4 #include "../include/mycuda.h"
6 using namespace mycuda;
27 const int i = threadIdx.x;
34 if ( n>1024) { __syncthreads();
if (i<1024 && i+1024<n) x[i] += x[i+1024]; }
35 if ( n> 512) { __syncthreads();
if (i< 512 && i+ 512<n) x[i] += x[i+ 512]; }
36 if ( n> 256) { __syncthreads();
if (i< 256 && i+ 256<n) x[i] += x[i+ 256]; }
37 if ( n> 128) { __syncthreads();
if (i< 128 && i+ 128<n) x[i] += x[i+ 128]; }
38 if ( n> 64) { __syncthreads();
if (i< 64 && i+ 64<n) x[i] += x[i+ 64]; }
39 if ( n> 32) { __syncthreads();
if (i< 32 && i+ 32<n) x[i] += x[i+ 32]; }
40 if ( n> 16) { __syncthreads();
if (i< 16 && i+ 16<n) x[i] += x[i+ 16]; }
41 if ( n> 8) { __syncthreads();
if (i< 8 && i+ 8<n) x[i] += x[i+ 8]; }
42 if ( n> 4) { __syncthreads();
if (i< 4 && i+ 4<n) x[i] += x[i+ 4]; }
43 if ( n> 2) { __syncthreads();
if (i< 2 && i+ 2<n) x[i] += x[i+ 2]; }
44 if ( n> 1) { __syncthreads();
if (i< 1 && i+ 1<n) x[i] += x[i+ 1]; }
69 const int tid = threadIdx.x;
70 const int j = blockIdx.x;
71 const int offset = j*stride;
77 r[tid] = x[offset+tid];
81 if (tid==0) xsum[j] = r[0];
89 float * x = mapped_malloc<float>(M*N);
90 float * xsum = mapped_malloc<float>(N);
92 seq <<< N, M >>> ( x, 0.0f, 0.1f, M*N );
93 reduce1 <<< N, M >>> ( xsum, x, M );
98 for (
int j=0; j<N; j++) {
100 for (
int i=0; i<M; i++ ) {
101 xsum_gold[j] += x[j*M+i];
103 printf(
"%10.2f %10.2f % 10.4f\n", xsum_gold[j], xsum[j], xsum_gold[j] - xsum[j] );