9 #include "../include/mycuda.h"
10 #include "../include/mycuda_reduce.h"
12 using namespace mycuda;
29 template <
typename T,
typename R >
36 const int tid = threadIdx.x;
37 const int j = blockIdx.x;
38 const int offset = j*stride;
40 extern __shared__ T r[];
49 r[tid+blockDim.x] = x[i];
54 if (i>=offset+n)
break;
55 r[tid] = op( r[tid], x[i] );
58 if (i>=offset+n)
break;
59 r[tid+blockDim.x] = op( r[tid+blockDim.x], x[i]);
63 int nn = (n<2*blockDim.x ? n : 2*blockDim.x );
65 if (tid==0) xsum[j] = r[0];
72 const int stride = 256;
74 const int gridsize = 64;
76 float * x = mapped_malloc<float>(stride*gridsize);
77 float * xsum = mapped_malloc<float>( gridsize );
78 seq <<< gridsize, blocksize >>> ( x, 0.0f, 0.1f, stride*gridsize );
80 int sharedMemorySize = 2*blocksize*
sizeof(float);
81 reduce2 <<< gridsize, blocksize, sharedMemorySize >>> ( xsum, x,
n, stride,
mycuda_reduce::SUM() );
85 float xsum_gold[gridsize];
87 for (
int j=0; j<gridsize; j++) {
89 for (
int i=0; i<
n; i++) {
90 xsum_gold[j]+= x[j*stride+i];
92 xerr[j] = xsum_gold[j] - xsum[j];
93 printf(
"%10.2f %10.2f %10.5f\n", xsum_gold[j], xsum[j], xerr[j] );
95 for (
int j=0; j<gridsize; j++) assert( fabs( xerr[j]/xsum_gold[j] ) <0.0001 );