3 namespace mycuda_reduce {
15 __device__
float operator () (
const float x,
const float y )
const {
31 __host__ __device__
float operator () (
const float x,
const float y )
const {
32 if (isnan(x))
return x;
48 __host__ __device__
float operator () (
const float x,
const float y )
const {
49 if (isnan(x))
return x;
72 template <
typename T,
typename R>
75 const int i = threadIdx.x;
77 if ( n>1024) { __syncthreads();
if (i<1024 && i+1024<n) x[i] = op( x[i], x[i+1024]); }
78 if ( n> 512) { __syncthreads();
if (i< 512 && i+ 512<n) x[i] = op( x[i], x[i+ 512]); }
79 if ( n> 256) { __syncthreads();
if (i< 256 && i+ 256<n) x[i] = op( x[i], x[i+ 256]); }
80 if ( n> 128) { __syncthreads();
if (i< 128 && i+ 128<n) x[i] = op( x[i], x[i+ 128]); }
81 if ( n> 64) { __syncthreads();
if (i< 64 && i+ 64<n) x[i] = op( x[i], x[i+ 64]); }
87 if ( i+32<n ) smem[i] = op( smem[i], smem[i+32]);
88 if ( i+16<n ) smem[i] = op( smem[i], smem[i+16]);
89 if ( i+ 8<n ) smem[i] = op( smem[i], smem[i+ 8]);
90 if ( i+ 4<n ) smem[i] = op( smem[i], smem[i+ 4]);
91 if ( i+ 2<n ) smem[i] = op( smem[i], smem[i+ 2]);
92 if ( i+ 1<n ) smem[i] = op( smem[i], smem[i+ 1]);
110 template <
typename T,
typename R >
117 const int tid = threadIdx.x;
118 const int j = blockIdx.y;
119 const int k = blockIdx.x;
121 const int offset = j*stride + 2*k*
blocksize;
122 const int nblocks_per_col = gridDim.x;
125 extern __shared__ T r[];
127 int i = offset + tid;
129 if (i < j*stride+n ) {
134 if (i < j*stride+n ) {
136 i += (2*nblocks_per_col - 1)*blocksize;
140 if (i>=j*stride+n)
break;
141 r[tid] = op( r[tid], x[i] );
144 if (i>=j*stride+n)
break;
145 r[tid+
blocksize] = op( r[tid+blocksize], x[i]);
146 i += (2*nblocks_per_col - 1)*blocksize;
149 int nn = ( n - 2*k*blocksize < 2*blocksize ? n-2*k*blocksize : 2*
blocksize );
152 if (tid==0) xsum[j*nblocks_per_col+k] = r[0];
154 if (tid==0) xsum[j*nblocks_per_col+k] = op.zero;