GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
mycuda_reduce.h
Go to the documentation of this file.
1 
3 namespace mycuda_reduce {
4 
5 
6 
7 class SUM
8 {
9  public:
10 
11  const float zero;
12 
13  SUM() : zero(0.0f) {}
14 
15  __device__ float operator () ( const float x, const float y ) const {
16  return x + y;
17  }
18 };
19 
20 
21 
22 
23 class MAX
24 {
25  public:
26 
27  const float zero;
28 
29  MAX() : zero(-1.0f/0.0f) {}
30 
31  __host__ __device__ float operator () ( const float x, const float y ) const {
32  if (isnan(x)) return x;
33  return x > y ? x : y;
34  }
35 };
36 
37 
38 
39 
40 class MIN
41 {
42  public:
43 
44  const float zero;
45 
46  MIN() : zero(1.0/0.0f) {}
47 
48  __host__ __device__ float operator () ( const float x, const float y ) const {
49  if (isnan(x)) return x;
50  return x < y ? x : y;
51  }
52 };
53 
54 
55 
57 
69 // There are a couple of optimizations possible for special cases.
70 // * If n is a power of 2, can omit checks that i+???<n at each level.
71 // * If n>=64, can omit checks in the last 6 levels (single warp).
72 template <typename T, typename R>
73 __device__ void reduce_dev( T* x, int n, R op )
74 {
75  const int i = threadIdx.x;
76 
77  if ( n>1024) { __syncthreads(); if (i<1024 && i+1024<n) x[i] = op( x[i], x[i+1024]); }
78  if ( n> 512) { __syncthreads(); if (i< 512 && i+ 512<n) x[i] = op( x[i], x[i+ 512]); }
79  if ( n> 256) { __syncthreads(); if (i< 256 && i+ 256<n) x[i] = op( x[i], x[i+ 256]); }
80  if ( n> 128) { __syncthreads(); if (i< 128 && i+ 128<n) x[i] = op( x[i], x[i+ 128]); }
81  if ( n> 64) { __syncthreads(); if (i< 64 && i+ 64<n) x[i] = op( x[i], x[i+ 64]); }
82  __syncthreads();
83 
84  // Since all threads of a warp execute simultaneously, we don't need to sync.
85  volatile T* smem = x;
86  if (i<32) {
87  if ( i+32<n ) smem[i] = op( smem[i], smem[i+32]);
88  if ( i+16<n ) smem[i] = op( smem[i], smem[i+16]);
89  if ( i+ 8<n ) smem[i] = op( smem[i], smem[i+ 8]);
90  if ( i+ 4<n ) smem[i] = op( smem[i], smem[i+ 4]);
91  if ( i+ 2<n ) smem[i] = op( smem[i], smem[i+ 2]);
92  if ( i+ 1<n ) smem[i] = op( smem[i], smem[i+ 1]);
93  }
94  __syncthreads();
95 }
96 
97 
99 
110  template <typename T, typename R >
111 __global__ void reduce( T * xsum,
112  T * x,
113  int n, // column length
114  int stride, // column stride
115  R op ) // reduction functor
116 {
117  const int tid = threadIdx.x;
118  const int j = blockIdx.y;
119  const int k = blockIdx.x;
120  const int blocksize = blockDim.x;
121  const int offset = j*stride + 2*k*blocksize;
122  const int nblocks_per_col = gridDim.x;
123 
124 
125  extern __shared__ T r[];
126 
127  int i = offset + tid;
128 
129  if (i < j*stride+n ) {
130  r[tid] = x[i];
131  i += blocksize;
132  }
133 
134  if (i < j*stride+n ) {
135  r[tid+blocksize] = x[i];
136  i += (2*nblocks_per_col - 1)*blocksize;
137  }
138 
139  while ( 1 ) {
140  if (i>=j*stride+n) break;
141  r[tid] = op( r[tid], x[i] );
142  i += blocksize;
143 
144  if (i>=j*stride+n) break;
145  r[tid+blocksize] = op( r[tid+blocksize], x[i]);
146  i += (2*nblocks_per_col - 1)*blocksize;
147  }
148 
149  int nn = ( n - 2*k*blocksize < 2*blocksize ? n-2*k*blocksize : 2*blocksize );
150  if (nn>0) {
151  reduce_dev( r, nn, op );
152  if (tid==0) xsum[j*nblocks_per_col+k] = r[0];
153  } else {
154  if (tid==0) xsum[j*nblocks_per_col+k] = op.zero;
155  }
156 }
157 
158 
159 } // end namespace mycuda_reduce