GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
reduce2.cu
Go to the documentation of this file.
1 
9 #include "../include/mycuda.h"
10 #include "../include/mycuda_reduce.h"
11 
12 using namespace mycuda;
13 
14 
16 
29 template <typename T, typename R >
30 __global__ void reduce2( T * xsum,
31  T * x,
32  int n, // nrows of matrix
33  int stride, // column stride
34  R op )
35 {
36  const int tid = threadIdx.x;
37  const int j = blockIdx.x;
38  const int offset = j*stride;
39 
40  extern __shared__ T r[];
41 
42  int i = offset + tid;
43  if (i < offset+n) {
44  r[tid] = x[i];
45  i += blockDim.x;
46  }
47 
48  if (i < offset+n) {
49  r[tid+blockDim.x] = x[i];
50  i += blockDim.x;
51  }
52 
53  while (1) {
54  if (i>=offset+n) break;
55  r[tid] = op( r[tid], x[i] );
56  i += blockDim.x;
57 
58  if (i>=offset+n) break;
59  r[tid+blockDim.x] = op( r[tid+blockDim.x], x[i]);
60  i += blockDim.x;
61  }
62 
63  int nn = (n<2*blockDim.x ? n : 2*blockDim.x );
64  mycuda_reduce::reduce_dev( r, nn, op );
65  if (tid==0) xsum[j] = r[0];
66 }
67 
68 
69 
70 int main () {
71  const int n = 187; // nrows
72  const int stride = 256; // column stride
73  const int blocksize = 64;
74  const int gridsize = 64; // ncolumns
75 
76  float * x = mapped_malloc<float>(stride*gridsize);
77  float * xsum = mapped_malloc<float>( gridsize );
78  seq <<< gridsize, blocksize >>> ( x, 0.0f, 0.1f, stride*gridsize );
79 
80  int sharedMemorySize = 2*blocksize*sizeof(float);
81  reduce2 <<< gridsize, blocksize, sharedMemorySize >>> ( xsum, x, n, stride, mycuda_reduce::SUM() );
82 
83  // Compute sum on host and check.
85  float xsum_gold[gridsize];
86  float xerr[gridsize];
87  for (int j=0; j<gridsize; j++) {
88  xsum_gold[j]=0.0f;
89  for (int i=0; i<n; i++) {
90  xsum_gold[j]+= x[j*stride+i];
91  }
92  xerr[j] = xsum_gold[j] - xsum[j];
93  printf( "%10.2f %10.2f %10.5f\n", xsum_gold[j], xsum[j], xerr[j] );
94  }
95  for (int j=0; j<gridsize; j++) assert( fabs( xerr[j]/xsum_gold[j] ) <0.0001 );
96 }
97 
98 
99 
100 
101 
102 
103 
104 
105 
106 
107 
108 
109 
110 
111 
112 
113 
114 
115 
116 
117 
118 
119 
120 
121