GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
reduce1.cu
Go to the documentation of this file.
1 
4 #include "../include/mycuda.h"
5 
6 using namespace mycuda;
7 
8 const int blocksize = 256;
9 
10 
11 
13 
25 __device__ void reduce1_dev( float* x, int n )
26 {
27  const int i = threadIdx.x;
28 
29 // The following code block can also be expressed as a loop (try it...):
30 // for (int L=1024; L>0; L>>=1 ) {
31 // if (n>L) { __syncthreads(); if (i<L && i+L<n) x[i] += x[i+L]; }
32 // }
33 
34  if ( n>1024) { __syncthreads(); if (i<1024 && i+1024<n) x[i] += x[i+1024]; }
35  if ( n> 512) { __syncthreads(); if (i< 512 && i+ 512<n) x[i] += x[i+ 512]; }
36  if ( n> 256) { __syncthreads(); if (i< 256 && i+ 256<n) x[i] += x[i+ 256]; }
37  if ( n> 128) { __syncthreads(); if (i< 128 && i+ 128<n) x[i] += x[i+ 128]; }
38  if ( n> 64) { __syncthreads(); if (i< 64 && i+ 64<n) x[i] += x[i+ 64]; }
39  if ( n> 32) { __syncthreads(); if (i< 32 && i+ 32<n) x[i] += x[i+ 32]; }
40  if ( n> 16) { __syncthreads(); if (i< 16 && i+ 16<n) x[i] += x[i+ 16]; }
41  if ( n> 8) { __syncthreads(); if (i< 8 && i+ 8<n) x[i] += x[i+ 8]; }
42  if ( n> 4) { __syncthreads(); if (i< 4 && i+ 4<n) x[i] += x[i+ 4]; }
43  if ( n> 2) { __syncthreads(); if (i< 2 && i+ 2<n) x[i] += x[i+ 2]; }
44  if ( n> 1) { __syncthreads(); if (i< 1 && i+ 1<n) x[i] += x[i+ 1]; }
45  __syncthreads();
46 }
47 
48 
49 
51 
64 __global__
65 void reduce1( float * xsum, // output vector
66  float * x, // input vector
67  int stride ) // column stride
68 {
69  const int tid = threadIdx.x;
70  const int j = blockIdx.x;
71  const int offset = j*stride;
72 
73  // static allocation
74  __shared__ float r[blocksize];
75 
76  // Each thread copies one element from global to shared memory
77  r[tid] = x[offset+tid];
78 
79  // The sum is returned in element 0 of the shared array
80  reduce1_dev( r, blockDim.x );
81  if (tid==0) xsum[j] = r[0];
82 }
83 
84 
85 int main() {
86  const int M=256;
87  const int N=256;
88 
89  float * x = mapped_malloc<float>(M*N);
90  float * xsum = mapped_malloc<float>(N);
91 
92  seq <<< N, M >>> ( x, 0.0f, 0.1f, M*N );
93  reduce1 <<< N, M >>> ( xsum, x, M );
94 
96 
97  float xsum_gold[N];
98  for (int j=0; j<N; j++) {
99  xsum_gold[j] = 0.0;
100  for (int i=0; i<M; i++ ) {
101  xsum_gold[j] += x[j*M+i];
102  }
103  printf( "%10.2f %10.2f % 10.4f\n", xsum_gold[j], xsum[j], xsum_gold[j] - xsum[j] );
104  }
105 }
106 
107 
108 
109 
110 
111