GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
reduce3.cu
Go to the documentation of this file.
1 
10 #include "../include/mycuda.h"
11 #include "../include/mycuda_reduce.h"
12 
13 using namespace mycuda;
14 using namespace mycuda_reduce;
15 
16 
17 
18 
19 int main () {
20  const int nrow = 987;
21  const int ncol = 64;
22  const int stride = 1024;
23 
24  const int blocksize = 128;
25  const int nblocks_per_col = 3;
26 
27  const int nblocks = nblocks_per_col * ncol;
28  const int nelements = stride * ncol;
29 
30  float * x = mapped_malloc<float>( nelements );
31  float * xsum_block = mapped_malloc<float>( nblocks );
32  float * xsum_column = mapped_malloc<float>( ncol );
33  float * xsum_global = mapped_malloc<float>( 1 );
34  seq <<< nblocks, blocksize >>> ( x, 0.0f, 0.01f, nelements );
35 
36 
37  // Get block sums
38  dim3 grid = dim3( nblocks_per_col, ncol);
39  int smem = 2*blocksize*sizeof(float);
40  reduce <<< grid, blocksize, smem >>> ( xsum_block, x, nrow, stride, SUM() );
41 
42  // Get column sums
43  grid = dim3( 1, ncol );
44  reduce <<< grid, blocksize, smem >>> ( xsum_column, xsum_block, nblocks_per_col, nblocks_per_col, SUM() );
45 
46  // Get global sum
47  reduce <<< 1, blocksize, smem >>> ( xsum_global, xsum_block, nblocks, nblocks, SUM() );
48 
49 
50  // Compute sums on host and check
52  double xsum_global_gold = 0.0;
53  for (int j=0; j<ncol; j++) {
54  double xsum_column_gold = 0.0f;
55  for (int i=0; i<nrow; i++) {
56  xsum_column_gold += x[j*stride+i];
57  }
58  double xerr = xsum_column[j] - xsum_column_gold;
59  printf( "%12.2f %12.2f %12.6f\n", xsum_column[j], xsum_column_gold, xerr );
60  xsum_global_gold += xsum_column_gold;
61  }
62  double xerr = xsum_global[0] - xsum_global_gold;
63  printf( "%12.2f %12.2f %12.6f\n", xsum_global[0], xsum_global_gold, xerr );
64 }
65 
66 
67 
68 
69 
70 
71 
72 
73 
74 
75 
76 
77 
78 
79 
80 
81 
82 
83 
84 
85 
86 
87 
88 
89