10 #include "../include/mycuda.h"
11 #include "../include/mycuda_reduce.h"
13 using namespace mycuda;
14 using namespace mycuda_reduce;
22 const int stride = 1024;
25 const int nblocks_per_col = 3;
27 const int nblocks = nblocks_per_col * ncol;
28 const int nelements = stride * ncol;
30 float * x = mapped_malloc<float>( nelements );
31 float * xsum_block = mapped_malloc<float>( nblocks );
32 float * xsum_column = mapped_malloc<float>( ncol );
33 float * xsum_global = mapped_malloc<float>( 1 );
34 seq <<< nblocks, blocksize >>> ( x, 0.0f, 0.01f, nelements );
38 dim3 grid = dim3( nblocks_per_col, ncol);
39 int smem = 2*blocksize*
sizeof(float);
40 reduce <<< grid, blocksize, smem >>> ( xsum_block, x, nrow, stride,
SUM() );
43 grid = dim3( 1, ncol );
44 reduce <<< grid, blocksize, smem >>> ( xsum_column, xsum_block, nblocks_per_col, nblocks_per_col,
SUM() );
47 reduce <<< 1, blocksize, smem >>> ( xsum_global, xsum_block, nblocks, nblocks,
SUM() );
52 double xsum_global_gold = 0.0;
53 for (
int j=0; j<ncol; j++) {
54 double xsum_column_gold = 0.0f;
55 for (
int i=0; i<nrow; i++) {
56 xsum_column_gold += x[j*stride+i];
58 double xerr = xsum_column[j] - xsum_column_gold;
59 printf(
"%12.2f %12.2f %12.6f\n", xsum_column[j], xsum_column_gold, xerr );
60 xsum_global_gold += xsum_column_gold;
62 double xerr = xsum_global[0] - xsum_global_gold;
63 printf(
"%12.2f %12.2f %12.6f\n", xsum_global[0], xsum_global_gold, xerr );