GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
prefixScan.cu
Go to the documentation of this file.
1 
4 #include "../include/mycuda.h"
5 #include "../include/mycuda_scan.h"
6 
7 using namespace mycuda;
8 using namespace mycuda_scan;
9 
10 
12 
13 template < typename T >
14 void show( T* x,
15  T* x_gold,
16  int first,
17  int n )
18 {
19  printf( " Device Host Error\n");
20  printf( "===================================\n");
21  for (int i=first; i<first+n; i++)
22  printf( " %10.3f %10.3f %10.6f\n", x[i], x_gold[i], x[i]-x_gold[i] );
23 }
24 
25 
26 
28 
29 template < typename T>
30 int check( T* x,
31  T* x_gold,
32  int n,
33  char* str )
34 {
35  printf( "Checking: %s\n", str );
36  int status = 0;
37  for (int i=0; i<n; i++) {
38  T denom = max( 0.00001, fabs(x_gold[i]) );
39  if ( fabs((x[i] - x_gold[i]) / denom) > 0.00001 ) {
40  printf( "%s failed at element %d: %12.4f %12.4f\n", str, i, x[i], x_gold[i] );
41  status++;
42  }
43  }
44  return status;
45 }
46 
47 
49 
55 int main ()
56 {
57  int blocksize = 256;
58  int gridsize = 256;
59  int n = blocksize*gridsize;
60 
61 
62  // allocate and set data
63  float * x = mapped_malloc<float>(n); // input vector
64  float * w = mapped_malloc<float>(n); // output vector
65  float * blocksum = mapped_malloc<float>(gridsize); // block sums
66  seq <<< gridsize, blocksize >>> ( x, 0.0, 0.1, n );
67 
68 
69  // scan
70  size_t shared_size = 2*blocksize*sizeof(float);
71  int flag = 1;
72  scan_blocks <<< gridsize, blocksize, shared_size >>> ( w, x, blocksum, flag );
73  cudaDeviceSynchronize();
74 
75 
76  // Compute on host for comparison
77  float w_gold[n];
78  float blocksum_gold[gridsize];
79  for (int j=0; j<gridsize; j++) {
80  w_gold[j*blocksize]=x[j*blocksize];
81  for (int i=1; i<blocksize; i++) {
82  int ij=j*blocksize+i;
83  w_gold[ij] = w_gold[ij-1] + x[ij];
84  }
85  blocksum_gold[j] = w_gold[(j+1)*blocksize-1];
86  }
87 
88 
89  // Print a few elements from some arbitrary group.
90  int j=20;
91  show( w, w_gold, j*blocksize, 50 );
92 
93  // verify
94  check( w, w_gold, n, "w" );
95  check( blocksum, blocksum_gold, gridsize, "blocksum" );
96 }
97 
98 
99 
100