4 #include "../include/mycuda.h"
5 #include "../include/mycuda_scan.h"
7 using namespace mycuda;
8 using namespace mycuda_scan;
13 template <
typename T >
19 printf(
" Device Host Error\n");
20 printf(
"===================================\n");
21 for (
int i=first; i<first+
n; i++)
22 printf(
" %10.3f %10.3f %10.6f\n", x[i], x_gold[i], x[i]-x_gold[i] );
29 template <
typename T>
35 printf(
"Checking: %s\n", str );
37 for (
int i=0; i<
n; i++) {
38 T denom = max( 0.00001, fabs(x_gold[i]) );
39 if ( fabs((x[i] - x_gold[i]) / denom) > 0.00001 ) {
40 printf(
"%s failed at element %d: %12.4f %12.4f\n", str, i, x[i], x_gold[i] );
59 int n = blocksize*gridsize;
63 float * x = mapped_malloc<float>(
n);
64 float * w = mapped_malloc<float>(
n);
65 float * blocksum = mapped_malloc<float>(gridsize);
66 seq <<< gridsize, blocksize >>> ( x, 0.0, 0.1,
n );
70 size_t shared_size = 2*blocksize*
sizeof(float);
72 scan_blocks <<< gridsize, blocksize, shared_size >>> ( w, x, blocksum, flag );
73 cudaDeviceSynchronize();
78 float blocksum_gold[gridsize];
79 for (
int j=0; j<gridsize; j++) {
83 w_gold[ij] = w_gold[ij-1] + x[ij];
85 blocksum_gold[j] = w_gold[(j+1)*blocksize-1];
91 show( w, w_gold, j*blocksize, 50 );
94 check( w, w_gold, n,
"w" );
95 check( blocksum, blocksum_gold, gridsize,
"blocksum" );