/// \file prefixScan.cu
/// \brief This sample demonstrates code for performing prefix scans using CUDA.

#include "../include/mycuda.h"
#include "../include/mycuda_scan.h"

using namespace mycuda;
using namespace mycuda_scan;


/// Utility function for showing some test results.

template < typename T >
void show( T*   x,          ///< (in) Vector of CUDA results
           T*   x_gold,     ///< (in) Vector of host results
           int  first,      ///< (in) Index of first element to show
           int  n )         ///< (in) Number of elements to show
{ 
    printf( "     Device        Host       Error\n");
    printf( "===================================\n");
    for (int i=first; i<first+n; i++)
        printf( " %10.3f  %10.3f  %10.6f\n", x[i], x_gold[i], x[i]-x_gold[i] ); 
}



/// Verify that host and device results are identical

template < typename T>
int check( T*     x,           ///< (in) Vector of CUDA results           
           T*     x_gold,      ///< (in) Vector of host results           
           int    n,           ///< (in) Number of elements to show
           char*  str )        ///< (in) Label indicating vector being tested       
{    
    printf( "Checking: %s\n", str );
    int status = 0;
    for (int i=0; i<n; i++) {
        T denom = max( 0.00001, fabs(x_gold[i]) );
        if ( fabs((x[i] - x_gold[i]) / denom) > 0.00001 ) {
            printf( "%s failed at element %d: %12.4f  %12.4f\n", str, i, x[i], x_gold[i] ); 
            status++;
        }
    } 
    return status;
}


/// Perform blockwise prefix scan.

/**
 *     #### Notes
 *     * Each thread handles one element of vector.
 */

int main () 
{
    int blocksize = 256;
    int gridsize  = 256;
    int n = blocksize*gridsize;


    // allocate and set data
    float * x = mapped_malloc<float>(n);                   //  input vector
    float * w = mapped_malloc<float>(n);                   //  output vector
    float * blocksum = mapped_malloc<float>(gridsize);     // block sums
    seq <<< gridsize, blocksize >>> ( x, 0.0, 0.1, n );


    // scan
    size_t shared_size = 2*blocksize*sizeof(float);
    int    flag = 1;
    scan_blocks <<< gridsize, blocksize, shared_size >>> ( w, x, blocksum, flag );
    cudaDeviceSynchronize();


    // Compute on host for comparison
    float w_gold[n];
    float blocksum_gold[gridsize];
    for (int j=0; j<gridsize; j++) {
        w_gold[j*blocksize]=x[j*blocksize];
        for (int i=1; i<blocksize; i++) {
            int ij=j*blocksize+i;
            w_gold[ij] = w_gold[ij-1] + x[ij];
        }
        blocksum_gold[j] = w_gold[(j+1)*blocksize-1];
    }


    // Print a few elements from some arbitrary group.
    int j=20;  
    show( w, w_gold, j*blocksize, 50 );

    // verify
    check( w, w_gold, n, "w" );
    check( blocksum, blocksum_gold, gridsize, "blocksum" );
}




