/** \file  reduce3.cu
 *  \brief Compute column sums of a M x N array.  Each column is reduced by several thread blocks.
 *
 *     #### Notes
 *     * The actual reduction kernel is included from include/mycuda_reduce.cu
 *
 */


#include "../include/mycuda.h"
#include "../include/mycuda_reduce.h"

using namespace mycuda;
using namespace mycuda_reduce;




int main () {
    const int nrow            =  987; 
    const int ncol            =   64; 
    const int stride          = 1024; 

    const int blocksize       =  128;
    const int nblocks_per_col =    3;

    const int nblocks   = nblocks_per_col * ncol;
    const int nelements = stride * ncol;

    float * x           = mapped_malloc<float>( nelements );
    float * xsum_block  = mapped_malloc<float>( nblocks );
    float * xsum_column = mapped_malloc<float>( ncol );
    float * xsum_global = mapped_malloc<float>( 1 );
    seq <<< nblocks, blocksize >>> ( x, 0.0f, 0.01f, nelements );


    // Get block sums
    dim3 grid = dim3( nblocks_per_col, ncol); 
    int  smem = 2*blocksize*sizeof(float);
    reduce <<< grid, blocksize, smem  >>> ( xsum_block, x, nrow, stride, SUM() );

    // Get column sums
    grid = dim3( 1, ncol );
    reduce <<< grid, blocksize, smem >>> ( xsum_column, xsum_block, nblocks_per_col, nblocks_per_col, SUM() );

    // Get global sum
    reduce <<< 1, blocksize, smem >>> ( xsum_global, xsum_block, nblocks, nblocks, SUM() );


    // Compute sums on host and check
    device_synchronize();
    double xsum_global_gold = 0.0;
    for (int j=0; j<ncol; j++) {
        double xsum_column_gold = 0.0f;
        for (int i=0; i<nrow; i++) {
            xsum_column_gold += x[j*stride+i];
        }
        double xerr = xsum_column[j] - xsum_column_gold; 
        printf( "%12.2f  %12.2f  %12.6f\n", xsum_column[j], xsum_column_gold, xerr );
        xsum_global_gold += xsum_column_gold;
    }
    double xerr = xsum_global[0] - xsum_global_gold; 
    printf( "%12.2f  %12.2f  %12.6f\n", xsum_global[0], xsum_global_gold, xerr );
}

























