/** \file reduce2.cu
 *  \brief Reduces M x N matrix.  Returns column results for a generic reduction operation.
 *
 *     #### Notes  
 *     * Columns can be of arbitrary length, but one block reduces each column. 
 */


#include "../include/mycuda.h"
#include "../include/mycuda_reduce.h"

using namespace mycuda;


/// Reduction kernel 

/**
 *     #### Notes 
 *     * Uses dynamic allocation for shared memory.  Allocation size 
 *       is determined by configuration of kernel launch.
 *     * Each block reduces one column.
 *   
 *     #### Requires
 *     * blocksize >= 32
 *     * shared_memory_size = 2*blocksize*sizeof(T)
 *     * gridsize = ncolumns of matrix
 */

template <typename T, typename R >
__global__ void reduce2( T   * xsum, 
                         T   * x, 
                         int   n,        // nrows of matrix
                         int   stride,   // column stride
                         R     op ) 
{
    const int tid    = threadIdx.x;
    const int j      = blockIdx.x;
    const int offset = j*stride;

    extern __shared__ T r[];

    int i = offset + tid;
    if (i < offset+n) {
        r[tid] = x[i];
        i += blockDim.x;
    }    

    if (i < offset+n) {
        r[tid+blockDim.x] = x[i];
        i += blockDim.x;
    }

    while (1) {
        if (i>=offset+n) break;
        r[tid] = op( r[tid], x[i] );
        i += blockDim.x;

        if (i>=offset+n) break;
        r[tid+blockDim.x] = op( r[tid+blockDim.x], x[i]);
        i += blockDim.x;
    }

    int nn = (n<2*blockDim.x ? n : 2*blockDim.x );
    mycuda_reduce::reduce_dev( r, nn, op );
    if (tid==0) xsum[j] = r[0];
}



int main () {
    const int n         = 187;         // nrows
    const int stride    = 256;         // column stride
    const int blocksize = 64;
    const int gridsize  = 64;          // ncolumns

     float * x    = mapped_malloc<float>(stride*gridsize);
     float * xsum = mapped_malloc<float>( gridsize );
     seq <<< gridsize, blocksize >>> ( x, 0.0f, 0.1f, stride*gridsize );

     int sharedMemorySize =  2*blocksize*sizeof(float); 
     reduce2 <<< gridsize, blocksize, sharedMemorySize >>> ( xsum, x, n, stride, mycuda_reduce::SUM() );

     // Compute sum on host and check.
     device_synchronize();
     float xsum_gold[gridsize];
     float xerr[gridsize];
     for (int j=0; j<gridsize; j++) {
         xsum_gold[j]=0.0f;
         for (int i=0; i<n; i++) {
             xsum_gold[j]+= x[j*stride+i];
         }
         xerr[j] =  xsum_gold[j] - xsum[j]; 
         printf( "%10.2f  %10.2f  %10.5f\n", xsum_gold[j], xsum[j], xerr[j] );
     }
     for (int j=0; j<gridsize; j++) assert( fabs( xerr[j]/xsum_gold[j] ) <0.0001 );
}

























