/// Reduction operators and kernels.

namespace mycuda_reduce {



class SUM 
{
    public:

    const float zero;

    SUM() : zero(0.0f) {}

    __device__ float operator () ( const float x, const float y ) const {
        return x + y;
    }
};




class MAX 
{
    public:

    const float zero;

    MAX() : zero(-1.0f/0.0f) {}

    __host__ __device__ float operator () ( const float x, const float y ) const {
        if (isnan(x)) return x;
        return x > y ? x : y;
    }
};




class MIN 
{
    public:

     const float zero;

     MIN() : zero(1.0/0.0f) {}

     __host__ __device__ float operator () ( const float x, const float y ) const {
         if (isnan(x)) return x;
         return x < y ? x : y;
     }
};



/// Reduces a vector x of length n

/**
 *  #### Notes
 *    * x should be stored in shared memory for best performance.
 *    * sum is computed in place and returned in x[0].  
 *    * reduction operator implemented as a functor class.
 *
 *  #### Requires
 *    * n <= 2*blocksize
 *    * blocksize >= 32
 */     

// There are a couple of optimizations possible for special cases.
// * If n is a power of 2, can omit checks that i+???<n at each level.  
// * If n>=64, can omit checks in the last 6 levels (single warp).
template <typename T, typename R>
__device__ void reduce_dev( T* x, int n, R op )
{
    const int i = threadIdx.x;
    
    if ( n>1024)  { __syncthreads(); if (i<1024 && i+1024<n) x[i] = op( x[i], x[i+1024]); }
    if ( n> 512)  { __syncthreads(); if (i< 512 && i+ 512<n) x[i] = op( x[i], x[i+ 512]); }
    if ( n> 256)  { __syncthreads(); if (i< 256 && i+ 256<n) x[i] = op( x[i], x[i+ 256]); }
    if ( n> 128)  { __syncthreads(); if (i< 128 && i+ 128<n) x[i] = op( x[i], x[i+ 128]); }
    if ( n>  64)  { __syncthreads(); if (i<  64 && i+  64<n) x[i] = op( x[i], x[i+  64]); }
    __syncthreads();

    // Since all threads of a warp execute simultaneously, we don't need to sync.
    volatile T* smem = x;
    if (i<32) {
        if ( i+32<n ) smem[i] = op( smem[i], smem[i+32]);
        if ( i+16<n ) smem[i] = op( smem[i], smem[i+16]);
        if ( i+ 8<n ) smem[i] = op( smem[i], smem[i+ 8]);
        if ( i+ 4<n ) smem[i] = op( smem[i], smem[i+ 4]);
        if ( i+ 2<n ) smem[i] = op( smem[i], smem[i+ 2]);
        if ( i+ 1<n ) smem[i] = op( smem[i], smem[i+ 1]);
    }
    __syncthreads();
}        


/// Reduces an M x N matrix.  Given a K x N grid of blocks, returns a K x N array of sums.  

/**
 * #### Notes 
 *  * Uses dynamic allocation for shared memory.  Allocation size 
 *      is determined by configuration of kernel launch.
 *
 * #### Requires 
 *   * blocksize >=32
 *   * shared_memory_size = 2*blocksize*sizeof(T)
 */  

    template <typename T, typename R >
__global__ void reduce( T   * xsum, 
                        T   * x, 
                        int   n,            // column length
                        int   stride,       // column stride
                        R     op )          // reduction functor
{
    const int tid       = threadIdx.x;
    const int j         = blockIdx.y;
    const int k         = blockIdx.x;
    const int blocksize = blockDim.x;
    const int offset    = j*stride + 2*k*blocksize;
    const int nblocks_per_col = gridDim.x;


    extern __shared__ T r[];

    int i = offset + tid;

    if (i < j*stride+n ) {
        r[tid] = x[i]; 
        i += blocksize;
    }

    if (i < j*stride+n ) {
        r[tid+blocksize] = x[i]; 
        i += (2*nblocks_per_col - 1)*blocksize; 
    }

    while ( 1 ) {
        if (i>=j*stride+n) break;
        r[tid] = op( r[tid], x[i] );
        i += blocksize;

        if (i>=j*stride+n) break;
        r[tid+blocksize] = op( r[tid+blocksize], x[i]);
        i += (2*nblocks_per_col - 1)*blocksize;
    }

    int nn = ( n - 2*k*blocksize < 2*blocksize ? n-2*k*blocksize : 2*blocksize );
    if (nn>0) {
        reduce_dev( r, nn, op );
        if (tid==0) xsum[j*nblocks_per_col+k] = r[0];
    } else { 
        if (tid==0) xsum[j*nblocks_per_col+k] = op.zero;
    }
}


} // end namespace mycuda_reduce
