/// \file reduce1.cu
/// \brief Compute blockwise sums of a vector x of length n.

#include "../include/mycuda.h"

using namespace mycuda;

const int blocksize = 256; 



/// `__device__` function that does actual reduction

/**
 *    #### Notes
 *    * The sum is computed in place and returned in x[0].
 *    * x should be stored in shared memory for best performance (accessing
 *      shared memory is much faster than global memory).
 *    * Note use of synchronization!!
 *    
 *    #### Requires
 *    * n <= 2*blocksize
 */

__device__ void reduce1_dev( float* x, int n )
{
    const int i = threadIdx.x;

// The following code block can also be expressed as a loop (try it...):
//    for (int L=1024; L>0; L>>=1 ) {  
//        if (n>L)   { __syncthreads(); if (i<L && i+L<n) x[i] += x[i+L]; }
//    }

    if ( n>1024)   { __syncthreads(); if (i<1024 && i+1024<n) x[i] += x[i+1024]; }
    if ( n> 512)   { __syncthreads(); if (i< 512 && i+ 512<n) x[i] += x[i+ 512]; }
    if ( n> 256)   { __syncthreads(); if (i< 256 && i+ 256<n) x[i] += x[i+ 256]; }
    if ( n> 128)   { __syncthreads(); if (i< 128 && i+ 128<n) x[i] += x[i+ 128]; }
    if ( n>  64)   { __syncthreads(); if (i<  64 && i+  64<n) x[i] += x[i+  64]; }
    if ( n>  32)   { __syncthreads(); if (i<  32 && i+  32<n) x[i] += x[i+  32]; }
    if ( n>  16)   { __syncthreads(); if (i<  16 && i+  16<n) x[i] += x[i+  16]; }
    if ( n>   8)   { __syncthreads(); if (i<   8 && i+   8<n) x[i] += x[i+   8]; }
    if ( n>   4)   { __syncthreads(); if (i<   4 && i+   4<n) x[i] += x[i+   4]; }
    if ( n>   2)   { __syncthreads(); if (i<   2 && i+   2<n) x[i] += x[i+   2]; }
    if ( n>   1)   { __syncthreads(); if (i<   1 && i+   1<n) x[i] += x[i+   1]; }
    __syncthreads();
}



/// Reduction kernel 

/** 
 *    Given an M x N array, return the column sums.
 *   
 *    #### Notes
 *    * Uses static shared memory allocation.
 *    * Each block reduces one column.
 *     
 *    #### Requires
 *    * gridsize  = N
 *    * blocksize = M
 */

__global__ 
void reduce1( float * xsum,      // output vector
              float * x,         // input vector
              int     stride )   // column stride
{
    const int tid    = threadIdx.x;
    const int j      = blockIdx.x;
    const int offset = j*stride;

    // static allocation
    __shared__ float r[blocksize];

    // Each thread copies one element from global to shared memory
    r[tid] = x[offset+tid];

    // The sum is returned in element 0 of the shared array
    reduce1_dev( r, blockDim.x );
    if (tid==0) xsum[j] = r[0];
}


int main() {
    const int M=256;
    const int N=256;

    float * x    = mapped_malloc<float>(M*N);
    float * xsum = mapped_malloc<float>(N);

    seq <<< N, M >>> ( x, 0.0f, 0.1f, M*N );
    reduce1 <<< N, M >>> ( xsum, x, M );

    device_synchronize();

    float xsum_gold[N];
    for (int j=0; j<N; j++) {
         xsum_gold[j] = 0.0;
         for (int i=0; i<M; i++ ) {
             xsum_gold[j] += x[j*M+i];
         }
         printf( "%10.2f  %10.2f  % 10.4f\n", xsum_gold[j], xsum[j], xsum_gold[j] - xsum[j] ); 
    }
}






