/// Module for computing prefix scans

/**
 *  #### Notes
 *  * See `scan` example in Nvidia CUDA toolkit.
 */    


namespace mycuda_scan
{


/// Device code for prefix scan of a single thread block 

/**
 *  #### Notes
 *  * Uses shared memory to perform scan.
 */


template <typename T>
__device__ void scan_dev( T   * r,     ///< (out) shared memory
                          T     x,     ///< (in) ith element of vector to be scanned
                          int   i,     ///< (in) index of element to be scanned
                          int   n)     ///< (in) size of vector to be scanned
{
    r[i] = 0.0f;
    i   += n;
    r[i] = x;

    for( uint offset=1; offset<n; offset<<=1){
        __syncthreads();
        T t = r[i] + r[i - offset];
        __syncthreads();
        r[i] = t;
    }
    __syncthreads();
}





/// Kernel for doing prefix scan
 
/** 
 *   #### Notes
 *   * _Inclusive scan_:
 *      y(i) = x(0) + x(1) + y(i-1),  
 *
 *    _Exclusive scan_: 
 *      y(i) = x(0) + x(1) + y(i),
 *
 *   * block `j` scans elements `x[j*blocksize], ... x[(j+1)*blocksize-1]`
 *
 *   #### Requires
 *   * `sharedMemorySize = 2*n*sizeof(float)`
 *   * `n <= blocksize`
 */

template <typename T>
__global__ 
void scan_blocks( T*  w,           ///< (out)  scanned vector 
                  T*  x,           ///< (in) vector to be scanned
                  T*  blocksum,    ///< (out) Block sums
                  int inclusive )  ///< (in) flag indicating if scan is to be inclusive or exclusive
{
    int n    = blockDim.x;
    int j    = blockIdx.x;
    int tid  = threadIdx.x;
    int i    = j*n + tid;

    extern __shared__ T r[];    

    scan_dev( r, x[i], tid, n );
    if ( inclusive ) {
        w[i] = r[tid+n];
    } 
    else {
        w[i] = ( tid==0 ? 0.0f : r[tid+n-1] );
    }
    if (tid==0) blocksum[j] = r[2*n-1]; 
}
}  // end namespace mycuda_scan
