#include <curand_kernel.h>
#define SQRT2             1.414213562373095f
#define ONE_OVER_SQRT2    0.707106781186547f





///  Kernels for generating parallel streams of random numbers

namespace mycuda_random {
    
    
__global__ void random_init( curandState_t *state, const int seed, const int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;

    for ( int i=tid; i<n; i+=nthreads) {
        curand_init( seed, i, 0, &state[tid] );
    }
}


__global__ void random_normal( float *z, curandState_t *state, const int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;

    curandState_t s = state[tid];
    for ( int i=tid; i<n; i+=nthreads) {
        z[i] = curand_normal( &s );
    }
    state[tid] = s;
}



/**
 *   #### Notes
 *   * Getting and saving seed is costly, so better to copy seed to local
 *     memory, generate many random numbers, then return updated seed to global
 *     memory.
 */

__global__ void random_uniform( float *z, curandState_t *state, const int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;

    curandState_t s = state[tid];
    for ( int i=tid; i<n; i+=nthreads) {
        z[i] = curand_uniform( &s );
    }
    state[tid] = s;
}






    template <typename T>
__host__ __device__ T norm_cdf_inv( T q ) 
{
    return -SQRT2*erfcinv( 2.0f*q );
}



    template <typename T> 
__host__ __device__ T norm_cdf( T z ) 
{
    return .5f * ( 1.0f + erf( ONE_OVER_SQRT2 * z ));
}


} // end namespace mycuda_random


