/*!  \file
 * \brief Functions for generating multiple streams of random numbers on the GPU.
 */


#include <math.h>
const float  TWOPI  = 6.283185307179586;



/*!
 * \brief Tausworth random number generator.
 *
 *  #### Notes 
 *    * S1, S2, S3, and M are all constants, and seed is the state.  
 *    * State should be initialized to a value greater than 128.
 *    * Multiple streams should use uncorrelated seeds.    
 *    * See http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
 */

__device__ 
unsigned int TausStep( unsigned int * seed, 
                       int            S1, 
                       int            S2, 
                       int            S3, 
                       unsigned int   M )  
{  
    unsigned int b=(((*seed << S1) ^ *seed) >> S2);  
    return *seed = (((*seed & M) << S3) ^ b);  
}  


/*!
 * \brief Simple linear congruential random number generator
 *
 * #### Notes
 *   * See http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
 */

__device__ 
unsigned int LCGStep( unsigned int * seed, 
                      unsigned int   A, 
                      unsigned int   C )  
{  
    return *seed = ( A * *seed + C );  
}  



/*!
 * \brief Hybrid combined Tausworth RNG
 *
 * #### Notes
 *  * Combined period is lcm(p1,p2,p3,p4)~ 2^121  
 *  * See http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
 */

__device__ 
float hybrid_tausworth_rng( unsigned int * seed1, 
                            unsigned int * seed2, 
                            unsigned int * seed3, 
                            unsigned int * seed4 )  
{                                                              // Periods  
      unsigned int  u =
                TausStep( seed1, 13, 19, 12, 4294967294UL )    // p1=2^31-1  
              ^ TausStep( seed2,  2, 25,  4, 4294967288UL )    // p2=2^30-1  
              ^ TausStep( seed3,  3, 11, 17, 4294967280UL )    // p3=2^28-1  
              ^ LCGStep(  seed4, 1664525, 1013904223UL);       // p4=2^32  
     return  u*2.3283064365387e-10; 
}  




/*!
 * \brief Multiply-with-carry RNG  (George Marsaglia).
 *
 *  #### Notes 
 *    * Intended to be used for setting seeds for multiple
 *      streams, thus returns unsigned int.  
 *    * Multiply by 2.3283064365387e-10 to get a unif(0,1).
 *    * Check that result is >128 to use as a seed for
 *      Tausworth generator.
 *    * See http://en.wikipedia.org/wiki/Random_number_generation
 */      

__device__ __host__ 
unsigned int mult_with_carry_rng( unsigned int * seed1, 
                                  unsigned int * seed2 )
{
    *seed1 = 36969u * ( *seed1 & 65535u ) + ( *seed1 >> 16 );
    *seed2 = 18000u * ( *seed2 & 65535u ) + ( *seed2 >> 16 );
    return (( *seed1 << 16 ) + *seed2 );
}



/*! 
 * \brief Generate seeds for n streams
 *
 * Given a meta_seed (input), generate an array of seeds (one per intended
 * rng stream).
 *
 * #### Notes 
 *   * This creates seeds on host.  They should typically then be copied to 
 *     GPU in order to use.
 */

void initialize_rng_seeds( unsigned int * seeds,       ///< (out)
                           unsigned int * meta_seed,   ///< (in)
                           int            n )          ///< (in) number of seeds to generate
{
    unsigned int seed1 = meta_seed[0];
    unsigned int seed2 = meta_seed[1];
    for (int i=0;i<n*4;i++){ 
        do {
            seeds[i] = mult_with_carry_rng( &seed1, &seed2 );
        } while (seeds[i]<128);       // seeds should be greater than 128 for Tausworth generator.
	}                                                                               
    meta_seed[0] = seed1;
    meta_seed[1] = seed2;
}



__device__ 
void get_seed( unsigned int * seed, 
               unsigned int * rng_state, 
               int i, 
               int n )
{
    for (int k=0;k<4;k++)
        seed[k] = rng_state[k*n+i];
}



__device__ 
void put_seed( unsigned int * seed, 
               unsigned int * rng_state, 
               int i, 
               int n )
{
    for (int k=0;k<4;k++)
        rng_state[k*n+i] = seed[k];
}




__device__
float random_uniform( unsigned int * seed )
{
    return hybrid_tausworth_rng( &seed[0], &seed[1], &seed[2], &seed[3] );
}




__device__ 
float random_uniform_positive( unsigned int * seed )
{
    float u=-1.0;
    do {
        u = random_uniform( seed );
    } while (u<=0);
    return u;
}



/*!
 * \brief Box-Muller normal RNG
 *
 * #### Notes
 * * Suggested by GPU Gems 3, ch 37 (see 
 *   http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html).
 */

template <typename T>
__device__ 
void random_normal( T            * z1, 
                    T            * z2, 
                    unsigned int * seed ) 
{
    T u0 = random_uniform_positive( seed );
    T u1 = random_uniform( seed );
    T r  = sqrt( -2.0 * log( u0 ));
    T theta = TWOPI*u1;
    *z1 = r * sin( theta );
    *z2 = r * cos( theta ); 
}

