#include <cstdio>
#include <cstdlib>
#include <string>
#include <iostream>
#include <cassert>

#include <cuda.h>
#include <cuda_runtime.h>


// NOTE: NDEBUG must be defined BEFORE assert is included to disable assertions.    

#ifdef NDEBUG
#define CUDA_TRACE()                  
#define CUDA_SYNC_TRACE()
#define CUDA_DEBUG(_msg)              
#define CUDA_DEBUG_PRINTF(_fmt, ...)  
#else
#define CUDA_TRACE()                     fprintf( stderr, "%s(%i): CUDA_TRACE\n", __FILE__, __LINE__)    
#define CUDA_SYNC_TRACE()                mycuda::device_synchronize(); CUDA_TRACE();
#define CUDA_DEBUG(_msg)                 fprintf( stderr, "%s(%i): CUDA_DEBUG   ::  %s\n", __FILE__, __LINE__, _msg )  
#define CUDA_DEBUG_PRINTF(_fmt, ...)     fprintf( stderr, "%s(%i): CUDA_DEBUG   ::  " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
#endif    
#define CUDA_LOGGER(_msg)                fprintf( stderr, "%s(%i): CUDA_LOGGER   ::  %s\n", __FILE__, __LINE__, _msg )  
#define CUDA_LOGGER_PRINTF(_fmt, ...)    fprintf( stderr, "%s(%i): CUDA_LOGGER   ::  " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
#define CUDA_CHECK_ERROR()               mycuda::check_error( __FILE__, __LINE__ )
#define CUDA_ASYNC_CHECK_ERROR()         mycuda::async_check_error( __FILE__, __LINE__ )
#define CUDA_SAFE_CALL(err)              mycuda::safe_call(err, __FILE__, __LINE__)
#define CUDA_CATCH()                     catch( std::exception& e) { mycuda::handler( e, __FILE__, __LINE__ ); }
#define CUDA_HANDLER(e)                  mycuda::handler( e, __FILE__, __LINE__ )      



/// Core macros and utilities

namespace mycuda
{


/// Terminate if cudaGetLastError() returns error.

/**
 *   #### Notes 
 *   * Call using macro, e.g., `CUDA_CHECK_ERROR()`.   
 */     

void check_error( const char* file, const int line )
{
    cudaError_t err = cudaGetLastError();
    if (err) {
        fprintf( stdout, "%s(%i): mycuda::check_error  ::  %s\nAborting...\n", 
                 file, line, cudaGetErrorString( err ) );
        exit(-1);
    }
}


/// Terminate if cudaGetLastError() returns error.  

/**
 *   #### Notes 
 *   * Calls cudaDeviceSynchronize() first.
 *   * Call using macro, e.g., `CUDA_ASYNC_CHECK_ERROR()`.   
 */    

void async_check_error( const char* file, const int line )
{
    cudaError_t err = cudaDeviceSynchronize();
    if (err) {
        fprintf( stdout, "%s(%i): mycuda::async_check_error  ::  %s\nAborting...\n", 
                 file, line, cudaGetErrorString( err ) );
        exit(-1);
    }
}


/// Wrapper to check error code for any cuda call that returns one. Terminates if error is returned.

/**
 *   #### Notes
 *   * Call using macro, e.g., `CUDA_SAFE_CALL( cudaMalloc( &ptr, n*sizeof(float) ))`. 
 */

void safe_call( cudaError_t err, const char* file, const int line )
{
     if (err) { 
        fprintf( stdout, "%s(%i): mycuda::safe_call  ::  %s\nAborting...\n", 
                 file, line, cudaGetErrorString( err ) );
        exit(-1);
    }
}





/// Exception class for CUDA errors.  

/**
 *   #### Example
 *   >      void foo() throw (mycuda::exception) {
 *   >         if ( BAD_STUFF ) throw exception( "Bad stuff" );
 *   >      }
 */

class exception : public std::exception
{
    private:
    std::string msg;

    public:
    exception() : msg( cudaGetErrorString( cudaGetLastError() )) {}

    exception( std::string msg ) : msg(msg) {} 

    ~exception() throw() {}

    const char* what() const throw() { return msg.c_str(); }
};




/// Simple exception handler.

/**
 *    #### Example
 *    > void bar() {
 *    >    try {
 *    >        foo();
 *    >    }
 *    >    catch( std::exception& e ) {
 *    >        mycuda::handler( e );
 *    >    }
 *    > }
 */

// NOTE: need to catch reference to get derived classes.
void handler( std::exception& e, const char* file, const int line ) 
{
    fprintf( stdout, "%s(%i): mycuda::handler caught exception ::  %s\nAborting...\n", 
             file, line, e.what() );
    exit(-1);
}


/// Allocate memory on device
 
/**
 *   #### Example
 *   >     float *x = device_malloc<float>( 1024 );
 */

    template <typename T>
T * device_malloc( int n ) throw (exception)
{    
      T * ptr;
      if ( cudaMalloc( &ptr, n*sizeof(T) )) 
              throw exception( "mycuda::device_malloc" );
      return ptr;
}



/// Allocate mapped memory on host

/**
 *    #### Example:
 *    >    float *x = mapped_malloc<float>( 1024 );
 *   
 *    #### Notes 
 *        * call `cudaDeviceSynchronize()` before accessing data written on device
 *          from host.
 */

    template <typename T>
T * mapped_malloc( int n ) throw (exception)
{
      T * ptr;
      if ( cudaHostAlloc( &ptr, n*sizeof(T), cudaHostAllocMapped )) 
              throw exception( "mycuda::mapped_malloc" );
      return ptr;
}



/// Allocate host memory

/**
 *    #### Notes
 *    * Included for consistency.  Better:
 *    >     float * x = new float[1024];
 *    >     delete x;
 */   

    template <typename T>
T * host_malloc( int n ) throw ( exception )
{
    T * ptr = (T*)malloc( n*sizeof(T) );
    if (ptr==NULL) throw exception( "mycuda::host_malloc" );
    return ptr;
}



/// Free device memory

    template <typename T>
void device_free( T* ptr  ) throw( exception ) 
{     
    if ( cudaFree( ptr ) )
        throw exception( "mycuda::device_free" );
}



/// Free mapped memory.

    template <typename T>
void mapped_free( T* ptr  ) throw( exception ) 
{     
    if ( cudaFreeHost( ptr ) )
        throw exception( "mycuda::mapped_free" );
}



/// Free host memory (included just for consistency).

    template <typename T>
void host_free( T* ptr )
{
    free( ptr );
}


/// Copy from device to host 

/**
 *    #### Example
 *    >   copy_device_to_host( dest, src, n );
 */    

    template <typename T> 
void copy_device_to_host( T * dest, T * src, int n ) throw( exception ) 
{
    cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyDeviceToHost );
    if (err) throw( exception( "mycuda::copy_device_to_host"));
}



/// Copy from host to device

/**
 *    #### Example
 *    >   copy_host_to_device( dest, src, n );
 */    

    template <typename T> 
void copy_host_to_device( T * dest, T * src, int n ) throw( exception )
{
    cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyHostToDevice );
    if (err) throw( exception( "mycuda::copy_to_device" ));
}


/// Copy from device to device

/**
 *    #### Example
 *    >   copy_device_to_device( dest, src, n );
 */    

    template <typename T> 
void copy_device_to_device( T * dest, T * src, int n ) throw( exception )
{
    cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyDeviceToDevice );
    if (err) throw( exception( "mycuda::copy_device_to_device" ));
}


/// Copy from host to host

/** 
 *   #### Notes 
 *      * Use for mapped memory copies.
 *  
 *   #### Example
 *   >   copy_host_to_host( dest, src, n );
 */

    template <typename T> 
void copy_host_to_host( T * dest, T * src, int n ) throw( exception ) 
{
    cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyHostToHost );
    if (err) throw( exception( "mycuda::copy_host_to_host" ));
}



/// Wrapper for cudaDeviceSynchronize()

void device_synchronize() throw( exception ) 
{
    cudaError_t err = cudaDeviceSynchronize();
    if (err) throw( exception( "mycuda::device_synchronize" ) );
}



/// Wrapper for cudaMemGetInfo()

void get_memory_info() throw( exception )
{
    size_t free, total;
    cudaError_t err = cudaMemGetInfo ( &free, &total);
    if (err) throw( exception( "mycuda::get_memory_info" ));
    printf( "Free memory: %8.2f MB\nTotal memory: %8.2f MB\n", float(free) /(1<<20), 
                                                         float(total)/(1<<20) );
}



/// Fill device array z with constant a

    template <typename T1, typename T2>
__global__ void fill( T1 * z, T2 a, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = a;
    }
}    



/// Fill device array z with sequence 

/**
 *    E.g., z = {first, first+1, ..., first+n-1 }
 */

    template <typename T1, typename T2>
__global__ void seq( T1 * z, T2 first, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = first +i;
    }
}    



/// Fill device array z with sequence 

/**
 *      E.g., z = {first, first+inc, ..., first+(n-1)*inc }
 */

    template <typename T1, typename T2>
__global__ void seq( T1 * z, T2 first, T2 inc, int n )
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = first +i*inc;
    }
}    


/// Fill device array z with values from x, repeated ncopies times.

/**
 * E.g, if x={1,2,3} and ncopies=2, then z={1,1,2,2,3,3}
 */

    template <typename T1, typename T2>
__global__ void rep( T1 * z, int n, T2 * x, int nx, int ncopies ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = x[i/ncopies];
    }
}    



/// Fill device array z with values from x, tiled ncopies times.

/**
 * E.g, if x={1,2,3} and ncopies=2, then z={1,2,3,1,2,3}
 */

    template <typename T1, typename T2>
__global__ void tile( T1 * z, int n, T2 * x, int nx, int ncopies )
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = x[i % nx];
    }
}    


/// Elementwise operation, z = a*x + b*y

__global__ void aX_plus_bY( float * z, float a, float * x, float b, float * y, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = a*x[i] + b*y[i];
    }
}    



/// Elementwise operation, z = max( x, y )

__global__ void max_X_Y( float * z, float * x, float * y, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        if (isnan(x[i])) { 
            z[i] = x[i];
        } else {        
            z[i] = (x[i]>y[i] ? x[i] : y[i] );
        }    
    }
}    



/// Elementwise operation, z = min( x, y)

__global__ void min_X_Y( float * z, float * x, float * y, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        if (isnan(x[i])) { 
            z[i] = x[i];
        } else {        
            z[i] = (x[i]<y[i] ? x[i] : y[i] );
        }    
    }
}    



/// Elementwise operation, z = log( x )

__global__ void log_X( float * z, float * x, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = log(x[i]);
    }
}    



/// Elementwise operation, z = exp( x )

__global__ void exp_X( float * z, float * x, int n ) 
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = exp(x[i]);
    }
}    
}  // end namespace mycuda
