/// \file  simpleMath.cu
/// \brief This example demonstrates several kernels that do simple math operations


#include "../include/mycuda.h"

using namespace mycuda;



int main() 
{
    const int n         = 8192;
    const int gridsize  = 32;
    const int blocksize = 32;


    float *x_h = host_malloc<float>(n);
    float *y_h = host_malloc<float>(n);
    float *z_h = host_malloc<float>(n);

    float *x_d = device_malloc<float>(n);
    float *y_d = device_malloc<float>(n);
    float *z_d = device_malloc<float>(n);


    float a=2.0f, b=1.0f;
    fill <<< gridsize, blocksize >>> ( x_d, 3.0f, n );
    seq  <<< gridsize, blocksize >>> ( y_d, 5.0f, 2.0f );

    copy_device_to_host( x_h, x_d, n );
    copy_device_to_host( y_h, y_d, n );

    exp_X <<< gridsize, blocksize >>> ( x_d, x_d, n );
    aX_plus_bY <<< gridsize, blocksize >>> ( z_d, a, x_d, b, y_d, n );

    copy_device_to_host( z_h, z_d, n );


    // Show a few elements
    for (int i=0; i<50; i++) printf( "%8.2f = %8.2f * exp(%8.2f) + %8.2f * %8.2f\n", 
                                     z_h[i], a, x_h[i], b, y_h[i] ); 

    
    // Check results
    for (int i=0;i<n;i++) assert( fabs( z_h[i] - (a*exp(x_h[i]) + b*y_h[i]) ) < 0.00001 );

    // Free memory
    device_free( x_d );
    device_free( y_d );
    device_free( z_d );
    host_free(x_h);
    host_free(y_h);
    host_free(z_h);
}   





