/// \file vecAdd.cu
/// \brief Simple example of a kernel to add two vectors elementwise


#include "../include/mycuda.h"

using namespace mycuda;

///  Kernel to add two vectors elementwise

__global__ void
vecAdd( float * z, float a, float * x, float b, float * y, int n ) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z[i] = a*x[i] + b*y[i];
    }
}    



int main() 
{
    const int n         = 8192;
    const int gridsize  = 32;
    const int blocksize = 32;


//// Try these variants to test error checking: 
//    const int n1 = 1<<30;
//    float *x_d;
//    cudaMalloc( &x_d, n1*sizeof(float) );
//    cudaMalloc( &x_d, n1*sizeof(float) ); CUDA_CHECK_ERROR();
//    CUDA_SAFE_CALL( cudaMalloc( &x_d, n1*sizeof(float) ));
//    x_d = device_malloc<float>(n1);


    float *x_h = host_malloc<float>(n);
    float *y_h = host_malloc<float>(n);
    float *z_h = host_malloc<float>(n);

    float *x_d = device_malloc<float>(n);
    float *y_d = device_malloc<float>(n);
    float *z_d = device_malloc<float>(n);

    for (int i=0; i<n; i++) x_h[i] = i;
    for (int i=0; i<n; i++) y_h[i] = 2*i;
    float a=3.0f;
    float b=4.0f;
    
    copy_host_to_device( x_d, x_h, n );
    copy_host_to_device( y_d, y_h, n );
    vecAdd <<< blocksize, gridsize >>> ( z_d, a, x_d, b, y_d, n );
    copy_device_to_host( z_h, z_d, n );


    // Show a few elements
    for (int i=0; i<50; i++) printf( "%8.2f = %8.2f * %8.2f + %8.2f * %8.2f\n", 
                                     z_h[i], a, x_h[i], b, y_h[i] ); 

    
    // Check results
    for (int i=0;i<n;i++) assert( fabs( z_h[i] - (a*x_h[i] + b*y_h[i]) ) < 0.00001 );

    // Free memory
    device_free( x_d );
    device_free( y_d );
    device_free( z_d );
    host_free(x_h);
    host_free(y_h);
    host_free(z_h);
}   





