/** \file deviceQualifier.cu
 *  \brief This sample demonstrates the `__device__` qualifier.
 *
 *    #### Notes
 *    * `__device__` variables 
 *    * `__device__` functions.
 */

#include "../include/mycuda.h"

const int n=8192;         ///< Size of device vectors (needed for static allocations).

__device__ float x_d[n];  ///< Static allocation of global device variable.
__device__ float y_d[n];  ///< Static allocation of global device variable.
__device__ float z_d[n];  ///< Static allocation of global device variable.


/// Device function (called by kernel).

__device__ float XplusY_dev( float x, float y ) {
    return x+y;
}    


/// CUDA Kernel. Uses global device variables `x_d`, `y_d`, and `z_d`.

__global__ void XplusY() {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int nthreads = blockDim.x*gridDim.x;
    for (int i=tid; i<n; i+=nthreads) {
        z_d[i] = XplusY_dev( x_d[i], y_d[i] );
    }
}    



int main() {
    const int blocksize=64;
    const int gridsize=64;

    float *x_ptr, *y_ptr, *z_ptr;
    cudaGetSymbolAddress( (void**)&x_ptr, x_d );
    cudaGetSymbolAddress( (void**)&y_ptr, y_d );
    cudaGetSymbolAddress( (void**)&z_ptr, z_d );

    mycuda::seq <<< gridsize, blocksize >>> ( x_ptr, 1.0, 2.0, n );
    mycuda::seq <<< gridsize, blocksize >>> ( y_ptr, 3.0, n );

    XplusY  <<< gridsize, blocksize >>> ();

    float *x_h = mycuda::host_malloc<float>( n );
    float *y_h = mycuda::host_malloc<float>( n );
    float *z_h = mycuda::host_malloc<float>( n );

    mycuda::copy_device_to_host( x_h, x_ptr, n );
    mycuda::copy_device_to_host( y_h, y_ptr, n);
    mycuda::copy_device_to_host( z_h, z_ptr, n);
    
    for (int i=0;i<50;i++) printf( "%8.2f + %8.2f = %8.2f\n", x_h[i], y_h[i], z_h[i]);
    printf( "\n" );

}














