8 #include <cuda_runtime.h>
15 #define CUDA_SYNC_TRACE()
16 #define CUDA_DEBUG(_msg)
17 #define CUDA_DEBUG_PRINTF(_fmt, ...)
19 #define CUDA_TRACE() fprintf( stderr, "%s(%i): CUDA_TRACE\n", __FILE__, __LINE__)
20 #define CUDA_SYNC_TRACE() mycuda::device_synchronize(); CUDA_TRACE();
21 #define CUDA_DEBUG(_msg) fprintf( stderr, "%s(%i): CUDA_DEBUG :: %s\n", __FILE__, __LINE__, _msg )
22 #define CUDA_DEBUG_PRINTF(_fmt, ...) fprintf( stderr, "%s(%i): CUDA_DEBUG :: " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
24 #define CUDA_LOGGER(_msg) fprintf( stderr, "%s(%i): CUDA_LOGGER :: %s\n", __FILE__, __LINE__, _msg )
25 #define CUDA_LOGGER_PRINTF(_fmt, ...) fprintf( stderr, "%s(%i): CUDA_LOGGER :: " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
26 #define CUDA_CHECK_ERROR() mycuda::check_error( __FILE__, __LINE__ )
27 #define CUDA_ASYNC_CHECK_ERROR() mycuda::async_check_error( __FILE__, __LINE__ )
28 #define CUDA_SAFE_CALL(err) mycuda::safe_call(err, __FILE__, __LINE__)
29 #define CUDA_CATCH() catch( std::exception& e) { mycuda::handler( e, __FILE__, __LINE__ ); }
30 #define CUDA_HANDLER(e) mycuda::handler( e, __FILE__, __LINE__ )
49 cudaError_t err = cudaGetLastError();
51 fprintf( stdout,
"%s(%i): mycuda::check_error :: %s\nAborting...\n",
52 file, line, cudaGetErrorString( err ) );
68 cudaError_t err = cudaDeviceSynchronize();
70 fprintf( stdout,
"%s(%i): mycuda::async_check_error :: %s\nAborting...\n",
71 file, line, cudaGetErrorString( err ) );
84 void safe_call( cudaError_t err,
const char* file,
const int line )
87 fprintf( stdout,
"%s(%i): mycuda::safe_call :: %s\nAborting...\n",
88 file, line, cudaGetErrorString( err ) );
112 exception() : msg( cudaGetErrorString( cudaGetLastError() )) {}
118 const char*
what()
const throw() {
return msg.c_str(); }
139 void handler( std::exception& e,
const char* file,
const int line )
141 fprintf( stdout,
"%s(%i): mycuda::handler caught exception :: %s\nAborting...\n",
142 file, line, e.what() );
154 template <
typename T>
158 if ( cudaMalloc( &ptr,
n*
sizeof(T) ))
159 throw exception(
"mycuda::device_malloc" );
176 template <
typename T>
180 if ( cudaHostAlloc( &ptr,
n*
sizeof(T), cudaHostAllocMapped ))
181 throw exception(
"mycuda::mapped_malloc" );
196 template <
typename T>
199 T * ptr = (T*)malloc(
n*
sizeof(T) );
200 if (ptr==NULL)
throw exception(
"mycuda::host_malloc" );
208 template <
typename T>
211 if ( cudaFree( ptr ) )
212 throw exception(
"mycuda::device_free" );
219 template <
typename T>
222 if ( cudaFreeHost( ptr ) )
223 throw exception(
"mycuda::mapped_free" );
230 template <
typename T>
244 template <
typename T>
247 cudaError_t err = cudaMemcpy( dest, src,
n*
sizeof(T), cudaMemcpyDeviceToHost );
248 if (err)
throw(
exception(
"mycuda::copy_device_to_host"));
260 template <
typename T>
263 cudaError_t err = cudaMemcpy( dest, src,
n*
sizeof(T), cudaMemcpyHostToDevice );
264 if (err)
throw(
exception(
"mycuda::copy_to_device" ));
275 template <
typename T>
278 cudaError_t err = cudaMemcpy( dest, src,
n*
sizeof(T), cudaMemcpyDeviceToDevice );
279 if (err)
throw(
exception(
"mycuda::copy_device_to_device" ));
293 template <
typename T>
296 cudaError_t err = cudaMemcpy( dest, src,
n*
sizeof(T), cudaMemcpyHostToHost );
297 if (err)
throw(
exception(
"mycuda::copy_host_to_host" ));
306 cudaError_t err = cudaDeviceSynchronize();
307 if (err)
throw( exception(
"mycuda::device_synchronize" ) );
317 cudaError_t err = cudaMemGetInfo ( &free, &total);
318 if (err)
throw( exception(
"mycuda::get_memory_info" ));
319 printf(
"Free memory: %8.2f MB\nTotal memory: %8.2f MB\n",
float(free) /(1<<20),
320 float(total)/(1<<20) );
327 template <
typename T1,
typename T2>
328 __global__
void fill( T1 * z, T2 a,
int n )
330 int tid = blockIdx.x*blockDim.x + threadIdx.x;
331 int nthreads = blockDim.x*gridDim.x;
332 for (
int i=tid; i<
n; i+=nthreads) {
345 template <
typename T1,
typename T2>
346 __global__
void seq( T1 * z, T2 first,
int n )
348 int tid = blockIdx.x*blockDim.x + threadIdx.x;
349 int nthreads = blockDim.x*gridDim.x;
350 for (
int i=tid; i<
n; i+=nthreads) {
363 template <
typename T1,
typename T2>
364 __global__
void seq( T1 * z, T2 first, T2 inc,
int n )
366 int tid = blockIdx.x*blockDim.x + threadIdx.x;
367 int nthreads = blockDim.x*gridDim.x;
368 for (
int i=tid; i<
n; i+=nthreads) {
380 template <
typename T1,
typename T2>
381 __global__
void rep( T1 * z,
int n, T2 * x,
int nx,
int ncopies )
383 int tid = blockIdx.x*blockDim.x + threadIdx.x;
384 int nthreads = blockDim.x*gridDim.x;
385 for (
int i=tid; i<
n; i+=nthreads) {
398 template <
typename T1,
typename T2>
399 __global__
void tile( T1 * z,
int n, T2 * x,
int nx,
int ncopies )
401 int tid = blockIdx.x*blockDim.x + threadIdx.x;
402 int nthreads = blockDim.x*gridDim.x;
403 for (
int i=tid; i<
n; i+=nthreads) {
411 __global__
void aX_plus_bY(
float * z,
float a,
float * x,
float b,
float * y,
int n )
413 int tid = blockIdx.x*blockDim.x + threadIdx.x;
414 int nthreads = blockDim.x*gridDim.x;
415 for (
int i=tid; i<
n; i+=nthreads) {
416 z[i] = a*x[i] + b*y[i];
424 __global__
void max_X_Y(
float * z,
float * x,
float * y,
int n )
426 int tid = blockIdx.x*blockDim.x + threadIdx.x;
427 int nthreads = blockDim.x*gridDim.x;
428 for (
int i=tid; i<
n; i+=nthreads) {
432 z[i] = (x[i]>y[i] ? x[i] : y[i] );
441 __global__
void min_X_Y(
float * z,
float * x,
float * y,
int n )
443 int tid = blockIdx.x*blockDim.x + threadIdx.x;
444 int nthreads = blockDim.x*gridDim.x;
445 for (
int i=tid; i<
n; i+=nthreads) {
449 z[i] = (x[i]<y[i] ? x[i] : y[i] );
458 __global__
void log_X(
float * z,
float * x,
int n )
460 int tid = blockIdx.x*blockDim.x + threadIdx.x;
461 int nthreads = blockDim.x*gridDim.x;
462 for (
int i=tid; i<
n; i+=nthreads) {
471 __global__
void exp_X(
float * z,
float * x,
int n )
473 int tid = blockIdx.x*blockDim.x + threadIdx.x;
474 int nthreads = blockDim.x*gridDim.x;
475 for (
int i=tid; i<
n; i+=nthreads) {