GPU Workshop Sample Code
 All Data Structures Namespaces Files Functions Variables Macros Pages
mycuda.h
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <string>
4 #include <iostream>
5 #include <cassert>
6 
7 #include <cuda.h>
8 #include <cuda_runtime.h>
9 
10 
11 // NOTE: NDEBUG must be defined BEFORE assert is included to disable assertions.
12 
13 #ifdef NDEBUG
14 #define CUDA_TRACE()
15 #define CUDA_SYNC_TRACE()
16 #define CUDA_DEBUG(_msg)
17 #define CUDA_DEBUG_PRINTF(_fmt, ...)
18 #else
19 #define CUDA_TRACE() fprintf( stderr, "%s(%i): CUDA_TRACE\n", __FILE__, __LINE__)
20 #define CUDA_SYNC_TRACE() mycuda::device_synchronize(); CUDA_TRACE();
21 #define CUDA_DEBUG(_msg) fprintf( stderr, "%s(%i): CUDA_DEBUG :: %s\n", __FILE__, __LINE__, _msg )
22 #define CUDA_DEBUG_PRINTF(_fmt, ...) fprintf( stderr, "%s(%i): CUDA_DEBUG :: " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
23 #endif
24 #define CUDA_LOGGER(_msg) fprintf( stderr, "%s(%i): CUDA_LOGGER :: %s\n", __FILE__, __LINE__, _msg )
25 #define CUDA_LOGGER_PRINTF(_fmt, ...) fprintf( stderr, "%s(%i): CUDA_LOGGER :: " _fmt "\n", __FILE__, __LINE__, __VA_ARGS__ )
26 #define CUDA_CHECK_ERROR() mycuda::check_error( __FILE__, __LINE__ )
27 #define CUDA_ASYNC_CHECK_ERROR() mycuda::async_check_error( __FILE__, __LINE__ )
28 #define CUDA_SAFE_CALL(err) mycuda::safe_call(err, __FILE__, __LINE__)
29 #define CUDA_CATCH() catch( std::exception& e) { mycuda::handler( e, __FILE__, __LINE__ ); }
30 #define CUDA_HANDLER(e) mycuda::handler( e, __FILE__, __LINE__ )
31 
32 
33 
35 
36 namespace mycuda
37 {
38 
39 
41 
47 void check_error( const char* file, const int line )
48 {
49  cudaError_t err = cudaGetLastError();
50  if (err) {
51  fprintf( stdout, "%s(%i): mycuda::check_error :: %s\nAborting...\n",
52  file, line, cudaGetErrorString( err ) );
53  exit(-1);
54  }
55 }
56 
57 
59 
66 void async_check_error( const char* file, const int line )
67 {
68  cudaError_t err = cudaDeviceSynchronize();
69  if (err) {
70  fprintf( stdout, "%s(%i): mycuda::async_check_error :: %s\nAborting...\n",
71  file, line, cudaGetErrorString( err ) );
72  exit(-1);
73  }
74 }
75 
76 
78 
84 void safe_call( cudaError_t err, const char* file, const int line )
85 {
86  if (err) {
87  fprintf( stdout, "%s(%i): mycuda::safe_call :: %s\nAborting...\n",
88  file, line, cudaGetErrorString( err ) );
89  exit(-1);
90  }
91 }
92 
93 
94 
95 
96 
98 
106 class exception : public std::exception
107 {
108  private:
109  std::string msg;
110 
111  public:
112  exception() : msg( cudaGetErrorString( cudaGetLastError() )) {}
113 
114  exception( std::string msg ) : msg(msg) {}
115 
116  ~exception() throw() {}
117 
118  const char* what() const throw() { return msg.c_str(); }
119 };
120 
121 
122 
123 
125 
138 // NOTE: need to catch reference to get derived classes.
139 void handler( std::exception& e, const char* file, const int line )
140 {
141  fprintf( stdout, "%s(%i): mycuda::handler caught exception :: %s\nAborting...\n",
142  file, line, e.what() );
143  exit(-1);
144 }
145 
146 
148 
154  template <typename T>
155 T * device_malloc( int n ) throw (exception)
156 {
157  T * ptr;
158  if ( cudaMalloc( &ptr, n*sizeof(T) ))
159  throw exception( "mycuda::device_malloc" );
160  return ptr;
161 }
162 
163 
164 
166 
176  template <typename T>
177 T * mapped_malloc( int n ) throw (exception)
178 {
179  T * ptr;
180  if ( cudaHostAlloc( &ptr, n*sizeof(T), cudaHostAllocMapped ))
181  throw exception( "mycuda::mapped_malloc" );
182  return ptr;
183 }
184 
185 
186 
188 
196  template <typename T>
197 T * host_malloc( int n ) throw ( exception )
198 {
199  T * ptr = (T*)malloc( n*sizeof(T) );
200  if (ptr==NULL) throw exception( "mycuda::host_malloc" );
201  return ptr;
202 }
203 
204 
205 
207 
208  template <typename T>
209 void device_free( T* ptr ) throw( exception )
210 {
211  if ( cudaFree( ptr ) )
212  throw exception( "mycuda::device_free" );
213 }
214 
215 
216 
218 
219  template <typename T>
220 void mapped_free( T* ptr ) throw( exception )
221 {
222  if ( cudaFreeHost( ptr ) )
223  throw exception( "mycuda::mapped_free" );
224 }
225 
226 
227 
229 
230  template <typename T>
231 void host_free( T* ptr )
232 {
233  free( ptr );
234 }
235 
236 
238 
244  template <typename T>
245 void copy_device_to_host( T * dest, T * src, int n ) throw( exception )
246 {
247  cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyDeviceToHost );
248  if (err) throw( exception( "mycuda::copy_device_to_host"));
249 }
250 
251 
252 
254 
260  template <typename T>
261 void copy_host_to_device( T * dest, T * src, int n ) throw( exception )
262 {
263  cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyHostToDevice );
264  if (err) throw( exception( "mycuda::copy_to_device" ));
265 }
266 
267 
269 
275  template <typename T>
276 void copy_device_to_device( T * dest, T * src, int n ) throw( exception )
277 {
278  cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyDeviceToDevice );
279  if (err) throw( exception( "mycuda::copy_device_to_device" ));
280 }
281 
282 
284 
293  template <typename T>
294 void copy_host_to_host( T * dest, T * src, int n ) throw( exception )
295 {
296  cudaError_t err = cudaMemcpy( dest, src, n*sizeof(T), cudaMemcpyHostToHost );
297  if (err) throw( exception( "mycuda::copy_host_to_host" ));
298 }
299 
300 
301 
303 
305 {
306  cudaError_t err = cudaDeviceSynchronize();
307  if (err) throw( exception( "mycuda::device_synchronize" ) );
308 }
309 
310 
311 
313 
314 void get_memory_info() throw( exception )
315 {
316  size_t free, total;
317  cudaError_t err = cudaMemGetInfo ( &free, &total);
318  if (err) throw( exception( "mycuda::get_memory_info" ));
319  printf( "Free memory: %8.2f MB\nTotal memory: %8.2f MB\n", float(free) /(1<<20),
320  float(total)/(1<<20) );
321 }
322 
323 
324 
326 
327  template <typename T1, typename T2>
328 __global__ void fill( T1 * z, T2 a, int n )
329 {
330  int tid = blockIdx.x*blockDim.x + threadIdx.x;
331  int nthreads = blockDim.x*gridDim.x;
332  for (int i=tid; i<n; i+=nthreads) {
333  z[i] = a;
334  }
335 }
336 
337 
338 
340 
345  template <typename T1, typename T2>
346 __global__ void seq( T1 * z, T2 first, int n )
347 {
348  int tid = blockIdx.x*blockDim.x + threadIdx.x;
349  int nthreads = blockDim.x*gridDim.x;
350  for (int i=tid; i<n; i+=nthreads) {
351  z[i] = first +i;
352  }
353 }
354 
355 
356 
358 
363  template <typename T1, typename T2>
364 __global__ void seq( T1 * z, T2 first, T2 inc, int n )
365 {
366  int tid = blockIdx.x*blockDim.x + threadIdx.x;
367  int nthreads = blockDim.x*gridDim.x;
368  for (int i=tid; i<n; i+=nthreads) {
369  z[i] = first +i*inc;
370  }
371 }
372 
373 
375 
380  template <typename T1, typename T2>
381 __global__ void rep( T1 * z, int n, T2 * x, int nx, int ncopies )
382 {
383  int tid = blockIdx.x*blockDim.x + threadIdx.x;
384  int nthreads = blockDim.x*gridDim.x;
385  for (int i=tid; i<n; i+=nthreads) {
386  z[i] = x[i/ncopies];
387  }
388 }
389 
390 
391 
393 
398  template <typename T1, typename T2>
399 __global__ void tile( T1 * z, int n, T2 * x, int nx, int ncopies )
400 {
401  int tid = blockIdx.x*blockDim.x + threadIdx.x;
402  int nthreads = blockDim.x*gridDim.x;
403  for (int i=tid; i<n; i+=nthreads) {
404  z[i] = x[i % nx];
405  }
406 }
407 
408 
410 
411 __global__ void aX_plus_bY( float * z, float a, float * x, float b, float * y, int n )
412 {
413  int tid = blockIdx.x*blockDim.x + threadIdx.x;
414  int nthreads = blockDim.x*gridDim.x;
415  for (int i=tid; i<n; i+=nthreads) {
416  z[i] = a*x[i] + b*y[i];
417  }
418 }
419 
420 
421 
423 
424 __global__ void max_X_Y( float * z, float * x, float * y, int n )
425 {
426  int tid = blockIdx.x*blockDim.x + threadIdx.x;
427  int nthreads = blockDim.x*gridDim.x;
428  for (int i=tid; i<n; i+=nthreads) {
429  if (isnan(x[i])) {
430  z[i] = x[i];
431  } else {
432  z[i] = (x[i]>y[i] ? x[i] : y[i] );
433  }
434  }
435 }
436 
437 
438 
440 
441 __global__ void min_X_Y( float * z, float * x, float * y, int n )
442 {
443  int tid = blockIdx.x*blockDim.x + threadIdx.x;
444  int nthreads = blockDim.x*gridDim.x;
445  for (int i=tid; i<n; i+=nthreads) {
446  if (isnan(x[i])) {
447  z[i] = x[i];
448  } else {
449  z[i] = (x[i]<y[i] ? x[i] : y[i] );
450  }
451  }
452 }
453 
454 
455 
457 
458 __global__ void log_X( float * z, float * x, int n )
459 {
460  int tid = blockIdx.x*blockDim.x + threadIdx.x;
461  int nthreads = blockDim.x*gridDim.x;
462  for (int i=tid; i<n; i+=nthreads) {
463  z[i] = log(x[i]);
464  }
465 }
466 
467 
468 
470 
471 __global__ void exp_X( float * z, float * x, int n )
472 {
473  int tid = blockIdx.x*blockDim.x + threadIdx.x;
474  int nthreads = blockDim.x*gridDim.x;
475  for (int i=tid; i<n; i+=nthreads) {
476  z[i] = exp(x[i]);
477  }
478 }
479 } // end namespace mycuda