5 #include "../include/mycuda.h"
7 using namespace mycuda;
12 vecAdd(
float * z,
float a,
float * x,
float b,
float * y,
int n ) {
13 int tid = blockIdx.x*blockDim.x + threadIdx.x;
14 int nthreads = blockDim.x*gridDim.x;
15 for (
int i=tid; i<
n; i+=nthreads) {
16 z[i] = a*x[i] + b*y[i];
25 const int gridsize = 32;
38 float *x_h = host_malloc<float>(
n);
39 float *y_h = host_malloc<float>(
n);
40 float *z_h = host_malloc<float>(
n);
42 float *
x_d = device_malloc<float>(
n);
43 float *
y_d = device_malloc<float>(
n);
44 float *
z_d = device_malloc<float>(
n);
46 for (
int i=0; i<
n; i++) x_h[i] = i;
47 for (
int i=0; i<
n; i++) y_h[i] = 2*i;
53 vecAdd <<< blocksize, gridsize >>> (
z_d, a,
x_d, b,
y_d,
n );
58 for (
int i=0; i<50; i++) printf(
"%8.2f = %8.2f * %8.2f + %8.2f * %8.2f\n",
59 z_h[i], a, x_h[i], b, y_h[i] );
63 for (
int i=0;i<
n;i++) assert( fabs( z_h[i] - (a*x_h[i] + b*y_h[i]) ) < 0.00001 );