1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| #include <stdio.h> #include <math.h> #define N 100 const double EPS = 1E-6;
void __global__ add(const double *x, const double *y, double *z, int n) { const int index = blockDim.x * blockIdx.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) { z[i] = x[i] + y[i]; } }
void check(const double *z, const int n) { bool error = false; double maxError = 0; for (int i = 0; i < n; i++) { maxError = fmax(maxError, fabs(z[i]-70)); if (fabs(z[i] - 70) > EPS) { error = true; } } printf("%s\n", error ? "Errors" : "Pass"); printf("最大误差: %lf\n", maxError); }
int main() { const int arraySize = sizeof(double) * N;
double *h_x, *h_y, *h_z; cudaMallocHost(&h_x, arraySize); cudaMallocHost(&h_y, arraySize); cudaMallocHost(&h_z, arraySize);
for (int i = 0; i < N; i++) { h_x[i] = 50; h_y[i] = 20; }
double *d_x, *d_y, *d_z; cudaMalloc((void **)&d_x, arraySize); cudaMalloc((void **)&d_y, arraySize); cudaMalloc((void **)&d_z, arraySize); cudaMemcpy(d_x, h_x, arraySize, cudaMemcpyHostToDevice); cudaMemcpy(d_y, h_y, arraySize, cudaMemcpyHostToDevice);
dim3 blockSize(128); dim3 gridSize((N + blockSize.x - 1) / blockSize.x); add<<<gridSize, blockSize>>>(d_x, d_y, d_z, N);
cudaMemcpy(h_z, d_z, arraySize, cudaMemcpyDeviceToHost); check(h_z, N);
cudaFreeHost(h_x); cudaFreeHost(h_y); cudaFreeHost(h_z); cudaFree(d_x); cudaFree(d_y); cudaFree(d_z); return 0; }
|