Alex_McAvoy

想要成为渔夫的猎手

NVIDIA CUDA2023春训营(五)CUDA 向量加法与矩阵乘法

1D Grid, 1D Block 向量加法

普通实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;

void __global__ add(const double *x, const double *y, double *z, int n) {
// 获取全局索引
const int index = blockDim.x * blockIdx.x + threadIdx.x;

// 步长
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
z[i] = x[i] + y[i];
}
}

// 误差检测
void check(const double *z, const int n) {
bool error = false;
double maxError = 0;

for (int i = 0; i < n; i++) {
maxError = fmax(maxError, fabs(z[i]-70));
if (fabs(z[i] - 70) > EPS) {
error = true;
}
}

printf("%s\n", error ? "Errors" : "Pass");
printf("最大误差: %lf\n", maxError);
}

int main() {
const int arraySize = sizeof(double) * N;

// 申请host锁定内存
double *h_x, *h_y, *h_z;
cudaMallocHost(&h_x, arraySize);
cudaMallocHost(&h_y, arraySize);
cudaMallocHost(&h_z, arraySize);

// 初始化数据
for (int i = 0; i < N; i++) {
h_x[i] = 50;
h_y[i] = 20;
}

// 申请device显存
double *d_x, *d_y, *d_z;
cudaMalloc((void **)&d_x, arraySize);
cudaMalloc((void **)&d_y, arraySize);
cudaMalloc((void **)&d_z, arraySize);

// host数据传输到device
cudaMemcpy(d_x, h_x, arraySize, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, arraySize, cudaMemcpyHostToDevice);

// 核函数执行配置
dim3 blockSize(128);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);

// 执行核函数
add<<<gridSize, blockSize>>>(d_x, d_y, d_z, N);

// 将device得到的结果传输到host
cudaMemcpy(h_z, d_z, arraySize, cudaMemcpyDeviceToHost);

// 检查执行结果
check(h_z, N);

// 释放host锁定内存
cudaFreeHost(h_x);
cudaFreeHost(h_y);
cudaFreeHost(h_z);

// 释放device显存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);

return 0;
}

统一内存实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;

__global__ void add(const double *x, const double *y, double *z, int n) {
// 获取全局索引
const int index = blockDim.x * blockIdx.x + threadIdx.x;

// 步长
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
z[i] = x[i] + y[i];
}
}

void check(const double *z, const int n) {
bool error = false;
double maxError = 0;

// 误差检测
for (int i = 0; i < n; i++) {
maxError = fmax(maxError, fabs(z[i]-70));
if (fabs(z[i] - 70) > EPS) {
error = true;
}
}

printf("%s\n", error ? "Errors" : "Pass");
printf("最大误差: %lf\n", maxError);
}

int main() {
// 申请统一内存
const int arraySize = sizeof(double) * N;
double *x, *y, *z;
cudaMallocManaged((void**)&x, arraySize);
cudaMallocManaged((void**)&y, arraySize);
cudaMallocManaged((void**)&z, arraySize);

// 初始化数据
for (int i = 0; i < N; i++) {
x[i] = 50;
y[i] = 20;
}

// 核函数执行配置
dim3 blockSize(128);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);

// 执行核函数
add<<<gridSize, blockSize>>>(x, y, z, N);

// 同步函数
cudaDeviceSynchronize();

// 检查执行结果
check(z, N);

// 释放统一内存
cudaFree(x);
cudaFree(y);
cudaFree(z);

return 0;
}

2D Grid, 2D Block 矩阵乘法

普通实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;

// GPU 矩阵乘法
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k) {

// 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

int sum = 0;
if (row < m && col < k) {
for(int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}

// CPU 矩阵乘法
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
int sum = 0;
for (int h = 0; h < n; h++) {
sum += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = sum;
}
}
}

// 检查执行结果
void check(int *h_c_cpu, int *h_c_gpu, int m, int k) {
int error = false;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
if(fabs(h_c_gpu[i * k + j] - h_c_cpu[i * k + j]) > EPS) {
error = true;
}
}
}
printf("检查结果:%s\n", error ? "Errors" : "Pass");
}

int main() {
int m = 5;
int n = 5;
int k = 5;

// 申请统一内存
int *a, *b, *c_gpu, *c_cpu;

cudaMallocManaged((void **) &a, sizeof(int) * m * n);
cudaMallocManaged((void **) &b, sizeof(int) * n * k);
cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);

// 初始化数据
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
b[i * k + j] = rand() % 1024;
}
}

// 核函数执行配置
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 grid(grid_cols, grid_rows);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);

// 执行核函数
gpu_matrix_mult<<<grid, block>>>(a, b, c_gpu, m, n, k);

// 同步函数
cudaDeviceSynchronize();

// 输出GPU结果
printf("GPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_gpu[i * k + j]);
}
printf("\n");
}

// cpu矩阵乘积
cpu_matrix_mult(a, b, c_cpu, m, n, k);

// 输出CPU结果
printf("CPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_cpu[i * k + j]);
}
printf("\n");
}

// 检查执行结果
check(c_cpu, c_gpu, m, k);

// 释放统一内存
cudaFree(a);
cudaFree(b);
cudaFree(c_cpu);
cudaFree(c_gpu);

return 0;
}

共享内存优化实现

在处理矩阵乘法时,假设矩阵 $M$ 为 $mn$ 维,矩阵 $N$ 为 $nk$ 维,得到的矩阵 $P$ 为 $m*k$ 维

举例来说,当需要读取矩阵 $M$ 中的一个数值 $M(\text{row}, \text{col})$ 时,就要被 grid 中所有满足 row = blockIdx.y * blockDim.y + threadIdx.y 的线程从全局内存中读取一次,总共要读取 $n$ 次

那么,对于这么多次的重复读取,可以将这个变量存放在共享内存中,从而减少每次的读取时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;

// GPU 矩阵乘法,使用 shared memory
__global__ void gpu_matrix_mult_shared(int *a,int *b, int *c, int m, int n, int k) {

// 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

// 申请共享内存,存于每个block中
__shared__ int s_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int s_b[BLOCK_SIZE][BLOCK_SIZE];

int sum = 0;
// 枚举所有的grid,sub * BLOCK_SIZE为第sub个grid前的所有grid中的所有block所占空间
for (int sub = 0; sub < gridDim.x; sub++) {
/** index_a:该线程要读取的数据在矩阵a中的索引
* - sub * BLOCK_SIZE + threadIdx.x:该线程要读取的数据所在行
* - row*n:该线程所在行之前的所有数据
**/
int index_a = (sub * BLOCK_SIZE + threadIdx.x) + row * n;
if (row < m && (sub * BLOCK_SIZE + threadIdx.x) < n) {
s_a[threadIdx.y][threadIdx.x] = a[index_a];
} else {
s_a[threadIdx.y][threadIdx.x] = 0;
}
/** index_b:该线程要读取的数据在矩阵b中的索引
* - col:该线程要读取的数据所在行
* - n * (sub * BLOCK_SIZE + threadIdx.y):该线程所在行之前的所有数据
**/
int index_b = col + n * (sub * BLOCK_SIZE + threadIdx.y);
if (col < n && (sub * BLOCK_SIZE + threadIdx.y) < k) {
s_b[threadIdx.y][threadIdx.x] = b[index_b];
} else {
s_b[threadIdx.y][threadIdx.x] = 0;
}

// 控制线程同步,保证共享变量中的元素都被加载
__syncthreads();

// 从共享空间中取元素计算
for (int i = 0; i < BLOCK_SIZE; i++) {
sum += s_a[threadIdx.y][i] * s_b[i][threadIdx.x];
}
__syncthreads();

if (row < m && col < k) {
c[row * k + col] = sum;
}
}
}

// CPU 矩阵乘法
void cpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
int sum = 0;
for (int h = 0; h < n; h++) {
sum += a[i * n + h] * b[h * k + j];
}
c[i * k + j] = sum;
}
}
}

// 检查执行结果
void check(int *c_cpu, int *c_gpu, int m, int k) {
int error = false;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
if(fabs(c_gpu[i * k + j] - c_cpu[i * k + j]) > EPS) {
error = true;
}
}
}
printf("检查结果:%s\n", error ? "Errors" : "Pass");
}

int main() {
int m = 5;
int n = 5;
int k = 5;

// 申请统一内存
int *a, *b, *c_gpu, *c_cpu;

cudaMallocManaged((void **) &a, sizeof(int) * m * n);
cudaMallocManaged((void **) &b, sizeof(int) * n * k);
cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);

// 初始化数据
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
b[i * k + j] = rand() % 1024;
}
}

// 核函数执行配置
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 grid(grid_cols, grid_rows);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);

// 执行核函数
gpu_matrix_mult_shared<<<grid, block>>>(a, b, c_gpu, m, n, k);

// 同步函数
cudaDeviceSynchronize();

// 输出GPU结果
printf("GPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_gpu[i * k + j]);
}
printf("\n");
}

// cpu矩阵乘积
cpu_matrix_mult(a, b, c_cpu, m, n, k);

// 输出CPU结果
printf("CPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_cpu[i * k + j]);
}
printf("\n");
}

// 检查执行结果
check(c_cpu, c_gpu, m, k);

// 释放统一内存
cudaFree(a);
cudaFree(b);
cudaFree(c_cpu);
cudaFree(c_gpu);

return 0;
}

2D Grid,2D Block 方阵转置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include <stdio.h>
#include <math.h>
#define N 10
#define BLOCK_SIZE 16
const double EPS = 1E-6;

// N*M矩阵a转置为M*N矩阵b
__global__ void transposition(int *a, int *b) {
// 索引
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < N && y < N) {
b[N * x + y] = a[N * y + x];
}
}

// N*M矩阵a转置为M*N矩阵b,共享内存实现
__global__ void transposition_shared(int *a, int *b) {

// 共享变量
__shared__ int s[BLOCK_SIZE][BLOCK_SIZE];

// N*M矩阵a索引
int a_x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int a_y = blockIdx.y * BLOCK_SIZE + threadIdx.y;

if (a_x < N && a_y < N) {
s[threadIdx.y][threadIdx.x] = a[N * a_y + a_x];
}

__syncthreads();

// M*a矩阵b索引
int b_x = blockIdx.x * BLOCK_SIZE + threadIdx.y;
int b_y = blockIdx.y * BLOCK_SIZE + threadIdx.x;

if (b_x < N && b_y < N) {
b[N * b_x + b_y] = s[threadIdx.x][threadIdx.y];
}
}

void check(int *a, int *b) {
int error = false;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if(fabs(a[i * N + j] - b[i * N + j]) > EPS) {
error = true;
}
}
}
printf("检查结果:%s\n", error ? "Errors" : "Pass");
}

void print(int *arr) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d ", arr[i * N + j]);
}
printf("\n");
}
}

int main() {
// 申请统一内存
int *a, *b, *b_shared;
cudaMallocManaged(&a, sizeof(int) * N * N);
cudaMallocManaged(&b, sizeof(int) * N * N);
cudaMallocManaged(&b_shared, sizeof(int) * N * N);

// 初始化N*M矩阵a
for (int i = 0; i < N ; i++) {
for (int j = 0; j < N; j++) {
a[i * N + j] = rand() % 1024;
}
}

// 声明事件并分配资源
float time, time_shared;
cudaEvent_t start, stop;
cudaEvent_t start_shared, stop_shared;

cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&start_shared);
cudaEventCreate(&stop_shared);

// 核函数执行配置
unsigned int gird_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 grid(gird_size, gird_size);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);

// 不使用共享内存实现矩阵转置
cudaEventRecord(start);
transposition<<<grid, block>>>(a, b);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("不使用共享内存实现矩阵转置 GPU 执行时间:%g ms\n", time);

// 使用共享内存实现矩阵转置
cudaEventRecord(start_shared);
transposition_shared<<<grid, block>>>(a, b_shared);
cudaEventRecord(stop_shared);
cudaEventSynchronize(stop_shared);
cudaEventElapsedTime(&time_shared, start_shared, stop_shared);
printf("使用共享内存实现矩阵转置 GPU 执行时间:%g ms\n", time_shared);

// 打印结果
printf("原矩阵:\n");
print(a);
printf("不使用共享内存转置矩阵:\n");
print(b);
printf("使用共享内存转置矩阵:\n");
print(b_shared);
check(b, b_shared);

// 回收事件资源
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventDestroy(start_shared);
cudaEventDestroy(stop_shared);

// 释放统一内存
cudaFree(a);
cudaFree(b);
cudaFree(b_shared);

return 0;
}
感谢您对我的支持,让我继续努力分享有用的技术与知识点!