▶ 使用 kernels 导语并行化 for 循环
● 一重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128 * 256 * 512; 6 7 int main() 8 { 9 int i, j, k, a[row], b[row], c[row]; 10 clock_t time; 11 for (i = 0; i < row; i++) 12 a[i] = b[i] = i; 13 14 #ifdef _OPENACC 15 time = clock(); 16 #pragma acc kernels 17 for (i = 0; i < row; i++) 18 c[i] = a[i] + b[i]; 19 time = clock() - time; 20 printf("\nTime with acc:%d ms\n", time); 21 #else 22 time = clock(); 23 for (i = 0; i < row; i++) 24 c[i] = a[i] + b[i]; 25 time = clock() - time; 26 printf("\nTime without acc:%d ms\n", time); 27 #endif 28 getchar(); 29 return 0; 30 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>set PGI_ACC_NOTIFY=1 // 设置环境变量,要求 pgi 编译器插入代码,在执行时输出 CUDA 内核执行配置 D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe // 编译,-Minfo 要求输出编译优化信息 main: 16, Generating implicit copyin(b[:row]) // 在第 16 行的位置添加了导语和字句 Generating implicit copyout(c[:row]) Generating implicit copyin(a[:row]) 17, Loop is parallelizable // 启用并行优化 Accelerator kernel generated Generating Tesla code 17, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ // 使用默认 vector 尺寸,注释是自动生成的 D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -Minfo main.c -o main_no_acc.exe // 非 OpenACC 优化的编译,没有额外输出信息 D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h // 这里本来是一行,我把它拆开了 function=main line=17 device=0 threadid=1 num_gangs=65535 num_workers=1 vector_length=128 grid=65535 block=128 // 对代码第 17 行的 for 进行了并行优化, // 使用第 0 号设备(GPU) Time with acc:300 ms // 线程编号 1,使用 65536 个 gang,1 个 worker,vector 宽度为 128 // CUDA 配置为 gridDim.x = 65536,blockDim.x = 128 D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_no_acc.exe Time without acc:27 ms // 没有较高的传算比,串行反而较快
● 二重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128 * 256, col = 512; 6 7 int main() 8 { 9 int i, j, k, a[row][col], b[row][col], c[row][col]; 10 clock_t time; 11 for (i = 0; i < row; i++) 12 { 13 for (j = 0; j < col; j++) 14 a[i][j] = b[i][j] = i * j; 15 } 16 17 #ifdef _OPENACC 18 time = clock(); 19 #pragma acc kernels 20 for (i = 0; i < row; i++) 21 { 22 for (j = 0; j < col; j++) 23 c[i][j] = a[i][j] + b[i][j]; 24 } 25 time = clock() - time; 26 printf("\nTime with acc:%d ms\n", time); 27 #else 28 time = clock(); 29 for (i = 0; i < row; i++) 30 { 31 for (j = 0; j < col; j++) 32 c[i][j] = a[i][j] + b[i][j]; 33 } 34 time = clock() - time; 35 printf("\nTime without acc:%d ms\n", time); 36 #endif 37 getchar(); 38 return 0; 39 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe main: 19, Generating implicit copyin(a[:row][:col]) Generating implicit copyout(c[:row][:col]) Generating implicit copyin(b[:row][:col]) 20, Loop is parallelizable 22, Loop is parallelizable Accelerator kernel generated Generating Tesla code 20, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ // 高一层的循环使用的是 worker 22, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -Minfo main.c -o main_no_acc.exe D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h
function=main line=22 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x2048 block=32x4// 注意参数变化,仍有 num_gangs = grid,num_workers * vector_length = block Time with acc:310 ms D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_no_acc.exe Time without acc:36 ms
● 三重循环
1 #include <stdio.h> 2 #include <time.h> 3 #include <openacc.h> 4 5 const int row = 128, col = 256, page = 512; 6 7 int main() 8 { 9 int i, j, k, a[row][col][page], b[row][col][page], c[row][col][page]; 10 clock_t time; 11 for (i = 0; i < row; i++) 12 { 13 for (j = 0; j < col; j++) 14 { 15 for (k = 0; k < page; k++) 16 a[i][j][k] = b[i][j][k] = i * j + k; 17 } 18 } 19 20 #ifdef _OPENACC 21 time = clock(); 22 #pragma acc kernels 23 for (i = 0; i < row; i++) 24 { 25 for (j = 0; j < col; j++) 26 { 27 for (k = 0; k < page; k++) 28 c[i][j][k] = a[i][j][k] + b[i][j][k]; 29 } 30 } 31 time = clock() - time; 32 printf("\nTime with acc:%d ms\n", time); 33 #else 34 time = clock(); 35 for (i = 0; i < row; i++) 36 { 37 for (j = 0; j < col; j++) 38 { 39 for (k = 0; k < page; k++) 40 c[i][j][k] = a[i][j][k] + b[i][j][k]; 41 } 42 } 43 time = clock() - time; 44 printf("\nTime without acc:%d ms\n", time); 45 #endif 46 getchar(); 47 return 0; 48 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe main: 22, Generating implicit copyin(b[:row][:col][:page]) Generating implicit copyout(c[:row][:col][:page]) Generating implicit copyin(a[:row][:col][:page]) 23, Loop is parallelizable 25, Loop is parallelizable 27, Loop is parallelizable Accelerator kernel generated Generating Tesla code 23, #pragma acc loop gang /* blockIdx.y */ // 高一层的循环没有直接并行,而是尝试调整 grid 25, #pragma acc loop gang, vector(4) /* blockIdx.z threadIdx.y */ 27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -Minfo main.c -o main_no_acc.exe D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h
function=main line=27 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=16x128x16 block=32x4// grid 变成了三维 Time with acc:304 ms D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_no_acc.exe Time without acc:54 ms