CUDA学习笔记(二)

前言

线程的组织形式对程序的性能影响是至关重要的,本篇博文主要以下面一种情况来介绍线程组织形式:

  • 2D grid 2D block

一些基本的描述:

gridDim.x-线程网络X维度上线程块的数量

gridDim.y-线程网络Y维度上线程块的数量

blockDim.x-一个线程块X维度上的线程数量

blockDim.y-一个线程块Y维度上的线程数量

blockIdx.x-线程网络X维度上的线程块索引

blockIdx.y-线程网络Y维度上的线程块索引

threadIdx.x-线程块X维度上的线程索引

threadIdx.y-线程块Y维度上的线程索引

线程索引

一般,一个矩阵以线性存储在global memory中的,并以行来实现线性

 

在kernel里,线程的唯一索引非常有用,为了确定一个线程的索引,我们以2D为例:

  • 线程和block索引
  • 矩阵中元素坐标
  • 线性global memory 的偏移

首先可以将thread和block索引映射到矩阵坐标

ix = threadIdx.x + blockIdx.x * blockDim.x

iy = threadIdx.y + blockIdx.y * blockDim.y

之后可以利用上述变量计算线性地址

idx = iy * nx + ix

 

上图展示了block和thread索引,矩阵坐标以及线性地址之间的关系,谨记,相邻的thread拥有连续的threadIdx.x,也就是索引为(0,0)(1,0)(2,0)(3,0)...的thread连续,而不是(0,0)(0,1)(0,2)(0,3)...连续,跟我们线代里玩矩阵的时候不一样。

现在可以验证出下面的关系:

thread_id(2,1)block_id(1,0) coordinate(6,1) global index 14 ival 14

下图显示了三者之间的关系:

 

代码

 
  1. int main(int argc, char **argv) {

  2.   printf("%s Starting...\n", argv[0]);

  3.   // set up device

  4.   int dev = 0;

  5.   cudaDeviceProp deviceProp;

  6.   CHECK(cudaGetDeviceProperties(&deviceProp, dev));

  7.   printf("Using Device %d: %s\n", dev, deviceProp.name);

  8.   CHECK(cudaSetDevice(dev));

  9.  
  10.   // set up date size of matrix

  11.   int nx = 1<<14;

  12.   int ny = 1<<14;

  13.   int nxy = nx*ny;

  14.   int nBytes = nxy * sizeof(float);

  15.   printf("Matrix size: nx %d ny %d\n",nx, ny);

  16.  
  17.   // malloc host memory

  18.   float *h_A, *h_B, *hostRef, *gpuRef;

  19.   h_A = (float *)malloc(nBytes);

  20.   h_B = (float *)malloc(nBytes);

  21.   hostRef = (float *)malloc(nBytes);

  22.   gpuRef = (float *)malloc(nBytes);

  23.   

  24.   // initialize data at host side

  25.   double iStart = cpuSecond();

  26.   initialData (h_A, nxy);

  27.   initialData (h_B, nxy);

  28.   double iElaps = cpuSecond() - iStart;

  29.   memset(hostRef, 0, nBytes);

  30.   memset(gpuRef, 0, nBytes);

  31.  
  32.   // add matrix at host side for result checks

  33.   iStart = cpuSecond();

  34.   sumMatrixOnHost (h_A, h_B, hostRef, nx,ny);

  35.   iElaps = cpuSecond() - iStart;

  36.  
  37.   // malloc device global memory

  38.   float *d_MatA, *d_MatB, *d_MatC;

  39.   cudaMalloc((void **)&d_MatA, nBytes);

  40.   cudaMalloc((void **)&d_MatB, nBytes);

  41.   cudaMalloc((void **)&d_MatC, nBytes);

  42.   

  43.   // transfer data from host to device

  44.   cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);

  45.   cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);

  46.  
  47.   // invoke kernel at host side

  48.   int dimx = 32;

  49.   int dimy = 32;

  50.   dim3 block(dimx, dimy);

  51.   dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y);

  52.   iStart = cpuSecond();

  53.   sumMatrixOnGPU2D <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny);

  54.   cudaDeviceSynchronize();

  55.   iElaps = cpuSecond() - iStart;

  56.   printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x,

  57.   grid.y, block.x, block.y, iElaps);

  58.  
  59.   // copy kernel result back to host side

  60.   cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);

  61.  
  62.   // check device results

  63.   checkResult(hostRef, gpuRef, nxy);

  64.   

  65.   // free device global memory

  66.   cudaFree(d_MatA);

  67.   cudaFree(d_MatB);

  68.   cudaFree(d_MatC);

  69.  
  70.   // free host memory

  71.   free(h_A);

  72.   free(h_B);

  73.   free(hostRef);

  74.   free(gpuRef);

  75.  
  76.   // reset device

  77.   cudaDeviceReset();

  78.   return (0);

  79. }

编译运行:

 
  1. $ nvcc -arch=sm_20 sumMatrixOnGPU-2D-grid-2D-block.cu -o matrix2D

  2. $ ./matrix2D

输出:

 
  1. ./a.out Starting...

  2. Using Device 0: Tesla M2070

  3. Matrix size: nx 16384 ny 16384

  4. sumMatrixOnGPU2D <<<(512,512), (32,32)>>> elapsed 0.060323 sec

  5. Arrays match.

接下来,我们更改block配置为32x16,重新编译,输出为:

sumMatrixOnGPU2D <<<(512,1024), (32,16)>>> elapsed 0.038041 sec

可以看到,性能提升了一倍,直观的来看,我们会认为第二个配置比第一个多了一倍的block所以性能提升一倍,实际上也确实是因为block增加了。但是,如果你继续增加block的数量,则性能又会降低:

sumMatrixOnGPU2D <<< (1024,1024), (16,16) >>> elapsed 0.045535 sec

下图展示了不同配置的性能;

 

关于性能的分析将在之后的博文中总结,现在只是了解下,本文在于掌握线程组织的方法。

猜你喜欢

转载自blog.csdn.net/qq_36387683/article/details/81125418