cuda初始化代码

#include <stdio.h>
#include <cuda_runtime.h>

bool CUDA_initial(void)
{
int i;
int device_count;
if( cudaGetDeviceCount(&device_count) )
{
  printf(" There is zero device beyond 1.0/n");
  return false;
}
else
  printf("There is %d device beyond 1.0/n",device_count);
for(i=0;i<device_count;i++)
{
  struct cudaDeviceProp device_prop;
  if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
  {
   printf("device properties is :/n"
      "/t device name is %s/n"
      "/t totalGlobalMem is %d/n"
      "/t sharedMemPerBlock is %d/n"
      "/t regsPerBlock is %d/n"
      "/t warpSize is %d/n"
      "/t memPitch is %d/n"
      "/t maxThreadsPerBlock is %d/n"
      "/t maxThreadsDim [3] is %d X %d X %d/n"
      "/t maxGridSize [3] is %d X %d X %d/n"
      "/t totalConstMem is %d/n"
      "/t device version is major %d ,minor %d/n"
      "/t clockRate is %d/n"
      "/t textureAlignment is %d/n"
      "/t deviceOverlap is %d/n"
      "/t multiProcessorCount is %d/n",
      device_prop.name,
      device_prop.totalGlobalMem,
      device_prop.sharedMemPerBlock,
      device_prop.regsPerBlock,
      device_prop.warpSize,
      device_prop.memPitch,
      device_prop.maxThreadsPerBlock,
      device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
      device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
      device_prop.totalConstMem,
      device_prop.major,device_prop.minor,
      device_prop.clockRate,
      device_prop.textureAlignment,
      device_prop.deviceOverlap,
      device_prop.multiProcessorCount);
   break;
  }
}

if(i==device_count)
{
  printf("Get the propertites of device occurred error/n");
  return false;
}

if(cudaSetDevice(i)==cudaErrorInvalidDevice)
{
printf("Set Device occurred error/n");
return false;
}

return true;
}

int main()
{
if(CUDA_initial()==true)
printf("CUDA initial successed!/n");
return 0;
}

其中遇到的一些函数解释：

1.1.1 cudaGetDeviceCount

名称

cudaGetDeviceCount – 返回具有计算能力的设备的数量

概要

cudaError_t cudaGetDeviceCount( int* count )

说明

以*count形式返回可用于执行的计算能力大于等于1.0的设备数量。如果不存在此类设备，cudaGetDeviceCount ()将返回1，且设备0仅支持设备模拟模式。由于此设备能够模拟所有硬件特性，因此该设备将报告9999种主要和次要计算能力。

返回值

1.1.2 cudaSetDevice

名称

cudaSetDevice – 设置设备以供GPU执行使用

概要

cudaError_t cudaSetDevice(int dev)

说明

将dev记录为活动主线程将执行设备码的设备。

返回值

1.1.3 cudaGetDevice

名称

cudaGetDevice – 返回当前使用的设备

概要

cudaError_t cudaGetDevice(int *dev)

说明

以*dev形式返回活动主线程执行设备码的设备。

返回值

1.1.4 cudaGetDeviceProperties

名称

cudaGetDeviceProperties – 返回关于计算设备的信息

概要

cudaError_t cudaGetDeviceProperties( struct cudaDeviceProp* prop，int dev )

说明

以*prop形式返回设备dev的属性。cudaDeviceProp结构定义如下：

struct cudaDeviceProp {
char name [256];

size_t totalGlobalMem;
size_t sharedMemPerBlock;
int regsPerBlock;

int warpSize;

size_t memPitch;

int maxThreadsPerBlock;
int maxThreadsDim [3];
int maxGridSize [3];
size_t totalConstMem;
int major;

int minor;

int clockRate;

size_t textureAlignment;
int deviceOverlap;
int multiProcessorCount;

}

其中：

name

用于标识设备的ASCII字符串；

totalGlobalMem

设备上可用的全局存储器的总量，以字节为单位；

sharedMemPerBlock

线程块可以使用的共享存储器的最大值，以字节为单位；多处理器上的所有线程块可以同时共享这些存储器；

regsPerBlock

线程块可以使用的32位寄存器的最大值；多处理器上的所有线程块可以同时共享这些寄存器；

warpSize

按线程计算的warp块大小；
memPitch

允许通过cudaMallocPitch()为包含存储器区域的存储器复制函数分配的最大间距（pitch），以字节为单位；

maxThreadsPerBlock

每个块中的最大线程数：

maxThreadsDim[3]

块各个维度的最大值：

maxGridSize[3]

网格各个维度的最大值；

totalConstMem

设备上可用的不变存储器总量，以字节为单位；
major，minor

定义设备计算能力的主要修订号和次要修订号；
clockRate

以千赫为单位的时钟频率；

textureAlignment

对齐要求；与textureAlignment字节对齐的纹理基址无需对纹理取样应用偏移；

deviceOverlap

如果设备可在主机和设备之间并发复制存储器，同时又能执行内核，则此值为 1；否则此值为 0；

multiProcessorCount

设备上多处理器的数量。

返回值

1.1.1 cudaGetDeviceCount

1.1.2 cudaSetDevice

1.1.3 cudaGetDevice

1.1.4 cudaGetDeviceProperties

猜你喜欢