#include<stdlib.h>
#include<string.h>
#include<time.h>
#include<stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<Windows.h>
#define CHECK(call) \
{
\
const cudaError_t error = call; \
if (error != cudaSuccess) \
{
\
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
void sumArraysOnHost(float* A, float* B, float* C, const int N) {
for (int idx = 0; idx < N; idx++) {
C[idx] = A[idx] + B[idx];
//printf("%d\n",idx);
}
}
__global__ void sumArraysOnGPU(float* A, float* B, float* C,const int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i< N)
C[i] = A[i] + B[i];
}
void initialData(float* ip, int size) {
// generate different seed for random number
time_t t;
srand((unsigned int)time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
__global__ void checkIndex(void) {
printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) "
"gridDim:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z,
blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,
gridDim.x, gridDim.y, gridDim.z);
}
void checkResult(float* hostRef, float* gpuRef, const int N) {
double epsilon = 1.0E-8;
int match = 1;
for (int i = 0; i < N; i++) {
if ((hostRef[i] - gpuRef[i]) > epsilon|| (hostRef[i] - gpuRef[i])<-epsilon) {
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n",
hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
int main(int argc, char** argv) {
int nElem = 65536 *1024;
cudaDeviceReset();
LARGE_INTEGER freq,tBegin,tEnd;
size_t nBytes = nElem * sizeof(float);
float* h_A, * h_B, * h_C;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&tBegin);
h_A = (float*)malloc(nBytes);
h_B = (float*)malloc(nBytes);
h_C = (float*)malloc(nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
sumArraysOnHost(h_A, h_B, h_C, nElem);
QueryPerformanceCounter(&tEnd);
auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
printf("cpu cost %lf s\n",time);
float* d_A, * d_B, * d_C;
float* g_C = (float*)malloc(nBytes);
QueryPerformanceCounter(&tBegin);
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
sumArraysOnGPU<<<65536,1024>>>(d_A, d_B, d_C,nElem);
//CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(g_C, d_C, nBytes, cudaMemcpyDeviceToHost));
QueryPerformanceCounter(&tEnd);
auto time2 = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
printf("gpu cost %lf s\n", time2);
checkResult(g_C, h_C, nElem);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return(0);
}
cuda 入门求和
猜你喜欢
转载自blog.csdn.net/weixin_39057744/article/details/121869520
今日推荐
周排行