Host and Device Memory Functions
STANDARD C FUNCTIONS | CUDA C FUNCTIONS |
---|---|
malloc | cudaMalloc |
memcpy | cudaMemcpy |
memset | cudaMemset |
free | cudaFree |
Function Signature
cudaError_t cudaMalloc(void ** devPtr, size_t size);
// may return cudaSuccess, cudaErrorMemoryAllocation, etc.
cudaError_t cudaMemcpy(void * dst, void * src, size_t size, cudaMemcpyKind kind);
// kind could be one of the followings:
// cudaMemcpyHostToHost
// cudaMemcpyHostToDevice
// cudaMemcpyDeviceToHost
// cudaMemcpyDeviceToDevice
// get error
char* cudaGetErrorString(cudaError_t error);
Examples
- C Example
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
void sumArrayOnHost(float * a, float * b, float * c, const int size) {
for (int i = 0; i < size; ++i) {
c[i] = a[i] + b[i];
} // use for do loop
}
void initData(float * ip, const int size) {
// generate different seed for random number
time_t t;
srand((unsigned int) time(&t));
for (int i = 0; i < size; ++i) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
void showData(float * ip, const int size) {
for (int i = 0; i < size; ++i) {
printf("%.6f ", ip[i]);
}
printf("\n");
}
int main() {
const int n = 16;
size_t nBytes = n * sizeof(float);
float * h_a = nullptr; // host pointer of a
float * h_b = nullptr;
float * h_c = nullptr;
// malloc memory
h_a = (float *)malloc(nBytes);
h_b = (float *)malloc(nBytes);
h_c = (float *)malloc(nBytes);
// init data
initData(h_a, n);
initData(h_b, n);
// a + b = c
sumArrayOnHost(h_a, h_b, h_c, n);
// check result
showData(h_a, n);
showData(h_b, n);
showData(h_c, n);
// free memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
- CUDA Example
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void initData(float * ip, const int size) {
time_t t;
srand((unsigned int)time(&t));
for (int i = 0; i < size; ++i) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
// modify sumArrayOnHost to sumArrayOnDevice
__global__ void sumArrayOnDevice(float * a, float * b, float * c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
void showData(float * ip, const int size) {
for (int i = 0; i < size; ++i) {
printf("%.6f ", ip[i]);
}
printf("\n");
}
int main() {
const int n = 4;
const size_t nBytes = sizeof(float) * n;
// ptr on host
float * h_a = (float *)malloc(nBytes);
float * h_b = (float *)malloc(nBytes);
float * h_c = (float *)malloc(nBytes);
// ptr on device
float * d_a = nullptr;
float * d_b = nullptr;
float * d_c = nullptr;
cudaMalloc((float**)&d_a, nBytes);
cudaMalloc((float**)&d_b, nBytes);
cudaMalloc((float**)&d_c, nBytes);
initData(h_a, n);
initData(h_b, n);
// copy data from host to device
cudaMemcpy(d_a, h_a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, nBytes, cudaMemcpyHostToDevice);
// cudaMemcpy(d_c, h_c, cudaMemcpyHostToDevice);
// summation
sumArrayOnDevice<<<1, n>>>(d_a, d_b, d_c);
// copy data back
cudaMemcpy(h_c, d_c, nBytes, cudaMemcpyDeviceToHost);
// show data
showData(h_a, n);
showData(h_b, n);
showData(h_c, n);
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}