《CUDA 编程入门》笔记

在这里插入图片描述

helloworld:vec-add

#include <stdio.h>
#include <cuda.h>

#include "_aux.h"

/* host, add */
void vec_add_host(float *x, float *y, float *z, int N);

/* device function */
__global__ void vec_add(float *x, float *y, float *z, int N)
{
    
    
    /* 1D block */
    int idx = get_tid();

    if (idx < N) z[idx] = z[idx] + y[idx] + x[idx];
}

void vec_add_host(float *x, float *y, float *z, int N)
{
    
    
    int i;

    for (i = 0; i < N; i++) z[i] = z[i] + y[i] + x[i];
}

int main()
{
    
    
    int N = 20000000;
    int nbytes = N * sizeof(float);

    /* 1D block */
    int bs = 256;

    /* 2D grid */
    int s = ceil(sqrt((N + bs - 1.) / bs));
    dim3 grid = dim3(s, s);

    float *dx = NULL, *hx = NULL;
    float *dy = NULL, *hy = NULL;
    float *dz = NULL, *hz = NULL;

    int itr = 30;
    int i;
    double th, td;

    /* allocate GPU mem */
    cudaMalloc((void **)&dx, nbytes);
    cudaMalloc((void **)&dy, nbytes);
    cudaMalloc((void **)&dz, nbytes);

    if (dx == NULL || dy == NULL || dz == NULL) {
    
    
        printf("couldn't allocate GPU memory\n");
        return -1;
    }

    printf("allocated %.2f MB on GPU\n", nbytes / (1024.f * 1024.f));

    /* alllocate CPU mem */
    hx = (float *) malloc(nbytes);
    hy = (float *) malloc(nbytes);
    hz = (float *) malloc(nbytes);

    if (hx == NULL || hy == NULL || hz == NULL) {
    
    
        printf("couldn't allocate CPU memory\n");
        return -2;
    }
    printf("allocated %.2f MB on CPU\n", nbytes / (1024.f * 1024.f));

    /* init */
    for (i = 0; i < N; i++) {
    
    
        hx[i] = 1;
        hy[i] = 1;
        hz[i] = 1;
    }

    /* copy data to GPU */
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);

    /* call GPU */
    cudaDeviceSynchronize();
    td = get_time();
    
    for (i = 0; i < itr; i++) vec_add<<<grid, bs>>>(dx, dy, dz, N);

    cudaDeviceSynchronize();
    td = get_time() - td;

    /* CPU */
    th = get_time();
    for (i = 0; i < itr; i++) vec_add_host(hx, hy, hz, N);
    th = get_time() - th;

    printf("GPU time: %e, CPU time: %e, speedup: %g\n", td, th, th / td);

    cudaFree(dx);
    cudaFree(dy);
    cudaFree(dz);

    free(hx);
    free(hy);
    free(hz);

    return 0;
}

内存管理

#include <stdio.h>
#include <cuda.h>


__global__ void sum(float *x)
{
    
    
    int tid = threadIdx.x;
    x[tid] += 1;
}

int main()
{
    
    
    int N = 32;
    int nbytes = N * sizeof(float);

    float *dx = NULL, *hx = NULL;
    int i;

    /* allocate GPU mem */
    cudaMalloc((void **)&dx, nbytes);

    if (dx == NULL) {
    
    
        printf("couldn't allocate GPU memory\n");
        return -1;
    }

    /* allocate CPU host mem: memory copy is faster than malloc */
    //hx = (float *)malloc(nbytes);
    cudaMallocHost((void **)&hx, nbytes);

    if (hx == NULL) {
    
    
        printf("couldn't allocate CPU memory\n");
        return -2;
    }

    /* init */
    printf("hx original: \n");
    for (i = 0; i < N; i++) {
    
    
        hx[i] = i;
        printf("%g\n", hx[i]);
    }

    /* copy data to GPU */
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);

    /* call GPU */
    sum<<<1, N>>>(dx);

    /* let GPU finish */
    cudaDeviceSynchronize();

    /* copy data from GPU */
    cudaMemcpy(hx, dx, nbytes, cudaMemcpyDeviceToHost);

    printf("\nhx from GPU: \n");
    for (i = 0; i < N; i++) {
    
    
        printf("%g\n", hx[i]);
    }

    cudaFree(dx);
    //free(hx);
    cudaFreeHost(hx);

    return 0;
}

规约算法

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

volatile关键字能防止编译器优化循环展开,造成结果错误。
在这里插入图片描述
注意:下面的程序若去掉volatile关键字,则函数reduction_2和reduction_3的计算结果有误。

#include <stdio.h>
#include <cuda.h>


__global__ void reduction_1(const float* x, float* y)
{
    
    
	__shared__ float sdata[256];
	int tid = threadIdx.x;
	/*load data to shared mem*/
	sdata[tid] = x[tid];
	__syncthreads();

	/*reduction using shared mem*/
	if (tid < 128)sdata[tid] += sdata[tid + 128]; __syncthreads();
	if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads();
	if (tid < 32) sdata[tid] += sdata[tid + 32]; __syncthreads();
	if (tid < 16) sdata[tid] += sdata[tid + 16]; __syncthreads();
	if (tid < 8) sdata[tid] += sdata[tid + 8];__syncthreads();
	if (tid < 4) sdata[tid] += sdata[tid + 4]; __syncthreads();
	if (tid < 2) sdata[tid] += sdata[tid + 2]; __syncthreads();
	if (tid == 0) *y = sdata[0] + sdata[1];
}

__global__ void reduction_2(const float* x, float* y)
{
    
    
	__shared__ volatile float sdata[256];
	int tid = threadIdx.x;
	/*load data to shared mem*/
	sdata[tid] = x[tid];
	__syncthreads();
	/*reduction using shared mem*/
	if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads();
	if (tid < 64)  sdata[tid] += sdata[tid + 64]; __syncthreads();
	if (tid < 32) {
    
    
		sdata[tid] += sdata[tid + 32];
		sdata[tid] += sdata[tid + 16];
		sdata[tid] += sdata[tid + 8];
		sdata[tid] += sdata[tid + 4];
		sdata[tid] += sdata[tid + 2];
		sdata[tid] += sdata[tid + 1];
	}
	if (tid == 0) y[0] = sdata[0];
}

__device__ void warpReduce(volatile float* sdata, int tid)
{
    
    
	sdata[tid] += sdata[tid + 32]; 
	sdata[tid] += sdata[tid + 16]; 
	sdata[tid] += sdata[tid + 8]; 
	sdata[tid] += sdata[tid + 4]; 
	sdata[tid] += sdata[tid + 2]; 
	sdata[tid] += sdata[tid + 1];
}

__global__ void reduction_3(const float* x, float* y)
{
    
    
	__shared__ float sdata[256];
	int tid = threadIdx.x;
	/* load data to shared mem*/
	sdata[tid] = x[tid];
	__syncthreads();
	/*reduction using shared mem*/
	if (tid < 128)sdata[tid] += sdata[tid + 128]; __syncthreads();
	if (tid < 64) sdata[tid] += sdata[tid + 64]; __syncthreads();
	if (tid < 32) warpReduce(sdata, tid);
	if (tid == 0) y[0] = sdata[0];
}


int main()
{
    
    
	int N = 256;
	int nbytes = N * sizeof(float);

	float* dx = NULL, *hx = NULL;
	float* dy = NULL;
	int i;
	float as = 0;

	cudaMalloc((void**)& dx, nbytes);
	cudaMalloc((void**)& dy, sizeof(float));

	if (dx == NULL || dy == NULL) 
		return -1;

	hx = (float*)malloc(nbytes);
	if (hx == NULL) 
		return -2;

	for (i = 0; i < N; i++) 
		hx[i] = 1;

	cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);

	reduction_1 << <1, N >> > (dx, dy);
	//reduction_2 << <1, N >> > (dx, dy);
	//reduction_3 << <1, N >> > (dx, dy);

	cudaDeviceSynchronize();

	cudaMemcpy(&as, dy, sizeof(float), cudaMemcpyDeviceToHost);

	printf("answer: 256, calculated by GPU:%g\n", as);

	cudaFree(dx);
	cudaFree(dy);
	free(hx);

	return 0;
}

完整的三阶段规约算法
1.块大小, 256:数组长度降低256倍 ——大规模数组依旧很长,例如256万降低到1万
2.对部分和继续使用上一步的算法
3.使用一个块,将最后结果规约


/* asum: sum of all entries of a vector */

#include "aux.h"

typedef double FLOAT;

__device__ void warpReduce(volatile FLOAT *sdata, int tid)
{
    
    
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

/* sum all entries in x and asign to y
 * block dim must be 256 */
__global__ void asum_stg_1(const FLOAT *x, FLOAT *y, int N)
{
    
    
    __shared__ FLOAT sdata[256];
    int idx = get_tid();
    int tid = threadIdx.x;
    int bid = get_bid();

    /* load data to shared mem */
    if (idx < N) {
    
    
        sdata[tid] = x[idx];
    }
    else {
    
    
        sdata[tid] = 0;
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid < 64) sdata[tid] += sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) y[bid] = sdata[0];
}

__global__ void asum_stg_3(FLOAT *x, int N)
{
    
    
    __shared__ FLOAT sdata[128];
    int tid = threadIdx.x;
    int i;

    sdata[tid] = 0;

    /* load data to shared mem */
    for (i = 0; i < N; i += 128) {
    
    
        if (tid + i < N) sdata[tid] += x[i + tid];
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 64) sdata[tid] = sdata[tid] + sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);
    __syncthreads();

    if (tid == 0) x[0] = sdata[0];
}

/* dy and dz serve as cache: result stores in dz[0] */
void asum(FLOAT *dx, FLOAT *dy, FLOAT *dz, int N)
{
    
    
    /* 1D block */
    int bs = 256;

    /* 2D grid */
    int s = ceil(sqrt((N + bs - 1.) / bs));
    dim3 grid = dim3(s, s);
    int gs = 0;

    /* stage 1 */
    asum_stg_1<<<grid, bs>>>(dx, dy, N);

    /* stage 2 */
    {
    
    
        /* 1D grid */
        int N2 = (N + bs - 1) / bs;

        int s2 = ceil(sqrt((N2 + bs - 1.) / bs));
        dim3 grid2 = dim3(s2, s2);

        asum_stg_1<<<grid2, bs>>>(dy, dz, N2);

        /* record gs */
        gs = (N2 + bs - 1.) / bs;
    }

    /* stage 3 */
    asum_stg_3<<<1, 128>>>(dz, gs);
}

FLOAT asum_host(FLOAT *x, int N);

FLOAT asum_host(FLOAT *x, int N)
{
    
    
    int i;
    FLOAT t = 0;

    for (i = 0; i < N; i++) t += x[i];

    return t;
}

int main(int argc, char **argv)
{
    
    
    int N = 10000070;
    int nbytes = N * sizeof(FLOAT);

    FLOAT *dx = NULL, *hx = NULL;
    FLOAT *dy = NULL, *dz;
    int i, itr = 20;
    FLOAT asd = 0, ash;
    double td, th;

    if (argc == 2) {
    
    
        int an;

        an = atoi(argv[1]);
        if (an > 0) N = an;
    }

    /* allocate GPU mem */
    cudaMalloc((void **)&dx, nbytes);
    cudaMalloc((void **)&dy, sizeof(FLOAT) * ((N + 255) / 256));
    cudaMalloc((void **)&dz, sizeof(FLOAT) * ((N + 255) / 256));

    if (dx == NULL || dy == NULL || dz == NULL) {
    
    
        printf("couldn't allocate GPU memory\n");
        return -1;
    }

    printf("allocated %e MB on GPU\n", nbytes / (1024.f * 1024.f));

    /* alllocate CPU mem */
    hx = (FLOAT *) malloc(nbytes);

    if (hx == NULL) {
    
    
        printf("couldn't allocate CPU memory\n");
        return -2;
    }
    printf("allocated %e MB on CPU\n", nbytes / (1024.f * 1024.f));

    /* init */
    for (i = 0; i < N; i++) {
    
    
        hx[i] = 1;
    }

    /* copy data to GPU */
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);

    /* let dust fall */
    cudaDeviceSynchronize();
    td = get_time();

    /* call GPU */
    for (i = 0; i < itr; i++) asum(dx, dy, dz, N);

    /* let GPU finish */
    cudaDeviceSynchronize();
    td = get_time() - td;

    th = get_time();
    for (i = 0; i < itr; i++) ash = asum_host(hx, N);
    th = get_time() - th;

    /* copy data from GPU */
    cudaMemcpy(&asd, dz, sizeof(FLOAT), cudaMemcpyDeviceToHost);

    printf("asum, answer: %d, calculated by GPU:%f, calculated by CPU:%f\n", N, asd, ash);
    printf("GPU time: %e, CPU time: %e, speedup: %g\n", td, th, th / td);

    cudaFree(dx);
    cudaFree(dy);
    cudaFree(dz);
    free(hx);

    return 0;
}
#include "_aux.h"
#include <assert.h>


/* host, add */
float dot_host(float *x, float *y, int N)
{
    
    
    int i;
    float t = 0;

    assert(x != NULL);
    assert(y != NULL);

    for (i = 0; i < N; i++) t += x[i] * y[i];

    return t;
}

__device__ void warpReduce(volatile float *sdata, int tid)
{
    
    
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

/* partial dot product */
__global__ void dot_stg_1(const float *x, float *y, float *z, int N)
{
    
    
    __shared__ float sdata[256];
    int idx = get_tid();
    int tid = threadIdx.x;
    int bid = get_bid();

    /* load data to shared mem */
    if (idx < N) {
    
    
        sdata[tid] = x[idx] * y[idx];
    }
    else {
    
    
        sdata[tid] = 0;
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid < 64) sdata[tid] += sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) z[bid] = sdata[0];
}

/* sum all entries in x and asign to y
 * block dim must be 256 */
__global__ void dot_stg_2(const float *x, float *y, int N)
{
    
    
    __shared__ float sdata[256];
    int idx = get_tid();
    int tid = threadIdx.x;
    int bid = get_bid();

    /* load data to shared mem */
    if (idx < N) {
    
    
        sdata[tid] = x[idx];
    }
    else {
    
    
        sdata[tid] = 0;
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid < 64) sdata[tid] += sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) y[bid] = sdata[0];
}

__global__ void dot_stg_3(float *x, int N)
{
    
    
    __shared__ float sdata[128];
    int tid = threadIdx.x;
    int i;

    sdata[tid] = 0;

    /* load data to shared mem */
    for (i = 0; i < N; i += 128) {
    
    
        if (tid + i < N) sdata[tid] += x[i + tid];
    }

    __syncthreads();

    /* reduction using shared mem */
    if (tid < 64) sdata[tid] = sdata[tid] + sdata[tid + 64];
    __syncthreads();

    if (tid < 32) warpReduce(sdata, tid);

    if (tid == 0) x[0] = sdata[0];
}

/* dz and d serve as cache: result stores in d[0] */
void dot_device(float *dx, float *dy, float *dz, float *d, int N)
{
    
    
    /* 1D block */
    int bs = 256;

    /* 2D grid */
    int s = ceil(sqrt((N + bs - 1.) / bs));
    dim3 grid = dim3(s, s);
    int gs = 0;

    /* stage 1 */
    dot_stg_1<<<grid, bs>>>(dx, dy, dz, N);

    /* stage 2 */
    {
    
    
        /* 1D grid */
        int N2 = (N + bs - 1) / bs;

        int s2 = ceil(sqrt((N2 + bs - 1.) / bs));
        dim3 grid2 = dim3(s2, s2);

        dot_stg_2<<<grid2, bs>>>(dz, d, N2);

        /* record gs */
        gs = (N2 + bs - 1.) / bs;
    }

    /* stage 3 */
    dot_stg_3<<<1, 128>>>(d, gs);
}

int main(int argc, char **argv)
{
    
    
    int N = 10000070;
    int nbytes = N * sizeof(float);

    float *hx = NULL, *hy = NULL;
    float *dx = NULL, *dy = NULL, *dz = NULL, *d = NULL;
    int i, itr = 20;
    float asd = 0, ash;
    double td, th;

    if (argc == 2) {
    
    
        int an;

        an = atoi(argv[1]);
        if (an > 0) N = an;
    }

    /* allocate GPU mem */
    cudaMalloc((void **)&dx, nbytes);
    cudaMalloc((void **)&dy, nbytes);

    cudaMalloc((void **)&dz, sizeof(float) * ((N + 255) / 256));
    cudaMalloc((void **)&d, sizeof(float) * ((N + 255) / 256));

    if (dx == NULL || dy == NULL || dz == NULL || d == NULL) {
    
    
        printf("couldn't allocate GPU memory\n");
        return -1;
    }

    printf("allocated %e MB on GPU\n", nbytes / (1024.f * 1024.f));

    /* alllocate CPU mem */
    hx = (float *) malloc(nbytes);
    hy = (float *) malloc(nbytes);

    if (hx == NULL || hy == NULL) {
    
    
        printf("couldn't allocate CPU memory\n");
        return -2;
    }
    printf("allocated %e MB on CPU\n", nbytes / (1024.f * 1024.f));

    /* init */
    for (i = 0; i < N; i++) {
    
    
        hx[i] = 1;
        hy[i] = 2;
    }

    /* copy data to GPU */
    cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);

    /* let dust fall */
    cudaDeviceSynchronize();
    td = get_time();

    /* call GPU */
    for (i = 0; i < itr; i++) dot_device(dx, dy, dz, d, N);

    /* let GPU finish */
    cudaDeviceSynchronize();
    td = get_time() - td;

    th = get_time();
    for (i = 0; i < itr; i++) ash = dot_host(hx, hy, N);
    th = get_time() - th;

    /* copy data from GPU */
    cudaMemcpy(&asd, d, sizeof(float), cudaMemcpyDeviceToHost);

    printf("dot, answer: %d, calculated by GPU:%f, calculated by CPU:%f\n", 2 * N, asd, ash);
    printf("GPU time: %e, CPU time: %e, speedup: %g\n", td, th, th / td);

    cudaFree(dx);
    cudaFree(dy);
    cudaFree(dz);
    cudaFree(d);

    free(hx);
    free(hy);

    return 0;
}

猜你喜欢

转载自blog.csdn.net/taifyang/article/details/131179029