版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/a19990412/article/details/85222642
简述
矩阵向量乘法。
- 读取文件
data.txt
- 并输入到
output.txt
文件中 - 用typedef方便的修改数据类型(要是写成模板也是可以的)
代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <stdio.h>
typedef double DATA;
// Kernal:
__global__ void MatrixMultiply(DATA *a, DATA * b, DATA *c, int N) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if (tx < N) {
DATA sum = 0;
for (int k = 0; k < N; ++k) {
sum += a[tx * N + k] * b[k];
}
c[tx] = sum;
}
}
cudaError_t matrixMultiplyWithCuda(DATA *a, DATA *b, DATA *c, size_t size);
int main()
{
std::ifstream in("data.txt");
int N;
in >> N;
if (in.fail()) {
printf("Something wrong\n");
}
else {
printf("Success read\n");
}
// host initial
DATA *a = new DATA[N * N];
DATA *b = new DATA[N];
DATA *c = new DATA[N];
// read
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j) in >> a[i * N + j];
for (int i = 0; i < N; ++i) in >> b[i];
cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
std::ofstream out("output.txt");
for (int i = 0; i < N; ++i) {
out << std::setiosflags(std::ios::fixed) << c[i] << " ";
out << std::endl;
}
cudaStatus = cudaThreadExit();
// host free
delete[] a;
delete[] b;
delete[] c;
return 0;
}
cudaError_t matrixMultiplyWithCuda(DATA *a, DATA *b, DATA *c, size_t N) {
DATA *dev_a = 0;
DATA *dev_b = 0;
DATA *dev_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(DATA));
cudaStatus = cudaMalloc((void**)&dev_b, N * sizeof(DATA));
cudaStatus = cudaMalloc((void**)&dev_c, N * sizeof(DATA));
cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(DATA), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, N * sizeof(DATA), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("Something wrong\n");
goto Error;
}
// kernal invocation
dim3 threadPerBlock(500, 1, 1);
dim3 numBlocks(N / threadPerBlock.x+1, 1, 1);
MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
if (cudaStatus != cudaSuccess) {
printf( "Calculate wrong\n");
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, N * sizeof(DATA), cudaMemcpyDeviceToHost);
Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}