该Example使用最基本的cudaMalloc() , cudaMemcpy() 等函数以及Kernel函数实现了一个并行化向量相乘的案例 :
#include <iostream>
using namespace std;
static void HandleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ ))
__global__ void product(const int *a, const int *b, int *c) {
unsigned int index = threadIdx.x;
c[index] = a[index] * b[index];
}
int main() {
const int n = 10;
int a[n];
int b[n];
for (int i = 0; i < n; i++) {
a[i] = i;
b[i] = i;
}
int *aDev, *bDev, *cDev;
HANDLE_ERROR(cudaMalloc((void **) &aDev, sizeof(int) * n));
HANDLE_ERROR(cudaMalloc((void **) &bDev, sizeof(int) * n));
HANDLE_ERROR(cudaMalloc((void **) &cDev, sizeof(int) * n));
HANDLE_ERROR(cudaMemcpy((void *) aDev, (void *) a, sizeof(int) * n, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy((void *) bDev, (void *) b, sizeof(int) * n, cudaMemcpyHostToDevice));
dim3 gridSize(1);
dim3 blockSize(n);
product<<<gridSize, blockSize>>>(aDev, bDev, cDev);
int c[n];
HANDLE_ERROR(cudaMemcpy(c, cDev, sizeof(int) * n, cudaMemcpyDeviceToHost));
for (int i: c) {
cout << i << endl;
}
HANDLE_ERROR(cudaFree(aDev));
HANDLE_ERROR(cudaFree(bDev));
HANDLE_ERROR(cudaFree(cDev));
return 0;
}
|