本博客主要参考 cuBLAS 库 词条实现,与原文不同的是,本博客:
- 将cuBLAS库的乘法运算进行了封装,方便了算法调用;
- 将原文的结果转置实现为了不转置,这样可以直接使用计算结果;
- 测试并更改了乘法参数,解决了原文中更改矩阵大小时报错的问题。
总的来说,本博客的代码利用cuBLAS库实现了两个矩阵相乘,提高了矩阵乘法的计算速度。
test.cpp
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <time.h>
#include <iostream>
using namespace std;
int **matMult_cuBLAS(int **A, int **B, int rowSizeA, int colSizeA, int colSizeB, cublasHandle_t cuHandle){
int** C = new int*[rowSizeA];
for(int i = 0; i < rowSizeA; i++){
C[i] = new int[colSizeB];
}
for (int i = 0; i < rowSizeA; i++){
for (int j = 0; j < colSizeB; j++){
C[i][j] = 0;
}
}
float *h_A = (float*)malloc (rowSizeA * colSizeA * sizeof(float));
float *h_B = (float*)malloc (colSizeA * colSizeB * sizeof(float));
float *h_C = (float*)malloc (rowSizeA * colSizeB * sizeof(float));
for (int i = 0; i < rowSizeA; i++) {
for (int j = 0; j < colSizeA; j++) {
h_A[i * colSizeA + j] = (float)A[i][j];
}
}
for (int i = 0; i < colSizeA; i++) {
for (int j = 0; j < colSizeB; j++) {
h_B[i * colSizeB + j] = (float)B[i][j];
}
}
float *d_A, *d_B, *d_C;
cudaMalloc (
(void**)&d_A,
rowSizeA * colSizeA * sizeof(float)
);
cudaMalloc (
(void**)&d_B,
colSizeA * colSizeB * sizeof(float)
);
cudaMalloc (
(void**)&d_C,
rowSizeA * colSizeB * sizeof(float)
);
cublasSetVector (
rowSizeA * colSizeA,
sizeof(float),
h_A,
1,
d_A,
1
);
cublasSetVector (colSizeA * colSizeB, sizeof(float), h_B, 1, d_B, 1);
float a=1; float b=0;
cublasSgemm (
cuHandle,
CUBLAS_OP_T,
CUBLAS_OP_T,
rowSizeA,
colSizeB,
colSizeA,
&a,
d_A,
colSizeA,
d_B,
colSizeB,
&b,
d_C,
rowSizeA
);
cublasGetVector (
rowSizeA * colSizeB,
sizeof(float),
d_C,
1,
h_C,
1
);
for (int i = 0; i < rowSizeA; i++) {
for (int j = 0; j < colSizeB; j++) {
C[i][j] = (int)h_C[j * rowSizeA + i];
}
}
free (h_A); free (h_B); free (h_C); cudaFree (d_A);
cudaFree (d_B); cudaFree (d_C);
return C;
}
int** uniformMat(int rowSize, int colSize, int minValue, int maxValue) {
int** mat = new int* [rowSize];
for (int i = 0; i < rowSize; i++)
mat[i] = new int[colSize];
srand((unsigned)time(NULL));
for (int i = 0; i < rowSize; i++) {
for (int j = 0; j < colSize; j++) {
mat[i][j] = (int)(rand() % (maxValue - minValue + 1)) + minValue;
}
}
return mat;
}
int main(void)
{
cublasHandle_t cuHandle;
cublasStatus_t status = cublasCreate(&cuHandle);
if (status != CUBLAS_STATUS_SUCCESS)
{
if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
cout << "CUBLAS 对象实例化出错" << endl;
}
getchar ();
return EXIT_FAILURE;
}
int rowSizeA = 3;
int colSizeA = 4;
int colSizeB = 2;
int **A = uniformMat(rowSizeA, colSizeA, 0, 4);
int **B = uniformMat(colSizeA, colSizeB, 5, 9);
cout << "矩阵 A :" << endl;
for (int i = 0; i < rowSizeA; i++) {
for (int j = 0; j < colSizeA; j++) {
cout << A[i][j] << " ";
}
cout << endl;
}
cout << endl;
cout << "矩阵 B :" << endl;
for (int i = 0; i < colSizeA; i++) {
for (int j = 0; j < colSizeB; j++) {
cout << B[i][j] << " ";
}
cout << endl;
}
cout << endl;
int **C = matMult_cuBLAS(A, B, rowSizeA, colSizeA, colSizeB, cuHandle);
cout << "矩阵 C :" << endl;
for (int i = 0; i < rowSizeA; i++) {
for (int j = 0; j < colSizeB; j++) {
cout << C[i][j] << " ";
}
cout << endl;
}
cout << endl;
cublasDestroy (cuHandle);
return 0;
}
在终端输入:
nvcc -lcublas test.cpp -o t
./t
运算结果:
矩阵 A :
1 3 2 0
2 1 2 1
4 3 2 4
矩阵 B :
6 8
7 5
7 6
7 6
矩阵 C :
41 35
40 39
87 83
|