NVIDIA网站有一部分GPU编程系列的课程,具体的课程地址为: https://www.nvidia.cn/developer/online-training/community-training/ 以下为课程的部分记录笔记,仅供学习参考: https://gg2ksnq1wg.feishu.cn/docs/doccnh2QtoPeGfUHR4eJIAvcGzd?from=from_copylink 课程中所涉及的代码: https://github.com/jhzhang19/NVIDIA_CUDA_program.git
1、 利用CUDA实现卷积
这一部分cuda和OpenCV联合编译一直没有成功,可能是OpenCV需要安装cuda版本的,有测试成功的朋友欢迎交流。
#include"cuda_runtime.h"
#include<cudnn.h>
#include<cuda.h>
#include<device_functions.h>
#include<opencv4/opencv2/opencv.hpp>
#include<iostream>
using namespace std;
using namespace cv;
float3 data_kernel[] = {
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
make_float3(-2.0f, -2.0f, -2.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(2.0f, 2.0f, 2.0f),
make_float3(-1.0f, -1.0f, -1.0f), make_float3(0.0f, 0.0f, 0.0f), make_float3(1.0f, 1.0f, 1.0f),
};
int main(){
cv::Mat img = cv::imread("/home/zjh19/图片/00000.png");
int imgWidth = img.cols;
int imgHeight = img.rows;
int imgChannel = img.channels();
cv::Mat dst_gpu(imgHeight, imgWidth, CV_8UC3, cv::Scalar(0, 0, 0));
size_t num = imgChannel * imgHeight * imgWidth * sizeof(unsigned char);
unsigned char *in_gpu;
unsigned char *out_gpu;
float *filt_data;
cudaMalloc((void **)&filt_data, 3 * 3 * 3 * sizeof(float3));
cudaMalloc((void **)&in_gpu, num);
cudaMalloc((void **)*out_gpu, num);
cudnnHandle_t handle;
cudnnCreate(&handle);
cudnnTensorDescriptor_t input_descriptor;
cudnnCreateTensorDescriptor(&input_descriptor);
cudnnSetTensor4dDescriptor(input_descriptor, CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT, 1, 3, imgHeight, imgWidth);
cudnnTensorDescriptor_t output_descriptor;
cudnnCreateTensorDescriptor(&output_descriptor);
cudnnSetTensor4dDescriptor(output_descriptor, CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT, 1, 3, imgHeight, imgWidth);
cudnnFilterDescriptor_t kernel_descriptor;
cudnnCreateFilterDescriptor(&kernel_descriptor);
cudnnSetFilter4dDescriptor(kernel_descriptor, CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW, 3, 3, 3, 3);
cudnnConvolutionDescriptor_t conv_descriptor;
cudnnCreateConvolutionDescriptor(&conv_descriptor);
cudnnSetConvolution2dDescriptor(conv_descriptor, 1, 1, 1, 1, 1, 1,
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
cudnnConvolutionFwdAlgoPerf_t algo;
cudnnGetConvolutionForwardAlgorithm_v7(handle, input_descriptor, kernel_descriptor,
conv_descriptor, output_descriptor, 1, 0, &algo);
size_t workspace_size = 0;
cudnnGetConvolutionForwardWorkspaceSize(handle, input_descriptor, kernel_descriptor, conv_descriptor, output_descriptor, algo.algo, &workspace_size);
void *workspace = nullptr;
cudaMalloc(&workspace, workspace_size);
cudaMemcpy((void *)filt_data, (void *)data_kernel, 3 * 3 * 3 * sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(in_gpu, img.data, num, cudaMemcpyHostToDevice);
auto alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle, &alpha, input_descriptor, in_gpu,
kernel_descriptor, filt_data, conv_descriptor, algo.algo, &workspace, workspace_size, &beta, output_descriptor, out_gpu);
cudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);
cudaFree(in_gpu);
cudaFree(out_gpu);
cudaFree(workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(conv_descriptor);
cudnnDestroy(handle);
return 0;
}
2、CUDA实现sobel边缘检测算子
#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<opencv2/opencv.hpp>
#include<device_functions.h>
#include<iostream>
using namespace std;
using namespace cv;
void sobel_cpu(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth){
}
__global__ void sobel_gpu(unsigned char* in, unsigned char* out, int imgHeight, int imgWidth){
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int index = y * imgWidth + x;
int Gx = 0;
int Gy = 0;
unsigned char x0, x1, x2, x3, x4, x5, x6, x7, x8;
if(x>0 && x<imgWidth && y>0 && y<imgHeight){
x0 = in[(y - 1) * imgWidth + x - 1];
x1= in[(y - 1) * imgWidth + x ];
x2= in[(y - 1) * imgWidth + x + 1 ];
x3= in[y * imgWidth + x - 1 ];
x4= in[y * imgWidth + x ];
x5= in[y * imgWidth + x + 1];
x6= in[(y + 1) * imgWidth + x - 1 ];
x7= in[(y + 1) * imgWidth + x ];
x8= in[(y + 1) * imgWidth + x + 1 ];
Gx = x0 + 2 * x3 + x6 - (x2 + 2 * x5 + x8);
Gy = x6 + 2 * x7 + x8 - (x0 + 2 * x1 + x2);
out[index] = (abs(Gx) + abs(Gy)) / 2;
}
}
int main(){
Mat grayImg = imread("1.jpg", 0);
int imgWidth = grayImg.cols;
int imgHeight = grayImg.rows;
Mat gaussImg;
GaussianBlur(grayImg, gaussImg, Size(3, 3), 0, 0, BORDER_DEFAULT);
Mat dst_cpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));
Mat dst_gpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));
sobel_cpu(gaussImg, dst_cpu, imgHeight, imgWidth);
size_t num = imgHeight * imgWidth * sizeof(unsigned char);
unsigned char *in_gpu;
unsigned char *out_gpu;
cudaMalloc((void **)&in_gpu, num);
cudaMalloc((void **)&out_gpu, num);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
cudaMemcpy(in_gpu, gaussImg.data, num, cudaMemcpyHostToDevice);
sobel_gpu<<<blocksPerGrid, threadsPerBlock>>>(in_gpu, out_gpu, imgHeight, imgWidth);
cudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);
imshow("gpu", dst_gpu);
imshow("cpu", dst_cpu);
waitKey(0);
cudaFree(in_gpu);
cudaFree(out_gpu);
return 0;
}
3、CUDA多流操作(锁页内存)
single stream
#include<stdio.h>
#include<iostream>
#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<device_functions.h>
using namespace std;
#define N (1024*1024)
#define FULL (N*20)
__global__ void kernel(int *a, int *b, int *c){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx < N){
c[idx] = (a[idx] + b[idx]) / 2;
}
}
int main(){
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop, whichDevice);
if(!prop.deviceOverlap){
cout << "Device will not support overlap!" << endl;
return 0;
}
else{
cout<<prop.deviceOverlap<<" yes"<<endl;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream;
int *host_a, *host_b, *host_c;
int *dev_a, *dev_b, *dev_c;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaStreamCreate(&stream);
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, N * sizeof(int));
cudaHostAlloc((void **)&host_a, FULL * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void **)&host_b, FULL * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void **)&host_c, FULL * sizeof(int), cudaHostAllocDefault);
for (int i = 0; i < FULL;i++){
host_a[i] = rand();
host_b[i] = rand();
}
cudaEventRecord(start, 0);
for (int i = 0; i < FULL;i+=N){
cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);
kernel<<<N / 256, 256, 0, stream>>>(dev_a, dev_b, dev_c);
cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream);
}
cudaStreamSynchronize(stream);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "Single Time is:" << float(elapsedTime) << " s" << endl;
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
cudaFreeHost(host_a);
cudaFreeHost(host_b);
cudaFreeHost(host_c);
cudaStreamDestroy(stream);
return 0;
}
multi stream
#include<stdio.h>
#include<iostream>
#include<cuda.h>
#include<cudnn.h>
#include<cuda_runtime.h>
#include<device_functions.h>
using namespace std;
#define N (1024*1024)
#define FULL (N*20)
__global__ void kernel(int *a, int *b, int *c){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx < N){
c[idx] = (a[idx] + b[idx]) / 2;
}
}
int main(){
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop, whichDevice);
if(!prop.deviceOverlap){
cout << "Device will not support overlap!" << endl;
return 0;
}
else{
cout<<prop.deviceOverlap<<" yes"<<endl;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0;
cudaStream_t stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaMalloc((void **)&dev_a0, N * sizeof(int));
cudaMalloc((void **)&dev_b0, N * sizeof(int));
cudaMalloc((void **)&dev_c0, N * sizeof(int));
cudaMalloc((void **)&dev_a1, N * sizeof(int));
cudaMalloc((void **)&dev_b1, N * sizeof(int));
cudaMalloc((void **)&dev_c1, N * sizeof(int));
cudaHostAlloc((void **)&host_a, FULL * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void **)&host_b, FULL * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void **)&host_c, FULL * sizeof(int), cudaHostAllocDefault);
for (int i = 0; i < FULL;i++){
host_a[i] = rand();
host_b[i] = rand();
}
cudaEventRecord(start, 0);
for (int i = 0; i < FULL;i+=2*N){
cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(dev_a1, host_a + i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(dev_b1, host_b + i+N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
kernel<<<N / 256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);
kernel<<<N / 256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);
cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(host_c + i+N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1);
}
cudaStreamSynchronize(stream0);
cudaStreamSynchronize(stream1);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "Multi Time is:" << float(elapsedTime) << " s" << endl;
cudaFree(dev_a0);
cudaFree(dev_b0);
cudaFree(dev_c0);
cudaFree(dev_a1);
cudaFree(dev_b1);
cudaFree(dev_c1);
cudaFreeHost(host_a);
cudaFreeHost(host_b);
cudaFreeHost(host_c);
cudaStreamDestroy(stream0);
cudaStreamDestroy(stream1);
return 0;
}
4、CUDA-python 图片处理
对图片像素值进行操作
import imp
import cv2 as cv
import numpy as np
import numba
from numba import cuda
import time
import math
@cuda.jit
def process_gpu(img, channels):
tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
ty = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
for c in range (channels):
color = img[tx, ty][c] * 2.0 + 30.0
if color > 255:
img[tx, ty][c] = 255
elif color < 0:
img[tx, ty][c] = 0
else:
img[tx, ty][c] = color
def process_cpu(img, dst):
rows, cols, channels = img.shape
for i in range(rows):
for j in range(cols):
for c in range(channels):
color = img[i, j][c] * 2.0 + 30.0
if color > 255:
dst[i, j][c] = 255
elif color < 0:
dst[i, j][c] = 0
else:
dst[i, j][c] = color
if __name__ == "__main__":
img = cv.imread("test.png")
rows, clos, channels = img.shape
dst_cpu = img.copy()
dst_gpu = img.copy()
dImg = cuda.to_device(img)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))
blockspergrid_y = int(math.ceil(clos/threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda.synchronize()
print("GPU processing:")
start_gpu = time.time()
process_gpu[blockspergrid, threadsperblock](dImg, channels)
cuda.synchronize()
end_gpu = time.time()
time_gpu = end_gpu - start_gpu
dst_gpu = dImg.copy_to_host()
print("GPU process time is: " + str(time_gpu) + "s")
print("CPU processing:")
start_cpu = time.time()
process_cpu(img, dst_cpu)
end_cpu = time.time()
time_cpu = end_cpu - start_cpu
print("CPU process time is: "+ str(time_cpu) + "s")
cv.imwrite("result_cpu.png", dst_cpu)
cv.imwrite("result_gpu.png", dst_gpu)
print("Process Done!")
5.CUDA python 内存操作
矩阵相乘计算A*B=C,分别使用CPU、GPU共享内存、GPU全局内存进行计算存储,比较时间快慢。
import numba
from numba import cuda
import math
import numpy as np
import time
TPB = 16
@numba.jit(nopython=True)
def matmul_cpu(A, B, C):
for y in range(B.shape[1]):
for x in range(A.shape[0]):
tmp = 0.
for k in range(A.shape[1]):
tmp = A[x,k] * B[k, y]
C[x, y] = tmp
@cuda.jit
def matmul_gpu(A, B, C):
row, col = cuda.grid(2)
if row<C.shape[0] and col < C.shape[1]:
tmp = 0.
for k in range(A.shape[1]):
tmp += A[row, k] * B[k, col]
C[row, col] = tmp
@cuda.jit
def matmul_shared_mem(A, B, C):
sA = cuda.shared.array(shape=(TPB, TPB), dtype=numba.float32)
sB = cuda.shared.array(shape=(TPB, TPB), dtype=numba.float32)
x, y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
if x >=C.shape[0] and y <= C.shape[1]:
return
tmp = 0.
for i in range(int(A.shape[1]/TPB)):
sA[tx, ty] = A[x, ty+i*TPB]
sB[tx, ty] = B[tx+i*TPB, y]
cuda.syncthreads()
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]
cuda.syncthreads()
C[x, y] = tmp
A = np.full((TPB*500, TPB*500), 3, np.float)
B = np.full((TPB*500, TPB*500), 4, np.float)
C_cpu = np.full((A.shape[0], B.shape[1]), 0, np.float)
print("Start processing in CPU")
start_cpu = time.time()
matmul_cpu(A, B, C_cpu)
end_cpu = time.time()
time_cpu = end_cpu - start_cpu
print("CPU process time is: "+ str(time_cpu)+" s")
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((A.shape[0], B.shape[1]))
C_shared_mem = cuda.device_array((A.shape[0], B.shape[1]))
threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
print("GPU processing")
start_gpu = time.time()
matmul_gpu[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
cuda.synchronize()
end_gpu = time.time()
time_gpu = end_gpu - start_gpu
C_global_gpu = C_global_mem.copy_to_host()
print("GPU time is: "+str(time_gpu)+" s")
start_gpu_shared = time.time()
matmul_shared_mem[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_shared_mem)
cuda.synchronize()
end_gpu_shared = time.time()
time_gpu_shared = end_gpu_shared - start_gpu_shared
print("GPU time(shared memory) is: " + str(time_gpu_shared) + " s")
C_shared_gpu = C_shared_mem.copy_to_host()
|