你看标题hhhh, 我也想笑 但其实它更好笑: hhhhhhhhhhhhhhhhhhhhh
就是 你的程序没保存,但是结果不对,找不到问题,不妨看看这个hhhhh
两年前有幸上过程润伟老师讲的CUDA C高性能编程引论,当时的课只有一周,不过,老师讲的风趣幽默,以至于我先在一些重要的点都记得hhhh
不扯了,切入正题:
dim3 threads_per_block(64, 64, 1);
dim3 number_of_blocks(16, 16, 1);
这个你看上去好像没啥问题,但实际上它已经超过的了最大运行线程数,没错就这个简单的东西…
原问题,参考附录
但是它执行并不会报错的,所以需要手动去读取错误,如下面所示:
(PS:顺便学到CUDA的错误处理,俩年前我一直不知道程润伟老师老师这个操作到底是啥意思)
创建一个包装 CUDA 函数调用的宏对于检查错误十分有用。以下是一个宏示例,可以在余下练习中随时使用: (摘自NVIDIA官方CUDA C编程教学notebook)
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
checkCuda( cudaDeviceSynchronize() )
}
但是我在实际使用的时候,发现这个玩意就是个玩具,一点儿都不好用,这是我看到的另一种写法:
dim3 tblocks(32, 16, 1);
dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1);
cudaError_t ierrSync, ierrAsync;
for (istep=0; istep < nstep; istep++) {
step_kernel_mod<<< grid, tblocks >>>(ni, nj, tfac, temp1, temp2);
ierrSync = cudaGetLastError();
ierrAsync = cudaDeviceSynchronize();
if (ierrSync != cudaSuccess) { printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); }
if (ierrAsync != cudaSuccess) { printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); }
temp_tmp = temp1;
temp1 = temp2;
temp2= temp_tmp;
}
上边那个玩意儿还不如 cudaGetLastError 有用hhh,另外我也懒得解释上边这几句都啥意思了,大家一看就懂,这个写法其实还可以封装一下,今天就只在这里做个记录
附录
原问题是将 step_kernel_mod 改写核函数,并调用
#include <stdio.h>
#include <math.h>
#define I2D(num, c, r) ((r)*(num)+(c))
void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
for ( int j=1; j < nj-1; j++ ) {
for ( int i=1; i < ni-1; i++ ) {
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
}
void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
for ( int j=1; j < nj-1; j++ ) {
for ( int i=1; i < ni-1; i++ ) {
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
}
int main()
{
int istep;
int nstep = 200;
const int ni = 200;
const int nj = 100;
float tfac = 8.418e-5;
float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp;
const int size = ni * nj * sizeof(float);
temp1_ref = (float*)malloc(size);
temp2_ref = (float*)malloc(size);
temp1 = (float*)malloc(size);
temp2 = (float*)malloc(size);
for( int i = 0; i < ni*nj; ++i) {
temp1_ref[i] = temp2_ref[i] = temp1[i] = temp2[i] = (float)rand()/(float)(RAND_MAX/100.0f);
}
for (istep=0; istep < nstep; istep++) {
step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref);
temp_tmp = temp1_ref;
temp1_ref = temp2_ref;
temp2_ref= temp_tmp;
}
for (istep=0; istep < nstep; istep++) {
step_kernel_mod(ni, nj, tfac, temp1, temp2);
temp_tmp = temp1;
temp1 = temp2;
temp2= temp_tmp;
}
float maxError = 0;
for( int i = 0; i < ni*nj; ++i ) {
if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); }
}
if (maxError > 0.0005f)
printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError);
else
printf("The Max Error of %.5f is within acceptable bounds.\n", maxError);
free( temp1_ref );
free( temp2_ref );
free( temp1 );
free( temp2 );
return 0;
}
NVIDIA 教程给的标准答案:
#include <stdio.h>
#include <math.h>
#define I2D(num, c, r) ((r)*(num)+(c))
__global__
void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j > 0 && i > 0 && j < nj-1 && i < ni-1) {
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
for ( int j=1; j < nj-1; j++ ) {
for ( int i=1; i < ni-1; i++ ) {
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
}
int main()
{
int istep;
int nstep = 200;
const int ni = 200;
const int nj = 100;
float tfac = 8.418e-5;
float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp;
const int size = ni * nj * sizeof(float);
temp1_ref = (float*)malloc(size);
temp2_ref = (float*)malloc(size);
cudaMallocManaged(&temp1, size);
cudaMallocManaged(&temp2, size);
for( int i = 0; i < ni*nj; ++i) {
temp1_ref[i] = temp2_ref[i] = temp1[i] = temp2[i] = (float)rand()/(float)(RAND_MAX/100.0f);
}
for (istep=0; istep < nstep; istep++) {
step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref);
temp_tmp = temp1_ref;
temp1_ref = temp2_ref;
temp2_ref= temp_tmp;
}
dim3 tblocks(32, 16, 1);
dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1);
cudaError_t ierrSync, ierrAsync;
for (istep=0; istep < nstep; istep++) {
step_kernel_mod<<< grid, tblocks >>>(ni, nj, tfac, temp1, temp2);
ierrSync = cudaGetLastError();
ierrAsync = cudaDeviceSynchronize();
if (ierrSync != cudaSuccess) { printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); }
if (ierrAsync != cudaSuccess) { printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); }
temp_tmp = temp1;
temp1 = temp2;
temp2= temp_tmp;
}
float maxError = 0;
for( int i = 0; i < ni*nj; ++i ) {
if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); }
}
if (maxError > 0.0005f)
printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError);
else
printf("The Max Error of %.5f is within acceptable bounds.\n", maxError);
free( temp1_ref );
free( temp2_ref );
cudaFree( temp1 );
cudaFree( temp2 );
return 0;
}
|