? 关键是线程分配方式,将一个大矩阵分成一片一片的矩阵,用线程的ny,nx来分别表示矩阵的行和列,但由于一般矩阵都是用一个一维数组进行存储,所以最终对应到全局内存的index还得利用行和列转成一维的index。
#include <stdio.h>
const int N = 128;
const int TILE_DIM = 32;
const int SIZE = sizeof(int) * N * N;
void __global__ copy(int *da, int *db, const int N);
void __global__ copy(int *da, int *db, const int N)
{
int ny = blockIdx.y * TILE_DIM + threadIdx.y;
int nx = blockIdx.x * TILE_DIM + threadIdx.x;
int index = ny*N + nx;
if(ny<N && nx<N)
{
db[index] = da[index];
}
}
int main(int argc, char *argv[])
{
int *ha, *hb;
ha = (int *)malloc(SIZE);
hb = (int *)malloc(SIZE);
for(int i=0; i< N*N; ++i)
{
ha[i] = 100;
}
int *da, *db;
cudaMalloc((void **)&da, SIZE);
cudaMalloc((void **)&db, SIZE);
cudaMemcpy(da, ha, SIZE, cudaMemcpyHostToDevice);
const dim3 block_size(TILE_DIM, TILE_DIM);
const int grid_size_x = (N +TILE_DIM -1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 grid_size(grid_size_x, grid_size_y);
copy<<<grid_size,block_size>>>(da,db,N);
cudaMemcpy(hb,db,SIZE,cudaMemcpyDeviceToHost);
printf("%d\n",hb[100]);
free(ha);
free(hb);
cudaFree(da);
cudaFree(db);
return 0;
}
|