前言

今天学习了如何使用CUDA获取GPU设备上属性信息，这里做个学习笔记并分享给大家。

1. 学习内容

CUDA获取设备数量与设备名称，CUDA获取设备通用属性信息，CUDA获取设备内存相关属性信息，CUDA获取设备线程相关属性信息等。

2. VS2017 CUDA代码实现

#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <memory>

int main()
{
	//1. 获取CUDA支持的设备数量
	int device_count = 0;
	// get the count of number of CUDA enable devices , if no CUDA capable devices, get 0
	cudaGetDeviceCount(&device_count);

	if (device_count == 0)
	{
		std::cout << "There are no available device(s) that support CUDA" << std::endl;
	}
	else
	{
		std::cout << "Detected < " << device_count <<" > CUDA Capable device(s)\n" << std::endl;
	}

	// 2. GPU通用属性信息
	cudaDeviceProp device_Property;
	int device = 0;
	cudaGetDevice(&device);
	cudaGetDeviceProperties(&device_Property, device);
	printf("Device %d: \"%s\"\n", device, device_Property.name);

	int driverVersion = 0;
	int runtimeVersion = 0;

	cudaDriverGetVersion(&driverVersion);
	cudaRuntimeGetVersion(&runtimeVersion);
	printf("CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion / 1000,
		(driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
	
	printf("(%2d) Multiprocessors\n", device_Property.multiProcessorCount);

	printf("GPU Max Clock rate: %.0f MHz(%0.2f GHz)\n", device_Property.clockRate * 1e-3f, device_Property.clockRate * 1e-6f);

	// 3. GPU内存相关属性
	// GPU内存为分层结构，分为L1缓存， L2缓存，全局内存，纹理内存和共享内存

	//全局内存大小
	printf("Total amount of global memory: %.0f MBytes(%llu bytes)\n",
		(float)device_Property.totalGlobalMem / 1048576.0f, (unsigned long long)device_Property.totalGlobalMem);

	//显存频率
	printf("Memory Clock rate: %.0f MHz\n", device_Property.memoryClockRate * 1e-3f);

	//显存带宽
	printf("Memory Bus Width: %d-bit\n", device_Property.memoryBusWidth);

	//L2缓存大小
	if (device_Property.l2CacheSize)
	{
		printf("L2 Cache Size : %d bytes\n", device_Property.l2CacheSize);
	}

	//设备中总常量显存
	printf("Total amount of constant memory: %lu bytes\n", device_Property.totalConstMem);

	//设备中每个块的最大可用共享缓存
	printf("Total amount of shared memory per block: %lu bytes\n", device_Property.sharedMemPerBlock);

	//设备中每个块最大可用寄存器数量
	printf("Total number of registers available per block: %d\n", device_Property.regsPerBlock);

	//4. 线程相关属性

	//块和线程可以是多维的，最好知道每个维度中可以并行启动多少线程和块，每个多处理器的线程数量和每个块的线程数量有限制
	//在内核参数配置中，如果每个块中启动的线程数量超过每个块中可能的最大线程数量，则程序可能崩溃。

	//每个多处理器线程数量
	printf("Maximum number of threads per multiprocessor: %d\n", device_Property.maxThreadsPerMultiProcessor);

	//每个块最大线程数
	printf("Maximum number of threads per block: %d\n", device_Property.maxThreadsPerBlock);

	//每个块中各个维度最大线程数量
	printf("Max dimension size of a thread block (x, y, z):(%d, %d, %d)\n",
		device_Property.maxThreadsDim[0],
		device_Property.maxThreadsDim[1],
		device_Property.maxThreadsDim[2]);

	//每个维度中每个网格的最大维度大小
	printf("Max dimension size of a grid size (x, y, z): (%d, %d, %d)\n",
		device_Property.maxGridSize[0],
		device_Property.maxGridSize[1],
		device_Property.maxGridSize[2]);

	//为什么要了解设备属性呢？答案就是如果有多个GPU设备，将帮助选择更多多处理的GPU设备

	//设备属性的一种用法：假设有一个应用程序，要求双精度浮点操作。并不是所有的GPU设备都支持这种操作。
	//确定设备是否支持双精度浮点操作并为应用程序设置该设备
	int device1;
	cudaGetDevice(&device1);

	cudaDeviceProp deviceProp;
	memset(&deviceProp, 0, sizeof(cudaDeviceProp));
	//major > 1 && minor > 3, 该设备支持双精度操作
	deviceProp.major = 1;
	deviceProp.minor = 3;
	cudaChooseDevice(&device1, &deviceProp);
	printf("ID of device which support double precision is :%d\n",
		device1);
	cudaSetDevice(device1);

	system("pause");
	return 0;
}