系列文章目录

文章目录

系列文章目录
前言
一、统一虚拟寻址
- 1. 统一虚拟寻址技术
- 2. 示例程序
二、统一内存寻址
- 1.统一内存寻址技术
- 2.示例程序
总结
参考资料

前言

这里开始介绍内存管理中的统一虚拟寻址和统一内存寻址技术的相关知识点。

一、统一虚拟寻址

1. 统一虚拟寻址技术

只有计算力在2.0及以上版本的设备支持一种特殊的寻址方式，即为统一虚拟寻址（UVA）。有了UVA，主机内存和设备内存共享统一虚拟地址空间，说白了，和之前介绍的零拷贝内存技术的功能相同，不需要 $c u d a M e m c p y$ 函数完成主机内存与设备内存数据的相互传输，即可以在主机和设备中都可以直接读写。
在这里插入图片描述通过UVA, 有 $c u d a H o s t A l l o c$ 分配的固定主机内存具有相同的主机和设备指针。

与零拷贝内存相比，使用UVA无须获取设备指针或管理物理上数据完全相同的两个指针。从 $c u d a H o s t A l l o c$ 函数返回的指针可以直接传递给核函数，这样减少了代码量，提高了应用程序的可读性和可维护性。

2. 示例程序

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h>

void InitData(float* data, size_t nElem)
{
	for (size_t i = 0; i < nElem; i++)
	{
		data[i] = i % 255;
	}
}

void SumArraysOnHost(float* h_A, float* h_B, float* hostRef, size_t nElem)
{
	for (size_t i = 0; i < nElem; i++)
	{
		hostRef[i] = h_A[i] + h_B[i];
	}
}

void CheckResults(float* hostRef, float* gpuRef, size_t nElem)
{
	bool bSame = true;
	for (size_t i = 0; i < nElem; i++)
	{
		if (abs(gpuRef[i] - hostRef[i]) > 1e-5)
		{
			bSame = false;
		}
	}

	if (bSame)
	{
		printf("Result is correct!\n");
	}
	else
	{
		printf("Result is error!\n");
	}
}

__global__ void GpuSumArrays(float* d_A, float* d_B, float* d_C, size_t nElem)
{
	int tid = blockDim.x * blockIdx.x + threadIdx.x;
	if (tid < nElem)
		d_C[tid] = d_A[tid] + d_B[tid];
}




int main()
{
	int nDev = 0;
	cudaSetDevice(nDev);

	cudaDeviceProp stDeviceProp;
	cudaGetDeviceProperties(&stDeviceProp, nDev);

	//check whether support mapped memory
	if (!stDeviceProp.canMapHostMemory)
	{
		printf("Device %d does not support mapping CPU host memory!\n", nDev);
		goto EXIT;
	}

	printf("Using device %d: %s\n", nDev, stDeviceProp.name);

	// set up data size of vector
	int nPower = 10;
	int nElem = 1 << nPower;
	size_t nBytes = nElem * sizeof(float);
	if (nPower < 18) {
		printf("Vector size %d power %d nbytes %3.0f KB\n",
			nElem, nPower, (float)nBytes / (1024.0f));
	}
	else {
		printf("Vector size %d power %d nbytes %3.0f MB\n",
			nElem, nPower, (float)nBytes / (1024.0f * 1024.0f));
	}

	// part 1: using device memory
	// malloc host memory
	float *h_A, *h_B, *hostRef, *gpuRef;
	h_A = (float*)malloc(nBytes);
	h_B = (float*)malloc(nBytes);
	hostRef = (float*)malloc(nBytes);
	gpuRef = (float*)malloc(nBytes);

	// initialize data at host side
	InitData(h_A, nElem);
	InitData(h_B, nElem);
	memset(hostRef, 0, nBytes);
	memset(gpuRef, 0, nBytes);

	// add vector at host side for result checks
	SumArraysOnHost(h_A, h_B, hostRef, nElem);

	// malloc device global memory
	float* d_A, *d_B, *d_C;
	cudaMalloc(&d_A, nBytes);
	cudaMalloc(&d_B, nBytes);
	cudaMalloc(&d_C, nBytes);

	//transfer data from host to device
	cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

	// set up execution configuration
	int nLen = 512;
	dim3 block(nLen);
	dim3 grid((nElem + block.x - 1) / block.x);

	//invoke kernel at host side
	GpuSumArrays << <grid, block >> > (d_A, d_B, d_C, nElem);

	//copy kernel result back to host side
	cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

	//check device results
	CheckResults(hostRef, gpuRef, nElem);

	// free device globl memory
	cudaFree(d_A);
	cudaFree(d_B);
	free(h_A);
	free(h_B);

	// part2: using UVA - global memory for array A , B, C
	// allocate UVA memory
	unsigned int nFlags = cudaHostAllocMapped;
	cudaHostAlloc(&h_A, nBytes, nFlags);
	cudaHostAlloc(&h_B, nBytes, nFlags);
	//cudaHostAlloc(&h_C, nBytes, nFlags);

	// initialize data at host side
	InitData(h_A, nElem);
	InitData(h_B, nElem);
	//InitData(h_C, nElem);
	memset(hostRef, 0, nBytes);
	//memset(h_C, 0, nBytes);

	// pass the pointer to device
	//cudaHostGetDevicePointer(&d_A, h_A, 0);
	//cudaHostGetDevicePointer(&d_B, h_B, 0);

	// add at host side for result checks
	SumArraysOnHost(h_A, h_B, hostRef, nElem);

	//execute kernle with zero copy memory
	GpuSumArrays << <grid, block >> > (h_A, h_B, d_C, nElem);

	//copy kernel result back to host side
	cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

	//check device results
	CheckResults(hostRef, gpuRef, nElem);

	// free memory
	cudaFreeHost(h_A);
	cudaFreeHost(h_B);
	
	free(hostRef);
	free(gpuRef);
	cudaFree(d_C);

EXIT:
	cudaDeviceReset();

	system("pause");
	return 0;
}

运行结果：
在这里插入图片描述
这里有个疑问，统一虚拟寻址只支持设备函数读UVA中的数据，无法写UVA数据吗？个人将设备函数调用由：

GpuSumArrays << <grid, block >> > (h_A, h_B, d_C, nElem);

替换为：

GpuSumArrays << <grid, block >> > (h_A, h_B, h_C, nElem);

其中 $h\_C$ 变量的分配内存方式和 $h\_A,h\_B$ 相同，运行结果出错，发现 $h_C$ 变量存储的值没被设备函数改写。

二、统一内存寻址

1.统一内存寻址技术

统一内存寻址用于简化CUDA编程模型中的内存管理。统一内存中创建了一个托管内存池，内存池中已分配的内存空间可以用相同的内存地址在CPU和GPU上访问。底层系统在统一内存空间中自动在主机和设备间进行数据传输，而这种传输对于应用程序而言是透明的，简化了程序代码。

统一内存寻址依赖于UVA支持，但是完全不同的技术。UVA为系统中的所有处理器提供一个单一的虚拟内存地址空间。但是UVA不会自动将数据从一个物理位置转移到另一个位置，但是统一内存寻址有这个作用。

统一内存寻址提供了一个“单指针到数据”模型，功能上类似零拷贝内存，但是零拷贝内存在主机内存中进行分配，由于收到PCIe总线上访问零拷贝内存影响，核函数性能具有较高延迟。

统一内存寻址将内存和执行空间分离，可以根据需要将数据透明地传输到主机或设备上，以提升局部性和性能。

托管内存是指底层系统自动分配的统一内存，可以与设备的分配内存互操作，因此可以在核函数中使用两类内存：
（1）由系统控制的托管内存
（2）由应用程序明确分配和调用的未托管内存

所有在设备内存上有效地CUDA操作同样适用于托管内存。主机也能引用和访问托管内存。

托管内存可被静态分配，也可被动态分配。
静态分配使用修饰符 $_ _ \_\_managed\_\_$ 将设备变量作为托管变量，这个变量可从主机或设备代码中直接被引用：
$_\_device\_\_ \_\_managed\_\_\ int\ y;$

动态分配可采用CUDA运行时函数实现：
$cudaError\_t\ cudaMallocManaged(void\ **devPtr,\ size\_t\ size,\ unsigned\ int\ flags=0);$

上述函数返回的指针在所有设备和主机上都有效，使用托管内存的程序可以利用自动数据传输和重复指针消除功能。

所有的托管内存必须在主机端动态声明或者在全局范围内静态声明。

2.示例程序

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h>

void InitData(float* data, size_t nElem)
{
	for (size_t i = 0; i < nElem; i++)
	{
		data[i] = i % 255;
	}
}

void SumArraysOnHost(float* h_A, float* h_B, float* hostRef, size_t nElem)
{
	for (size_t i = 0; i < nElem; i++)
	{
		hostRef[i] = h_A[i] + h_B[i];
	}
}

void CheckResults(float* hostRef, float* gpuRef, size_t nElem)
{
	bool bSame = true;
	for (size_t i = 0; i < nElem; i++)
	{
		if (abs(gpuRef[i] - hostRef[i]) > 1e-5)
		{
			bSame = false;
		}
	}

	if (bSame)
	{
		printf("Result is correct!\n");
	}
	else
	{
		printf("Result is error!\n");
	}
}

__global__ void GpuSumArrays(float* d_A, float* d_B, float* d_C, size_t nElem)
{
	int tid = blockDim.x * blockIdx.x + threadIdx.x;
	if (tid < nElem)
		d_C[tid] = d_A[tid] + d_B[tid];
}




int main()
{
	int nDev = 0;
	cudaSetDevice(nDev);

	cudaDeviceProp stDeviceProp;
	cudaGetDeviceProperties(&stDeviceProp, nDev);

	cudaGetDevice(&nDev);

	//check whether support mapped memory
	if (!stDeviceProp.canMapHostMemory)
	{
		printf("Device %d does not support mapping CPU host memory!\n", nDev);
		goto EXIT;
	}

	printf("Using device %d: %s\n", nDev, stDeviceProp.name);

	// set up data size of vector
	int nPower = 10;
	int nElem = 1 << nPower;
	size_t nBytes = nElem * sizeof(float);
	if (nPower < 18) {
		printf("Vector size %d power %d nbytes %3.0f KB\n",
			nElem, nPower, (float)nBytes / (1024.0f));
	}
	else {
		printf("Vector size %d power %d nbytes %3.0f MB\n",
			nElem, nPower, (float)nBytes / (1024.0f * 1024.0f));
	}

	// part 1: using device memory
	// malloc host memory
	float *h_A, *h_B, *h_C, *hostRef, *gpuRef;
	h_A = (float*)malloc(nBytes);
	h_B = (float*)malloc(nBytes);
	hostRef = (float*)malloc(nBytes);
	gpuRef = (float*)malloc(nBytes);

	// initialize data at host side
	InitData(h_A, nElem);
	InitData(h_B, nElem);
	memset(hostRef, 0, nBytes);
	memset(gpuRef, 0, nBytes);

	// add vector at host side for result checks
	SumArraysOnHost(h_A, h_B, hostRef, nElem);

	// malloc device global memory
	float* d_A, *d_B, *d_C;
	cudaMalloc(&d_A, nBytes);
	cudaMalloc(&d_B, nBytes);
	cudaMalloc(&d_C, nBytes);

	//transfer data from host to device
	cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

	// set up execution configuration
	int nLen = 512;
	dim3 block(nLen);
	dim3 grid((nElem + block.x - 1) / block.x);

	//invoke kernel at host side
	GpuSumArrays << <grid, block >> > (d_A, d_B, d_C, nElem);

	//copy kernel result back to host side
	cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

	//check device results
	CheckResults(hostRef, gpuRef, nElem);

	// free device globl memory
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	free(h_A);
	free(h_B);

	// part2: using UMA - managed memory for array A , B, C
	// allocate UMA memory
	cudaError_t err = cudaMallocManaged(&h_A, nBytes);
	if (err != cudaSuccess)
	{
		printf("Not support cudaMallocManaged!\n");
		goto EXIT;
	}

	cudaMallocManaged(&h_B, nBytes, cudaMemAttachGlobal);
	cudaMallocManaged(&h_C, nBytes, cudaMemAttachGlobal);


	// initialize data at host side
	InitData(h_A, nElem);
	InitData(h_B, nElem);
	InitData(h_C, nElem);
	memset(hostRef, 0, nBytes);
	
	// add at host side for result checks
	SumArraysOnHost(h_A, h_B, hostRef, nElem);

	//execute kernle with zero copy memory
	GpuSumArrays << <grid, block >> > (h_A, h_B, h_C, nElem);

	// must be add the code before access the unified managed memory, 
	// otherwise will throw undefined exception
	cudaDeviceSynchronize();

	//check device results
	CheckResults(hostRef, h_C, nElem);

	// free memory
	cudaFreeHost(h_A);
	cudaFreeHost(h_B);
	cudaFreeHost(h_C);
	
	
	free(hostRef);
	free(gpuRef);

EXIT:
	cudaDeviceReset();

	system("pause");
	return 0;
}