环境搭建
环境搭建参考文档 TensorRT使用入门
简介
本节主要介绍使用tensorrt来计算conv2d,主要介绍 addConvolutionNd接口的使用,以及IConvolutionLayer中参数的设置。
代码
建议先熟悉环境搭建文档中的demo,熟悉基本的c++ api使用。主要代码如下。 先引入必要的头文件
#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
定义logger
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
定义一些辅助函数
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, DimsHW paddings) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
output.d[2] = ((input.d[2] + 2 * paddings.d[0] - kh) / sh) + 1;
output.d[3] = ((input.d[3] + 2 * paddings.d[1] - kw) / sw) + 1;
return output;
}
三个函数的意思显而易见,ProductOfDims计算输入Dims各维度数值的乘积(即元素个数),CalculateConv2dOutput为了计算conv2d的输出shape。注意:CalculateConv2dOutput函数目前只考虑下paddings,未考虑prepadding, postpadding以及padding mode。这在后面会讨论到。
创建一个Network
int main() {
Logger logger;
IBuilder* builder = createInferBuilder(logger);
uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(flag);
定义conv2d的输入以及一些必要属性的shape信息
Dims4 input_shape{3, 3, 8, 8};
Dims4 filter_shape{3, 3, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
DimsHW paddings{1, 1};
Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, paddings);
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
添加一个convolution layer,设置stride, padding,
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0};
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
conv2d->setPaddingNd(paddings);
完整代码见附录1
设置prepadding和postpadding
2d conv的padding有四个方向,上左下右。 前面demo中设置的padding是二维的,即HW两个方向,由于IConvolutionLayer中padding这个属性是对称的,所以padding[0]是上下padding, padding[1]左右padding。
但是实际应用中padding不一定是对称的,即上下左右padding可能各不相同。这是就需要设置IConvolutionLayer中的prepadding和postpadding属性。对于2d conv,prepadding[0]和prepadding[1]分别表示上padding,左padding,postpadding[0]和postpadding[1]分别表示下padding,右padding。 设置prepadding 和 postpadding 的demo如附录2, 我们简化下输入:
Dims4 input_shape{1, 1, 2, 2};
Dims4 filter_shape{1, 1, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
Dims2 prepadding{1,0};
Dims2 postpadding{0,1};
这里定义了上padding和右padding为1。同时conv2d的layer设置对应的属性
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
conv2d->setPrePadding(prepadding);
conv2d->setPostPadding(postpadding);
同时注意CalculateConv2dOutput的修改
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, Dims2 postpadding) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
output.d[2] = ((input.d[2] + prepadding.d[0] + postpadding.d[0] - kh) / sh) + 1;
output.d[3] = ((input.d[3] + prepadding.d[1] + postpadding.d[1] - kw) / sw) + 1;
return output;
}
最后的运行结果如下:
output shape : 1 1 2 2
output data :
2 1 4 2
完整代码见附录2
prepadding、postpadding与paddingNd冲突时
通过前面的介绍可知,IConvolutionLayer有三个接口设置不同情况的padding,即
conv2d->setPaddingNd(padding);
conv2d->setPrePadding(prepadding);
conv2d->setPostPadding(postpadding);
如果三个同时设置了,靠后的设置会覆盖之前的设置。比如上面这种情况,prepadding和postpadding有效,padding无效。读者可自信实验验证。根本原因是:调用setPaddingNd时,内部去调用接口setPrePadding和setPostPadding。也就是说,setPaddingNd接口更改的底层数据和setPrePadding、setPostPadding是同一份数据(prepadding, postpadding)。
我们可以通过一个小case验证一下:
DimsHW padding{1, 1};
conv2d->setPaddingNd(padding);
std::cout << DimsToStr(conv2d->getPaddingNd()) << "\n";
std::cout << DimsToStr(conv2d->getPrePadding()) << "\n";
std::cout << DimsToStr(conv2d->getPostPadding()) << "\n";
输出为:
1 1
1 1
1 1
设置padding mode
IConvolutionLayer设置padding mode的api是
conv2d->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_DOWN);
PaddingMode主要包括三种情况,kEXPLICIT_ROUND_DOWN是默认值,此时计算方式就是前面写的CalculateConv2dOutput中的计算方式。 PaddingMode详细信息可以参考官方文档: PaddingMode官方文档 注意,从图中公式可知,padding mode为某些场景时,会自动计算prepadding,postpadding(虚拟的值,实际计算o时,如ceil(I/S),不需要pre/post padding的值),这种情况设置prepadding, postpadding的优先级更高。(原文 padding mode takes precedence if setPaddingMode() and setPrePadding() are also used.)意思是当你设置的padding mode需要自己推到prepadding, postpadding时,通过setPaddingNd,setPrePadding,setPostPadding这些接口设置的prepadding,postpadding值无效。读者可自行验证。
总结
经过上述的描述后,我们知道,关于padding,本质内部只有两个数据结构,prepadding和postpadding,setPaddingNd(),setPrePadding(),setPostPadding()这三个API都会去设置这两个数据结构,同时,当padding mode为某些场景时,也会去自动设置这两个数据,并且拥有更高的优先值。因为基于上述讨论,我们可以完善下CalculateConv2dOutput()这个函数的逻辑。
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding,
Dims2 postpadding, PaddingMode paddingmode = PaddingMode::kEXPLICIT_ROUND_DOWN) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
int32_t uppadding = prepadding.d[0];
int32_t leftpadding = prepadding.d[1];
int32_t downpadding = postpadding.d[0];
int32_t rightpadding = postpadding.d[1];
switch (paddingmode)
{
case PaddingMode::kEXPLICIT_ROUND_UP:
output.d[2] = ((input.d[2] + uppadding + downpadding - kh + sh -1 )/ sh) + 1;
output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw + sw -1) / sw) + 1;
break;
case PaddingMode::kSAME_LOWER:
output.d[2] = (input.d[2] / sh) + 1;
output.d[3] = (input.d[3] / sw) + 1;
break;
case PaddingMode::kSAME_UPPER:
output.d[2] = (input.d[2] + sh -1 / sh) + 1;
output.d[3] = (input.d[3] + sw -1 / sw) + 1;
break;
default:
output.d[2] = ((input.d[2] + uppadding + downpadding - kh) / sh) + 1;
output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw) / sw) + 1;
break;
}
return output;
}
代码中如 output.d[2] = (input.d[2] / sh) + 1 对应公式 floor(I / S), output.d[2] = (input.d[2] + sh -1 / sh) + 1 则对应向上取整的 ceil(I / S) 其他paddingmode的计算方式读者可自行添加。 完整测试代码见附录3
附录
附录1
#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, DimsHW paddings) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
output.d[2] = ((input.d[2] + 2 * paddings.d[0] - kh) / sh) + 1;
output.d[3] = ((input.d[3] + 2 * paddings.d[1] - kw) / sw) + 1;
return output;
}
int main() {
Logger logger;
IBuilder* builder = createInferBuilder(logger);
uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(flag);
Dims4 input_shape{3, 3, 8, 8};
Dims4 filter_shape{3, 3, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
DimsHW paddings{1, 1};
Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, paddings);
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0};
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
conv2d->setPaddingNd(paddings);
conv2d->getOutput(0)->setName("output");
network->markOutput(*conv2d->getOutput(0));
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
IRuntime* runtime = createInferRuntime(logger);
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
int32_t inputIndex = engine->getBindingIndex("input");
int32_t outputIndex = engine->getBindingIndex("output");
std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
std::vector<float> output(ProductOfDims(output_shape));
void *GPU_input_Buffer_ptr;
void *GPU_output_Buffer_ptr;
void* buffers[2];
cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size());
cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size());
cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice);
buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);
IExecutionContext *context = engine->createExecutionContext();
context->executeV2(buffers);
cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
std::cout << "output data : \n";
for(auto i : output)
std::cout << i << " ";
std::cout << std::endl;
}
附录2
#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, Dims2 postpadding) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
output.d[2] = ((input.d[2] + prepadding.d[0] + postpadding.d[0] - kh) / sh) + 1;
output.d[3] = ((input.d[3] + prepadding.d[1] + postpadding.d[1] - kw) / sw) + 1;
return output;
}
int main() {
Logger logger;
IBuilder* builder = createInferBuilder(logger);
uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(flag);
Dims4 input_shape{1, 1, 2, 2};
Dims4 filter_shape{1, 1, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
Dims2 prepadding{1,0};
Dims2 postpadding{0,1};
Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, prepadding, postpadding);
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0};
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
conv2d->setPrePadding(prepadding);
conv2d->setPostPadding(postpadding);
conv2d->getOutput(0)->setName("output");
network->markOutput(*conv2d->getOutput(0));
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
IRuntime* runtime = createInferRuntime(logger);
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
int32_t inputIndex = engine->getBindingIndex("input");
int32_t outputIndex = engine->getBindingIndex("output");
std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
std::vector<float> output(ProductOfDims(output_shape));
void *GPU_input_Buffer_ptr;
void *GPU_output_Buffer_ptr;
void* buffers[2];
cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size());
cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size());
cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice);
buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);
IExecutionContext *context = engine->createExecutionContext();
context->executeV2(buffers);
cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
std::cout << "output data : \n";
for(auto i : output)
std::cout << i << " ";
std::cout << std::endl;
}
附录3
#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding,
Dims2 postpadding, PaddingMode paddingmode = PaddingMode::kEXPLICIT_ROUND_DOWN) {
assert(input.d[1] == filter.d[1]);
assert(filter.d[2] == kernel_size.d[0]);
assert(filter.d[3] == kernel_size.d[1]);
Dims4 output;
output.d[0] = input.d[0];
output.d[1] = filter.d[0];
int32_t kh = kernel_size.d[0];
int32_t kw = kernel_size.d[1];
int32_t sh = stride.d[0];
int32_t sw = stride.d[1];
int32_t uppadding = prepadding.d[0];
int32_t leftpadding = prepadding.d[1];
int32_t downpadding = postpadding.d[0];
int32_t rightpadding = postpadding.d[1];
switch (paddingmode)
{
case PaddingMode::kEXPLICIT_ROUND_UP:
output.d[2] = ((input.d[2] + uppadding + downpadding - kh + sh -1 )/ sh) + 1;
output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw + sw -1) / sw) + 1;
break;
case PaddingMode::kSAME_LOWER:
output.d[2] = (input.d[2] / sh) + 1;
output.d[3] = (input.d[3] / sw) + 1;
break;
case PaddingMode::kSAME_UPPER:
output.d[2] = (input.d[2] + sh -1 / sh) + 1;
output.d[3] = (input.d[3] + sw -1 / sw) + 1;
break;
default:
output.d[2] = ((input.d[2] + uppadding + downpadding - kh) / sh) + 1;
output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw) / sw) + 1;
break;
}
return output;
}
int main() {
Logger logger;
IBuilder* builder = createInferBuilder(logger);
uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(flag);
Dims4 input_shape{1, 1, 4, 4};
Dims4 filter_shape{1, 1, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
DimsHW padding{1, 1};
Dims2 prepadding{1,0};
Dims2 postpadding{0,1};
Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, prepadding, postpadding);
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0};
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
conv2d->setPaddingMode(PaddingMode::kSAME_UPPER);
conv2d->getOutput(0)->setName("output");
network->markOutput(*conv2d->getOutput(0));
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
IRuntime* runtime = createInferRuntime(logger);
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
int32_t inputIndex = engine->getBindingIndex("input");
int32_t outputIndex = engine->getBindingIndex("output");
std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
std::vector<float> output(ProductOfDims(output_shape));
void *GPU_input_Buffer_ptr;
void *GPU_output_Buffer_ptr;
void* buffers[2];
cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size());
cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size());
cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice);
buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);
IExecutionContext *context = engine->createExecutionContext();
context->executeV2(buffers);
cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
std::cout << "output data : \n";
for(auto i : output)
std::cout << i << " ";
std::cout << std::endl;
}
|