一. 下载对应的安装包
查看cuda版本:nvcc -V 显示我的版本是10.2版本 查看cudnn版本:cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2 显示我的cudnn版本为7.5.0版本
在官网下载trt安装包:根据系统、cuda、cudnn下载,下载地址: https://developer.nvidia.cn/nvidia-tensorrt-8x-download
由于我需要下载8.x,要求cudnn的最低版本为8.2,所以去下载cudnn。 https://developer.nvidia.com/rdp/cudnn-archive
二. 环境配置
cd TensorRT-8.0.1.6/python/ pip install tensorrt-8.0.1.6-cp37-none-linux_x86_64.whl # 根据python版本安装 添加依赖环境cudnn: export LD_LIBRARY_PATH=/home/work/guopei/cuda/lib64:
L
D
L
I
B
R
A
R
Y
P
A
T
H
添
加
依
赖
环
境
t
e
n
s
o
r
r
t
:
e
x
p
o
r
t
L
D
L
I
B
R
A
R
Y
P
A
T
H
=
LD_LIBRARY_PATH 添加依赖环境tensorrt: export LD_LIBRARY_PATH=
LDL?IBRARYP?ATH添加依赖环境tensorrt:exportLDL?IBRARYP?ATH=LD_LIBRARY_PATH:/home/work/guopei/TensorRT-8.0.1.6/lib
添加这个就可以使用trtexec export PATH=/home/work/guopei/TensorRT-8.0.1.6/bin:$PATH trtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine.trt --explicitBatch --device=5
注意:在onnx转trt之前,最好执行一下命令: python -m onnxsim test1.oxxn test2.onnx
三. 测试安装环境
查看onnx结构:https://lutzroeder.github.io/netron/
四. 代码示例
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def GiB(val):
return val * 1 << 30
def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False):
"""
仅适用TensorRT V8版本
生成cudaEngine,并保存引擎文件(仅支持固定输入尺度)
fp16_mode: True则fp16预测
onnx_model_path: 将加载的onnx权重路径
trt_engine_path: trt引擎文件保存路径
"""
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(EXPLICIT_BATCH)
parser = trt.OnnxParser(network, TRT_LOGGER)
config = builder.create_builder_config()
config.max_workspace_size=GiB(1)
if fp16_mode:
config.set_flag(trt.BuilderFlag.FP16)
with open(onnx_model_path, 'rb') as model:
assert parser.parse(model.read())
serialized_engine=builder.build_serialized_network(network, config)
with open(trt_engine_path, 'wb') as f:
f.write(serialized_engine)
print('TensorRT file in ' + trt_engine_path)
print('============ONNX->TensorRT SUCCESS============')
class TrtModel():
'''
TensorRT infer
'''
def __init__(self,trt_path):
self.ctx=cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
with open(trt_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
def __call__(self,img_np_nchw):
'''
TensorRT推理
:param img_np_nchw: 输入图像
'''
self.ctx.push()
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
np.copyto(host_inputs[0], img_np_nchw.ravel())
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
self.ctx.pop()
return host_outputs[0]
def destroy(self):
self.ctx.pop()
yolov5-face测试代码:
import os
import sys
import cv2
import copy
import torch
import argparse
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
sys.path.append(root_path)
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from detect_face import scale_coords_landmarks,show_results
from torch2trt.trt_model import TrtModel
cur_path=os.path.abspath(os.path.dirname(__file__))
def img_process(img_path,long_side=512,stride_max=32):
'''
图像预处理
'''
orgimg=cv2.imread(img_path)
img0 = copy.deepcopy(orgimg)
h0, w0 = orgimg.shape[:2]
r = long_side/ max(h0, w0)
if r != 1:
interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
imgsz = check_img_size(long_side, s=stride_max)
img = letterbox(img0, new_shape=imgsz,auto=False)[0]
img = img[:, :, ::-1].transpose(2, 0, 1).copy()
img = torch.from_numpy(img)
img = img.float()
img /= 255.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img,orgimg
def img_vis(img,orgimg,pred,vis_thres = 0.3):
'''
预测可视化
vis_thres: 可视化阈值
'''
print('img.shape: ', img.shape)
print('orgimg.shape: ', orgimg.shape)
no_vis_nums=0
for i, det in enumerate(pred):
gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]]
gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]
if len(det):
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum()
det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round()
for j in range(det.size()[0]):
if det[j, 4].cpu().numpy() < vis_thres:
no_vis_nums+=1
continue
xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
conf = det[j, 4].cpu().numpy()
landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist()
class_num = det[j, 15].cpu().numpy()
orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
cv2.imwrite(cur_path+'/result.jpg', orgimg)
print('result save in '+cur_path+'/result.jpg')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--img_path', type=str, default=cur_path+"/test.jpg", help='img path')
parser.add_argument('--trt_path', type=str, required=True, help='trt_path')
parser.add_argument('--output_shape', type=list, default=[1,25200,16], help='input[1,3,640,640] -> output[1,25200,16]')
opt = parser.parse_args()
img,orgimg=img_process(opt.img_path)
model=TrtModel(opt.trt_path)
pred=model(img.numpy()).reshape([1, 16128, 16])
model.destroy()
pred = non_max_suppression_face(torch.from_numpy(pred), conf_thres=0.3, iou_thres=0.5)
img_vis(img,orgimg,pred)
|