Ubuntu配置TensorRT及验证_竹叶青lvye的博客-CSDN博客博主的一些基本环境配置可见之前博客非虚拟机环境下Ubuntu配置_jiugeshao的专栏-CSDN博客第一步: 准备安装AnacondaAnaconda3-5.2.0-Linux-x86_64.sh的版本是2.4.0, cuda、cuddn、tensorrt的版本见上面博客。tensorrt的python库安装如下,几个whl文件安装下。


实验一: tensorflow的pb模型转化为uff模型,tensorrt加载uff模型去预测图片



pb模型转uff模型(tensorflow2.x)_竹叶青lvye的博客-CSDN博客大多数的博客只是提到tensorflow1.x系列下的转换。大概步骤就是安装tensorrt,同时安装tensorrt下的几个python的wl文件。可参见博主之前的博客:1.tensorrt的安装Ubuntu配置TensorRT及验证_竹叶青lvye的博客-CSDN博客2.tensorrt下几个whl文件的安装TensorRT加速方法介绍(python pytorch模型)_竹叶青lvye的博客-CSDN博客_tensorrt加速pytorch安装uff-0.6.9-py2.py3-none-可参考tensorrt下自带的例子来使用uff文件进行加速验证(结合自己的路径查找)


from random import randint

import tensorrt
from PIL import Image
import numpy as np
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import pycuda.driver as cuda
# This import causes pycuda to automatically manage CUDA context creation and cleanup.
import pycuda.autoinit
from tensorflow.keras.preprocessing import image
import tensorrt as trt

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
import common
import cv2
import time

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# Frozen model layers:
# Input
# resnet50/conv1_pad/Pad/paddings
# resnet50/conv1_pad/Pad
class ModelData(object):
    MODEL_FILE = "weights.uff"
    INPUT_NAME =  "Input"
    INPUT_SHAPE = (224, 224,3)
    OUTPUT_NAME = "resnet50/predictions/Softmax"
# resnet50/predictions/BiasAdd
# resnet50/predictions/Softmax
# Identity

def build_engine():
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, builder.create_builder_config() as config, trt.UffParser() as parser:
        config.max_workspace_size = common.GiB(1)
        # Parse the Uff Network
        parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE, tensorrt.UffInputOrder.NHWC)
        parser.parse(ModelData.MODEL_FILE, network)
        engine = builder.build_engine(network, config)

        return engine

def main():
    engine = build_engine()
    # Build an engine, allocate buffers and create a stream.
    # For more information on buffer allocation, refer to the introductory samples.
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)
    with engine.create_execution_context() as context:
        img = image.load_img('2008_002682.jpg', target_size=(224, 224))
        img = image.img_to_array(img)
        img = preprocess_input(img)
        img = img[np.newaxis, :]
        inputs[0].host = img.ravel()

        t_model = time.perf_counter()
        result = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
        print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')

        output = np.array(result[1])
        output = output[np.newaxis,:]
        print('Predicted:', decode_predictions(output, top=5)[0])

if __name__ == '__main__':




??????? parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE, tensorrt.UffInputOrder.NHWC)





pip install -U tf2onnx


python -m tf2onnx.convert --graphdef weights.pb --output model.onnx --inputs Input:0 --outputs resnet50/predictions/Softmax:0



TensorRT加速方法介绍(python pytorch模型)_竹叶青lvye的博客-CSDN博客_tensorrt加速pytorch中所提到的第一种加速方法,先转换为tensorrt模型

./trtexec --onnx=/home/sxhlvye/Trial1/Tensorrt/model.onnx --saveEngine=/home/sxhlvye/Trial1/Tensorrt/model.trt


import sys
import cv2
from PIL import Image
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import resnet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import tensorflow as tf
import time
import numpy as np
# This import causes pycuda to automatically manage CUDA context creation and cleanup.
import pycuda.autoinit
import tensorrt as trt
import common
import pycuda.driver as cuda
import time
import matplotlib.pyplot as plt
import cv2
# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

filename = "2008_002682.jpg"
engine_file_path = "model.trt"

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """ = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str( + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device,, stream) for inp in inputs]

    # Run inference.
    t_model = time.perf_counter()
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    print(f'only one line cost:{time.perf_counter() - t_model:.8f}s')

    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(, out.device, stream) for out in outputs]

    # Synchronize the stream

    # Return only the host outputs.
    return [ for out in outputs]

def main():
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine =  runtime.deserialize_cuda_engine(

    #create the context for this engine
    context = engine.create_execution_context()
    #allocate buffers for input and output
    inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
    #read a image
    img = image.load_img('2008_002682.jpg', target_size=(224, 224))
    img = image.img_to_array(img)
    img = preprocess_input(img)
    img = img[np.newaxis, :]

    # Load data to the buffer
    inputs[0].host = img.ravel()

    #Do Inference
    t_model = time.perf_counter()
    result = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
    print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')

    output = np.array(result[0])
    output = output[np.newaxis, :]
    print('Predicted:', decode_predictions(output, top=5)[0])

if __name__ == '__main__':


/home/sxhlvye/anaconda3/bin/python /home/sxhlvye/Trial1/Tensorrt/
2022-03-20 18:13:45.682920: I tensorflow/stream_executor/platform/default/] Successfully opened dynamic library
Reading engine from file model.trt
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.0 but loaded cuDNN 8.0.5
[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS LT 11.1.0
[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.0 but loaded cuDNN 8.0.5
(224, 224, 3)
only one line cost:0.33635211s
do inference cost:0.33758112s
(1, 1000)
Predicted: [('n02123597', 'Siamese_cat', 0.16550788), ('n02108915', 'French_bulldog', 0.14138032), ('n04409515', 'tennis_ball', 0.08570899), ('n02095314', 'wire-haired_fox_terrier', 0.052046295), ('n02123045', 'tabby', 0.05069564)]

Process finished with exit code 0






TensorRT加速模型推断时间方法介绍(c++ pytorch模型)_竹叶青lvye的博客-CSDN博客




#include "BatchStream.h"
#include "EntropyCalibrator.h"
#include "argsParser.h"
#include "buffers.h"
#include "common.h"
#include "logger.h"

#include "NvInfer.h"
#include "NvUffParser.h"
#include <cuda_runtime_api.h>
#include "parserOnnxConfig.h"

#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>

#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>

using namespace std;
using namespace cv;

//! \brief  The SampleUffSSD class implements the SSD sample
//! \details It creates the network using an UFF model
class SampleUffSSD
    template <typename T>
    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;


    //! \brief Function builds the network engine
    bool build();

    //! \brief Runs the TensorRT inference engine for this sample
    bool infer();

    //! \brief Cleans up any state created in the sample class
    bool teardown();


    nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network.

    std::vector<samplesCommon::PPM<3, 224, 224>> mPPMs; //!< PPMs of test images

    std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The TensorRT engine used to run the network

    //! \brief Parses an UFF model for SSD and creates a TensorRT network
    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
        SampleUniquePtr<nvuffparser::IUffParser>& parser);

    //! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
    bool processInput(const samplesCommon::BufferManager& buffers);

    //! \brief Filters output detections and verify results
    bool verifyOutput(const samplesCommon::BufferManager& buffers);

//! \brief Creates the network, configures the builder and creates the network engine
//! \details This function creates the SSD network by parsing the UFF model and builds
//!          the engine that will be used to run SSD (mEngine)
//! \return Returns true if the engine was created successfully and false otherwise
bool SampleUffSSD::build()
    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
    if (!builder)
        return false;

    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0));
    if (!network)
        return false;

    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
    if (!config)
        return false;

    auto parser = SampleUniquePtr<nvuffparser::IUffParser>(nvuffparser::createUffParser());
    if (!parser)
        return false;

    auto constructed = constructNetwork(builder, network, config, parser);
    if (!constructed)
        return false;

    ASSERT(network->getNbInputs() == 1);
    mInputDims = network->getInput(0)->getDimensions();
    ASSERT(mInputDims.nbDims == 3);

    ASSERT(network->getNbOutputs() == 2);

    return true;

//! \brief Uses a UFF parser to create the SSD Network and marks the
//!        output layers
//! \param network Pointer to the network that will be populated with the SSD network
//! \param builder Pointer to the engine builder
bool SampleUffSSD::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
    SampleUniquePtr<nvuffparser::IUffParser>& parser)
    parser->registerInput("Input", Dims3(224, 224, 3), nvuffparser::UffInputOrder::kNHWC);

    auto parsed = parser->parse("/home/sxhlvye/Trial1/Tensorrt/weights.uff", *network, nvinfer1::DataType::kFLOAT);
    if (!parsed)
        return false;



    // Calibrator life time needs to last until after the engine is built.

    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
    if (!mEngine)
        return false;

    return true;

//! \brief Runs the TensorRT inference engine for this sample
//! \details This function is the main execution function of the sample. It allocates the buffer,
//!          sets inputs, executes the engine and verifies the detection outputs.
bool SampleUffSSD::infer()
    // Create RAII buffer manager object
    samplesCommon::BufferManager buffers(mEngine, 1);

    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
    if (!context)
        return false;

    if (!processInput(buffers))
        return false;

    // Memcpy from host input buffers to device input buffers

    const bool status = context->execute(1, buffers.getDeviceBindings().data());
    if (!status)
        return false;

    // Memcpy from device output buffers to host output buffers

    // Post-process detections and verify results
    if (!verifyOutput(buffers))
        return false;

    return true;

//! \brief Cleans up any state created in the sample class
bool SampleUffSSD::teardown()
    //! Clean up the libprotobuf files as the parsing is complete
    //! \note It is not safe to use any other part of the protocol buffers library after
    //! ShutdownProtobufLibrary() has been called.
    return true;

//! \brief Reads the input and mean data, preprocesses, and stores the result in a managed buffer
bool SampleUffSSD::processInput(const samplesCommon::BufferManager& buffers)
    cv::Mat image = cv::imread("/home/sxhlvye/Trial1/Tensorrt/2008_002682.jpg", cv::IMREAD_COLOR);
    //cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
    cout << image.channels() << "," << image.size().width << "," << image.size().height << std::endl;

    cv::Mat dst = cv::Mat::zeros(341, 256, CV_32FC3);
    cv::resize(image, dst, dst.size());
    cout << dst.channels() << "," << dst.size().width << "," << dst.size().height << std::endl;

    cv::Mat dst1 = dst(Range(58, 282), Range(16, 240)).clone();
    cout << dst1.channels() << "," << dst1.size().width << "," << dst1.size().height << std::endl;

    const int channel = 3;
    const int inputH = 244;
    const int inputW = 224;
    // Read a random digit file

    std::vector<float> fileData(inputH * inputW * channel);

    for (int c = 0; c < channel; ++c)
        for (int i = 0; i < dst1.rows; ++i)
            cv::Vec3b *p1 = dst1.ptr<cv::Vec3b>(i);
            for (int j = 0; j < dst1.cols; ++j)
                fileData[c * dst1.cols * dst1.rows + i * dst1.cols + j] = (1-p1[j][c]) / 255.0f;

    float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer("Input"));
    for (int i = 0; i < inputH * inputW * channel; i++)
        hostDataBuffer[i] = fileData[i];

    return true;

//! \brief Filters output detections and verify result
//! \return whether the detection output matches expectations
bool SampleUffSSD::verifyOutput(const samplesCommon::BufferManager& buffers)
    const int outputSize = 1000;
    std::cout << "outputSize: " << outputSize << std::endl;
    float* output = static_cast<float*>(buffers.getHostBuffer("resnet50/predictions/Softmax"));
    float val{0.0f};
    int idx{0};

    // Calculate Softmax
    float sum{0.0f};
    for (int i = 0; i < outputSize; i++)
        output[i] = exp(output[i]);
        sum += output[i];

    for (int i = 0; i < outputSize; i++)
        output[i] /= sum;

    vector<float> voutput(1000);
    for (int i = 0; i < outputSize; i++)
        voutput[i] = output[i];

    for(int i=0; i<1000; i++)
        for(int j= i+1; j< 1000; j++)
            if(output[i] < output[j])
                int temp;
                temp = output[i];
                output[i] = output[j];
                output[j] = temp;

    for(int i=0; i<5;i++)
        cout << output[i] << std::endl;

    vector<string> labels;
    string line;

    ifstream readFile("/home/sxhlvye/Trial/yolov3-9.5.0/imagenet_classes.txt");
    while (getline(readFile,line))
        //istringstream record(line);
        //string label;
       // record >> label;
        //cout << line << std::endl;

    vector<int> indexs(5);

    for(int i=0; i< 1000;i++)
        if(voutput[i] == output[0])
            indexs[0] = i;

        if(voutput[i] == output[1])
            indexs[1] = i;

        if(voutput[i] == output[2])
            indexs[2] = i;

        if(voutput[i] == output[3])
            indexs[3] = i;
        if(voutput[i] == output[4])
            indexs[4] = i;


    cout << "top 5: " << std::endl;

    cout << labels[indexs[0]] << "--->" << output[0] << std::endl;

    cout << labels[indexs[1]] << "--->" << output[1] << std::endl;

    cout << labels[indexs[2]] << "--->" << output[2] << std::endl;

    cout << labels[indexs[3]] << "--->" << output[3] << std::endl;

    cout << labels[indexs[4]] << "--->" << output[4] << std::endl;

    return true;

int32_t main()
    SampleUffSSD sample;

    if (!
        std::cout << "faile" << std::endl;
        return 0;

    if (!sample.infer())
        std::cout << "faile" << std::endl;
        return 0;

    if (!sample.teardown())
        std::cout << "faile" << std::endl;
        return 0;






?将keras的模型转化为onnx模型 - 简书

?tensorflow 小于_用TensorRT C++ API加速TensorFlow模型实例_莫祖兰的博客-CSDN博客


tensorflow中ckpt转pb - 知乎


2022年3月20日 22 : 46


【Python】 14-CVS文件操作
