Libtorch在visual studio 2019 上的部署
Libtorch对于YOLOv5的部署(供自己以后复习使用)
libtorch是pytorch在c++的前端,C++对torch的各种操作还是比较友好,通过torch::或者后加_的方法都可以找到对应实现,而我也是为了学习c++,所以训练一下
各库的安装
1.libtorch libtorch直接可以在官网下载,但是使用的话还是要注意,就是你后面训练的模型使用的pytorch版本要和libtorch对应(如我使用的是torch1.8.1+cu11.1和libtorch1.8.1GPU版本),还有一点要注意,libtorch有两个版本:release版本和debug版本,两个版本顾名思义,好像debug更大,release版本用来开发,节约成本。pytroch官网
安装成功,测试代码:
#include "torch/torch.h"
#include "torch/script.h"
int main()
{
torch::Tensor output = torch::randn({ 3,2 });
std::cout << output;
return 0;
}
2.opencv和cmake opencv和cmake也直接在官网下载
环境变量和各种与visual studio 2019搭配的头文件、库文件、依赖的设置
参考这篇博文 跟着一步步来,应该没什么问题,我觉得需要注意的是 这两个头文件路径中常用的头文件分别是: #include “torch/script.h” #include “torch/torch.h” 第二个头文件,但是一般都没有说这个头文件所在路径,可能导致程序找不到很多定义。
模型转换(ptrorch——script model)
参考这篇知乎文章](https://zhuanlan.zhihu.com/p/146453159) 讲的很详细,我在这是用的yolov5官方最新5.0代码和权重。官方默认使用cpu转换,好像CPU转换的模型用libtorchGPU版本可以编译,而GPU转换的模型用libtorch cpu版本则不行。
import argparse
import sys
import time
from pathlib import Path
import torch
import torch.nn as nn
from torch.utils.mobile_optimizer import optimize_for_mobile
FILE = Path(__file__).absolute()
sys.path.append(FILE.parents[0].as_posix())
from models.common import Conv
from models.yolo import Detect
from models.experimental import attempt_load
from utils.activations import Hardswish, SiLU
from utils.general import colorstr, check_img_size, check_requirements, file_size, set_logging
from utils.torch_utils import select_device
def run(weights='./last6.9.pt',
img_size=(640, 640),
batch_size=1,
device=torch.device('cpu'),
include=('torchscript', 'onnx', 'coreml'),
half=False,
inplace=False,
train=False,
optimize=False,
dynamic=False,
simplify=False,
opset_version=12,
):
t = time.time()
include = [x.lower() for x in include]
img_size *= 2 if len(img_size) == 1 else 1
device = select_device(device)
assert not (device.type == 'cpu' and half), '--half only compatible with GPU export, i.e. use --device 0'
model = attempt_load(weights, map_location=device)
labels = model.names
gs = int(max(model.stride))
img_size = [check_img_size(x, gs) for x in img_size]
img = torch.zeros(batch_size, 3, *img_size).to(device)
if half:
img, model = img.half(), model.half()
model.train() if train else model.eval()
for k, m in model.named_modules():
m._non_persistent_buffers_set = set()
if isinstance(m, Conv):
if isinstance(m.act, nn.Hardswish):
m.act = Hardswish()
elif isinstance(m.act, nn.SiLU):
m.act = SiLU()
elif isinstance(m, Detect):
m.inplace = inplace
m.onnx_dynamic = dynamic
for _ in range(2):
y = model(img)
print(f"\n{colorstr('PyTorch:')} starting from {weights} ({file_size(weights):.1f} MB)")
if 'torchscript' in include or 'coreml' in include:
prefix = colorstr('TorchScript:')
try:
print(f'\n{prefix} starting export with torch {torch.__version__}...')
f = weights.replace('.pt', '.torchscript.pt')
ts = torch.jit.trace(model, img, strict=False)
output1 = ts(torch.ones(1, 3, 640, 640))
output2 = model(torch.ones(1, 3, 640, 640))
print(output1)
print(output2)
(optimize_for_mobile(ts) if optimize else ts).save(f)
print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
except Exception as e:
print(f'{prefix} export failure: {e}')
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str, default='./last6.9.pt', help='weights path')
parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image (height, width)')
parser.add_argument('--batch-size', type=int, default=1, help='batch size')
parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--include', nargs='+', default=['torchscript', 'onnx', 'coreml'], help='include formats')
parser.add_argument('--half', action='store_true', help='FP16 half-precision export')
parser.add_argument('--inplace', action='store_true', help='set YOLOv5 Detect() inplace=True')
parser.add_argument('--train', action='store_true', help='model.train() mode')
parser.add_argument('--optimize', action='store_true', help='TorchScript: optimize for mobile')
parser.add_argument('--dynamic', action='store_true', help='ONNX: dynamic axes')
parser.add_argument('--simplify', action='store_true', help='ONNX: simplify model')
parser.add_argument('--opset-version', type=int, default=12, help='ONNX: opset version')
opt = parser.parse_args()
return opt
def main(opt):
set_logging()
print(colorstr('export: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items()))
run(**vars(opt))
if __name__ == "__main__":
opt = parse_opt()
main(opt)
Libtorch在visual studio 2019上的部署
我是用了这篇博文的代码 使用yolov5x.torchscript.pt,我把它修改成了只测试图片,但还有点问题
#include <opencv2/opencv.hpp>
#include <torch/script.h>
#include <torch/torch.h>
#include <algorithm>
#include <iostream>
#include <time.h>
using namespace std;
vector<torch::Tensor> non_max_suppression(torch::Tensor preds, float score_thresh = 0.25, float iou_thresh = 0.45)
{
std::vector<torch::Tensor> output;
for (size_t i = 0; i < preds.sizes()[0]; ++i)
{
torch::Tensor pred = preds.select(0, i);
pred = pred.to(at::kCPU);
torch::Tensor scores = pred.select(1, 4) * std::get<0>(torch::max(pred.slice(1, 5, pred.sizes()[1]), 1));
pred = torch::index_select(pred, 0, torch::nonzero(scores > score_thresh).select(1, 0));
if (pred.sizes()[0] == 0) continue;
pred.select(1, 0) = pred.select(1, 0) - pred.select(1, 2) / 2;
pred.select(1, 1) = pred.select(1, 1) - pred.select(1, 3) / 2;
pred.select(1, 2) = pred.select(1, 0) + pred.select(1, 2);
pred.select(1, 3) = pred.select(1, 1) + pred.select(1, 3);
std::tuple<torch::Tensor, torch::Tensor> max_tuple = torch::max(pred.slice(1, 5, pred.sizes()[1]), 1);
pred.select(1, 4) = pred.select(1, 4) * std::get<0>(max_tuple);
pred.select(1, 5) = std::get<1>(max_tuple);
torch::Tensor dets = pred.slice(1, 0, 6);
torch::Tensor keep = torch::empty({ dets.sizes()[0] });
torch::Tensor areas = (dets.select(1, 3) - dets.select(1, 1)) * (dets.select(1, 2) - dets.select(1, 0));
std::tuple<torch::Tensor, torch::Tensor> indexes_tuple = torch::sort(dets.select(1, 4), 0, 1);
torch::Tensor v = std::get<0>(indexes_tuple);
torch::Tensor indexes = std::get<1>(indexes_tuple);
int count = 0;
while (indexes.sizes()[0] > 0)
{
keep[count] = (indexes[0].item().toInt());
count += 1;
torch::Tensor lefts = torch::empty(indexes.sizes()[0] - 1);
torch::Tensor tops = torch::empty(indexes.sizes()[0] - 1);
torch::Tensor rights = torch::empty(indexes.sizes()[0] - 1);
torch::Tensor bottoms = torch::empty(indexes.sizes()[0] - 1);
torch::Tensor widths = torch::empty(indexes.sizes()[0] - 1);
torch::Tensor heights = torch::empty(indexes.sizes()[0] - 1);
for (size_t i = 0; i < indexes.sizes()[0] - 1; ++i)
{
lefts[i] = std::max(dets[indexes[0]][0].item().toFloat(), dets[indexes[i + 1]][0].item().toFloat());
tops[i] = std::max(dets[indexes[0]][1].item().toFloat(), dets[indexes[i + 1]][1].item().toFloat());
rights[i] = std::min(dets[indexes[0]][2].item().toFloat(), dets[indexes[i + 1]][2].item().toFloat());
bottoms[i] = std::min(dets[indexes[0]][3].item().toFloat(), dets[indexes[i + 1]][3].item().toFloat());
widths[i] = std::max(float(0), rights[i].item().toFloat() - lefts[i].item().toFloat());
heights[i] = std::max(float(0), bottoms[i].item().toFloat() - tops[i].item().toFloat());
}
torch::Tensor overlaps = widths * heights;
torch::Tensor ious = overlaps / (areas.select(0, indexes[0].item().toInt()) + torch::index_select(areas, 0, indexes.slice(0, 1, indexes.sizes()[0])) - overlaps);
indexes = torch::index_select(indexes, 0, torch::nonzero(ious <= iou_thresh).select(1, 0) + 1);
}
keep = keep.toType(torch::kInt64);
output.push_back(torch::index_select(dets, 0, keep.slice(0, 0, count)));
}
return output;
}
#include <torch/script.h>
#include <iostream>
#include <memory>
int main()
{
std::cout << "cuda::is_available():" << torch::cuda::is_available() << std::endl;
std::cout << "cudnn::is_available():" << torch::cuda::cudnn_is_available() << std::endl;
torch::DeviceType device_type = at::kCPU;
if (torch::cuda::is_available())
device_type = at::kCUDA;
char* model_path= "E:\\Project2\\yolov5s.torchscript.pt";
torch::jit::script::Module module = torch::jit::load(model_path);
module.to(device_type);
std::vector<std::string> classnames;
std::ifstream f("coco.names");
std::string name = "";
while (std::getline(f, name))
{
classnames.push_back(name);
}
double t_start, t_end, t_cost;
t_start = cv::getTickCount();
std::string video = "E:\\Project2\\bicycle.jpg";
cv::Mat frame, img;
frame = cv::imread(video);
int width = frame.size().width;
int height = frame.size().height;
int count = 0;
cv::resize(frame, img, cv::Size(640, 640));
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
img.convertTo(img, CV_32FC3, 1.0f / 255.0f);
auto imgTensor = torch::from_blob(img.data, { 1, img.rows, img.cols, img.channels() }).to(device_type);
imgTensor = imgTensor.permute({ 0, 3, 1, 2 }).contiguous();
std::vector<torch::jit::IValue> inputs;
inputs.emplace_back(imgTensor);
torch::jit::IValue output = module.forward(inputs);
auto preds = output.toTuple()->elements()[0].toTensor();
std::vector<torch::Tensor> dets = non_max_suppression(preds, 0.35, 0.5);
std::cout << "预测结果:" << dets<<std::endl;
t_end = cv::getTickCount();
t_cost = (t_end - t_start)/ cv::getTickFrequency();
printf("time cost:", t_cost);
if (dets.size() > 0)
{
for (size_t i = 0; i < dets[0].sizes()[0]; ++i)
{
float left = dets[0][i][0].item().toFloat() * frame.cols / 640;
float top = dets[0][i][1].item().toFloat() * frame.rows / 640;
float right = dets[0][i][2].item().toFloat() * frame.cols / 640;
float bottom = dets[0][i][3].item().toFloat() * frame.rows / 640;
float score = dets[0][i][4].item().toFloat();
int classID = dets[0][i][5].item().toInt();
cv::rectangle(frame, cv::Rect(left, top, (right - left), (bottom - top)), cv::Scalar(0, 255, 0), 2);
}
}
std::cout << "-[INFO] Frame:" << std::to_string(count) << std::endl;
cv::imshow("", frame);
cv::resize(frame, frame, cv::Size(width, height));
cv::waitKey(0);
return 0;
}
(1)这一段代码总是报错,我没解决掉
(2)这一段代码,我也修改了他的,因为总是报c10:error的错误,他是直接加载权重,我先定义变量再加载。
char* model_path= "E:\\Project2\\yolov5s.torchscript.pt";
torch::jit::script::Module module = torch::jit::load(model_path);
运行结果
因为没解决cv::putText的错误,就只有这个了
pytorch和libtorch在CPU上的运行比较
libtorch上的cpu推理421 ms(其实是我GPU真的用不上,以后在解决吧,废了!结果是不是也有点不太对,算了) pytorch上cpu推理264 ms(听说在cpu上c++比pytorch慢是正常的?)
|