题目要求:学习了解单目深度估计模型MonoDepthv2,根据python源码集成到现有ONNX系列模型中。 MonoDepthv2 论文:Digging Into Self-Supervised Monocular Depth Estimation MonoDepthv2 源码:Monodepth2 GitHub
分析: 1)了解MonoDepthv2的基本原理和代码理解 2)将模型转化为更加方便高效的ONNX模型并在opencv中完成推理过程(并验证)
- 结果展示:
- Pytorch转ONNX模型
- 合并Encoder和Decoder为一个模型
import matplotlib as mpl
import matplotlib.cm as cm
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms, datasets
import networks
from layers import disp_to_depth
from utils import download_model_if_doesnt_exist
from evaluate_depth import STEREO_SCALE_FACTOR
from collections import OrderedDict
from layers import *
import cv2
class Encoder_Decoder(nn.Module):
def __init__(self, encoder, decoder):
super(Encoder_Decoder, self).__init__()
self.encoder = encoder
self.depth_decoder = decoder
def forward(self, x):
features = self.encoder(x)
outputs = self.depth_decoder(features)
return outputs
- Pytorch权重转ONNX权重
def parse_args():
parser = argparse.ArgumentParser(
description='Simple testing funtion for Monodepthv2 models.')
parser.add_argument('--image_path', type=str, default='assets/test_image.jpg',
help='path to a test image or folder of images')
parser.add_argument('--model_name', type=str, default='mono_640x192',
help='name of a pretrained model to use',
parser.add_argument('--ext', type=str,
help='image extension to search for in folder', default="jpg")
help='if set, disables CUDA',
help='if set, predicts metric depth instead of disparity. (This only '
'makes sense for stereo-trained KITTI models).',
return parser.parse_args()
def test_simple(args):
"""Function to predict for a single image or folder of images
assert args.model_name is not None, \
"You must specify the --model_name parameter; see README.md for an example"
device = torch.device("cpu")
if args.pred_metric_depth and "stereo" not in args.model_name:
print("Warning: The --pred_metric_depth flag only makes sense for stereo-trained KITTI "
"models. For mono-trained models, output depths will not in metric space.")
model_path = os.path.join("models", args.model_name)
print("-> Loading model from ", model_path)
encoder_path = os.path.join(model_path, "encoder.pth")
depth_decoder_path = os.path.join(model_path, "depth.pth")
print(" Loading pretrained encoder")
encoder = networks.ResnetEncoder(18, False)
loaded_dict_enc = torch.load(encoder_path, map_location=device)
feed_height = loaded_dict_enc['height']
feed_width = loaded_dict_enc['width']
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
print(" Loading pretrained decoder")
depth_decoder = networks.DepthDecoder(
num_ch_enc=encoder.num_ch_enc, scales=range(4))
loaded_dict = torch.load(depth_decoder_path, map_location=device)
if os.path.isfile(args.image_path):
paths = [args.image_path]
output_directory = os.path.dirname(args.image_path)
elif os.path.isdir(args.image_path):
paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext)))
output_directory = args.image_path
raise Exception("Can not find args.image_path: {}".format(args.image_path))
print("-> Predicting on {:d} test images".format(len(paths)))
with torch.no_grad():
for idx, image_path in enumerate(paths):
if image_path.endswith("_disp.jpg"):
input_image = pil.open(image_path).convert('RGB')
original_width, original_height = input_image.size
input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
input_image = transforms.ToTensor()(input_image).unsqueeze(0)
input_image = input_image.to(device)
model = Encoder_Decoder(encoder=encoder, decoder=depth_decoder)
outputs = model(input_image)
disp = outputs
print('disp: ', disp.shape)
disp_ = disp.squeeze().cpu().numpy()
disp_resized = torch.nn.functional.interpolate(
disp, (original_height, original_width), mode="bilinear", align_corners=False)
disp_resized_np = disp_resized.squeeze().cpu().numpy()
vmax = np.percentile(disp_resized_np, 95)
normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
im = pil.fromarray(colormapped_im)
name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
print(" Processed {:d} of {:d} images - saved predictions to:".format(
idx + 1, len(paths)))
print(" - {}".format(name_dest_im))
print('-> Done!')
x = torch.rand(1,3,192,640)
input_names = ['input']
output_names = ['output']
torch.onnx.export(model, x, 'mono.onnx',input_names=input_names, output_names=output_names,opset_version=11, verbose='True')
def onnx_inference():
img = cv2.imread("assets/test_image.jpg")
h, w, _ = img.shape
blobImage = cv2.dnn.blobFromImage(img, 1.0 / 255.0, (640, 192), None, True, False)
net = cv2.dnn.readNet('mono.onnx')
outNames = net.getUnconnectedOutLayersNames()
outs = net.forward(outNames)
print('cv outs: ', outs[0].shape)
outs = np.squeeze(outs, axis=(0,1))
outs = outs * 255.0
outs =outs.transpose((1,2,0)).astype(np.uint8)
disp_resized_np = cv2.resize(outs,(640,192))
model = onnx.load('mono.onnx')
session = ort.InferenceSession('mono.onnx')
img = cv2.resize(img, (640, 192))
img = np.asarray(img) / 255.0
img = img[np.newaxis, :].astype(np.float32)
input_image = img.transpose((0,3,1,2))
outs = session.run(None, input_feed={'input':input_image})
outs = np.squeeze(outs, axis=(0,1))
outs = outs * 255.0
outs =outs.transpose((1,2,0)).astype(np.uint8)
disp_resized_np = cv2.resize(outs,(640,192))
outs = cv2.applyColorMap(outs,colormap=cv2.COLORMAP_SUMMER)
cv2.imwrite('disp_color.png', outs)
if __name__ == '__main__':
args = parse_args()
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>
using namespace cv;
using namespace dnn;
using namespace std;
class baseDepth
baseDepth(int h, int w, const string& model_path = "model/mono.onnx") {
this->inHeight = h;
this->inWidth = w;
cout << "start" << endl;
this->net = readNetFromONNX(model_path);
cout << "end" << endl;
Mat depth(Mat& frame);
Mat viewer(vector<Mat> imgs, double alpha=0.80);
Net net;
int inWidth;
int inHeight;
Mat baseDepth::depth(Mat& frame) {
int ori_h = frame.size[0];
int ori_w = frame.size[1];
cout << "ori: " << ori_h << " , " << ori_w << endl;
Mat blobImage = blobFromImage(frame, 1.0 / 255.0, Size(this->inWidth, this->inHeight), Scalar(0, 0, 0), true, false);
cout << "read model" << endl;
vector<Mat> scores;
this->net.forward(scores, this->net.getUnconnectedOutLayersNames());
int channel = scores[0].size[1];
int h = scores[0].size[2];
int w = scores[0].size[3];
cout << "c: " << channel << " , h: " << h << " , w: " << w << endl;
Mat depthMap(scores[0].size[2], scores[0].size[3], CV_32F, scores[0].ptr<float>(0, 0));
cout << depthMap.size() << endl;
depthMap *= 255.0;
depthMap.convertTo(depthMap, CV_8UC1);
resize(depthMap, depthMap, Size(ori_w, ori_h));
applyColorMap(depthMap, depthMap, COLORMAP_MAGMA);
imwrite("inference/depth_color.png", depthMap);
return depthMap;
Mat baseDepth::viewer(vector<Mat> imgs, double alpha){
Size imgOriSize = imgs[0].size();
Size imgStdSize(imgOriSize.width * alpha, imgOriSize.height * alpha);
Mat imgStd;
int delta_h = 2, delta_w = 2;
Mat imgWindow(imgStdSize.height+2*delta_h, imgStdSize.width*2+3*delta_w, imgs[0].type());
resize(imgs[0], imgStd, imgStdSize, alpha, alpha, INTER_LINEAR);
imgStd.copyTo(imgWindow(Rect(Point2i(delta_w, delta_h), imgStdSize)));
resize(imgs[1], imgStd, imgStdSize, alpha, alpha, INTER_LINEAR);
imgStd.copyTo(imgWindow(Rect(Point2i(imgStdSize.width+2*delta_w, delta_h), imgStdSize)));
return imgWindow;
if(config.model_name == "monodepth"){
int h = 192, w = 640;
baseDepth model(h, w);
Mat depthMap = model.depth(srcimg);
static const string kWinName = "Deep learning Mono depth estimation in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);
Mat res = model.viewer({srcimg, depthMap}, 0.90);
imshow(kWinName, res);
- 小结
转换过程主要遇到的问题: 1)MonoDepth模型较丰富,内容上涉及单目和双目估计,模型结构上又分为Encoder和Decoder两部分,转ONNX时需要合并成一个模型测试; 2)MonoDepth的Decoder部分需要输入多个特征层,而ONNX forward通常只支持单个输入,因此合并模型只forward了第一个特征层(实际也只用到了第一个特征层); 3)PIL、matplotlib、cv2对图像的排列顺序不尽相同,可能存在ONNX转换成功而结果很奇怪,此时需要多定位图像的读取和存储方式的差异; 4)深度估计只看深度结果图很难了解细节,需要跟原图放一起对比才能能清楚地理解深度,在输出时尽量保持在一起展示,添加颜色渲染,以提高辨识度。