onnx效率问题:和Module & DataParallel比较
1、实验1 - 人脸定位 + 人脸关键点检测
使用mbv2模型进行人脸关键点检测,算法链接如下:https://github1s.com/cosanlab/py-feat/blob/HEAD/feat/landmark_detectors/basenet_test.py
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
class MobileNet_GDConv(nn.Module):
def __init__(self, num_classes):
super(MobileNet_GDConv, self).__init__()
self.pretrain_net = models.mobilenet_v2(pretrained=False)
self.base_net = nn.Sequential(*list(self.pretrain_net.children())[:-1])
self.linear7 = ConvBlock(1280, 1280, (7, 7), 1, 0, dw=True, linear=True)
self.linear1 = ConvBlock(1280, num_classes, 1, 1, 0, linear=True)
def forward(self, x):
x = self.base_net(x)
x = self.linear7(x)
x = self.linear1(x)
x = x.view(x.size(0), -1)
return x
1)使用Module加载mbv2模型(Bad)
class MyTest():
'''使用单一模型运行'''
def test_get_landmark_from_mbv2_Single(self):
frame = cv2.imread("./img/calibrate_glasses.jpg")
'''pth转onnx'''
faces = scrfd_detector.detect_faces(frame)[0]
landmark_detector = MobileNet_GDConv(136)
checkpoint = torch.load("./pth/mobilenet_224_model_best_gdconv_external.pth.tar", map_location="cpu")
model_dict = landmark_detector.state_dict()
state_dict = {k.replace("module.","") : v for k,v in checkpoint["state_dict"].items() }
model_dict.update(state_dict)
landmark_detector.load_state_dict(model_dict)
count = 1000
landmark_time = 0
for i in range(count):
print(i)
startT = time.time()
landmarks = get_landmark_from_mbv2(landmark_detector,frame,faces)
endT = time.time()
landmark_time += (endT - startT)
print(f"consume = {(landmark_time / count) * 1000} ms ")
运算开销为:
consume = 357.7960877418518 ms #Module
2)使用DataParallel加载mbv2模型(Perfect)
class MyTest():
def test_get_landmark_from_mbv2_DP(self):
frame = cv2.imread("./img/calibrate_glasses.jpg")
'''pth转onnx'''
faces = scrfd_detector.detect_faces(frame)[0]
landmark_detector = MobileNet_GDConv(136)
landmark_detector = torch.nn.DataParallel(landmark_detector)
checkpoint = torch.load("./pth/mobilenet_224_model_best_gdconv_external.pth.tar", map_location="cpu")
landmark_detector.load_state_dict(checkpoint["state_dict"])
count = 1000
landmark_time = 0
for i in range(count):
print(i)
startT = time.time()
landmarks = get_landmark_from_mbv2(landmark_detector,frame,faces)
endT = time.time()
landmark_time += (endT - startT)
print(f"consume = {(landmark_time / count) * 1000} ms ")
计算开销为:
consume = 18.832219123840332 ms #DataParallel
3)使用onnx加载mbv2模型(Good)
先将pth.tar权重文件导入到模型中(注意不是DataParallel模型),接着导出onnx权重文件。
class MyTest(unittest.TestCase):
def test_get_landmark_from_mbv2(self):
frame = cv2.imread("./img/calibrate_glasses.jpg")
'''pth转onnx'''
faces = scrfd_detector.detect_faces(frame)[0]
ort_sess = onnxruntime.InferenceSession(
'./weights/landmark_mbv2.onnx')
count = 1000
landmark_time = 0
for i in range(count):
print(i)
startT = time.time()
landmarks = detect_landmarks(ort_sess, frame, [faces])
endT = time.time()
landmark_time += (endT - startT)
print(f"consume = {(landmark_time / count) * 1000} ms ")
计算开销为
consume = 76.81396794319153 ms #onnx
4)疑惑
为什么使用onnxRuntime运行模型时,计算开销虽比Module加载的模型小,但是比DataParallel加载的高出那么多?
2、实验2 - 人脸定位 + 人脸关键点 + 皱眉检测
1)使用Module加载mbv3_small模型(Well)
if(frownModel == None):
'''single'''
frownModel = mobilenetv3.MobileNetV3_Small(num_classes=2)
frownModel.load_state_dict(torch.load("frownMobileNet_CE_sobel.pth"))
for i in range(1000):
...
startTime = time.time()
brow_detect = frown_infer_CE(ROI,frownModel)
brow_state = "frown" if brow_detect == 1 else "normal"
endTime = time.time()
frownDet_time_total += (endTime - startTime)
...
各模块计算开销为:
locate_time_mean = 30.982503175735474ms #人脸定位 onnx
landmark_time_mean = 40.17165660858154ms #人脸关键点检测 DataParallel
blink_time_mean = 2.109795331954956ms
headpose_time_mean = 1.8869452476501465ms
distance_time_mean = 0.00399017333984375ms
browCropAndProcess_time_mean = 3.628519296646118ms
frownDet_time_mean = 46.93113684654236ms #皱眉检测 Module
browRaise_time_mean = 0.12495064735412598ms
yawn_time_mean = 0.006979942321777344ms
#总耗时约为30 + 40 + 46 = 116ms
2)使用DataParallel加载mbv3_small模型(perfect)
if(frownModel == None):
frownModel = mobilenetv3.MobileNetV3_Small(num_classes=2)
frownModel = torch.nn.DataParallel(frownModel)
checkPoint = torch.load("frownMobileNet_CE_sobel.pth")
state_dict = {"module." + k : v for k,v in checkPoint.items()}
frownModel.load_state_dict(state_dict)
for i in range(1000):
...
startTime = time.time()
brow_detect = frown_infer_CE(ROI,frownModel)
brow_state = "frown" if brow_detect == 1 else "normal"
endTime = time.time()
frownDet_time_total += (endTime - startTime)
...
计算开销
locate_time_mean = 26.766409873962402ms #人脸定位 onnx
landmark_time_mean = 38.56730318069458ms #人脸关键点检测 DataParallel
blink_time_mean = 2.069387912750244ms
headpose_time_mean = 1.7124550342559814ms
distance_time_mean = 0.001995086669921875ms
browCropAndProcess_time_mean = 3.3850138187408447ms
frownDet_time_mean = 45.7048454284668ms #人脸皱眉检测 DataParallel
browRaise_time_mean = 0.13062715530395508ms
yawn_time_mean = 0.010970592498779297ms
#总耗时约为26 + 38 + 45 = 109ms
3)使用onnx加载mbv3_small模型(Good)
if(frownModel == None):
frownModel = onnxruntime.InferenceSession("frownClassifier.onnx")
for i in range(1000):
...
startTime = time.time()
brow_detect = frown_infer_CE(ROI,frownModel)
brow_state = "frown" if brow_detect == 1 else "normal"
endTime = time.time()
frownDet_time_total += (endTime - startTime)
...
计算开销
locate_time_mean = 33.561965465545654ms #人脸定位 onnx
landmark_time_mean = 64.68117618560791ms #人脸关键点检测 DataParallel
blink_time_mean = 2.0567142963409424ms
headpose_time_mean = 1.7259035110473633ms
distance_time_mean = 0.001996755599975586ms
browCropAndProcess_time_mean = 3.628046989440918ms
frownDet_time_mean = 25.39611053466797ms #人脸皱眉检测 onnx
browRaise_time_mean = 0.15384936332702637ms
yawn_time_mean = 0.00897359848022461ms
#总耗时约为33 + 64 + 25 = 122ms
4)使用onnx加载mbv2,mbv3_small模型(Bad)
步骤即为1.3和2.3的组合
for i in range(1000):
...
startTime = time.time()
landmark = detector.detect_landmarks(img, [dets])
endTime = time.time()
landmark_time_total += (endTime - startTime)
startTime = time.time()
brow_detect = frown_infer_CE(ROI,frownModel)
brow_state = "frown" if brow_detect == 1 else "normal"
endTime = time.time()
frownDet_time_total += (endTime - startTime)
...
计算开销
locate_time_mean = 22.34066390991211ms #人脸定位 onnx
landmark_time_mean = 103.81071853637695ms #人脸关键点检测 onnx
blink_time_mean = 1.5990486145019531ms
headpose_time_mean = 1.2555391788482666ms
distance_time_mean = 0.005983591079711914ms
browCropAndProcess_time_mean = 2.7420103549957275ms
frownDet_time_mean = 18.23524832725525ms #人脸皱眉检测 onnx
browRaise_time_mean = 0.1012418270111084ms
yawn_time_mean = 0.013963460922241211ms
#总耗时: 22 + 18 + 103 = 143 ms
5)疑惑
为什么onnx加载mbv3_small模型时,计算开销都比Module和DataParallel小;而对于mbv2模型来说,计算开销比Module小,但比DataParallel高?
mbv2_landmark:DataParallel(18.83ms) < onnx(76.81ms) < Module(357.79 ms)
mbv3_small:onnx(25.39ms)< DataParallel(45.70ms) < Module(46.93ms)
3、小总结 & 推测
我有两点猜测
-
onnx推理引擎的确可以增加推理速度,减少计算开销。对于onnxRuntime推理引擎来说,它需要为onnx模型分配指定资源(cpu,内存),这些资源获取有可能被其他onnx模型占用,或者被其他进程占用,此时onnx需要去等待这些资源的释放 -
至于为什么onnxRuntime在仅使用CPU资源情况下,推理速度比DataParallel慢,我猜测是上面的mbv2模型使用的是torchvision.models 里面的mobileNetV2,在推理性能上面,原生的DataParallel相较于onnxRuntime,实现了更好地模型推理优化。
|