RCNN系列算法指的是RCNN、Fast-RCNN、Faster-RCNN等一系列由RCNN算法演变出的算法。这类算法通常是采用两个步骤来实现对目标的定位及检测的,即定位+检测。定位算法通常在RCNN算法中也与很多,详细参照主要包括滑动窗口模型、和选择性收索模型等。然后特征分类网络一般采用ResNet系列模型及VGG系列模型。当然我们也可尝试使用GoogleNet或者Inception系列模型进行训练,以提高发杂分类场景中的分类准确性。RCNN系列模型也被称作为Two Stage模型。 Fast-RCNN论文链接 大致实现步骤如下: a、输入图片 b、收索目标区域 c、提取特征 d、图像分类
import cv2
if __name__ == '__main__':
im = cv2.imread('6.tif')
ResizeValue = 2
newHeight = int((im.shape[0])/ResizeValue)
newWidth = int((im.shape[1])/ResizeValue)
im = cv2.resize(im, (newWidth, newHeight))
ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
rects = ss.process()
print('Total Number of Region Proposals: {}'.format(len(rects)))
numShowRects = len(rects)
increment = 50
Area = 100*100
set_x,set_y,set_w,set_h = [],[],[],[]
rec_value = []
while True:
imOut = im.copy()
for i, rect in enumerate(rects):
if (i < numShowRects):
x, y, w, h = rect
Area_Get = w*h
key = []
if 400>w>350 and 600>h>500:
cv2.rectangle(imOut, (x, y), (x + w, y + h), (0, 255, 0), 1, cv2.LINE_AA)
cv2.imshow("Output", imOut)
k = cv2.waitKey(0) & 0xFF
if k == 109:
numShowRects += increment
elif k == 108 and numShowRects > increment:
numShowRects -= increment
elif k == 113:
总结: RCNN系列算法本质上都是一样的,无非是Two Stage步骤来回优化,首先需要优化的必然是候选区域的生成方法,无论是滑动窗口法还是选择性收索法都不具有效率,在此基础上Faster-RCNN提出了一种bbox regression的方法,正式将two方法真正意义上变成了完全的基于深度学习的目标检测方法。并且提升了算法的效率。然后就是特征提取模型,一般特征提取模型有三种,即深度模型,广度模型和混合模型,这里深度模型指的是算法层数不断在叠加的模型,典型的模型为ResNet系列模型,广度模型指的是通过不同的卷积窗口对图像特征进行提取的模型,典型的模型为VGG系列模型,还有就是混合模型,即既注重深度特征提取,又注重广度特征提取的模型,如Inception系列模型以及GoogleNet等。
YoloV1原文链接 Yolo系列算法是典型的one stage算法,同样,在算法设计上也注重目标区域的检测以及特征的分类,这里目标区域的检测采用的是和图像区域分类定位的方式实现的。
import torch
import torch.nn as nn
from utils import SPP, SAM, BottleneckCSP, Conv
from backbone import resnet18
import numpy as np
import tools
class myYOLO(nn.Module):
def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.5, hr=False):
super(myYOLO, self).__init__()
self.device = device
self.num_classes = num_classes
self.trainable = trainable
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.stride = 32
self.grid_cell = self.create_grid(input_size)
self.input_size = input_size
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=device).float()
self.backbone = resnet18(pretrained=True)
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
def create_grid(self, input_size):
w, h = input_size[1], input_size[0]
ws, hs = w // self.stride, h // self.stride
grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
grid_xy = grid_xy.view(1, hs*ws, 2).to(self.device)
return grid_xy
def set_grid(self, input_size):
self.input_size = input_size
self.grid_cell = self.create_grid(input_size)
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=self.device).float()
def decode_boxes(self, pred):
input box : [tx, ty, tw, th]
output box : [xmin, ymin, xmax, ymax]
output = torch.zeros_like(pred)
pred[:, :, :2] = torch.sigmoid(pred[:, :, :2]) + self.grid_cell
pred[:, :, 2:] = torch.exp(pred[:, :, 2:])
output[:, :, 0] = pred[:, :, 0] * self.stride - pred[:, :, 2] / 2
output[:, :, 1] = pred[:, :, 1] * self.stride - pred[:, :, 3] / 2
output[:, :, 2] = pred[:, :, 0] * self.stride + pred[:, :, 2] / 2
output[:, :, 3] = pred[:, :, 1] * self.stride + pred[:, :, 3] / 2
return output
def nms(self, dets, scores):
""""Pure Python NMS baseline."""
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
areas = (x2 - x1) * (y2 - y1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(1e-28, xx2 - xx1)
h = np.maximum(1e-28, yy2 - yy1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= self.nms_thresh)[0]
order = order[inds + 1]
return keep
def postprocess(self, all_local, all_conf, exchange=True, im_shape=None):
bbox_pred: (HxW, 4), bsize = 1
prob_pred: (HxW, num_classes), bsize = 1
bbox_pred = all_local
prob_pred = all_conf
cls_inds = np.argmax(prob_pred, axis=1)
prob_pred = prob_pred[(np.arange(prob_pred.shape[0]), cls_inds)]
scores = prob_pred.copy()
keep = np.where(scores >= self.conf_thresh)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
keep = np.zeros(len(bbox_pred), dtype=np.int)
for i in range(self.num_classes):
inds = np.where(cls_inds == i)[0]
if len(inds) == 0:
c_bboxes = bbox_pred[inds]
c_scores = scores[inds]
c_keep = self.nms(c_bboxes, c_scores)
keep[inds[c_keep]] = 1
keep = np.where(keep > 0)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
if im_shape != None:
bbox_pred = self.clip_boxes(bbox_pred, im_shape)
return bbox_pred, scores, cls_inds
def forward(self, x, target=None):
_, _, C_5 = self.backbone(x)
C_5 = self.SPP(C_5)
C_5 = self.SAM(C_5)
C_5 = self.conv_set(C_5)
prediction = self.pred(C_5)
prediction = prediction.view(C_5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
B, HW, C = prediction.size()
conf_pred = prediction[:, :, :1]
cls_pred = prediction[:, :, 1 : 1 + self.num_classes]
txtytwth_pred = prediction[:, :, 1 + self.num_classes:]
if not self.trainable:
with torch.no_grad():
all_conf = torch.sigmoid(conf_pred)[0]
all_bbox = torch.clamp((self.decode_boxes(txtytwth_pred) / self.scale_torch)[0], 0., 1.)
all_class = (torch.softmax(cls_pred[0, :, :], 1) * all_conf)
all_conf = all_conf.to('cpu').numpy()
all_class = all_class.to('cpu').numpy()
all_bbox = all_bbox.to('cpu').numpy()
bboxes, scores, cls_inds = self.postprocess(all_bbox, all_class)
return bboxes, scores, cls_inds
conf_loss, cls_loss, txtytwth_loss, total_loss = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred,
return conf_loss, cls_loss, txtytwth_loss, total_loss
基本实现思路: A、获取目标区域
- 将图片划分为S X S大小
- 确认边界及概率
- 对目标的概率映射
- 获取最终的目标边框
- 将目标区域转换为特定大小的尺寸
- 将转换后的图像输入特征提取网络
- 对输入目标进行分类
总结: yolo系列算法是一种比较成熟的目标检测算法框架,基于这种框架的算法还在不断地迭代中,当然解决的问题也越来越细化,比如候选区精度、比如小尺度检测等。基本上YoloV3及以上版本的算法可以在很多场景下得到现实应用。当然,问题总是在不断地出现和得到补充的,期待能够看到更加高效准确的基于Yolo算法设计思路而来的新算法。