Faster RCNN代码整体框架
前言
Faster RCNN作为经典的双阶段目标检测算法,掌握其中的思想和代码实现的方法,对于我们实现单阶段目标检测或者双阶段目标检测都是很有帮助的。 相较于单阶段目标检测,双阶段目标检测主要多了一步生成proposal,也就是候选框的生成。在Faster RCNN中,对于图像中的生成的每一个anchor而言,首先要经过RPN(在这里只区分前景或者背景)做第一次筛选,选出概率大的一些anchor(train时采用预测目标概率前2000个anchor,test时采用1000个)作为proposal,proposal用于后续Faster RCNN的计算,这样的方式也就称为双阶段目标检测。
我以源码为基础,将代码更精简了一点,实现起来更加容易,并且细化了代码量并在每部分做了详细的注释以及shape的变化,以便更好的理解代码的原理和实现的过程。
整体组成
整体Faster RCNN主要包括transform、backbone、RegionProposalNetwork(RPN)、RoIHead、postprocess组成。 其中:transform的作用是将输入图像,缩放到一个给定的大小中,组成一个batch数据输入给网络。因为对于输入图像,其都是一张张大小不一尺寸的图像,无法输入到网络做平行计算,因此transform就是为了将输入图像缩放到指定大小,组成一个batch,然后输入到网络中。 backbone:对于输入图像进行特征提取,得到特征图。在这里采用的是ResNet50+FPN作为backbone。特征图包括ResNet50中layer1、layer2、layer3、layer4的输出并做FPN,将输出的通道数(256, 512, 1024, 2048)都调整为256,并且在layer4生成的特征图后使用一个maxpool,得到’pool’。一共生成5个预测特征图,分别为layer1, layer2, layer3, layer4, maxpool。对应的键为0, 1, 2, 3, pool. RegionProposalNetwork: 将图像中生成的anchors进行筛选,生成proposal。 RoIHead: 由RoI pooling(RoIAlign pooling) + MLP(FastRCNN中ROI-pooling后的展平操作+两个全连接层) + FastRCNNPredictor(输出的预测部分,包括bbox回归参数和类别概率)组成。对RPN生成的proposal进行bbox回归参数和类别参数的预测。 postprocess: 将网络的预测结果还原到原图像尺寸上。
Faster RCNN Model整体代码
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torchvision.ops import MultiScaleRoIAlign
from transforms import RcnnTransforms
from RPN import AnchorsGenerator, RPNHead, RegionProposalNetwork
from RoIHead import RoIHead
class FasterRCNNBase(nn.Module):
def __init__(self, backbone, rpn, roi_heads, transform):
super(FasterRCNNBase, self).__init__()
self.transform = transform
self.backbone = backbone
self.rpn = rpn
self.roi_heads = roi_heads
def forward(self, images, targets=None):
original_image_sizes = []
for img in images:
val = img.shape[-2:]
assert len(val) == 2
original_image_sizes.append((val[0], val[1]))
images, targets = self.transform(images, targets)
features = self.backbone(images.tensors)
proposals, proposal_losses = self.rpn(images, features, targets)
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
if self.training:
return losses
return detections
class TwoMLPHead(nn.Module):
def __init__(self, in_channels, representation_size):
super(TwoMLPHead, self).__init__()
self.fc6 = nn.Linear(in_channels, representation_size)
self.fc7 = nn.Linear(representation_size, representation_size)
def forward(self, x):
x = x.flatten(start_dim=1)
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
return x
class FastRCNNPredictor(nn.Module):
def __init__(self, in_channels, num_classes):
super(FastRCNNPredictor, self).__init__()
self.cls_score = nn.Linear(in_channels, num_classes)
self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
def forward(self, x):
scores = self.cls_score(x)
bbox_deltas = self.bbox_pred(x)
return scores, bbox_deltas
class FasterRCNN(FasterRCNNBase):
def __init__(self, backbone, num_classes=None,
min_size=800, max_size=1333,
image_mean=None, image_std=None,
rpn_anchor_generator=None, rpn_head=None,
rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000,
rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000,
rpn_nms_thresh=0.7,
rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,
rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,
box_roi_pool=None, box_head=None, box_predictor=None,
box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,
box_batch_size_per_image=512, box_positive_fraction=0.25,
bbox_reg_weights=None):
out_channels = backbone.out_channels
if image_mean is None:
image_mean = [0.485, 0.456, 0.406]
if image_std is None:
image_std = [0.229, 0.224, 0.225]
transform = RcnnTransforms(min_size, max_size, image_mean, image_std)
if rpn_anchor_generator is None:
anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
rpn_anchor_generator = AnchorsGenerator(
anchor_sizes, aspect_ratios
)
if rpn_head is None:
rpn_head = RPNHead(
out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
)
rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
rpn = RegionProposalNetwork(
rpn_anchor_generator, rpn_head,
rpn_fg_iou_thresh, rpn_bg_iou_thresh,
rpn_batch_size_per_image, rpn_positive_fraction,
rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)
if box_roi_pool is None:
box_roi_pool = MultiScaleRoIAlign(
featmap_names=['0', '1', '2', '3'],
output_size=[7, 7],
sampling_ratio=2)
if box_head is None:
resolution = box_roi_pool.output_size[0]
representation_size = 1024
box_head = TwoMLPHead(
out_channels * resolution ** 2,
representation_size
)
if box_predictor is None:
representation_size = 1024
box_predictor = FastRCNNPredictor(
representation_size,
num_classes)
roi_heads = RoIHead(
box_roi_pool, box_head, box_predictor,
box_fg_iou_thresh, box_bg_iou_thresh,
box_batch_size_per_image, box_positive_fraction,
bbox_reg_weights,
box_score_thresh, box_nms_thresh, box_detections_per_img)
super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
接下来的几篇文章,依次实现transform、backbone、RPN、RoIHead部分。
|