RegionProposalNetwork

在Faster RCNN中第一阶段是由RegionProposalNetwork生成anchors，并通过筛选得到proposal。代码中详细注释了每一部分的过程。
import torch
import torchvision
from torch import nn, Tensor
from torch.nn import functional as F
import math
from typing import Dict


def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = True):
    """
    very similar to the smooth_l1_loss from pytorch, but with
    the extra beta parameter
    """
    n = torch.abs(input - target)
    # cond = n < beta. lt: 代表小于操作
    cond = torch.lt(n, beta)
    loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
    if size_average:
        return loss.mean()
    return loss.sum()


def nms(boxes, scores, iou_threshold):
    # type: (Tensor, Tensor, float) -> Tensor
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
    IoU greater than iou_threshold with another (higher scoring)
    box.

    Parameters
    ----------
    boxes : Tensor[N, 4])
        boxes to perform NMS on. They
        are expected to be in (x1, y1, x2, y2) format
    scores : Tensor[N]
        scores for each one of the boxes
    iou_threshold : float
        discards all overlapping
        boxes with IoU < iou_threshold

    Returns
    -------
    keep : Tensor
        int64 tensor with the indices
        of the elements that have been kept
        by NMS, sorted in decreasing order of scores
    """
    return torchvision.ops.nms(boxes, scores, iou_threshold)


class RPNHead(nn.Module):
    # 计算预测目标概率与bbox regression参数
    def __init__(self, in_channels, num_anchors):
        super(RPNHead, self).__init__()

        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
        # 计算预测的类别分数（这里的类别指前景或者背景）
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
        # 计算预测的目标bbox regression参数
        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
        # 初始化
        for layer in self.children():
            if isinstance(layer, nn.Conv2d):
                torch.nn.init.normal_(layer.weight, std=0.01)
                torch.nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        # x.shape: 不同特征图的输出维度 C=256 [B, 256, H, W]
        logits = []
        bbox_reg = []
        for i, feature in enumerate(x):
            output_33 = F.relu(self.conv(feature))
            logits.append(self.cls_logits(output_33))
            bbox_reg.append(self.bbox_pred(output_33))
        return logits, bbox_reg


class AnchorsGenerator(nn.Module):
    # sizes: anchor的尺寸 aspect_ratios：anchor采用的不同的比例
    def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
        super(AnchorsGenerator, self).__init__()

        self.sizes = sizes
        self.aspect_ratios = aspect_ratios
        self.cell_anchors = None
        self._cache = {} # 在原图上生成的所有anchor的信息 存入到cache中

    def generate_anchors(self, scale, aspect_ratios, dtype=torch.float32, device='cpu'):
        # scale: tuple -> tensor
        scale = torch.as_tensor(scale, dtype=dtype, device=device)
        # aspect_ratios: tuple -> tensor
        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)

        h_ratios = torch.sqrt(aspect_ratios)
        w_ratios = 1.0 / h_ratios

        # 每一个高、宽比例分别乘上对应预测特征图的尺度 得到每一个anchor对应的高和宽
        # ResNet而言 对应不同的预测特征图 scale仅有一个尺度 ws shape: [3]
        ws = (w_ratios[:, None] * scale[None, :]).view(-1)
        hs = (h_ratios[:, None] * scale[None, :]).view(-1)

        # 生成的anchors模板都是以（0, 0）为中心, 对应左上角坐标和右下角坐标. dim=1 shape-> [3, 4] 三个比例的anchor，4左上角坐标和右下角坐标
        base_anchor = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
        # round 四舍五入取整
        return base_anchor.round()

    def set_cell_anchors(self, dtype, device):
        # 初始化时设置为None
        if self.cell_anchors is not None:
            cell_anchors = self.cell_anchors
            assert cell_anchors is not None
            # suppose that all anchors have the same device
            # which is a valid assumption in the current state of the codebase
            if cell_anchors[0].device == device:
                return

        # 根据提供的sizes和aspect_ratios生成anchors模板
        # 不同预测特征图所对应的sizes不同 遍历不同的预测特征图生成与之对应的anchor模板
        # 返回一个list key代表不同的预测特征图 value代表anchor模板
        cell_anchors = [
            self.generate_anchors(sizes, aspect_ratios, dtype, device)
            for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)]

        self.cell_anchors = cell_anchors

    def cached_grid_anchors(self, grid_sizes, strides):
        key = str(grid_sizes) + str(strides)
        if key in self._cache:
            return self._cache[key]
        # 得到所有预测特征图映射回原图上所生成的anchors
        anchors = self.grid_anchors(grid_sizes, strides)
        self._cache[key] = anchors

        return anchors

    def grid_anchors(self, grid_sizes, strides):
        anchors = []
        # anchor模板
        cell_anchors = self.cell_anchors

        # 遍历每个预测特征图的grid_sizes strides和cell_anchors
        # size 预测特征图的高度和宽度
        # stride 预测特征层上的每一个stride对应原图上的高度和宽度
        # cell_anchors anchor模板
        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
            grid_height, grid_width = size
            stride_height, stride_width = stride
            device = base_anchors.device
            # 生成对应原图上的x坐标
            shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
            # 生成对应原图上的x坐标
            shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height

            # 得到每一个点映射回原图上的坐标
            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
            shift_x = shift_x.reshape(-1) # 展平 -> 向量
            shift_y = shift_y.reshape(-1)
            # shifts: shape: 当前预测特征图上[grid_width*grid_height, 4]
            shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
            # shift这个偏移量加上anchor模板 就可以平移到原图上的每个位置上
            # shape: [grid_width*grid_height, 3, 4] 3: 当前预测特征图上每个位置生成的3个anchor 4: 对应的左上右下坐标
            shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
            # 当前预测特征图生成的所有anchor存入anchors中
            # shape: [grid_width*grid_height*3, 4]
            anchors.append(shifts_anchor.reshape(-1, 4)) # 当前预测特征图生成的所有anchor存入anchors中

        return anchors

    def num_anchors_per_location(self):
        # 计算每个预测特征层上每个滑动窗口的预测目标数
        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]

    def forward(self, image_list, feature_maps):
        # 获取backbone中每个预测特征图的尺寸(h, w)
        grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])

        # 获取打包成batch的输入图像的height和width
        # image_list: 1 图像缩放后的尺寸(image_sizes) 2 预处理过程中将图像打包成一个个batch 每个batch对应一个tensor(tensors)
        image_size = image_list.tensors.shape[-2:]

        # 获取变量类型和设备类型
        dtype, device = feature_maps[0].dtype, feature_maps[0].device

        # 计算特征层上的每一步对应原始图像上的尺度
        # 图像大小 / 预测特征图大小 = 特征图上的每一步对应原图上像素的大小
        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]

        # 根据提供的sizes和aspect_ratios生成anchors模板
        self.set_cell_anchors(dtype, device)

        # 将anchor模板应用到原图中
        # grid_sizes 预测特征图的高度、宽度信息
        # strides：预测特征图中每个scale对应原图上的尺度
        # 得到不同预测特征图在原图上生成的anchors的坐标信息
        anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)

        # 遍历batch中的每一张图片
        anchors = []
        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
            anchors_in_image = []
            # 遍历所有预测特征图映射回原图中anchor的坐标信息
            for anchors_per_feature_map in anchors_over_all_feature_maps:
                anchors_in_image.append(anchors_per_feature_map)
            anchors.append(anchors_in_image)
        # cat: 将一张图像中的所有预测特征图上的anchors拼接到一起
        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
        self._cache.clear()

        return anchors


def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its
    (x1, y1, x2, y2) coordinates.

    Arguments:
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
            are expected to be in (x1, y1, x2, y2) format

    Returns:
        area (Tensor[N]): area for each box
    """
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


def box_iou(boxes1, boxes2):
    # 计算传入的两组boxes的IOU值
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    # [gt_box_num, 1, 2] - [anchor_num, 2] -> shape: [gt_box_num, anchor_num, 2]
    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])
    # [gt_box_num, 1, 2] - [anchor_num, 2] -> shape: [gt_box_num, anchor_num, 2]
    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
    # clamp: 限制最小的输出 shape: [gt_box_num, anchor_num, 2] 2: 相交区域的w和h
    wh = (right_bottom - left_top).clamp(min=0)
    # wh[:, :, 0] 将每一个相交区域的宽取出 shape -> [gt_box_num, anchor_num]
    # wh[:, :, 1] 将每一个相交区域的高取出 shape -> [gt_box_num, anchor_num]
    # inter shape: [gt_box_num, anchor_num] 对应每个gtbox与生成的所有anchor之间相交的面积
    inter = wh[:, :, 0] * wh[:, :, 1]
    # iou shape: [gt_box_num, anchor_num]
    iou = inter / (area1[:, None] + area2 - inter)

    return iou


def permute_and_flatten(layer, N, A, C, H, W):
    # 调整tensor顺序 并进行展平操作
    # layer: 预测特征层上预测的目标概率或bboxes regression参数
    # N: batch_size
    # A: anchors的数量
    # C: classes_num or 4(bbox coordinate) RPN中C=1
    # H: height
    # W: width
    # C: class_num RPN只需计算前景还是背景 C=1 shape: [batch_size, anchor_num, C, H, W]
    layer = layer.view(N, -1, C, H, W)
    # shape: -> [batch_size, H, W, anchor_num, C]
    layer = layer.permute(0, 3, 4, 1, 2)
    # shape: -> [batch_size, H * W * anchor_num, C]
    # view和reshape功能是一样的，先展平所有元素在按照给定shape排列
    # view函数只能用于内存中连续存储的tensor，permute等操作会使tensor在内存中变得不再连续，此时就不能再调用view函数
    # reshape则不需要依赖目标tensor是否在内存中是连续的
    layer = layer.reshape(N, -1, C)

    return layer


def concat_box_pred_layers(box_cls, box_regression):

    # 存储预测目标分类数的参数
    box_cls_flattened = []
    # 存储预测bbox回归的参数
    box_regression_flattened = []

    # 逐层遍历每个预测特征图
    for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
        # N: batch_size A: anchor的个数 C: 分类数（在RPN中C=1，只区分目标和背景）H: 特征矩阵的高度 W: 特征矩阵的宽度
        N, AxC, H, W = box_cls_per_level.shape
        Ax4 = box_regression_per_level.shape[1]
        # anchor的个数
        A = Ax4 // 4
        # 分类的个数
        C = AxC // A

        # 对参数进行展平处理 shape: [batch_size, anchor_num*H*W， C=1]
        box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
        box_cls_flattened.append(box_cls_per_level)

        # shape: [batch_size, anchor_num*H*W， C=4]
        box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
        box_regression_flattened.append(box_regression_per_level)
    # torch.cat(box_cls_flattened, dim=1) shape: [batch, anchor_num*H*W, C=1] 将不同预测特征图中生成的anchor数量加到一起
    # flatten(0, -2) shape: [Batch*anchor_num*H*W, 1] 其中参数表示从0维度开始，到-2维度停止 来进行展平操作。
    box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)
    # shape: [Batch*anchor_num*H*W, 4]
    box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)

    return box_cls, box_regression


def clip_boxes_to_image(boxes, size):
    # 裁剪预测的boxes信息，将越界的坐标调整到图片边界上
    boxes_x = boxes[..., 0::2] # xmin xmax
    boxes_y = boxes[..., 1::2] # ymin ymax
    # size 对应当前图像的高宽信息
    height, width = size
    # clamp 限制信息。 将x坐标范围限制在0-width之间
    boxes_x = boxes_x.clamp(min=0, max=width)
    # 将y坐标范围限制在0 - height之间
    boxes_y = boxes_y.clamp(min=0, max=height)

    # 得到裁剪之后的boxes
    clipped_boxes = torch.cat((boxes_x, boxes_y), dim=1)
    return clipped_boxes


def remove_small_boxes(boxes, min_size):
    # proposal的宽高信息
    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
    # 当满足宽，高都大于给定阈值时为True
    keep = (ws >= min_size) & (hs >= min_size)
    # 获取keep中为True的索引
    keep = keep.nonzero().squeeze(1)
    return keep


def batched_nms(boxes, scores, level_idxs, iou_threshold):
    # 判断proposal的数量是否为0
    if boxes.numel() == 0:
        return torch.empty((0, ), dtype=torch.int64, device=boxes.device)

    # 获取所有proposal中坐标最大的数值
    max_coordinate = boxes.max()
    # to: 使level_idxs的dtype和device与boxes保持一致
    offset = level_idxs.to(boxes) * (max_coordinate + 1)
    # 生成的proposal是在不同的预测特征图上生成的，加一个很大的offset，是为了将不同预测特征图的proposal分开，保证每一个预测特征图的proposal与其他层的proposal不会相交。
    # nms对不同预测特征图的proposal进行处理
    boxes_offset = boxes + offset[:, None]
    keep = nms(boxes_offset, scores, iou_threshold)

    return keep


def encode_boxes(reference_boxes, anchors, weights):
    wx = weights[0]
    wy = weights[1]
    ww = weights[2]
    wh = weights[3]

    # shape: [anchor_nums, 1]
    # 对应anchors的坐标 xmin ymin xmax ymax
    anchors_x1 = anchors[:, 0].unsqueeze(1)
    anchors_y1 = anchors[:, 1].unsqueeze(1)
    anchors_x2 = anchors[:, 2].unsqueeze(1)
    anchors_y2 = anchors[:, 3].unsqueeze(1)
    # 每个anchors对应的gtbox坐标 xmin ymin xmax ymax
    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
    # anchors的宽度
    ex_width = anchors_x2 - anchors_x1
    # anchors的高度
    ex_height = anchors_y2 - anchors_y1
    # anchor的中心坐标x
    ex_ctr_x = anchors_x1 + 0.5 * ex_width
    # anchor的中心坐标y
    ex_ctr_y = anchors_y1 + 0.5 * ex_height

    # 每一个anchor对应gtbox的w h center_x center_y
    gt_widths = reference_boxes_x2 - reference_boxes_x1
    gt_heights = reference_boxes_y2 - reference_boxes_y1
    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights

    # 计算gtbox对应anchors的回归参数
    # ti*: 第i个anchor对应gtbox的回归参数.
    # tx* = (gtbox的x坐标 - anchor的x的坐标) / anchor的宽
    # ty* = (gtbox的y坐标 - anchor的y的坐标) / anchor的高
    # 得到gtbox相对anchor的xy坐标偏移量
    target_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_width
    target_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_height
    # tw* = log(gtbox的w / anchor的w)
    # th* = log(gtbox的h / anchor的h)
    target_dw = ww * torch.log(gt_widths / ex_width)
    target_dh = wh * torch.log(gt_heights / ex_height)
    # xywh -> xmin ymin xmax ymax
    pred_boxes_xmin = target_dx - torch.tensor(0.5, dtype=target_dx.dtype, device=target_dx.device) * target_dw
    pred_boxes_ymin = target_dy - torch.tensor(0.5, dtype=target_dy.dtype, device=target_dy.device) * target_dh
    pred_boxes_xmax = target_dx + torch.tensor(0.5, dtype=target_dx.dtype, device=target_dx.device) * target_dw
    pred_boxes_ymax = target_dy + torch.tensor(0.5, dtype=target_dy.dtype, device=target_dy.device) * target_dh
    # shape: [anchors, 4] 4为xywh的偏移量
    # targets = torch.cat((target_dx, target_dy, target_dw, target_dh), dim=1)
    # shape: [anchors, 4] 4为xyxy的偏移量
    targets = torch.cat((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=1)

    return targets


class box_Coder(object):

    def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):

        self.weights = weights
        self.bbox_xform_clip = bbox_xform_clip

    def encode(self, reference_boxes, anchores):
        # 结合anchors和与之对应的gt计算回归参数
        # reference_boxes: 每个anchor对应的gtbox坐标 proposals: 对应anchor的坐标
        # 统计每张图像的anchors个数
        boxes_per_image = [len(b) for b in reference_boxes]
        # shape: [batch_size*per_anchor_num, 4]
        # 每个anchor对应的gtbox坐标进行拼接
        reference_boxes = torch.cat(reference_boxes, dim=0)
        # shape: [batch_size*per_anchor_num, 4]
        # 每个anchors进行拼接
        anchors = torch.cat(anchores, dim=0)

        targets = self.encode_single(reference_boxes, anchors)
        # shape: [per_anchor_num, 4]
        return targets.split(boxes_per_image, 0)

    def encode_single(self, reference_boxes, anchors):
        dtype = reference_boxes.dtype
        device = reference_boxes.device
        weights = torch.as_tensor(self.weights, dtype=dtype, device=device) # weights 1 1 1 1
        # shape: [anchors, 4] 4: center_x, center_y, w, h的偏移量
        # targets = encode_boxes(reference_boxes, anchors, weights)
        # shape: [anchors, 4] 4: xmin, ymin, xmax, ymax的偏移量
        targets = encode_boxes(reference_boxes, anchors, weights)

        return targets

    def decode_single(self, rel_codes, boxes):
        # rel_codes: 预测的bbox的回归参数
        # boxes: anchor模板的坐标
        boxes = boxes.to(rel_codes.dtype)

        # boxes(cell_anchors) shape: [batchsize*anchor_num, 4] 4: xmin, ymin, xmax, ymax
        width = boxes[:, 2] - boxes[:, 0] # shape: [batchsize*anchor_num]
        height = boxes[:, 3] - boxes[:, 1]
        center_x = boxes[:, 0] + 0.5 * width
        center_y = boxes[:, 1] + 0.5 * height

        wx, wy, ww, wh = self.weights
        # 0::4 从0开始以4为间隔进行采样 但可以多保留一个维度 dx shape: [batchsize*anchor_num, 1]
        # rel_codes shape: [batchsize*anchor_num, 4]
        dx = rel_codes[:, 0::4] / wx # 预测anchor的中心坐标回归参数x
        dy = rel_codes[:, 1::4] / wy # 预测anchor的中心坐标回归参数y
        dw = rel_codes[:, 2::4] / ww # 预测anchor的宽度回归参数w
        dh = rel_codes[:, 3::4] / wh # 预测anchor的高度回归参数h

        # clamp: 限制数值的上下限
        # 预防指数爆炸
        dw = torch.clamp(dw, max=self.bbox_xform_clip)
        dh = torch.clamp(dh, max=self.bbox_xform_clip)

        # 将预测值应用于anchor模板中
        # tx: 预测得到的中心坐标x的回归参数 wa: anchor的宽度 xa: anchor的中心x坐标
        # tx = (x - xa) / wa        ty = (y - ya) / ha
        # tw = log(w / wa)          th = log(h / ha)
        # x y w h 为预测值xywh
        pred_center_x = dx * width[:, None] + center_x[:, None]
        pred_center_y = dy * height[:, None] + center_y[:, None]
        pred_w = torch.exp(dw) * width[:, None]
        pred_h = torch.exp(dh) * height[:, None]

        # [x, y, w, h] -> [xmin, ymin, xmax. ymax]
        pred_boxes_xmin = pred_center_x - torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_w
        pred_boxes_ymin = pred_center_y - torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_h
        pred_boxes_xmax = pred_center_x + torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_w
        pred_boxes_ymax = pred_center_y + torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_h

        # stack: dim=2 首先新增一个维度 flatten()展平 ->  shape: [batchsize*anchor_num, 4]
        # stack: 会新增维度 cat: 在指定维度上进行拼接
        pred_boxes = torch.stack((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=2).flatten(1)
        # pred_boxes = torch.cat((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=1)

        return pred_boxes

    def decode(self, rel_codes, boxes):
        # rel_codes: 预测的bbox的回归参数
        # boxes: anchor的坐标
        # 将一个batch中所有anchor的坐标信息拼接在一起
        concat_boxes = torch.cat(boxes, dim=0)

        # 获取一个batch中anchor的总数
        box_sum = concat_boxes.shape[0]

        # 将预测的bbox的回归参数应用到anchor上
        pred_boxes = self.decode_single(rel_codes, concat_boxes)
        # shape: [batchsize*anchor_num, 1, 4]
        pred_boxes = pred_boxes.reshape(box_sum, -1, 4)

        return pred_boxes


class Matcher(object):
    # 计算anchors与每个gtboxes匹配的iou最大值，并记录索引，
    def __init__(self,  high_threshold, low_threshold, allow_low_quality_matches=False):

        self.BELOW_LOW_THRESHOLD = -1
        self.high_threshold = high_threshold  # 0.7
        self.low_threshold = low_threshold  # 0.3
        self.allow_low_quality_matches = allow_low_quality_matches

    def __call__(self, match_quality_matrix):
        # matched_vals: 每一个anchor对应每个GTbox上最大的iou的值, 即每列的最大值。
        # matches: 每列最大值的索引
        # 获得anchors与GTbox最大的iou值
        matched_value, matches_idx = match_quality_matrix.max(dim=0)

        if self.allow_low_quality_matches:
            all_matches = matches_idx.clone()
        else:
            all_matches = None

        # 计算iou小于low_threshold. 小于的位置为True, 不小于的位置为False
        below_low_threshold = matched_value < self.low_threshold

        # 计算iou大于low_threshold且小于high_threshold之间的索引
        between_threshold = (matched_value >= self.low_threshold) & (matched_value < self.high_threshold)

        # 将matches_idx中小于low_threshold位置的值，设置为-1
        matches_idx[below_low_threshold] = -1

        # iou在[low_threshold, high_threshold]之间的matches_idx索引值设为-2
        matches_idx[between_threshold] = -2

        # 对于每一个GTbox而言，与之匹配IOU值最大的anchor，也将其设置为正样本。
        # 相当于对于每一个GTbox都有一个与之匹配的anchor，将其设置为正样本，不局限于iou大于0.7时才为正样本。
        if self.allow_low_quality_matches:
            self.set_low_quality_matches_(matches_idx, all_matches, match_quality_matrix)

        return matches_idx

    def set_low_quality_matches_(self, matches_idx, all_matches, match_quality_matrix):
        # 对于每个GTbox，寻找与其iou值最大的anchor. 对于每一行而言，每一行为一个GTbox
        highest_quality_gt_value, _ = match_quality_matrix.max(dim=1)
        # 在相同位置上 数值相等的位置为True. nonzero: 寻找为True的位置. 返回每一个非0元素的坐标
        gt_anchor_matches_highest_coordiate = torch.nonzero(match_quality_matrix == highest_quality_gt_value[:, None])
        gt_anchor_matches_highest_coordiate_update = gt_anchor_matches_highest_coordiate[:, 1]
        # 保留该anchor匹配gt最大iou的索引，即使iou低于设定的阈值
        matches_idx[gt_anchor_matches_highest_coordiate_update] = all_matches[gt_anchor_matches_highest_coordiate_update]


class BalancedPositiveNegativeSampler(object):

    def __init__(self, batch_size_per_image, positive_fraction):

        self.batch_size_per_image = batch_size_per_image
        self.positive_fraction = positive_fraction

    def __call__(self, matched_idxs):
        # 记录正负样本的索引
        pos_idx = []
        neg_idx = []
        # 遍历每张图像的matched_idxs 划分为正、负和丢弃样本的labels
        for matched_idxs_per_image in matched_idxs:
            # >= 1 positive sample, nonzero返回非零元素的索引(正样本对应的索引)
            # torch.nonzero(matched_idxs_per_image >= 1) shape: [正样本对应的索引, 1]
            # squeeze(1): 降维 -> shape: [正样本对应的索引]
            positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
            # = 0的为负样本
            negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
            # 指定正样本数量
            num_pos = int(self.batch_size_per_image * self.positive_fraction)
            # 若正样本数量不足，直接使用所有正样本
            num_pos = min(positive.numel(), num_pos)
            # 负样本数量
            num_neg = self.batch_size_per_image - num_pos
            # 如果负样本数量不够就直接采用所有负样本 numel: 元素个数
            num_neg = min(negative.numel(), num_neg)

            # 随机选择指定数量的正负样本 返回的是 索引信息
            perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
            perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]

            pos_idx_per_image = positive[perm1]
            neg_idx_per_image = negative[perm2]

            # 创建两个全0的模板 用于存储正样本和负样本
            pos_idx_per_image_mask = torch.zeros_like(
                matched_idxs_per_image, dtype=torch.uint8
            )
            neg_idx_per_image_mask = torch.zeros_like(
                matched_idxs_per_image, dtype=torch.uint8
            )
            # 对于正样本模板 对应正样本的索引位置处的值 全部设置为1
            pos_idx_per_image_mask[pos_idx_per_image] = 1
            # 对于负样本模板 对应负样本的索引位置处的值 全部设置为1
            neg_idx_per_image_mask[neg_idx_per_image] = 1

            pos_idx.append(pos_idx_per_image_mask)
            neg_idx.append(neg_idx_per_image_mask)

        return pos_idx, neg_idx


class RegionProposalNetwork(nn.Module):
    # batch_size_per_image：RPN计算损失时采用正负样本的总个数 positive_fraction：正样本在总样本中的比例
    def __init__(self, anchor_generate, rpn_head, fg_iou_thresh, bg_iou_thresh, batch_size_per_image,
                 positive_fraction, pre_nms_top_n, post_nms_top_n, nms_thresh):
        super(RegionProposalNetwork, self).__init__()

        self.anchor_generator = anchor_generate
        self.head = rpn_head
        # 平衡系数
        self.box_coder = box_Coder(weights=(1.0, 1.0, 1.0, 1.0))

        # train
        self.box_similarity = box_iou
        # fg_iou_thresh: 当iou大于fg_iou_thresh(0.7)，设置为正样本. bg_iou_thresh: 当iou小于bg_iou_thresh(0.3)，设置为负样本.
        self.proposal_matcher = Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=True)

        # batch_size_per_image: 计算损失时，选择的正负样本的总个数
        # positive_fraction: 正样本占全部样本的比例
        self.fg_bg_sampler = BalancedPositiveNegativeSampler(
            batch_size_per_image, positive_fraction  # 256, 0.5
        )

        # test
        self._pre_nms_top_n = pre_nms_top_n
        self._post_nms_top_n = post_nms_top_n
        self.nms_thresh = nms_thresh
        # 过滤proposal时使用
        self.min_size = 1e-3

    def pre_nms_top_n(self):
        # 判断训练还是预测. 训练 pre_nms_top_n = 2000 预测: pre_nms_top_n = 1000
        if self.training:
            return self._pre_nms_top_n['training']
        return self._pre_nms_top_n['testing']

    def post_nms_top_n(self):
        if self.training:
            return self._post_nms_top_n['training']
        return self._post_nms_top_n['testing']

    def _get_top_n_idx(self, objectness, num_anchors_per_level):

        # objectness: 一个batch中每张图像的预测目标概率信息
        # num_anchors_per_level: 每个预测特征图上预测anchor的数量
        # 记录每个预测特征图上预测目标概率前pre_nms_top_n的索引信息
        r = []
        offset = 0
        # 遍历每个预测特征图上的预测目标概率信息
        # split: 在指定维度上 信息按多长进行分割。 对应每一个预测特征图上anchor的个数
        for ob in objectness.split(num_anchors_per_level, 1):
            # ob shape: [batch_size, 对应预测特征图上anchor的数量]
            num_anchors = ob.shape[1]
            # 针对每一层取前topn个anchors，得到proposal.
            pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
            # topk. 默认从大到小排序。 return：1、排序后的数值 2、排序后的index
            # 对每一层预测特征图进行排序
            _, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
            # offset. 目的是获得将所有anchor(所有预测特征图中的anchor)合并到一起后的索引， 得到proposal。
            r.append(top_n_idx + offset)
            offset += num_anchors
        # 将每一层的前topk的索引进行拼接
        # r中每一层预测特征图的shape: [batch_size, topk]
        return torch.cat(r, dim=1)

    def assign_targets_to_anchors(self, anchors, targets):
        # 对于一个batch中，每一张图像中生成的anchors，为每一个anchors匹配所对应的标签以及GTboxes。

        # 记录anchors匹配的标签以及gtboxes所对应的坐标
        labels = []
        matched_gt_boxes = []

        # 遍历每张图像的anchors和target
        for anchors_per_image, targets_per_image in zip(anchors, targets):
            # targets为dict，分别对应boxes, labels, image_id, area, iscrowd信息。
            # RPN只区别目标是前景和背景 因此只提取坐标信息，不需要具体的类别信息。
            gt_boxes = targets_per_image['boxes']
            # numel(): 获取所有元素的个数
            # 如果图像中没有检测目标(gtbox)
            if gt_boxes.numel() == 0:
                device = anchors_per_image.device
                matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
                labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)

            else:
                # 计算GTboxes与anchors的IOU
                # return shape: [gt_num, anchor_num]
                match_quality_matrix = box_iou(gt_boxes, anchors_per_image)
                # 通过计算得到的iou值，为每个anchor分配匹配到的gtbox的索引。
                # shape: [anchor_num]. 由-1 -2 anchor的索引组成. -1为负样本 -2为丢弃的样本 >=0为正样本
                matched_idxs = self.proposal_matcher(match_quality_matrix)
                # 获得每一个anchor得到的gtbox的坐标
                matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]
                # 记录所有anchors匹配后的标签(所有正样本的位置) 正样本对应索引位置上的值为1
                labels_per_image = matched_idxs >= 0
                labels_per_image = labels_per_image.to(dtype=torch.float32)
                # 记录所有负样本的索引 负样本对应索引位置上的值设为0
                bg_indices = matched_idxs == -1
                labels_per_image[bg_indices] = 0.0
                # 记录丢弃样本的索引 丢弃样本对应索引位置上的值设为-1
                between_indices = matched_idxs == -2
                labels_per_image[between_indices] = -1.0

            labels.append(labels_per_image)
            matched_gt_boxes.append(matched_gt_boxes_per_image)

        return labels, matched_gt_boxes

    def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
        # proposal shape: [Batch, anchor_num, 4]
        # batch size
        num_images = proposals.shape[0]
        device = proposals.device

        # detach 对于fast_rcnn部分，proposal是输入参数，丢弃计算梯度信息，只保留数值信息，requires_grad=False.
        objectness = objectness.detach()
        # shape: [batch_size, anchor_num]
        objectness = objectness.view(num_images, -1)
        # levels 记录不同预测特征层上的anchors索引信息。 为了知道对于每一个anchor其属于哪一个预测特征图上的
        # idx: 对应预测特征图的索引 n: 该预测特征图上anchor的个数
        # torch.full生成维度为n，其中用idx值填充
        levels = [torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level)]
        # [anchor_num]
        levels = torch.cat(levels, dim=0)
        # reshape shape: [1, anchor_num]
        # expand_as: [batch, anchor_num]
        levels = levels.reshape(1, -1).expand_as(objectness)

        # 获取每张预测特征图上预测概率排在pre_nms_top_n的anchor的索引值，得到proposals.
        # top_n_idx shape: [batch_size, proposal_num]
        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)

        # batch_size的个数
        image_range = torch.arange(num_images, device=device)
        # shape: [batch_size, 1]
        batch_idx = image_range[:, None]

        # 根据每个预测特征图中排在前pre_nms_top_n个anchor的索引值，得到其相应的概率信息，从而得到proposal。
        # 在objectness中，每一张图像中对应的topn的索引，得到其概率信息。
        # objectness shape: [batch_size, proposal_num]
        objectness = objectness[batch_idx, top_n_idx]
        # 存储anchor属于某个预测特征层的信息
        levels = levels[batch_idx, top_n_idx]
        # 预测概率排在pre_nms_top_n个anchor的索引值，得到相应的bbox坐标信息，得到proposal
        proposals = proposals[batch_idx, top_n_idx]

        final_boxes = []
        final_scores = []

        # 遍历每张图像上的相关预测信息
        for boxes, scores, level, img_shape in zip(proposals, objectness, levels, image_shapes):
            # 调整预测的boxes坐标，得到最终的proposal坐标。目的是将越界的坐标调整到图像边界上，限制在图像的内部。
            boxes = clip_boxes_to_image(boxes, img_shape)
            # 移除proposal中的小boxes 根据self.min_size
            # 返回boxes满足宽，高都大于self.min_size的索引
            keep = remove_small_boxes(boxes, self.min_size)
            boxes, scores, level = boxes[keep], scores[keep], level[keep]
            # 经过nms后保留下来的proposal.
            # return: 执行完nms后，按照目标的类别scores，从大到小排序输出的索引信息。
            keep = batched_nms(boxes, scores, level, self.nms_thresh)
            # 只获取post_nms_top_n个proposal
            keep = keep[: self.post_nms_top_n()]
            boxes, scores = boxes[keep], scores[keep]

            final_boxes.append(boxes)
            final_scores.append(scores)

        return final_boxes, final_scores

    def compute_loss(self, objectness, pred_bbox_deltas, labels, regression_targets):
        # 按照给定的batch_size_per_image, positive_fraction选择正负样本
        # labels: 完成对正负样本的划分。
        # fg_bg_sampler: 不是所有的正负样本都拿来训练，选择在计算损失时所使用到的正负样本。
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
        # 将一个batch中的所有正负样本分别拼接在一起，并获取非零位置的索引
        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)

        # 将所有正负样本索引拼接在一起
        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
        objectness = objectness.flatten()

        labels = torch.cat(labels, dim=0)
        regression_targets = torch.cat(regression_targets, dim=0)

        # 计算边界框回归损失 只需要计算正样本的损失
        box_loss = smooth_l1_loss(pred_bbox_deltas[sampled_pos_inds], regression_targets[sampled_pos_inds], beta=1 / 9, size_average=False) / (sampled_inds.numel())
        # 计算目标预测概率损失
        objectness_loss = F.binary_cross_entropy_with_logits(
            objectness[sampled_inds], labels[sampled_inds]
        )

        return objectness_loss, box_loss

    def forward(self, image_list, features, targets=None):
        # 首先提取所有预测特征图的特征矩阵。features为字典类型，只提取value(特征矩阵)，不要key.
        # features是所有预测特征层组成的OrderedDict
        features = list(features.values())

        # 将预测特征图输入到RPNHead中
        # 计算每个预测特征层上的预测目标概率和bboxes regression参数
        # objectness和pred_bbox_deltas都是list
        # objectness: 根据预测特征层预测每一个anchor属于前景还是背景。 shape: [B, 3(每个预测特征图中anchor的个数), H, W]
        # pred_bbox_deltas: 根据预测特征层输出坐标偏移量 shape: [B, 3*4, H, W]
        objectness, pred_bbox_deltas = self.head(features)

        # 得到一个batch中对应每一张图像生成的所有anchors信息
        anchors = self.anchor_generator(image_list, features)

        # batch_size
        num_images = len(anchors)

        # 计算每个预测特征图上anchor的数量 shape: [C, H, W]
        num_anchors_per_level_shape_tensor = [o[0].shape for o in objectness]
        # 每一个预测特征层上所生成anchor的个数
        num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensor]

        # 调整box_cls和box_reg两个list中的每个预测特征层的预测信息的tensor排列顺序和shape进行调整
        # objectness shape -> [Batch*anchor_num*H*W, 1]
        # pred_bbox_deltas shape -> [Batch*anchor_num*H*W, 4]
        objectness, pred_bbox_deltas = concat_box_pred_layers(objectness, pred_bbox_deltas)

        # 将预测得到的bbox参数应用到anchor中，得到proposal
        # detach(): 不计算梯度
        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
        # shape: [Batch*anchor_num, 4] -> [Batch, anchor_num, 4]
        proposals = proposals.view(num_images, -1, 4)

        # 过滤proposal. 筛出小面积的proposal框, nms处理，根据预测概率取前post_nms_top_n个目标
        # proposal shape: [Batch, anchor_num, 4]
        boxes, scores = self.filter_proposals(proposals, objectness, image_list.image_sizes, num_anchors_per_level)

        # 在训练期间，计算loss
        losses = {}
        if self.training:
            # 计算每个anchor最匹配的GTbox，并将anchor分类成 前景、背景、丢弃 用1，0，-1表示.
            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
            # 计算gtbox坐标相对于anchor坐标之间的回归参数
            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)

            # objectness: 预测的目标分数 pred_bbox_deltas: 预测的坐标偏移量
            # labels: 真实的label(正样本1 负样本0 丢弃样本-1) regression_targets: gtbox坐标相对于anchor坐标之间的回归参数
            loss_objectness, loss_rpn_box_reg = self.compute_loss(objectness, pred_bbox_deltas, labels, regression_targets)

            losses = {
                'loss_objectness': loss_objectness,
                'loss_rpn_box_reg': loss_rpn_box_reg
            }
        return boxes, losses
人工智能最新文章
2022吴恩达机器学习课程——第二课（神经网
第十五章规则学习
FixMatch: Simplifying Semi-Supervised Le
数据挖掘Java——Kmeans算法的实现
大脑皮层的分割方法
【翻译】GPT-3是如何工作的
论文笔记:TEACHTEXT: CrossModal Generaliz
python从零学（六）
详解Python 3.x 导入(import)
【答读者问27】backtrader不支持最新版本的
加:2022-04-01 00:02:58 更:2022-04-01 00:07:52
360图书馆购物三丰科技阅读网日历万年历 2025年7日历
-2025/7/29 12:02:48-
图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
网站联系: qq:121756557 email:121756557@qq.com IT数码