开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> 目标检测模型---SSD -> 正文阅读

[人工智能]目标检测模型---SSD

1.SSD网络模型

SSD全称为Single Shot MultiBox Detector，一种one-stage目标检测模型，其网络结构图为：
在这里插入图片描述
1. SSD模型的主干网络采用VGG网络模块，并进行一些改进：

将VGG16的FC6层和FC7层转变为卷积层（原用于分类）
去除所有的Dropout层的FC8层
新增加Conv6层、Conv7层、Conv8层、Conv9层

2. SSD模型采用的特征图为：(与图中编号有差别)

Conv4的第三次卷积的特征：3838512
Conv7卷积的特征：19191024
Conv8的第二次卷积的特征：1010512
Conv9的第二次卷积的特征：55256
Conv10的第二次卷积的特征：33256
Conv11的第二次卷积的特征：11256

SSD模型采用的有效特征层共6个，用于进行检测框的预测和目标分类。对每个有效特征层进行如下的操作：一次num_anchors x 4的卷积和一次num_anchors x num_classes的卷积。

num_classes为数据集总的类别数加上背景类别。
num_anchors为该特征层每一个特征点所拥有的先验框的数量。对于上述的六个特征层来说，每个特征层中的每个特征点对应的先验框数量为{4，6，6，6，4，4}。

那么有：所有的特征层对应的预测结果的shape如下
在这里插入图片描述
并且每个特征层的先验框（box）的数量及总量为：

SSD模型的实现代码为：

class SSD300(nn.Module):
    def __init__(self, num_classes, base, backbone_name, pretrained=False):
        super(SSD300, self).__init__()
        self.num_classes = num_classes
        self.base = base
        self.backbone_name = backbone_name
        if self.backbone_name == 'vgg':
            print('using vgg backbone')
            self.vgg = vgg(self.base, pretrained)
            self.extras = add_extras(1024, self.backbone_name)
            self.L2Norm = L2Norm(512, 20)
            mbox = [4, 6, 6, 6, 4, 4]

            loc_layers = []
            conf_layers = []

            backbone_source = [21, -1]
            """
            在SSD中vgg网络模块获得的特征层里
            第21层和第33层可以用于进行回归预测和分类预测
            分别是conv4-3和conv7的输出
            """
            for k, v in enumerate(backbone_source):
                loc_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * 4,
                                         kernel_size=(3, 3),
                                         padding=1)]

                conf_layers += [nn.Conv2d(self.vgg[v].out_channels,
                                          mbox[k] * num_classes,
                                          kernel_size=(3, 3),
                                          padding=1)]
            """
            在SSD中add_extras网络模块的特征层中
            第1层、第三层、第五层、第七层可以用于进行回归预测和分类预测
            其大小分别为(10, 10, 512)、(5, 5, 256)、(3, 3, 256)、(1, 1, 256)
            """
            for k, v in enumerate(self.extras[1::2], 2):
                loc_layers += [nn.Conv2d(v.out_channels,
                                         mbox[k] * 4,
                                         kernel_size=(3, 3),
                                         padding=1)]

                conf_layers += [nn.Conv2d(v.out_channels,
                                          mbox[k] * num_classes,
                                          kernel_size=(3, 3),
                                          padding=1)]
        else:
            print('using mobilenetv2 backbone')
            self.mobilenet = mobilenet_v2(pretrained).features
            self.extras = add_extras(1280, self.backbone_name)
            self.L2Norm = L2Norm(96, 20)
            mbox = [6, 6, 6, 6, 6, 6]

            loc_layers = []
            conf_layers = []

            backbone_source = [13, -1]

            for k, v in enumerate(backbone_source):
                loc_layers += [nn.Conv2d(self.mobilenet[v].out_channels,
                                         mbox[k] * 4,
                                         kernel_size=(3, 3),
                                         padding=1)]
                conf_layers += [nn.Conv2d(self.mobilenet[v].out_channels,
                                          mbox[k] * num_classes,
                                          kernel_size=(3, 3),
                                          padding=1)]
            for k, v in enumerate(self.extras, 2):
                loc_layers += [nn.Conv2d(v.out_channels,
                                         mbox[k] * 4,
                                         kernel_size=(3, 3),
                                         padding=1)]
                conf_layers += [nn.Conv2d(v.out_channels,
                                          mbox[k] * num_classes,
                                          kernel_size=(3, 3),
                                          padding=1)]
        self.loc = nn.ModuleList(loc_layers)
        self.conf = nn.ModuleList(conf_layers)

    def forward(self, x):
        # ---------------------------#
        #   x是300,300,3
        # ---------------------------#
        sources = list()
        loc = list()
        conf = list()

        # ---------------------------#
        #   获得conv4_3的内容
        #   shape为38,38,512
        # ---------------------------#
        if self.backbone_name == "vgg":
            for k in range(23):
                x = self.vgg[k](x)
        else:
            for k in range(14):
                x = self.mobilenet[k](x)
        # ---------------------------#
        #   conv4_3的内容
        #   需要进行L2标准化
        # ---------------------------#
        s = self.L2Norm(x)
        sources.append(s)

        # ---------------------------#
        #   获得conv7的内容
        #   shape为19,19,1024
        # ---------------------------#
        if self.backbone_name == "vgg":
            for k in range(23, len(self.vgg)):
                x = self.vgg[k](x)
        else:
            for k in range(14, len(self.mobilenet)):
                x = self.mobilenet[k](x)

        sources.append(x)
        # -------------------------------------------------------------#
        #   在add_extras获得的特征层里
        #   第1层、第3层、第5层、第7层可以用来进行回归预测和分类预测。
        #   shape分别为(10,10,512), (5,5,256), (3,3,256), (1,1,256)
        # -------------------------------------------------------------#
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if self.backbone_name == "vgg":
                if k % 2 == 1:
                    sources.append(x)
            else:
                sources.append(x)

        # -------------------------------------------------------------#
        #   为获得的6个有效特征层添加回归预测和分类预测
        # -------------------------------------------------------------#
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        # -------------------------------------------------------------#
        #   进行reshape方便堆叠
        # -------------------------------------------------------------#
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        # -------------------------------------------------------------#
        #   loc会reshape到batch_size, num_anchors, 4
        #   conf会reshap到batch_size, num_anchors, self.num_classes
        # -------------------------------------------------------------#
        output = (
            loc.view(loc.size(0), -1, 4),
            conf.view(conf.size(0), -1, self.num_classes),
        )
        return output

VGG网络实现：

base = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
        512, 512, 512]

def vgg(pretrained=False):
    layers = []
    in_channels = 3
    for v in base:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'C':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=(3, 3), padding=1)
            layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    # 19, 19, 512 -> 19, 19, 512 
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    # 19, 19, 512 -> 19, 19, 1024
    conv6 = nn.Conv2d(512, 1024, kernel_size=(3, 3), padding=6, dilation=(6, 6))
    # 19, 19, 1024 -> 19, 19, 1024
    conv7 = nn.Conv2d(1024, 1024, kernel_size=(1, 1))
    layers += [pool5, conv6,
               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]

    model = nn.ModuleList(layers)
    if pretrained:
        state_dict = load_state_dict_from_url("https://download.pytorch.org/models/vgg16-397923af.pth",
                                              model_dir="./model_data")
        state_dict = {k.replace('features.', ''): v for k, v in state_dict.items()}
        model.load_state_dict(state_dict, strict=False)
    return model

扩展网络模块实现：

def add_extras(in_channels, backbone_name):
    layers = []
    if backbone_name == 'vgg':
        # Block 6
        # 19,19,1024 -> 19,19,256 -> 10,10,512
        layers += [nn.Conv2d(in_channels, 256, kernel_size=(1, 1), stride=(1, 1))]
        layers += [nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=1)]

        # Block 7
        # 10,10,512 -> 10,10,128 -> 5,5,256
        layers += [nn.Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))]
        layers += [nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=1)]

        # Block 8
        # 5,5,256 -> 5,5,128 -> 3,3,256
        layers += [nn.Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))]
        layers += [nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))]

        # Block 9
        # 3,3,256 -> 3,3,128 -> 1,1,256
        layers += [nn.Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))]
        layers += [nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))]
    else:
        layers += [InvertedResidual(in_channels, 512, stride=2, expand_ratio=0.2)]
        layers += [InvertedResidual(512, 256, stride=2, expand_ratio=0.25)]
        layers += [InvertedResidual(256, 256, stride=2, expand_ratio=0.5)]
        layers += [InvertedResidual(256, 64, stride=2, expand_ratio=0.25)]

    return nn.ModuleList(layers)

2. 先验框的准备（default box//Prior box）

Prior box是指实际训练中用于训练的Default box，即Default box是一种概念，Prior box则是实际的选取。

训练过程中，当一张图片送入模型后获得各个特征层（feature map），那么对于正样本来说，需要先将prior box与ground truth box进行匹配，若匹配则该prior box包含某个真实目标，但距离完整目标的ground truth box还存在差距，而训练的目的就是保证default box的分类confidence的同时将prior box尽可能回归到ground truth box。

Defalut box生成规则：
对于每个特征层（feature map）上的每个特征点，将其中点作为中心，生成一系列同心的default box。此外，中心点的坐标会乘以step，相当于从feature map位置映射回原图位置。特征层的default box数量与aspect_ratios有关。

SSD模型：

# 先验框的长边比例
# [1, 2] -> [1, 1, 2, 1/2]
# [1, 2, 3] -> [1, 1, 2, 1/2, 3, 1/3]
aspect_ratios = [[1, 2], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2], [1, 2]]

对于每个特征层来说，根据其对应的aspect_ratio，会有两个正方形及不等的长方形。

规定default box的边长为anchors_size = [30, 60, 111, 162, 213, 264, 315]
对于6中特征层，会有6中不同的[min_size, max_size]的组合。

那么， default box边长的计算方式为：

小正方形的边长为： $min\_size$
大正方形的边长为： $\sqrt{min\_size \times max\_size}$
长方形默认框的height为： $\frac{1}{\sqrt{aspect\_ratio}} \times min\_size$
长方形默认框的height为： $\sqrt{aspect\_ratio} \times min\_size$

对于default box的[min_size, max_size]，是由如下的方式计算而来的。

#coding:utf-8
import math
 
min_dim = 300   #######维度
# conv4_3 ==> 38 x 38
# fc7 ==> 19 x 19
# conv6_2 ==> 10 x 10
# conv7_2 ==> 5 x 5
# conv8_2 ==> 3 x 3
# conv9_2 ==> 1 x 1
mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2'] #####prior_box来源层，可以更改。很多改进都是基于此处的调整。
# in percent %
min_ratio = 20 ####这里即是论文中所说的Smin=0.2，Smax=0.9的初始值，经过下面的运算即可得到min_sizes，max_sizes。具体如何计算以及两者代表什么，请关注我的博客SSD详解。这里产生很多改进。
max_ratio = 90
##math.floor()函数表示：求一个最接近它的整数，它的值小于或等于这个浮点数。
step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))####取一个间距步长，即在下面for循环给ratio取值时起一个间距作用。可以用一个具体的数值代替，这里等于17。
print('step:',step)
min_sizes = []  ###经过以下运算得到min_sizes和max_sizes。
max_sizes = []
for ratio in range(min_ratio, max_ratio + 1, step):  ####从min_ratio至max_ratio+1每隔step=17取一个值赋值给ratio。注意xrange函数的作用。
#####min_sizes.append（）函数即把括号内部每次得到的值依次给了min_sizes。
  min_sizes.append(min_dim * ratio / 100.)
  print(min_sizes)
  max_sizes.append(min_dim * (ratio + step) / 100.)
min_sizes = [min_dim * 10 / 100.] + min_sizes ## 在min_sizes前拼接上30
max_sizes = [min_dim * 20 / 100.] + max_sizes
 ###steps即计算卷积层产生的prior_box距离原图的步长，先验框中心点的坐标会乘以step，相当于从feature map位置映射回原图位置，
 ###比如conv4_3输出特征图大小为38*38，而输入的图片为300*300，所以38*8约等于300，所以映射步长为8。这是针对300*300的训练图片。
steps = [8, 16, 32, 64, 100, 300] 
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
print('min_sizes:',min_sizes)
print('max_sizes:',max_sizes)

那么，先验框的准备为：

class AnchorBox:
    def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None):
        # 获取输入图片的大小， 300*300
        self.input_shape = input_shape
        # 先验框的短边
        self.min_size = min_size
        # 先验框的长边
        self.max_size = max_size
        # 先验框的长边比例
        # [1, 2] -> [1, 1, 2, 1/2]
        # [1, 2, 3] -> [1, 1, 2, 1/2, 3, 1/3]
        self.aspect_ratios = []
        for ar in aspect_ratios:
            self.aspect_ratios.append(ar)
            self.aspect_ratios.append(1.0 / ar)

    def call(self, layer_shape):
        # 获取输入特征层的宽和高, 比如38*38
        layer_height = layer_shape[0]
        layer_width = layer_shape[1]
        # 获取输入图片的宽和高
        img_height = self.input_shape[0]
        img_width = self.input_shape[1]

        box_widths, box_heights = [], []
        # self.aspect_ratios一般有两个值：[1, 1, 2, 1/2]；[1, 1, 2, 1/2, 3, 1/3]
        for ar in self.aspect_ratios:
            # 首先添加较小的正方形
            if ar == 1 and len(box_widths) == 0:
                box_widths.append(self.min_size)
                box_heights.append(self.min_size)
            # 然后添加较大的正方形
            elif ar == 1 and len(box_widths) > 0:
                box_widths.append(np.sqrt(self.min_size * self.max_size))
                box_heights.append(np.sqrt(self.min_size * self.max_size))
            # 添加长方形
            elif ar != 1:
                box_widths.append(self.min_size * np.sqrt(ar))
                box_heights.append(self.min_size / np.sqrt(ar))
        # 获取所有先验框的1/2的宽高
        box_widths = 0.5 * np.array(box_widths)
        box_heights = 0.5 * np.array(box_heights)
        # 获取每一个特征层对应的步长
        step_x = img_width / layer_width
        step_y = img_height / layer_height
        # 生成网格中心，并构建网络
        # --------------------------------------------------- #
        linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x, layer_width)
        liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, layer_height)

        centers_x, centers_y = np.meshgrid(linx, liny)
        centers_x = centers_x.reshape(-1, 1)
        centers_y = centers_y.reshape(-1, 1)

        # 每一个先验框需要两个（centers_x, centers_y），前一个用于计算左上角，后一个用于计算右下角
        num_anchors = len(self.aspect_ratios)
        anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
        anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors))

        # 获得先验框的左上角和右下角
        anchor_boxes[:, ::4] -= box_widths
        anchor_boxes[:, 1::4] -= box_heights
        anchor_boxes[:, 2::4] += box_widths
        anchor_boxes[:, 3::4] += box_heights
        # 将先验框变成小数形式，同时进行归一化
        # --------------------------------------------------- #
        anchor_boxes[:, ::2] /= img_width
        anchor_boxes[:, 1::2] /= img_height
        anchor_boxes = anchor_boxes.reshape(-1, 4)

        anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
        return anchor_boxes

3. VOC格式数据集的准备

首先，VOC图片数据集的存放方式为：

|-VOCdevkit
|---VOC2007
|-----Annotations
|-----ImageSets
|-----JPEGImages

其中，Annotations存放每张图片的XML文件，XML文件中图片的object信息表示这张图片所对应的目标检测框位置及其类别名称，每个图片可能会包含多个object。difficult表示识别难度。
XML文件包含的信息为：

<annotation>
	<folder>VOC2007</folder>
	<filename>000003.jpg</filename>
	<source>
		<database>The VOC2007 Database</database>
		<annotation>PASCAL VOC2007</annotation>
		<image>flickr</image>
		<flickrid>138563409</flickrid>
	</source>
	<owner>
		<flickrid>RandomEvent101</flickrid>
		<name>?</name>
	</owner>
	<size>
		<width>500</width>
		<height>375</height>
		<depth>3</depth>
	</size>
	<segmented>0</segmented>
	<object>
		<name>sofa</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>123</xmin>
			<ymin>155</ymin>
			<xmax>215</xmax>
			<ymax>195</ymax>
		</bndbox>
	</object>
	<object>
		<name>chair</name>
		<pose>Left</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>239</xmin>
			<ymin>156</ymin>
			<xmax>307</xmax>
			<ymax>205</ymax>
		</bndbox>
	</object>
</annotation>

ImageSets的Main文件则是存放划分的训练集、验证集、测试集的文件索引信息文件，每个索引对应着图片的文件名称。
JPEGImages则用于存放原始图片。

数据集的准备工作是生成用于训练的.txt文件，该文件每行对应一张图片，且包含了图片的图片路径及其目标框的位置、类别，如：

./data/VOCdevkit/VOC2007/JPEGImages/000005.jpg 263,324,211,339,8 165,253,264,372,8 241,295,194,299,8
./data/VOCdevkit/VOC2007/JPEGImages/000007.jpg 141,500,50,330,6

用于生成的代码为：

import os
import random
import xml.etree.ElementTree as ET
from utils.utils import get_classes


def convert_annotation(voc_devkit_path, year, image_id, list_file, classes):
    file_path = os.path.join(voc_devkit_path, 'VOC%s/Annotations/%s.xml' % (year, image_id))
    with open(file_path, 'r', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()

    for obj in root.iter('object'):
        difficult = 0
        if obj.find('difficult') is not None:
            difficult = int(obj.find('difficult').text)
        cls = obj.find('name').text
        if cls not in classes or difficult == 1:
            continue
        cls_id = classes.index(cls)
        xml_box = obj.find('bndbox')

        b = (float(xml_box.find('xmin').text), float(xml_box.find('xmax').text),
             float(xml_box.find('ymin').text), float(xml_box.find('ymax').text))
        list_file.write(' ' + ','.join([str(int(a)) for a in b]) + ',' + str(cls_id))


if __name__ == '__main__':
    random.seed(0)
    # -----------------------------------------------------------------------------------------------------#
    # annotation_mode用于指定该文件运行时计算的内容
    # annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
    # annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
    # annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
    # -----------------------------------------------------------------------------------------------------#
    annotation_mode = 2
    # -------------------------------------------------------------------#
    #   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
    #   与训练和预测所用的classes_path一致即可
    #   如果生成的2007_train.txt里面没有目标信息, 那么就是因为classes没有设定正确
    #   仅在annotation_mode为0和2的时候有效
    # -------------------------------------------------------------------#
    classes_path = "data/voc_classes.txt"
    # ----------------------------------------------------------------------------------------------------#
    #   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1
    #   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1
    #   仅在annotation_mode为0和1的时候有效
    # ----------------------------------------------------------------------------------------------------#
    trainval_percent = 0.9
    train_percent = 0.9
    # -------------------------------------------------------#
    #   指向VOC数据集所在的文件夹
    #   默认指向根目录下的VOC数据集
    # -------------------------------------------------------#
    voc_devkit_path = "./data/VOCdevkit"
    VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
    classes, classes_num = get_classes(classes_path)

    if annotation_mode == 0 or annotation_mode == 1:
        print("Generate txt in ImageSets...")
        xml_file_path = os.path.join(voc_devkit_path, 'VOC2007/Annotations')
        save_base_path = os.path.join(voc_devkit_path, 'VOC2007/ImageSets/Main')
        temp_xml = os.listdir(xml_file_path)
        total_xml = []
        for xml in temp_xml:
            if xml.endswith('.xml'):
                total_xml.append(xml)

        num = len(total_xml)
        list_ = range(num)
        tv = int(num * trainval_percent)
        tr = int(tv * train_percent)
        trainval = random.sample(list_, tv)
        train = random.sample(trainval, tr)
        print("train and val size: ", tv)
        print("train size: ", tr)

        ftrainval = open(os.path.join(save_base_path, 'trainval.txt'), 'w')
        ftest = open(os.path.join(save_base_path, 'test.txt'), 'w')
        ftrain = open(os.path.join(save_base_path, 'train.txt'), 'w')
        fval = open(os.path.join(save_base_path, 'val.txt'), 'w')

        for i in list_:
            name = total_xml[i][:-4] + '\n'
            if i in trainval:
                ftrainval.write(name)
                if i in train:
                    ftrain.write(name)
                else:
                    fval.write(name)
            else:
                ftest.write(name)
        ftrainval.close()
        ftrain.close()
        fval.close()
        ftest.close()
        print("Generate txt in ImageSets done!")
    if annotation_mode == 0 or annotation_mode == 2:
        print("Generate 2007_train.txt and 2007_val.txt for model training...")
        for year, data_name in VOCdevkit_sets:
            images_id_path = os.path.join(voc_devkit_path, 'VOC' + year, 'ImageSets', 'Main', data_name + '.txt')
            with open(images_id_path, 'r', encoding='utf-8') as f:
                image_ids = f.read().strip().split()
            with open('./data/%s_%s.txt' % (year, data_name), 'w', encoding='utf-8') as f:
                for image_id in image_ids:
                    f.write('%s/VOC%s/JPEGImages/%s.jpg' % (voc_devkit_path, year, image_id))
                    convert_annotation(voc_devkit_path, year, image_id, f, classes)
                    f.write('\n')
        print("Generate 2007_train.txt and 2007_val.txt for model training done!")

4. 模型训练

4.1 模型如何从特征中获取预测结果

SSD模型采用的有效特征层共6个，首先针对每个特征图中的每个特征点预设default boxes，具体为：mbox_sizes = [4, 6, 6, 6, 4, 4]，分别对应feature_map = [Conv4_3, Conv7, Conv8_2 , Conv9_2, Conv10_2 , Conv11_2]，同时feature_map的尺寸分别为feature_map_shape = [38*38, 19*19, 10*10, 5*5, 3*3, 1*1]，可以计算出总的default boxes数量为8732。

对于一张图片来说，其包含的目标可能有多个，因此需要判断出哪些预设的default boxes是否包含目标，以及判断出目标的类别（分类）；对于真实包含目标的default boxes，需要对其进行调整来获得最终的预测框（回归预测）。因此，对这六个有效特征图的输出进行两个3*3的卷积计算，分别用于回归预测和分类预测。

对于分类预测：数据集的类别数为num_classes，对于特征图K，其卷积后输出为mbox_size[k]*num_classes
对于回归预测：每个框是由中心点 $(x, y)$ 和宽高 $(w, h)$ 决定的，因此，对于特征图K，其卷积后的输出为mbox[k]*4

# feature_map = [Conv4_3, Conv7, Conv8_2 , Conv9_2, Conv10_2 , Conv11_2]
for k, v in enumerate(backbone_source):
	loc_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * 4, kernel_size=(3, 3), padding=1)]
	conf_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * num_classes, kernel_size=(3, 3), padding=1)]

for k, v in enumerate(self.extras[1::2], 2):
	loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size=(3, 3), padding=1)]
	conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size=(3, 3), padding=1)]

在这里插入图片描述

loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
# -------------------------------------------------------------#
#   loc会reshape到batch_size, num_anchors, 4
#   conf会reshap到batch_size, num_anchors, self.num_classes
# -------------------------------------------------------------#
output = (loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes),)

4.2 ground truth box(gt box)与default boxes的匹配

SSD模型首先针对每个特征层会生成许多default boxes，但我们需要对其进行筛选，从而匹配出与gt box相对应的Prior box，以便用于训练。

简单来说：首先获得图片的回归预测结果以及分类置信度（概率）；其次，针对每一张图片进行解码，获取预测框的真实框的情况；然后针对每一类别，取出大于阈值的对应框，并利用非极大抑制筛选出较好的结果；最终将label、置信度、框的位置进行堆叠并返回result。

class BBoxUtility(object):
    def __init__(self, num_classes):
        self.num_classes = num_classes

    def ssd_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
        # -----------------------------------------------------------------#
        #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
        # -----------------------------------------------------------------#
        box_yx = box_xy[..., ::-1]
        box_hw = box_wh[..., ::-1]
        input_shape = np.array(input_shape)
        image_shape = np.array(image_shape)

        if letterbox_image:
            # -----------------------------------------------------------------#
            #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
            #   new_shape指的是宽高缩放情况
            # -----------------------------------------------------------------#
            new_shape = np.round(image_shape * np.min(input_shape / image_shape))
            offset = (input_shape - new_shape) / 2. / input_shape
            scale = input_shape / new_shape

            box_yx = (box_yx - offset) * scale
            box_hw *= scale

        box_mins = box_yx - (box_hw / 2.)
        box_maxes = box_yx + (box_hw / 2.)
        boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
                               axis=-1)
        boxes *= np.concatenate([image_shape, image_shape], axis=-1)
        return boxes

    def decode_boxes(self, mbox_loc, anchors, variances):
        # 获得先验框的宽与高
        anchor_width = anchors[:, 2] - anchors[:, 0]
        anchor_height = anchors[:, 3] - anchors[:, 1]
        # 获得先验框的中心点
        anchor_center_x = 0.5 * (anchors[:, 2] + anchors[:, 0])
        anchor_center_y = 0.5 * (anchors[:, 3] + anchors[:, 1])

        # 真实框距离先验框中心的xy轴偏移情况
        decode_bbox_center_x = mbox_loc[:, 0] * anchor_width * variances[0]
        decode_bbox_center_x += anchor_center_x
        decode_bbox_center_y = mbox_loc[:, 1] * anchor_height * variances[0]
        decode_bbox_center_y += anchor_center_y

        # 真实框的宽与高的求取
        decode_bbox_width = torch.exp(mbox_loc[:, 2] * variances[1])
        decode_bbox_width *= anchor_width
        decode_bbox_height = torch.exp(mbox_loc[:, 3] * variances[1])
        decode_bbox_height *= anchor_height

        # 获取真实框的左上角与右下角
        decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
        decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
        decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
        decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height

        # 真实框的左上角与右下角进行堆叠
        decode_bbox = torch.cat((decode_bbox_xmin[:, None],
                                 decode_bbox_ymin[:, None],
                                 decode_bbox_xmax[:, None],
                                 decode_bbox_ymax[:, None]), dim=-1)
        # 防止超出0与1
        decode_bbox = torch.min(torch.max(decode_bbox, torch.zeros_like(decode_bbox)), torch.ones_like(decode_bbox))
        return decode_bbox

    def decode_box(self, predictions, anchors, image_shape, input_shape, letterbox_image, variances=[0.1, 0.2],
                   nms_iou=0.3, confidence=0.5):
        # ---------------------------------------------------#
        #   :4是回归预测结果
        # ---------------------------------------------------#
        mbox_loc = predictions[0]
        # ---------------------------------------------------#
        #   获得种类的置信度
        # ---------------------------------------------------#
        mbox_conf = nn.Softmax(-1)(predictions[1])

        results = []
        # ----------------------------------------------------------------------------------------------------------------#
        #   对每一张图片进行处理，由于在predict.py的时候，我们只输入一张图片，所以for i in range(len(mbox_loc))只进行一次
        # ----------------------------------------------------------------------------------------------------------------#
        for i in range(len(mbox_loc)):
            results.append([])
            # --------------------------------#
            #   利用回归结果对先验框进行解码
            # --------------------------------#
            decode_bbox = self.decode_boxes(mbox_loc[i], anchors, variances)

            for c in range(1, self.num_classes):
                # --------------------------------#
                #   取出属于该类的所有框的置信度
                #   判断是否大于门限
                # --------------------------------#
                c_confs = mbox_conf[i, :, c]
                c_confs_m = c_confs > confidence
                if len(c_confs[c_confs_m]) > 0:
                    # -----------------------------------------#
                    #   取出得分高于confidence的框
                    # -----------------------------------------#
                    boxes_to_process = decode_bbox[c_confs_m]
                    confs_to_process = c_confs[c_confs_m]

                    keep = nms(
                        boxes_to_process,
                        confs_to_process,
                        nms_iou
                    )
                    # -----------------------------------------#
                    #   取出在非极大抑制中效果较好的内容
                    # -----------------------------------------#
                    good_boxes = boxes_to_process[keep]
                    confs = confs_to_process[keep][:, None]
                    labels = (c - 1) * torch.ones((len(keep), 1)).cuda() if confs.is_cuda else (c - 1) * torch.ones(
                        (len(keep), 1))
                    # -----------------------------------------#
                    #   将label、置信度、框的位置进行堆叠。
                    # -----------------------------------------#
                    c_pred = torch.cat((good_boxes, labels, confs), dim=1).cpu().numpy()
                    # 添加进result里
                    results[-1].extend(c_pred)

            if len(results[-1]) > 0:
                results[-1] = np.array(results[-1])
                box_xy, box_wh = (results[-1][:, 0:2] + results[-1][:, 2:4]) / 2, results[-1][:, 2:4] - results[-1][:,
                                                                                                        0:2]
                results[-1][:, :4] = self.ssd_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)

        return results

4.3 损失函数

SSD模型针对包含多个的目标的图片进行处理，其损失函数包括两部分：置信度损失和位置损失，计算方式为：
$\frac{1}{N} [L_{conf}(x, c) + \alpha L_{loc}(x, l, g)]$
其中：

c, l, g 分别表示为类别置信度预测值、预测框和真实框的参数（宽高和中心位置）
N表示正样本的数量
x表示一个示性函数，即 $x^{p}_{ij} = \{1， 0\}$ ，1表示第 $i$ 个default box和类别为 $p$ 的第 $j$ 的gt box相匹配；0表示未匹配。
$L_{conf}$ 和 $L_{loc}$ 分别表示置信度损失和位置损失。

位置损失采用 $smooth_{L1}$ Loss，该损失是计算需要回归至default bounding box 的中心及宽度、高度的偏移量，其计算方式为：
$L_{conf} (x, l, g) = \sum^{N}_{i \in Positive} \sum_{m \in \{cx, cy, w, h\}} x^k_{ij} smooth_{L1}(l^m_{i} - \hat{g}^m_j)$
由于 $x^{p}_{ij} = \{1， 0\}$ ，位置损失只针对正样本来进行计算，其中：

$l_i^m$ ， $\hat{g}^m_i$ 分别表示第 $i$ 个预测框 $l$ 和调整的第 $j$ 个gt box 的中心、宽和高。

置信度损失采用多个类别上的softmax损失：
$L_{conf} = - \sum^{N}_{i \in Positive} x^{p}_{ij} log(\hat{c}^{p}_{i}) - \sum_{i \in Negative} log(\hat{c}^{0}_{i}), \quad \hat{c}_{i}^{p} = \frac{exp(c^p_{i})}{\sum_p exp(c^p_i)}$
其中， $\hat{c}_{i}^{p}$ 表示的在 $p$ 类别中，第 $i$ 个default box 的置信度， $p = 0$ 是背景。

class MultiboxLoss(nn.Module):
    def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0,
                 background_label_id=0, negatives_for_hard=100.0):
        super(MultiboxLoss, self).__init__()
        self.num_classes = num_classes
        self.alpha = alpha
        self.neg_pos_ratio = neg_pos_ratio
        if background_label_id != 0:
            raise Exception('Only 0 as background label id is supported')
        self.background_label_id = background_label_id
        self.negatives_for_hard = torch.FloatTensor([negatives_for_hard])[0]

    def _l1_smooth_loss(self, y_true, y_pred):
        abs_loss = torch.abs(y_true - y_pred)
        sq_loss = 0.5 * (y_true - y_pred) ** 2
        l1_loss = torch.where(abs_loss < 1.0, sq_loss, abs_loss - 0.5)
        return torch.sum(l1_loss, -1)

    def _softmax_loss(self, y_true, y_pred):
        y_pred = torch.clamp(y_pred, min=1e-7)
        softmax_loss = -torch.sum(y_true * torch.log(y_pred),
                                  axis=-1)
        return softmax_loss

    def forward(self, y_true, y_pred):
        # --------------------------------------------- #
        #   y_true batch_size, 8732, 4 + self.num_classes + 1
        #   y_pred batch_size, 8732, 4 + self.num_classes
        # --------------------------------------------- #
        num_boxes = y_true.size()[1]
        y_pred = torch.cat([y_pred[0], nn.Softmax(-1)(y_pred[1])], dim=-1)

        # --------------------------------------------- #
        #   分类的loss
        #   batch_size,8732,21 -> batch_size,8732
        # --------------------------------------------- #
        conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:])

        # --------------------------------------------- #
        #   框的位置的loss
        #   batch_size,8732,4 -> batch_size,8732
        # --------------------------------------------- #
        loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
                                        y_pred[:, :, :4])

        # --------------------------------------------- #
        #   获取所有的正标签的loss
        # --------------------------------------------- #
        pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1],
                                 axis=1)
        pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1],
                                  axis=1)

        # --------------------------------------------- #
        #   每一张图的正样本的个数
        #   num_pos     [batch_size,]
        # --------------------------------------------- #
        num_pos = torch.sum(y_true[:, :, -1], axis=-1)

        # --------------------------------------------- #
        #   每一张图的负样本的个数
        #   num_neg     [batch_size,]
        # --------------------------------------------- #
        num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
        # 找到了哪些值是大于0的
        pos_num_neg_mask = num_neg > 0
        # --------------------------------------------- #
        #   如果所有的图，正样本的数量均为0
        #   那么则默认选取100个先验框作为负样本
        # --------------------------------------------- #
        has_min = torch.sum(pos_num_neg_mask)

        # --------------------------------------------- #
        #   从这里往后，与视频中看到的代码有些许不同。
        #   由于以前的负样本选取方式存在一些问题，
        #   我对该部分代码进行重构。
        #   求整个batch应该的负样本数量总和
        # --------------------------------------------- #
        num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard

        # --------------------------------------------- #
        #   对预测结果进行判断，如果该先验框没有包含物体
        #   那么它的不属于背景的预测概率过大的话
        #   就是难分类样本
        # --------------------------------------------- #
        confs_start = 4 + self.background_label_id + 1
        confs_end = confs_start + self.num_classes - 1

        # --------------------------------------------- #
        #   batch_size,8732
        #   把不是背景的概率求和，求和后的概率越大
        #   代表越难分类。
        # --------------------------------------------- #
        max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2)

        # --------------------------------------------------- #
        #   只有没有包含物体的先验框才得到保留
        #   我们在整个batch里面选取最难分类的num_neg_batch个
        #   先验框作为负样本。
        # --------------------------------------------------- #
        max_confs = (max_confs * (1 - y_true[:, :, -1])).view([-1])

        _, indices = torch.topk(max_confs, k=int(num_neg_batch.cpu().numpy().tolist()))

        neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices)

        # 进行归一化
        num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos))
        total_loss = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss)
        total_loss = total_loss / torch.sum(num_pos)
        return total_loss

4.4 模型训练

获取数据集类别名称及数目、预设先验框、模型（使用预训练权重的情况下进行预训练权重初始化）、损失函数

class_names, num_classes = get_classes(classes_path)
num_classes += 1  # 含背景类别
anchors = get_anchors(input_shape, anchors_size, backbone)
model = SSD300(num_classes, backbone, pretrained)
# 不使用预训练权重时，随机初始化模型
if not pretrained:
	weights_init(model)
# 使用SSD模型预训练权重或只使用VGG主干网络权重
if model_path != '':
	print('Load weights {}.'.format(model_path))
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model_dict = model.state_dict()
	pretrained_dict = torch.load(model_path, map_location=device)
	pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
	model_dict.update(pretrained_dict)
	model.load_state_dict(model_dict)

criterion = MultiboxLoss(num_classes, neg_pos_ratio=3.0)

读取数据集（生成的train2007.txt和val2007.txt）

with open(train_annotation_path, encoding='utf-8') as f:
	train_lines = f.readlines()
with open(val_annotation_path, encoding='utf-8') as f:
	val_lines = f.readlines()
num_train = len(train_lines)
num_val = len(val_lines)


train_dataset = SSDDataset(train_lines, input_shape, anchors, batch_size, num_classes, train=True)
val_dataset = SSDDataset(val_lines, input_shape, anchors, batch_size, num_classes, train=False)

gen = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
                         drop_last=True, collate_fn=ssd_dataset_collate)
gen_val = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
                             drop_last=True, collate_fn=ssd_dataset_collate)

优化器设置

nbs = 64
lr_limit_max = 1e-3 if optimizer_type == 'sgd' else 5e-2
lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-5
Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
#  根据optimizer_type选择优化器
optimizer = {'adam': optim.Adam(model.parameters(), Init_lr_fit, betas=(momentum, 0.999), weight_decay=weight_decay),
'sgd': optim.SGD(model.parameters(), Init_lr_fit, momentum=momentum, nesterov=True,
weight_decay=weight_decay)}[optimizer_type]
#  获得学习率下降的公式
lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)

开始训练

for epoch in range(Init_Epoch, UnFreeze_Epoch):
 #   如果模型有冻结学习部分
 #   则解冻，并设置参数
 # ---------------------------------------#
	if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
		batch_size = Unfreeze_batch_size
		#   判断当前batch_size，自适应调整学习率
		nbs = 64
		lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2
		lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-5
		Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
		Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
		#   获得学习率下降的公式
		lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
		if backbone == "vgg":
			for param in model.vgg[:28].parameters():
			param.requires_grad = True
		else:
			for param in model.mobilenet.parameters():
			param.requires_grad = True
		epoch_step = num_train // batch_size
		epoch_step_val = num_val // batch_size
		if epoch_step == 0 or epoch_step_val == 0:
			raise ValueError("数据集过小，无法继续进行训练，请扩充数据集。")
		gen = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers,
						pin_memory=True,drop_last=True, collate_fn=ssd_dataset_collate)
		gen_val = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers, 
							pin_memory=True, 
							drop_last=True, collate_fn=ssd_dataset_collate)

		UnFreeze_flag = True

	set_optimizer_lr(optimizer, lr_scheduler_func, epoch)

	fit_one_epoch(model_train, model, criterion, loss_history, optimizer, epoch,
                          epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period,
                          save_dir)

5. 测试，获取mAP

针对测试集进行测试，获取mAP.

if __name__ == "__main__":
    '''
    Recall和Precision不像AP是一个面积的概念，在门限值不同时，网络的Recall和Precision值是不同的。
    map计算结果中的Recall和Precision代表的是当预测时，门限置信度为0.5时，所对应的Recall和Precision值。

    此处获得的./map_out/detection-results/里面的txt的框的数量会比直接predict多一些，这是因为这里的门限低，
    目的是为了计算不同门限条件下的Recall和Precision值，从而实现map的计算。
    '''
    # ------------------------------------------------------------------------------------------------------------------#
    #   map_mode用于指定该文件运行时计算的内容
    #   map_mode为0代表整个map计算流程，包括获得预测结果、获得真实框、计算VOC_map。
    #   map_mode为1代表仅仅获得预测结果。
    #   map_mode为2代表仅仅获得真实框。
    #   map_mode为3代表仅仅计算VOC_map。
    #   map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行
    # -------------------------------------------------------------------------------------------------------------------#
    map_mode = 0
    # -------------------------------------------------------#
    #   此处的classes_path用于指定需要测量VOC_map的类别
    #   一般情况下与训练和预测所用的classes_path一致即可
    # -------------------------------------------------------#
    classes_path = './data/voc_classes.txt'
    # -------------------------------------------------------#
    #   MINOVERLAP用于指定想要获得的mAP0.x
    #   比如计算mAP0.75，可以设定MINOVERLAP = 0.75。
    # -------------------------------------------------------#
    MINOVERLAP = 0.5
    # -------------------------------------------------------#
    #   map_vis用于指定是否开启VOC_map计算的可视化
    # -------------------------------------------------------#
    map_vis = False
    # -------------------------------------------------------#
    #   指向VOC数据集所在的文件夹
    #   默认指向根目录下的VOC数据集
    # -------------------------------------------------------#
    VOCdevkit_path = './data/VOCdevkit'
    # -------------------------------------------------------#
    #   结果输出的文件夹，默认为map_out
    # -------------------------------------------------------#
    map_out_path = 'map_out'

    image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split()

    if not os.path.exists(map_out_path):
        os.makedirs(map_out_path)
    if not os.path.exists(os.path.join(map_out_path, 'ground-truth')):
        os.makedirs(os.path.join(map_out_path, 'ground-truth'))
    if not os.path.exists(os.path.join(map_out_path, 'detection-results')):
        os.makedirs(os.path.join(map_out_path, 'detection-results'))
    if not os.path.exists(os.path.join(map_out_path, 'images-optional')):
        os.makedirs(os.path.join(map_out_path, 'images-optional'))

    class_names, _ = get_classes(classes_path)

    if map_mode == 0 or map_mode == 1:
        print("Load model.")
        ssd = SSD(confidence=0.01, nms_iou=0.5)
        print("Load model done.")

        print("Get predict result.")
        for image_id in tqdm(image_ids):
            image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/" + image_id + ".jpg")
            image = Image.open(image_path)
            if map_vis:
                image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg"))
            ssd.get_map_txt(image_id, image, class_names, map_out_path)
        print("Get predict result done.")

    if map_mode == 0 or map_mode == 2:
        print("Get ground truth result.")
        for image_id in tqdm(image_ids):
            with open(os.path.join(map_out_path, "ground-truth/" + image_id + ".txt"), "w") as new_f:
                root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/" + image_id + ".xml")).getroot()
                for obj in root.findall('object'):
                    difficult_flag = False
                    if obj.find('difficult') != None:
                        difficult = obj.find('difficult').text
                        if int(difficult) == 1:
                            difficult_flag = True
                    obj_name = obj.find('name').text
                    if obj_name not in class_names:
                        continue
                    bndbox = obj.find('bndbox')
                    left = bndbox.find('xmin').text
                    top = bndbox.find('ymin').text
                    right = bndbox.find('xmax').text
                    bottom = bndbox.find('ymax').text

                    if difficult_flag:
                        new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom))
                    else:
                        new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
        print("Get ground truth result done.")

    if map_mode == 0 or map_mode == 3:
        print("Get map.")
        get_map(MINOVERLAP, True, path=map_out_path)
        print("Get map done.")

    if map_mode == 4:
        print("Get map.")
        get_coco_map(class_names=class_names, path=map_out_path)
        print("Get map done.")

6 图片预测和FPS测试

if __name__ == "__main__":
    ssd = SSD()
    # ----------------------------------------------------------------------------------------------------------#
    #   mode用于指定测试的模式：
    #   'predict'表示单张图片预测，如果想对预测过程进行修改，如保存图片，截取对象等，可以先看下方详细的注释
    #   'video'表示视频检测，可调用摄像头或者视频进行检测，详情查看下方注释。
    #   'fps'表示测试fps，使用的图片是img里面的street.jpg，详情查看下方注释。
    #   'dir_predict'表示遍历文件夹进行检测并保存。默认遍历img文件夹，保存img_out文件夹，详情查看下方注释。
    # ----------------------------------------------------------------------------------------------------------#
    mode = "predict"
    # -------------------------------------------------------------------------#
    #   crop指定了是否在单张图片预测后对目标进行截取
    #   crop仅在mode='predict'时有效
    # -------------------------------------------------------------------------#
    crop = False
    # ----------------------------------------------------------------------------------------------------------#
    #   video_path用于指定视频的路径，当video_path=0时表示检测摄像头
    #   想要检测视频，则设置如video_path = "xxx.mp4"即可，代表读取出根目录下的xxx.mp4文件。
    #   video_save_path表示视频保存的路径，当video_save_path=""时表示不保存
    #   想要保存视频，则设置如video_save_path = "yyy.mp4"即可，代表保存为根目录下的yyy.mp4文件。
    #   video_fps用于保存的视频的fps
    #   video_path、video_save_path和video_fps仅在mode='video'时有效
    #   保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
    # ----------------------------------------------------------------------------------------------------------#
    video_path = 0
    video_save_path = ""
    video_fps = 25.0
    # -------------------------------------------------------------------------#
    #   test_interval用于指定测量fps的时候，图片检测的次数
    #   理论上test_interval越大，fps越准确。
    # -------------------------------------------------------------------------#
    test_interval = 100
    # -------------------------------------------------------------------------#
    #   dir_origin_path指定了用于检测的图片的文件夹路径
    #   dir_save_path指定了检测完图片的保存路径
    #   dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
    # -------------------------------------------------------------------------#
    dir_origin_path = "img/"
    dir_save_path = "img_out/"

    if mode == "predict":
        '''
        1、该代码无法直接进行批量预测，如果想要批量预测，可以利用os.listdir()遍历文件夹，利用Image.open打开图片文件进行预测。
        具体流程可以参考get_dr_txt.py，在get_dr_txt.py即实现了遍历还实现了目标信息的保存。
        2、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
        3、如果想要获得预测框的坐标，可以进入ssd.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
        4、如果想要利用预测框截取下目标，可以进入ssd.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
        在原图上利用矩阵的方式进行截取。
        5、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入ssd.detect_image函数，在绘图部分对predicted_class进行判断，
        比如判断if predicted_class == 'car': 即可判断当前目标是否为车，然后记录数量即可。利用draw.text即可写字。
        '''
        while True:
            img = input('Input image filename:')
            try:
                image = Image.open(img)
            except:
                print('Open Error! Try again!')
                continue
            else:
                r_image = ssd.detect_image(image, crop=crop)
                r_image.show()

    elif mode == "video":
        capture = cv2.VideoCapture(video_path)
        if video_save_path != "":
            fourcc = cv2.VideoWriter_fourcc(*'XVID')
            size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
            out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)

        ref, frame = capture.read()
        if not ref:
            raise ValueError("未能正确读取摄像头（视频），请注意是否正确安装摄像头（是否正确填写视频路径）。")

        fps = 0.0
        while True:
            t1 = time.time()
            # 读取某一帧
            ref, frame = capture.read()
            if not ref:
                break
            # 格式转变，BGRtoRGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # 转变成Image
            frame = Image.fromarray(np.uint8(frame))
            # 进行检测
            frame = np.array(ssd.detect_image(frame))
            # RGBtoBGR满足opencv显示格式
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

            fps = (fps + (1. / (time.time() - t1))) / 2
            print("fps= %.2f" % fps)
            frame = cv2.putText(frame, "fps= %.2f" % fps, (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            cv2.imshow("video", frame)
            c = cv2.waitKey(1) & 0xff
            if video_save_path != "":
                out.write(frame)

            if c == 27:
                capture.release()
                break

        print("Video Detection Done!")
        capture.release()
        if video_save_path != "":
            print("Save processed video to the path :" + video_save_path)
            out.release()
        cv2.destroyAllWindows()

    elif mode == "fps":
        img = Image.open('img/street.jpg')
        tact_time = ssd.get_FPS(img, test_interval)
        print(str(tact_time) + ' seconds, ' + str(1 / tact_time) + 'FPS, @batch_size 1')

    elif mode == "dir_predict":
        import os
        from tqdm import tqdm

        img_names = os.listdir(dir_origin_path)
        for img_name in tqdm(img_names):
            if img_name.lower().endswith(
                    ('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
                image_path = os.path.join(dir_origin_path, img_name)
                image = Image.open(image_path)
                r_image = ssd.detect_image(image)
                if not os.path.exists(dir_save_path):
                    os.makedirs(dir_save_path)
                r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)

    else:
        raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps' or 'dir_predict'.")