1. kmeans聚类中心

参考代码：plot_kmeans.py

import numpy as np
from matplotlib import pyplot as plt

np.random.seed(0)
colors = np.array(['blue', 'black'])

def plot_clusters(data, cls, clusters, title=""):
    # 对每一个数据点分配一个颜色
    if cls is None:
        c = [colors[0]] * data.shape[0]
    else:
        c = colors[cls].tolist()

    # 画出所有数据点
    plt.scatter(data[:, 0], data[:, 1], c=c)
    # 画出两个聚类中心点
    for i, clus in enumerate(clusters):
        plt.scatter(clus[0], clus[1], c='gold', marker='*', s=150)
    plt.title(title)
    plt.show()
    plt.close()


def distances(data, clusters):
    xy1 = data[:, None]  # [N,1,2]
    xy2 = clusters[None]  # [1,K,2]
    # xy2 - xy1: (300, 2, 2)
    # np.power(xy2 - xy1, 2): (300, 2, 2)
    # d: (300, 2)
    d = np.sum(np.power(xy2 - xy1, 2), axis=-1)
    return d


# k-means算法: 坐标系上两个点的欧氏距离作为样本之间的距离进行聚类,流程如下:
# 1 手动设定簇的个数k，假设k=2
# 2 在所有样本中随机选取k个样本作为簇的初始中心
# 3 计算每个样本离每个簇中心的距离（这里以欧式距离为例），然后将样本划分到离它最近的簇中
# 4 更新簇的中心，计算每个簇中所有样本的均值（方法不唯一）作为新的簇中心
# 5 重复第3步到第4步直到簇中心不在变化或者簇中心变化很小满足给定终止条件
def k_means(data, k, dist=np.mean):
    """
    k-means methods
    Args:
        data: 需要聚类的data
        k: 簇数(聚成几类)
        dist: 更新簇坐标的方法
    """
    data_number = data.shape[0]
    last_nearest = np.zeros((data_number,))

    # 随机在数据中选择k个聚类中心
    clusters = data[np.random.choice(data_number, k, replace=False)]
    print(f"random cluster: \n {clusters}")
    # plot
    plot_clusters(data, None, clusters, "random clusters")

    step = 0
    while True:
        # 距离为个数据点分别与聚类中心的平方和(不开方)
        d = distances(data, clusters)
        # 对全部数据点按照距离分类,离哪个聚类中心近就分配到哪里
        current_nearest = np.argmin(d, axis=1)

        # plot
        plot_clusters(data, current_nearest, clusters, f"step {step}")

        # 如果上一次判断最邻近位置与当前的判断完全一致表示聚类分类完成
        # (list/tuple).all()表示当前元祖全部为True
        if (last_nearest == current_nearest).all():
            break  # clusters won't change
        for cluster in range(k):
            # use np.mean update clusters
            # 简单来说就是对一类的全部数据点求一个均值,该均值就是聚类中心: (k1, 2) -> (2)
            clusters[cluster] = dist(data[current_nearest == cluster], axis=0)

        # 更新类别
        last_nearest = current_nearest
        step += 1

    return clusters


def main():
    x1, y1 = [np.random.normal(loc=1., size=150) for _ in range(2)]
    x2, y2 = [np.random.normal(loc=5., size=150) for _ in range(2)]

    x = np.concatenate([x1, x2])
    y = np.concatenate([y1, y2])

    plt.scatter(x, y, c='blue')
    plt.title("initial data")
    plt.show()
    plt.close()

    clusters = k_means(np.concatenate([x[:, None], y[:, None]], axis=-1), k=2)
    print(f"k-means fluster: \n {clusters}")


if __name__ == '__main__':
    main()

2. kmeans聚类anchor

参考代码：yolo_kmeans.py

import numpy as np


def wh_iou(wh1, wh2):
    # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2
    wh1 = wh1[:, None]  # [N,1,2]
    wh2 = wh2[None]     # [1,M,2]
    # prod用于两个最小边的相乘，也就是交集面积
    inter = np.minimum(wh1, wh2).prod(2)  # [N,M]
    return inter / (wh1.prod(2) + wh2.prod(2) - inter)  # iou = inter / (area1 + area2 - inter)


# k-means算法: 1-IOU(bboxes, anchors)作为样本之间的距离进行聚类,流程如下:
# 1 在所有的bboxes中随机挑选k个作为簇的中心。
# 2 计算每个bboxes离每个簇的距离1-IOU(bboxes, anchors)
# 3 计算每个bboxes距离最近的簇中心，并分配到离它最近的簇中
# 4 根据每个簇中的bboxes重新计算簇中心，这里默认使用的是计算中值，自己也可以改成其他方法
# 5 重复3到4直到每个簇中元素不在发生变化
def k_means(boxes, k, dist=np.median):
    """
    yolo k-means methods
    refer: https://github.com/qqwweee/keras-yolo3/blob/master/kmeans.py
    Args:
        boxes: 需要聚类的bboxes
        k: 簇数(聚成几类)
        dist: 更新簇坐标的方法(默认使用中位数，比均值效果略好)
    """
    box_number = boxes.shape[0]
    last_nearest = np.zeros((box_number,))
    # np.random.seed(0)  # 固定随机数种子

    # 在所有的bboxes中随机挑选k个作为簇的中心
    clusters = boxes[np.random.choice(box_number, k, replace=False)]

    while True:
        # 计算每个bboxes离每个簇的距离 1-IOU(bboxes, anchors)
        distances = 1 - wh_iou(boxes, clusters)
        # 计算每个bboxes距离最近的簇中心,其实也就是对每一个bbox进行分类(k个聚类中心中选一个)
        current_nearest = np.argmin(distances, axis=1)
        # 每个簇中元素不在发生变化说明以及聚类完毕
        if (last_nearest == current_nearest).all():
            break  # clusters won't change
        for cluster in range(k):
            # 根据每个簇中的bboxes重新计算簇中心, 这里选择用中位数来更新簇中心
            clusters[cluster] = dist(boxes[current_nearest == cluster], axis=0)

        last_nearest = current_nearest

    return clusters

3. kmeans+遗传算法聚类anchor

首先获取数据集的ground ture，获取样本尺寸，然后根据这些样本来对3个特征层分别聚类3个anchor，所以一共就是输出9个anchor，将anchor按面积的大小由小到大排序，依次分配给3个预测特征层即可。

其中聚类的过程中，配合遗传算法，使得anchor存在小幅度的变动，增强了多样性与泛化性。

读取数据集中的gtbox参考代码，read_voc.py：

import os
from tqdm import tqdm
from lxml import etree


class VOCDataSet(object):
    def __init__(self, voc_root, year="2012", txt_name: str = "train.txt"):
        assert year in ["2007", "2012"], "year must be in ['2007', '2012']"
        self.root = os.path.join(voc_root, "VOCdevkit", f"VOC{year}")
        self.annotations_root = os.path.join(self.root, "Annotations")

        # read train.txt or val.txt file
        txt_path = os.path.join(self.root, "ImageSets", "Main", txt_name)
        assert os.path.exists(txt_path), "not found {} file.".format(txt_name)

        with open(txt_path) as read:
            self.xml_list = [os.path.join(self.annotations_root, line.strip() + ".xml")
                             for line in read.readlines() if len(line.strip()) > 0]

        # check file
        assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path)
        for xml_path in self.xml_list:
            assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)

    def __len__(self):
        return len(self.xml_list)

    def parse_xml_to_dict(self, xml):
        """
        将xml文件解析成字典形式，参考tensorflow的recursive_parse_xml_to_dict
        Args:
            xml: xml tree obtained by parsing XML file contents using lxml.etree

        Returns:
            Python dictionary holding XML contents.
        """

        if len(xml) == 0:  # 遍历到底层，直接返回tag对应的信息
            return {xml.tag: xml.text}

        result = {}
        for child in xml:
            child_result = self.parse_xml_to_dict(child)  # 递归遍历标签信息
            if child.tag != 'object':
                result[child.tag] = child_result[child.tag]
            else:
                if child.tag not in result:  # 因为object可能有多个，所以需要放入列表里
                    result[child.tag] = []
                result[child.tag].append(child_result[child.tag])
        return {xml.tag: result}

    def get_info(self):
        im_wh_list = []
        boxes_wh_list = []
        for xml_path in tqdm(self.xml_list, desc="read data info."):
            # read xml
            with open(xml_path) as fid:
                xml_str = fid.read()
            xml = etree.fromstring(xml_str)
            data = self.parse_xml_to_dict(xml)["annotation"]
            im_height = int(data["size"]["height"])
            im_width = int(data["size"]["width"])

            wh = []
            for obj in data["object"]:
                xmin = float(obj["bndbox"]["xmin"])
                xmax = float(obj["bndbox"]["xmax"])
                ymin = float(obj["bndbox"]["ymin"])
                ymax = float(obj["bndbox"]["ymax"])
                wh.append([(xmax - xmin) / im_width, (ymax - ymin) / im_height])

            if len(wh) == 0:
                continue

            im_wh_list.append([im_width, im_height])
            boxes_wh_list.append(wh)

        return im_wh_list, boxes_wh_list

kmeans+遗传算法聚类anchor参考代码：

import random
import numpy as np
from tqdm import tqdm
from scipy.cluster.vq import kmeans

from read_voc import VOCDataSet
from yolo_kmeans import k_means, wh_iou


def anchor_fitness(k: np.ndarray, wh: np.ndarray, thr: float):  # mutation fitness
    r = wh[:, None] / k[None]         # r: {ndrray:(15774, 9, 2)}
    x = np.minimum(r, 1. / r).min(2)  # x: {ndrray:(15774, 9)}
    # x = wh_iou(wh, k)     # iou metric
    best = x.max(1)         # best: {ndrray:(15774,)}
    f = (best * (best > thr).astype(np.float32)).mean()  # fitness
    bpr = (best > thr).astype(np.float32).mean()  # best possible recall
    return f, bpr


# k-means聚类 + Genetic Algorithm遗传算法，在k-means聚类的结果上进行mutation变异，流程如下:
# 1 获取数据集中每个样本的长宽im_wh以及每个bounding boxes的长宽boxes_wh
# 2 将每张图片中长宽的最大值等比例缩放到指定大小img_size
# 3 对bounding boxes由相对大小(小数)到绝对大小的转换(整数)，获取完整的bounding boxes列表wh0
# 4 筛选bounding boxes，保留wh都大于等于两个像素的bounding boxes，得到wh列表
# 5 使用k-means聚类得到k个anchors
# 6 使用遗传算法随机对聚类出的k个anchors的长宽进行变异，如果变异后效果变得更好就将变异后的结果赋值给anchors，
# 如果变异后效果变差就跳过，默认变异1000次。 ps:这里使用anchor_fitness方法计算得到的fitness(适应度)进行对变异效果评估
# 7 将最终变异得到的anchors按照面积进行排序并返回
def main(img_size=512, n=9, thr=0.25, gen=1000):
    # 从数据集中读取所有图片的wh以及对应bboxes的wh
    dataset = VOCDataSet(voc_root="E:\学习\机器学习\数据集\VOC2012", year="2012", txt_name="train.txt")
    im_wh, boxes_wh = dataset.get_info()

    # 最大边缩放到img_size
    im_wh = np.array(im_wh, dtype=np.float32)
    shapes = img_size * im_wh / im_wh.max(1, keepdims=True)
    # [l * s for s, l in zip(shapes, boxes_wh)]: {list: 5717}
    wh0 = np.concatenate([l * s for s, l in zip(shapes, boxes_wh)])  # wh

    # Filter 过滤掉小目标
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print(f'WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.')
    wh = wh0[(wh0 >= 2.0).any(1)]  # 只保留wh都大于等于2个像素的box

    # Kmeans calculation
    k = k_means(wh, n)

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    f, bpr = anchor_fitness(k, wh, thr)
    print("kmeans: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")

    # Evolve
    # 遗传算法(在kmeans的结果基础上变异mutation)
    npr = np.random
    f, sh, mp, s = anchor_fitness(k, wh, thr)[0], k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), desc=f'Evolving anchors with Genetic Algorithm:')  # progress bar
    for _ in pbar:
        # 对v矩阵[9,2]进行变异来对聚类出来的k个中心进行些许变化，其中v的数据在1左右轻微浮动
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)  # min=0.3 / max=3.0
        kg = (k.copy() * v).clip(min=2.0)
        # 重新计算适应度，如果更好则进化
        fg, bpr = anchor_fitness(kg, wh, thr)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = f'Evolving anchors with Genetic Algorithm: fitness = {f:.4f}'

    # 按面积排序
    k = k[np.argsort(k.prod(1))]  # sort small to large
    print("genetic: " + " ".join([f"[{int(i[0])}, {int(i[1])}]" for i in k]))
    print(f"fitness: {f:.5f}, best possible recall: {bpr:.5f}")


if __name__ == "__main__":
    main()