我的博客https://blog.justlovesmile.top/
目标检测是计算机视觉任务中的一个重要研究方向,其用于解决对数码图像中特定种类的可视目标实例的检测问题。目标检测作为计算机视觉的根本性问题之一,是其他诸多计算机视觉任务,例如图像描述生成,实例分割和目标跟踪的基础以及前提。而在解决此类问题时,我们常常需要使用自己的脚本或者利用标注工具生成数据集,数据集格式往往会多种多样,因此对于目标检测任务而言,为了更好地兼容训练,大多数目标检测模型框架会默认支持几种常用的数据集标注格式,常见的分别是COCO,Pascal VOC,YOLO等等。本文主要介绍上述几种数据集格式以及我写的Python脚本(一般需要根据实际情况再改改)。
1. COCO
1.1 COCO数据集格式
COCO(Common Objects in COtext)数据集,是一个大规模的,适用于目标检测,图像分割,Image Captioning任务的数据集,其标注格式是最常用的几种格式之一。目前使用较多的是COCO2017数据集。其官网为COCO - Common Objects in Context (cocodataset.org)。
COCO数据集主要包含图像(jpg或者png等等)和标注文件(json),其数据集格式如下(/ 代表文件夹):
-coco/
|-train2017/
|-1.jpg
|-2.jpg
|-val2017/
|-3.jpg
|-4.jpg
|-test2017/
|-5.jpg
|-6.jpg
|-annotations/
|-instances_train2017.json
|-instances_val2017.json
|-*.json
train2017 以及val2017 这两个文件夹中存储的是训练集和验证集的图像,而test2017 文件夹中存储的是测试集的信息,可以只是图像,也可以包含标注,一般是单独使用的。
annotations 文件夹中的文件就是标注文件,如果你有xml 文件,通常需要转换成json 格式,其格式如下(更详细的可以参考官网):
{
"info": info,
"images": [image],
"annotations": [annotation],
"categories": [category],
"licenses": [license],
}
其中info 为整个数据集的信息,包括年份,版本,描述等等信息,如果只是完成训练任务,其实不太重要,如下所示:
info{
"year": int,
"version": str,
"description": str,
"contributor": str,
"url": str,
"date_created": datetime,
}
其中的image 为图像的基本信息,包括序号,宽高,文件名等等信息,其中的序号(id )需要和后面的annotations 中的标注所属图片序号对应如下所示:
image{
"id": int,
"width": int,
"height": int,
"file_name": str,
"license": int,
"flickr_url": str,
"coco_url": str,
"date_captured": datetime,
}
其中的annotation 是最重要的标注信息,包括序号,所属图像序号,类别序号等等信息,如下所示:
annotation{
"id": int,
"image_id": int,
"category_id": int,
"segmentation": RLE or [polygon],
"area": float,
"bbox": [x,y,width,height],
"iscrowd": 0 or 1,
}
其中的category 代表类别信息,包括父类别,类别序号以及类别名称,如下所示:
category{
"id": int,
"name": str,
"supercategory": str,
}
其中的license 代表数据集的协议许可信息,包括序号,协议名称以及链接信息,如下所示:
license{
"id": int,
"name": str,
"url": str,
}
接下来,我们来看一个简单的示例:
{
"info": {略}, "images": [{"id": 1, "file_name": "1.jpg", "height": 334, "width": 500}, {"id": 2, "file_name": "2.jpg", "height": 445, "width": 556}], "annotations": [{"id": 1, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 3, "segmentation": []}, {"id": 2, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 2, "segmentation": []}, {"id": 3, "area": 40448, "iscrowd": 0, "image_id": 2, "bbox": [246, 61, 128, 316], "category_id": 1, "segmentation": []}], "categories": [{"supercategory": "none", "id": 1, "name": "liner"},{"supercategory": "none", "id": 2, "name": "containership"},{"supercategory": "none", "id": 3, "name": "bulkcarrier"}], "licenses": [{略}]
}
1.2 COCO转换脚本
Python转换脚本 如下所示,需要准备图像 和xml 标注文件:
import os, random, json
import shutil as sh
from tqdm.auto import tqdm
import xml.etree.ElementTree as xmlET
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def readxml(file):
tree = xmlET.parse(file)
size = tree.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
objs = tree.findall('object')
bndbox = []
for obj in objs:
label = obj.find("name").text
bnd = obj.find("bndbox")
xmin = int(bnd.find("xmin").text)
ymin = int(bnd.find("ymin").text)
xmax = int(bnd.find("xmax").text)
ymax = int(bnd.find("ymax").text)
bbox = [xmin, ymin, xmax, ymax, label]
bndbox.append(bbox)
return [[width, height], bndbox]
def tococo(xml_root, image_root, output_root,classes={},errorId=[],train_percent=0.9):
assert train_percent<=1 and len(classes)>0
train_root = os.path.join(output_root, "train2017")
val_root = os.path.join(output_root, "val2017")
ann_root = os.path.join(output_root, "annotations")
train_content = {
"images": [],
"annotations": [],
"categories": []
}
val_content = {
"images": [],
"annotations": [],
"categories": []
}
train_json = 'instances_train2017.json'
val_json = 'instances_val2017.json'
images = os.listdir(image_root)
total_num = len(images)
train_percent = train_percent
train_num = int(total_num * train_percent)
train_file = sorted(random.sample(images, train_num))
if mkdir(output_root):
if mkdir(train_root) and mkdir(val_root) and mkdir(ann_root):
idx1, idx2, dx1, dx2 = 0, 0, 0, 0
for file in tqdm(images):
name=os.path.splitext(os.path.basename(file))[0]
if name not in errorId:
res = readxml(os.path.join(xml_root, name + '.xml'))
if file in train_file:
idx1 += 1
sh.copy(os.path.join(image_root, file), train_root)
train_content['images'].append(
{"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx1})
for b in res[1]:
dx1 += 1
x = b[0]
y = b[1]
w = b[2] - b[0]
h = b[3] - b[1]
train_content['annotations'].append(
{"area": w * h, "iscrowd": 0, "image_id": idx1, "bbox": [x, y, w, h],
"category_id": classes[b[4]], "id": dx1, "segmentation": []})
else:
idx2 += 1
sh.copy(os.path.join(image_root, file), val_root)
val_content['images'].append(
{"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx2})
for b in res[1]:
dx2 += 1
x = b[0]
y = b[1]
w = b[2] - b[0]
h = b[3] - b[1]
val_content['annotations'].append(
{"area": w * h, "iscrowd": 0, "image_id": idx2, "bbox": [x, y, w, h],
"category_id": classes[b[4]], "id": dx2, "segmentation": []})
for i, j in classes.items():
train_content['categories'].append({"supercategory": "none", "id": j, "name": i})
val_content['categories'].append({"supercategory": "none", "id": j, "name": i})
with open(os.path.join(ann_root, train_json), 'w') as f:
json.dump(train_content, f)
with open(os.path.join(ann_root, val_json), 'w') as f:
json.dump(val_content, f)
print("Number of Train Images:", len(os.listdir(train_root)))
print("Number of Val Images:", len(os.listdir(val_root)))
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations"
image_root = "E:/MyProject/Dataset/hwtest/images"
output_root = "E:/MyProject/Dataset/coco"
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
errorId = []
train_percent = 0.9
tococo(box_root, image_root, output_root,classes=classes,errorId=errorId,train_percent=train_percent)
if __name__ == "__main__":
test()
2. VOC
2.1 VOC数据集格式
VOC(Visual Object Classes)数据集来源于PASCAL VOC挑战赛,其主要任务有Object Classification 、Object Detection 、Object Segmentation 、Human Layout 、Action Classification 。其官网为The PASCAL Visual Object Classes Homepage (ox.ac.uk)。其主要数据集有VOC2007以及VOC2012。
VOC数据集主要包含图像(jpg或者png等等)和标注文件(xml),其数据集格式如下(/ 代表文件夹):
-VOC/
|-JPEGImages/
|-1.jpg
|-2.jpg
|-Annotations/
|-1.xml
|-2.xml
|-ImageSets/
|-Layout/
|-*.txt
|-Main/
|-train.txt
|-val.txt
|-trainval.txt
|-test.txt
|-Segmentation/
|-*.txt
|-Action/
|-*.txt
|-SegmentationClass/
|-SegmentationObject/
其中对于目标检测任务而言,最常用的以及必须的文件夹包括:JPEGImages ,Annotations ,ImageSets/Main 。
JPEGImages 里存放的是图像,而Annotations 里存放的是xml 标注文件,文件内容如下:
<annotation>
<folder>VOC</folder> # 图像所在文件夹
<filename>000032.jpg</filename> # 图像文件名
<source> # 图像源
<database>The VOC Database</database>
<annotation>PASCAL VOC</annotation>
<image>flickr</image>
</source>
<size> # 图像尺寸信息
<width>500</width> # 图像宽度
<height>281</height> # 图像高度
<depth>3</depth> # 图像通道数
</size>
<segmented>0</segmented> # 图像是否用于分割,0代表不适用,对目标检测而言没关系
<object> # 一个目标对象的信息
<name>aeroplane</name> # 目标的类别名
<pose>Frontal</pose> # 拍摄角度,若无一般为Unspecified
<truncated>0</truncated> # 是否被截断,0表示完整未截断
<difficult>0</difficult> # 是否难以识别,0表示不难识别
<bndbox> # 边界框信息
<xmin>104</xmin> # 左上角x
<ymin>78</ymin> # 左上角y
<xmax>375</xmax> # 右下角x
<ymax>183</ymax> # 右下角y
</bndbox>
</object>
# 下面是其他目标的信息,这里略掉
<object>
其他object信息,这里省略
</object>
</annotation>
2.2 VOC转换脚本
下面这个脚本,只适用于有图像和xml文件的情况下,coco转voc格式以后有需要再写:
import os,random
from tqdm.auto import tqdm
import shutil as sh
def mkdir(path):
if not os.path.exists(path):
os.mkdir(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def tovoc(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
'''
参数:
root:数据集存放根目录
功能:
加载数据,并保存为VOC格式
加载后的格式:
VOC/
Annotations/
- **.xml
JPEGImages/
- **.jpg
ImageSets/
Main/
- train.txt
- test.txt
- val.txt
- trainval.txt
'''
assert len(classes)>0
VOC = saveroot
ann_path = os.path.join(VOC, 'Annotations')
img_path = os.path.join(VOC,'JPEGImages')
set_path = os.path.join(VOC,'ImageSets')
txt_path = os.path.join(set_path,'Main')
if mkdir(VOC):
if mkdir(ann_path) and mkdir(img_path) and mkdir(set_path):
mkdir(txt_path)
images = os.listdir(imgroot)
list_index = range(len(images))
trainval_percent = tvp
train_percent = trp
val_percent = 1 - train_percent if train_percent<1 else 0.1
total_num = len(images)
trainval_num = int(total_num*trainval_percent)
train_num = int(trainval_num*train_percent)
val_num = int(trainval_num*val_percent) if train_percent<1 else 0
trainval = random.sample(list_index,trainval_num)
train = random.sample(list_index,train_num)
val = random.sample(list_index,val_num)
for i in tqdm(list_index):
imgfile = images[i]
img_id = os.path.splitext(os.path.basename(imgfile))[0]
xmlfile = img_id+".xml"
sh.copy(os.path.join(imgroot,imgfile),os.path.join(img_path,imgfile))
sh.copy(os.path.join(xmlroot,xmlfile),os.path.join(ann_path,xmlfile))
if img_id not in errorId:
if i in trainval:
with open(os.path.join(txt_path,'trainval.txt'),'a') as f:
f.write(img_id+'\n')
if i in train:
with open(os.path.join(txt_path,'train.txt'),'a') as f:
f.write(img_id+'\n')
else:
with open(os.path.join(txt_path,'val.txt'),'a') as f:
f.write(img_id+'\n')
if train_percent==1 and i in val:
with open(os.path.join(txt_path,'val.txt'),'a') as f:
f.write(img_id+'\n')
else:
with open(os.path.join(txt_path,'test.txt'),'a') as f:
f.write(img_id+'\n')
print("Dataset to VOC format finished!")
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations"
image_root = "E:/MyProject/Dataset/hwtest/images"
output_root = "E:/MyProject/Dataset/voc"
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
errorId = []
train_percent = 0.9
tovoc(box_root,image_root,output_root,errorId,classes,trp=train_percent)
if __name__ == "__main__":
test()
3. YOLO
3.1 YOLO数据集格式
YOLO 数据集格式的出现主要是为了训练YOLO 模型,其文件格式没有固定的要求,因为可以通过修改模型的配置文件进行数据加载,唯一需要注意的是YOLO 数据集的标注格式是将目标框的位置信息进行归一化处理(此处归一化指的是除以图片宽和高),如下所示:
{目标类别} {归一化后的目标中心点x坐标} {归一化后的目标中心点y坐标} {归一化后的目标框宽度w} {归一化后的目标框高度h}
3.2 YOLO转换脚本
Python 转换脚本如下所示:
import os
import random
from tqdm.auto import tqdm
import shutil as sh
try:
import xml.etree.cElementTree as et
except ImportError:
import xml.etree.ElementTree as et
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def xml2yolo(xmlpath,savepath,classes={}):
namemap = classes
rt = et.parse(xmlpath).getroot()
w = int(rt.find("size").find("width").text)
h = int(rt.find("size").find("height").text)
with open(savepath, "w") as f:
for obj in rt.findall("object"):
name = obj.find("name").text
xmin = int(obj.find("bndbox").find("xmin").text)
ymin = int(obj.find("bndbox").find("ymin").text)
xmax = int(obj.find("bndbox").find("xmax").text)
ymax = int(obj.find("bndbox").find("ymax").text)
f.write(
f"{namemap[name]} {(xmin+xmax)/w/2.} {(ymin+ymax)/h/2.} {(xmax-xmin)/w} {(ymax-ymin)/h}"
+ "\n"
)
def trainval(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
assert tvp<=1.0 and trp <=1.0 and len(classes)>0
imglabel = ['images','labels']
trainvaltest = ['train','val','test']
mkdir(saveroot)
for r in imglabel:
mkdir(os.path.join(saveroot,r))
for s in trainvaltest:
mkdir(os.path.join(saveroot,r,s))
trainval_percent = tvp
train_percent = trp
val_percent = 1 - train_percent if train_percent<1.0 else 0.15
total_img = os.listdir(imgroot)
num = len(total_img)
list_index = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
va = int(tv * val_percent)
trainval = random.sample(list_index, tv)
train = random.sample(trainval, tr)
val = random.sample(trainval, va)
print(f"trainval_percent:{trainval_percent},train_percent:{train_percent},val_percent:{val_percent}")
for i in tqdm(list_index):
name = total_img[i]
op = os.path.join(imgroot,name)
file_id = os.path.splitext(os.path.basename(name))[0]
if file_id not in errorId:
xmlp = os.path.join(xmlroot,file_id+'.xml')
if i in trainval:
if i in train:
sp = os.path.join(saveroot,"images","train",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","train",file_id+'.txt'),classes)
sh.copy(op,sp)
else:
sp = os.path.join(saveroot,"images","val",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id+'.txt'),classes)
sh.copy(op,sp)
if (train_percent==1.0 and i in val):
sp = os.path.join(saveroot,"images","val",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id+'.txt'),classes)
sh.copy(op,sp)
else:
sp = os.path.join(saveroot,"images","test",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","test",file_id+'.txt'),classes)
sh.copy(op,sp)
def maketxt(dir,saveroot,filename):
savetxt = os.path.join(saveroot,filename)
with open(savetxt,'w') as f:
for i in tqdm(os.listdir(dir)):
f.write(os.path.join(dir,i)+'\n')
def toyolo(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1,train_percent=0.9):
trainval(xmlroot,imgroot,saveroot,errorId,classes,tvp,train_percent)
maketxt(os.path.join(saveroot,"images","train"),saveroot,"train.txt")
maketxt(os.path.join(saveroot,"images","val"),saveroot,"val.txt")
maketxt(os.path.join(saveroot,"images","test"),saveroot,"test.txt")
print("Dataset to yolo format success.")
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations"
image_root = "E:/MyProject/Dataset/hwtest/images"
output_root = "E:/MyProject/Dataset/yolo"
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
errorId = []
train_percent = 0.9
toyolo(box_root,image_root,output_root,errorId,classes,train_percent=train_percent)
if __name__ == "__main__":
test()
按照此脚本,将会在输出文件夹中生成以下内容:
-yolo/
|-images/
|-train/
|-1.jpg
|-2.jpg
|-test/
|-3.jpg
|-4.jpg
|-val/
|-5.jpg
|-6.jpg
|-labels/
|-train/
|-1.txt
|-2.txt
|-test/
|-3.txt
|-4.txt
|-val/
|-5.txt
|-6.txt
|-train.txt
|-test.txt
|-val.txt
|