Rdd2022数据集,是关于道路损伤的数据集,与rdd2020相比增加了两万多张图片 但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注,特写此文章记录数据清洗与格式化的过程
数据集下载
在开始前需要自己下载zip格式的RDD2022数据集,大小为12.4G 点击此处下载
之后,在桌面上新建一个名为my_file 的文件夹,将上面下载的压缩包放进去,将我的main.py放进去,文件夹结构如下
在pycharm种运行main文件即可 运行完成后my_file结构如下,其中的my_data就是你要的文件夹,其他的都没有:
注意事项
注意:
- 如果运行过程中出现任何bug使得程序没有进行到底,需要删除所有自动生成的文件,回到最开始的目录结构,从新开始运行main文件
- 如果报错说有缺了什么包,自己去安装即可
- 注意设置工作目录为你自己新建的那个my_file文件夹,一般情况下默认就是这个,如果报错找不到目录啥的就看下是不是这个问题
- 按照个人需求,以国家为单位对数据集进行了train:val=7:3的切割
源代码
main.py的代码如下:
import zipfile
import os
import os
import xml.etree.ElementTree as ET
from shutil import copyfile
import shutil
import argparse
from pathlib import Path
import random
from collections import defaultdict
import random
work_dir = os.getcwd()
countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"]
labels = ["D00", "D10", "D20", "D40"]
def unzip_rdd2022():
path = os.path.join(work_dir, 'RDD2022.zip')
zip_file = zipfile.ZipFile(path)
zip_list = zip_file.namelist()
for f in zip_list:
zip_file.extract(f, work_dir)
zip_file.close()
def unzip_RDD2022_all_countries():
dir_path = os.path.join(work_dir, 'RDD2022_all_countries')
all_countries_zip_file_name = os.listdir(dir_path)
for name in all_countries_zip_file_name:
print('正在解压{}'.format(name))
all_countries_zip_file_path = os.path.join(dir_path, name)
zip_file = zipfile.ZipFile(all_countries_zip_file_path)
zip_list = zip_file.namelist()
for f in zip_list:
zip_file.extract(f, dir_path)
zip_file.close()
print('{}已解压完成'.format(name))
def remove_useless_file():
RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
for country in countries:
print("开始对 {} 的标签与图片进行操作".format(country))
annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/"))
jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/"))
newCountry = "new_" + country
annotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/')
jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/')
os.makedirs(annotations_dir, exist_ok=True)
os.makedirs(jpegimages_dir, exist_ok=True)
for annoFile in annoFiles:
tree = ET.parse(
os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile))
root = tree.getroot()
for obj in root.findall("object"):
a = obj.find("name").text
if a not in labels:
root.remove(obj)
if len(root.findall("object")) > 0:
country_path = os.path.join(RDD2022_all_countries_path, country)
newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry)
tree.write(newCountry_path + "/Annotations/" + annoFile)
copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg",
newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg")
else:
continue
print("{} 的标签与图片操作完毕".format(country))
def copy_file_2_new_train_dir():
os.makedirs(work_dir + "new_train/", exist_ok=True)
jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/')
annotation_path = os.path.join(work_dir, 'new_train', 'annotations/')
os.makedirs(jpeg_path, exist_ok=True)
os.makedirs(annotation_path, exist_ok=True)
RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
for country in countries:
print("{}正在复制".format(country))
jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages')
all_jpeg_names = os.listdir(jpeg_dir_path)
annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations')
all_anno_names = os.listdir(annotation_dir_path)
for name in all_jpeg_names:
source = os.path.join(jpeg_dir_path, name)
target = os.path.join(work_dir, 'new_train', 'jpegimages')
shutil.copy(source, target)
for name in all_anno_names:
source = os.path.join(annotation_dir_path, name)
target = os.path.join(work_dir, 'new_train', 'annotations')
shutil.copy(source, target)
print("{}复制完毕".format(country))
def generate_txt_file():
annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/"))
yoloFile = open("./xml2yolo_damage.txt", "w")
for i in range(len(annoFiles)):
yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "\n")
yoloFile.close()
def xml2yolo():
import argparse
import os
import xml.etree.ElementTree as ET
from PIL import Image
from collections import defaultdict
imageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"]
imageListDict = defaultdict(set)
def convert(size, box):
dw = 1. / size[0]
dh = 1. / size[1]
x = (box[0] + box[1]) / 2.0
y = (box[2] + box[3]) / 2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x * dw
w = w * dw
y = y * dh
h = h * dh
return [x, y, w, h]
def getYoloNumbers(imagePath, minX, minY, maxX, maxY):
image = Image.open(imagePath)
w = int(image.size[0])
h = int(image.size[1])
b = (minX, maxX, minY, maxY)
bb = convert((w, h), b)
image.close()
return bb
def getFileList3(filePath):
xmlFiles = []
with open(filePath, "r") as f:
xmlFiles = f.readlines()
for i in range(len(xmlFiles)):
temp = xmlFiles[i].strip().rsplit('.', 1)[0]
xmlFiles[i] = os.path.abspath(temp.replace("JPEGImages", "Annotations") + ".xml")
labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels")
if not os.path.exists(labels_path):
os.mkdir(labels_path)
assert (os.path.exists(xmlFiles[i]))
return xmlFiles
def main():
parser = argparse.ArgumentParser(description='run phase2.')
parser.add_argument('--input-file', type=str,
help='location to the list of images/xml files(absolute path). sample file at "./xml2yolo_damagee.txt"',
default='./xml2yolo_damage.txt')
args = parser.parse_args()
outputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3}
xmlFiles = getFileList3(args.input_file)
print("total files:", len(xmlFiles))
print('正在转换......')
for file in xmlFiles:
filePath = file
tree = ET.parse(filePath)
root = tree.getroot()
i = 0
imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]
while (not os.path.isfile(imageFile) and i < 2):
i += 1
imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]
if not os.path.isfile(imageFile):
print("File not found:", imageFile)
continue
txtFile = filePath[:-4].replace("Annotations", "labels") + ".txt"
yoloOutput = open(txtFile, "w")
for objects in root.findall('object'):
surfaceType = objects.find('name').text.replace(" ", "")
if surfaceType == "D30":
continue
bndbox = objects.find('bndbox')
[minX, minY, maxX, maxY] = [int(float(child.text)) for child in bndbox]
[x, y, w, h] = getYoloNumbers(imageFile, int(minX), int(minY), int(maxX), int(maxY))
yoloOutput.write(
str(outputCtoId[surfaceType]) + " " + str(x) + " " + str(y) + " " + str(w) + " " + str(h) + "\n")
imageListDict[outputCtoId[surfaceType]].add(imageFile)
yoloOutput.close()
for cl in imageListDict:
print(labels[cl], ":", len(imageListDict[cl]))
main()
def generate_my_data():
os.makedirs(work_dir + 'my_data/', exist_ok=True)
images_path = os.path.join(work_dir, 'my_data', 'images/')
labels_path = os.path.join(work_dir, 'my_data', 'labels/')
os.makedirs(images_path, exist_ok=True)
os.makedirs(labels_path, exist_ok=True)
os.makedirs(os.path.join(images_path, 'train/'), exist_ok=True)
os.makedirs(os.path.join(images_path, 'val/'), exist_ok=True)
os.makedirs(os.path.join(labels_path, 'train/'), exist_ok=True)
os.makedirs(os.path.join(labels_path, 'val/'), exist_ok=True)
print("最终my_data文件夹基本结构创建完毕")
new_train_path = os.path.join(work_dir, 'new_train')
jpeg_dir_path = os.path.join(new_train_path, 'jpegimages')
labels_dir_path = os.path.join(new_train_path, 'labels')
all_images_name = os.listdir(jpeg_dir_path)
all_countries_images = defaultdict(lambda: [])
for name in all_images_name:
country_name = '_'.join(name.split('_')[:-1])
all_countries_images[country_name].append(name)
images_len = sum([len(i) for i in all_countries_images.values()])
print("一共有{}张图片".format(images_len))
for k, v in all_countries_images.items():
print("{} 一共有 {}张图片".format(k, len(v)))
print('*************************')
print("开始切分数据集")
for country in countries:
image_len = len(all_countries_images[country])
train_nums = int(image_len * 0.7)
val_nums = image_len - train_nums
print("{}一共{}张图片,训练集7/10一共是{}张,测试集3/10一共是{}张,正在切割".format(country, image_len, train_nums, val_nums))
val_index = random.sample(range(0, image_len), val_nums)
for idx, name in enumerate(all_countries_images[country]):
source = os.path.join(jpeg_dir_path, name)
target = os.path.join(images_path, 'train') if idx not in val_index else os.path.join(images_path, 'val')
shutil.copy(source, target)
label_name = name.split('.')[0] + '.txt'
label_source = os.path.join(work_dir, 'new_train', 'labels', label_name)
label_target = os.path.join(work_dir, 'my_data', 'labels',
'train') if idx not in val_index else os.path.join(work_dir, 'my_data',
'labels', 'val')
shutil.copy(label_source, label_target)
all_train_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'train')))
all_val_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'val')))
print("所有数据切分完毕,训练集一共{}条,验证集一共{}条".format(all_train_len, all_val_len))
print("\n\n\n\n*************************")
print("完成,目标文件夹就是my_data, 其他的文件都可以删除")
print("注意:一共4种损伤类型,4种类型的名称以及对应的编号为")
for idx, i in enumerate(labels):
print("{}: {}".format(i, idx))
if __name__ == '__main__':
print("正在解压12.4G大的最外面的压缩包")
unzip_rdd2022()
print("正在解压6个国家的压缩包")
unzip_RDD2022_all_countries()
print("对图片进行去除清洗操作")
remove_useless_file()
print("正在将所有的图片以及标签复制到统一的目录下")
copy_file_2_new_train_dir()
print("正在生成用于标注转换的txt文件")
generate_txt_file()
print("正在转换标签")
xml2yolo()
print('正在生成最终文件夹')
generate_my_data()
|