[Python知识库] RDD2022 数据格式转换与清洗

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> RDD2022 数据格式转换与清洗 -> 正文阅读

[Python知识库]RDD2022 数据格式转换与清洗

Rdd2022数据集，是关于道路损伤的数据集，与rdd2020相比增加了两万多张图片
但是由于格式不能被yolo直接使用且其中有大量的图片没有符合要求的标注，特写此文章记录数据清洗与格式化的过程

数据集下载

在开始前需要自己下载zip格式的RDD2022数据集，大小为12.4G 点击此处下载

之后，在桌面上新建一个名为my_file 的文件夹，将上面下载的压缩包放进去，将我的main.py放进去，文件夹结构如下

在这里插入图片描述
在pycharm种运行main文件即可
运行完成后my_file结构如下，其中的my_data就是你要的文件夹，其他的都没有：

注意事项

注意：

如果运行过程中出现任何bug使得程序没有进行到底，需要删除所有自动生成的文件，回到最开始的目录结构，从新开始运行main文件
如果报错说有缺了什么包，自己去安装即可
注意设置工作目录为你自己新建的那个my_file文件夹，一般情况下默认就是这个，如果报错找不到目录啥的就看下是不是这个问题
按照个人需求，以国家为单位对数据集进行了train:val=7:3的切割

源代码

main.py的代码如下:

import zipfile
import os
import os
import xml.etree.ElementTree as ET
from shutil import copyfile
import shutil
import argparse
from pathlib import Path
import random
from collections import defaultdict
import random

work_dir = os.getcwd()
countries = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "Norway", "United_States"]
labels = ["D00", "D10", "D20", "D40"]


# 解压最开始的12.4G的压缩包到工作目录
#    解压之后是一个名为 RDD2022_all_countries 的文件夹
def unzip_rdd2022():
    path = os.path.join(work_dir, 'RDD2022.zip')
    zip_file = zipfile.ZipFile(path)
    zip_list = zip_file.namelist()
    for f in zip_list:
        zip_file.extract(f, work_dir)
    zip_file.close()


# RDD2022_all_countries文件夹里面有6个以国家名称命名的压缩包
#    进入这个文件夹里面继续解压，注意是解压到了RDD2022_all_countries
#    这个文件夹里面，至此所有的压缩文件解压完毕
def unzip_RDD2022_all_countries():
    dir_path = os.path.join(work_dir, 'RDD2022_all_countries')
    all_countries_zip_file_name = os.listdir(dir_path)

    for name in all_countries_zip_file_name:
        print('正在解压{}'.format(name))
        all_countries_zip_file_path = os.path.join(dir_path, name)
        zip_file = zipfile.ZipFile(all_countries_zip_file_path)
        zip_list = zip_file.namelist()
        for f in zip_list:
            zip_file.extract(f, dir_path)
        zip_file.close()
        print('{}已解压完成'.format(name))


# 将所有有标签的图片以及对应的标注移动到一个新的文件夹中
#   然后后续操作都是针对这些有标签的图片进行的，其实就是变相去除了
#   没有标签的图片
def remove_useless_file():
    # 一共6个国家，一个国家一个国家的操作
    RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
    for country in countries:
        print("开始对 {} 的标签与图片进行操作".format(country))
        annoFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/annotations/xmls/"))
        jpgFiles = os.listdir(os.path.join(RDD2022_all_countries_path, country + "/train/images/"))
        newCountry = "new_" + country
        # 在RDD2022_all_countries文件夹下面新建文件夹，new_countryname/Annotations
        #                                          new_countryname/JPEGImages
        annotations_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'Annotations/')
        jpegimages_dir = os.path.join(RDD2022_all_countries_path, newCountry, 'JPEGImages/')
        os.makedirs(annotations_dir, exist_ok=True)
        os.makedirs(jpegimages_dir, exist_ok=True)

        for annoFile in annoFiles:
            tree = ET.parse(
                os.path.join(RDD2022_all_countries_path + "/" + country + "/train/annotations/xmls/", annoFile))
            root = tree.getroot()
            for obj in root.findall("object"):
                a = obj.find("name").text
                if a not in labels:
                    root.remove(obj)

            if len(root.findall("object")) > 0:
                country_path = os.path.join(RDD2022_all_countries_path, country)
                newCountry_path = os.path.join(RDD2022_all_countries_path, newCountry)
                tree.write(newCountry_path + "/Annotations/" + annoFile)
                copyfile(os.path.join(country_path + "/train/images/", annoFile.split(".")[0]) + ".jpg",
                         newCountry_path + "/JPEGImages/" + annoFile.split(".")[0] + ".jpg")
            else:
                # print(f'{annoFile} 没有标签文件')
                continue
        print("{} 的标签与图片操作完毕".format(country))


# 将所有的图片复制到工作目录下的new_train/jpegimages 文件夹下
# 将所有的标签复制到工作目录下的new_train/annotations 文件夹下


def copy_file_2_new_train_dir():
    # 首先创建new_train文件夹
    os.makedirs(work_dir + "new_train/", exist_ok=True)
    # 创建new_train文件夹下面的两个文件夹
    jpeg_path = os.path.join(work_dir, 'new_train', 'jpegimages/')
    annotation_path = os.path.join(work_dir, 'new_train', 'annotations/')
    os.makedirs(jpeg_path, exist_ok=True)
    os.makedirs(annotation_path, exist_ok=True)

    RDD2022_all_countries_path = os.path.join(work_dir, 'RDD2022_all_countries')
    for country in countries:
        print("{}正在复制".format(country))
        jpeg_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'JPEGImages')
        all_jpeg_names = os.listdir(jpeg_dir_path)
        annotation_dir_path = os.path.join(RDD2022_all_countries_path, 'new_' + country, 'Annotations')
        all_anno_names = os.listdir(annotation_dir_path)
        for name in all_jpeg_names:
            source = os.path.join(jpeg_dir_path, name)
            target = os.path.join(work_dir, 'new_train', 'jpegimages')
            shutil.copy(source, target)
        for name in all_anno_names:
            source = os.path.join(annotation_dir_path, name)
            target = os.path.join(work_dir, 'new_train', 'annotations')
            shutil.copy(source, target)
        print("{}复制完毕".format(country))


# 生成一个包含所有xml文件路径的txt文件以便 xml2yolo文件调用
def generate_txt_file():
    annoFiles = os.listdir(os.path.join(work_dir, "new_train/Annotations/"))
    yoloFile = open("./xml2yolo_damage.txt", "w")
    for i in range(len(annoFiles)):
        yoloFile.writelines(work_dir + "/new_train/Annotations/" + annoFiles[i] + "\n")
    yoloFile.close()


def xml2yolo():
    import argparse
    import os
    import xml.etree.ElementTree as ET
    from PIL import Image
    from collections import defaultdict

    # Type of image in Dataset
    imageType = ["jpeg", "png", "jpg", "JPEG", "JPG", "PNG"]
    # dictionary to store list of image paths in each class
    imageListDict = defaultdict(set)

    def convert(size, box):
        dw = 1. / size[0]
        dh = 1. / size[1]
        x = (box[0] + box[1]) / 2.0
        y = (box[2] + box[3]) / 2.0
        w = box[1] - box[0]
        h = box[3] - box[2]
        x = x * dw
        w = w * dw
        y = y * dh
        h = h * dh
        return [x, y, w, h]

    # convert minX,minY,maxX,maxY to normalized numbers required by Yolo
    def getYoloNumbers(imagePath, minX, minY, maxX, maxY):
        image = Image.open(imagePath)
        w = int(image.size[0])
        h = int(image.size[1])
        b = (minX, maxX, minY, maxY)
        bb = convert((w, h), b)
        image.close()
        return bb

    def getFileList3(filePath):
        xmlFiles = []
        with open(filePath, "r") as f:
            xmlFiles = f.readlines()
            for i in range(len(xmlFiles)):
                temp = xmlFiles[i].strip().rsplit('.', 1)[0]
                xmlFiles[i] = os.path.abspath(temp.replace("JPEGImages", "Annotations") + ".xml")
                labels_path = os.path.dirname(xmlFiles[i]).replace("Annotations", "labels")
                if not os.path.exists(labels_path):
                    os.mkdir(labels_path)
                assert (os.path.exists(xmlFiles[i]))

        return xmlFiles

    def main():
        parser = argparse.ArgumentParser(description='run phase2.')

        parser.add_argument('--input-file', type=str,
                            help='location to the list of images/xml files(absolute path). sample file at "./xml2yolo_damagee.txt"',
                            default='./xml2yolo_damage.txt')
        args = parser.parse_args()

        # assign each class of dataset to a number
        outputCtoId = {'D00': 0, 'D10': 1, 'D20': 2, 'D40': 3}

        # read the path of the directory where XML and images are present
        xmlFiles = getFileList3(args.input_file)

        print("total files:", len(xmlFiles))
        print('正在转换......')

        # loop over each file under dirPath
        for file in xmlFiles:
            filePath = file
            # print(filePath)
            tree = ET.parse(filePath)
            root = tree.getroot()

            i = 0
            imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]
            while (not os.path.isfile(imageFile) and i < 2):
                i += 1
                imageFile = filePath[:-4].replace("Annotations", "JPEGImages") + "." + imageType[i]

            if not os.path.isfile(imageFile):
                print("File not found:", imageFile)
                continue

            txtFile = filePath[:-4].replace("Annotations", "labels") + ".txt"
            yoloOutput = open(txtFile, "w")

            # loop over each object tag in annotation tag
            for objects in root.findall('object'):
                surfaceType = objects.find('name').text.replace(" ", "")
                if surfaceType == "D30":
                    continue
                bndbox = objects.find('bndbox')
                [minX, minY, maxX, maxY] = [int(float(child.text)) for child in bndbox]
                [x, y, w, h] = getYoloNumbers(imageFile, int(minX), int(minY), int(maxX), int(maxY))
                yoloOutput.write(
                    str(outputCtoId[surfaceType]) + " " + str(x) + " " + str(y) + " " + str(w) + " " + str(h) + "\n")
                imageListDict[outputCtoId[surfaceType]].add(imageFile)
            yoloOutput.close()
        for cl in imageListDict:
            print(labels[cl], ":", len(imageListDict[cl]))

    main()


def generate_my_data():
    # 首先在工作目录下创建 my_data文件夹，以及他下面的images文件夹,labels文件夹
    os.makedirs(work_dir + 'my_data/', exist_ok=True)
    images_path = os.path.join(work_dir, 'my_data', 'images/')
    labels_path = os.path.join(work_dir, 'my_data', 'labels/')
    os.makedirs(images_path, exist_ok=True)
    os.makedirs(labels_path, exist_ok=True)
    # images和labels文件夹下面各有一个train，val文件夹
    os.makedirs(os.path.join(images_path, 'train/'), exist_ok=True)
    os.makedirs(os.path.join(images_path, 'val/'), exist_ok=True)
    os.makedirs(os.path.join(labels_path, 'train/'), exist_ok=True)
    os.makedirs(os.path.join(labels_path, 'val/'), exist_ok=True)
    print("最终my_data文件夹基本结构创建完毕")

    # 将new_train中6个国家的图片的绝对路径分别放到6个列表中
    new_train_path = os.path.join(work_dir, 'new_train')
    jpeg_dir_path = os.path.join(new_train_path, 'jpegimages')
    labels_dir_path = os.path.join(new_train_path, 'labels')
    all_images_name = os.listdir(jpeg_dir_path)

    all_countries_images = defaultdict(lambda: [])
    for name in all_images_name:
        country_name = '_'.join(name.split('_')[:-1])
        all_countries_images[country_name].append(name)
    images_len = sum([len(i) for i in all_countries_images.values()])
    print("一共有{}张图片".format(images_len))
    for k, v in all_countries_images.items():
        print("{} 一共有 {}张图片".format(k, len(v)))
    print('*************************')
    print("开始切分数据集")
    for country in countries:
        image_len = len(all_countries_images[country])
        train_nums = int(image_len * 0.7)
        val_nums = image_len - train_nums
        print("{}一共{}张图片，训练集7/10一共是{}张，测试集3/10一共是{}张，正在切割".format(country, image_len, train_nums, val_nums))
        # 验证集一共val_nums张图片, 一共image_len张图片，索引  0~image_len-1 ,从里面抽取val_index个数
        val_index = random.sample(range(0, image_len), val_nums)
        for idx, name in enumerate(all_countries_images[country]):
            # 图片的复制
            source = os.path.join(jpeg_dir_path, name)
            # target有两种可能，一种是train,一种是val
            target = os.path.join(images_path, 'train') if idx not in val_index else os.path.join(images_path, 'val')
            shutil.copy(source, target)
            # 图片对应的label的复制
            #    label的名称就是图片的名称改掉后缀
            label_name = name.split('.')[0] + '.txt'
            label_source = os.path.join(work_dir, 'new_train', 'labels', label_name)
            label_target = os.path.join(work_dir, 'my_data', 'labels',
                                        'train') if idx not in val_index else os.path.join(work_dir, 'my_data',
                                                                                           'labels', 'val')
            shutil.copy(label_source, label_target)
    all_train_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'train')))
    all_val_len = len(os.listdir(os.path.join(work_dir, 'my_data', 'images', 'val')))
    print("所有数据切分完毕,训练集一共{}条，验证集一共{}条".format(all_train_len, all_val_len))
    print("\n\n\n\n*************************")
    print("完成，目标文件夹就是my_data, 其他的文件都可以删除")
    print("注意：一共4种损伤类型，4种类型的名称以及对应的编号为")
    for idx, i in enumerate(labels):
        print("{}: {}".format(i, idx))


if __name__ == '__main__':
    print("正在解压12.4G大的最外面的压缩包")
    unzip_rdd2022()
    print("正在解压6个国家的压缩包")
    unzip_RDD2022_all_countries()
    print("对图片进行去除清洗操作")
    remove_useless_file()
    print("正在将所有的图片以及标签复制到统一的目录下")
    copy_file_2_new_train_dir()
    print("正在生成用于标注转换的txt文件")
    generate_txt_file()
    print("正在转换标签")
    xml2yolo()
    print('正在生成最终文件夹')
    generate_my_data()