[人工智能] 用Python将jpg格式文件逐个打标签之后再写入hdf5文件

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> 用Python将jpg格式文件逐个打标签之后再写入hdf5文件 -> 正文阅读

[人工智能]用Python将jpg格式文件逐个打标签之后再写入hdf5文件

在用CNN进行图片分类的任务时，发现很多demo都是使用已有的h5文件数据集进行训练

但是其他人的数据集毕竟不能100%契合每个项目

所以还是想自己准备数据集

可照片好拍，打标签和制作h5文件却很麻烦

试了一些第三方打标工具，包括labelme啥的，不是要花钱就是不好用

所以自己用Python写了个小程序，功能是：

从文件夹遍历所有图片，然后逐一打标

最后生成h5文件

程序代码如下：

1.导入所有的需要的包：

import os
import cv2
from keras.preprocessing import image
from matplotlib.pyplot import imshow, show
import numpy as np
from keras.applications.imagenet_utils import preprocess_input
import h5py
from sklearn.model_selection import train_test_split

2.手动输入分类数量、训练集h5文件名、测试集h5文件名、训练集占比：

# 分类数量：
class_num = int(input('Input the number of classes:'))
print(str(class_num) + " classes!")

# 创建hdf5训练集文件：
file_name = input('Input the hdf5 training dataset file name:')
f = h5py.File(file_name + ".hdf5", "w")

# 创建hdf5测试集文件：
file_name = input('Input the hdf5 test dataset file name:')
f2 = h5py.File(file_name + ".hdf5", "w")

# 训练集占比：
train_per = float(input('Input the percentage of the training set:'))
print('The training set accounts for ' + str(train_per * 100) + '% of the whole dataset.')

3.从当前目录遍历所有图片：

def getFileList(dir,Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir[-3:]:
                Filelist.append(dir)
    
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir=os.path.join(dir,s)
            getFileList(newDir, Filelist, ext)
 
    return Filelist
 
org_img_folder='./'
 
# 检索文件
img_num = len(imglist)
imglist = getFileList(org_img_folder, [], 'jpg')
print('本次执行检索到 '+str(img_num)+' 张图像\n')

4.进行图片逐一标注：

# 新建一个储存图片像素值的矩阵：
X_train = np.zeros((img_num, 64, 64, 3))
print(X_train.shape)
# 储存对应标签的矩阵：
y_train = np.zeros((img_num, 1))

i = 0

for img_path in imglist:
    print(img_path)
    img = image.load_img(img_path, target_size=(64, 64))
    imshow(img)
    show()
    time.sleep(1)
    class_tf = False

    while class_tf == False:
        class1 = input('which class?')
        if int(class1) < class_num:
            class_tf = True
            
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    # 将像素值和标签写入矩阵：
    X_train[i] = x
    y_train[i] = class1
    
    i += 1

5.分割训练集和测试集：

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=train_per, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
# 如果只是想把图片数据储存到numpy里，可以到此结束。

6.将训练集和测试集分别写入hdf5文件中：

# 写训练数据到hdf5文件里：
d = f.create_dataset("X_train", data=X_train)
d = f.create_dataset("y_train", data=y_train)
f.close()

# 写测试数据到hdf5文件里：
d = f2.create_dataset("X_test", data=X_test)
d = f2.create_dataset("y_test", data=y_test)
f2.close()

?完整代码：

import os
import cv2
from keras.preprocessing import image
from matplotlib.pyplot import imshow, show
import numpy as np
from keras.applications.imagenet_utils import preprocess_input
import h5py
from sklearn.model_selection import train_test_split

# 分类数量：
class_num = int(input('Input the number of classes:'))
print(str(class_num) + " classes!")

# 创建hdf5训练集文件：
file_name = input('Input the hdf5 training dataset file name:')
f = h5py.File(file_name + ".hdf5", "w")

# 创建hdf5测试集文件：
file_name = input('Input the hdf5 test dataset file name:')
f2 = h5py.File(file_name + ".hdf5", "w")

# 训练集占比：
train_per = float(input('Input the percentage of the training set:'))
print('The training set accounts for ' + str(train_per * 100) + '% of the whole dataset.')

def getFileList(dir,Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir[-3:]:
                Filelist.append(dir)
    
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir=os.path.join(dir,s)
            getFileList(newDir, Filelist, ext)
 
    return Filelist
 
org_img_folder='./'
 
# 检索文件
img_num = len(imglist)
imglist = getFileList(org_img_folder, [], 'jpg')
print('本次执行检索到 '+str(img_num)+' 张图像\n')


# 新建一个储存图片像素值的矩阵：
X_train = np.zeros((img_num, 64, 64, 3))
print(X_train.shape)
# 储存对应标签的矩阵：
y_train = np.zeros((img_num, 1))

i = 0

for img_path in imglist:
    print(img_path)
    img = image.load_img(img_path, target_size=(64, 64))
    imshow(img)
    show()
    time.sleep(1)
    class_tf = False

    while class_tf == False:
        class1 = input('which class?')
        if int(class1) < class_num:
            class_tf = True
            
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    # 将像素值和标签写入矩阵：
    X_train[i] = x
    y_train[i] = class1
    
    i += 1

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=train_per, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
# 如果只是想把图片数据储存到numpy里，可以到此结束。

# 写训练数据到hdf5文件里：
d = f.create_dataset("X_train", data=X_train)
d = f.create_dataset("y_train", data=y_train)
f.close()

# 写测试数据到hdf5文件里：
d = f2.create_dataset("X_test", data=X_test)
d = f2.create_dataset("y_test", data=y_test)
f2.close()

程序运行效果：