MNIST数据集有60000个示例的训练集和10000个示例的测试集。它是NIST提供的更大集合的子集。数字已经过大小标准化,并在固定大小的图像中居中。它是一个很好的数据库,为那些想尝试学习技术和模式识别方法的人对现实世界的数据,同时花费最小的努力在预处理和格式化。
Python 3.8.10
Package Version
----------------------- -------------------
tensorflow-gpu 2.5.0
import os, gzip, struct, PIL, numpy
import matplotlib.pyplot as pyp
def read_mnist(root):
"""读取整个MNIST数据集
Args:
root (str): MNIST数据集的`train-images-idx3-ubyte.gz`、`train-labels-idx1-ubyte.gz`、`t10k-images-idx3-ubyte.gz`、`t10k-labels-idx1-ubyte.gz`四个`.gz`文件所在目录。
Returns:
tuple: 训练集图片、训练集标签、测试集图片、测试集标签、图片尺寸
"""
TRAIN_X_FILENAME = 'train-images-idx3-ubyte.gz'
TRAIN_Y_FILENAME = 'train-labels-idx1-ubyte.gz'
TEST_X_FILENAME = 't10k-images-idx3-ubyte.gz'
TEST_Y_FILENAME = 't10k-labels-idx1-ubyte.gz'
STEP = 4
MODE, UNPACK_MODE = 'big', '>'
def read(x_name, y_name, root):
with gzip.open(os.path.join(root, x_name), 'rb') as gz:
data = gz.read()
count = int.from_bytes(data[1*STEP:2*STEP], byteorder=MODE, signed=False)
width = int.from_bytes(data[2*STEP:3*STEP], byteorder=MODE, signed=False)
height = int.from_bytes(data[3*STEP:4*STEP], byteorder=MODE, signed=False)
shape = (height, width, 1)
x = struct.unpack_from('%s%dB' % (UNPACK_MODE, width*height*count), data, 4*STEP)
x = numpy.reshape(x, (count, height, width, 1))/255.
with gzip.open(os.path.join(root, y_name), 'rb') as gz:
data = gz.read()
y = [int(i) for i in data][2*STEP:]
return x, y, shape
x, y, s = read(TRAIN_X_FILENAME, TRAIN_Y_FILENAME, root)
x_t, y_t, _ = read(TEST_X_FILENAME, TEST_Y_FILENAME, root)
return x, y, x_t, y_t, s
x, y, x_t, y_t, s = read_mnist('/home/ubuntu/Documents/data/mnist')
ROW, COL = 1, 8
pyp.figure(dpi=300)
for index in range(ROW*COL):
pyp.subplot(1, 8, index+1)
img=numpy.uint8(x[index]*255)
pyp.imshow(PIL.Image.fromarray(numpy.reshape(img,img.shape[:-1])))
pyp.title(y[index])
pyp.axis('off')
pyp.show()
|