KNN的算法实现
首先创建演示数据集
import numpy as np
import matplotlib.pyplot as plt
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
plt.show()
代码介绍:
- createDataSet用于创建训练数据集及其对应的类别,group对应的是二维训练数据集分别对应x轴和y轴的数据
- labels对应的是训练集的标签
- 使用Matplotlib绘制图形,scatter绘制散点图
Python基于欧氏距离实现KNN分类器
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E代表欧氏距离,M代表哈曼顿距离'
num_test = Y_test.shape[0]
labellist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
测试KNN算法
需要注意的是,我们在输入测试集的时候需要将其转换为Numpy的矩阵,否则系统会提示传入的参数是list类型,没有shape的方法
if __name__ == '__main__':
group, labels = createDataSet()
y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
print(y_test_pred)
完整代码
import operator
import numpy as np
import matplotlib.pyplot as plt
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
if __name__ == '__main__':
group, labels = createDataSet()
y_test_pred = KNN_classify(1, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
print(y_test_pred)
KNN实战
KNN实现MNIST数据分类
MNIST数据集是一个很经典的且很常用的数据集(类似图像处理中的“Hello World!”),它是一个基本的数据集,因此我们可以直接使用PyTorch框架进行数据下载与读取
执行代码的过程是一个比较漫长的过程
import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
batch_size = 100
train_dataset = dsets.MNIST(root='/ml/pymnist',
train=True,
transform=None,
download=True)
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
"""
MNIST数据集在KNN算法下的分类准确度
比较方式:对逐个像素进行比较,最后将差异值全部加起来,如果两张图片一样,那么差异值将会是一样,差异值越大图片相差越大
import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np
from KNN import train_loader, test_loader
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
batch_size = 100
train_dataset = dsets.MNIST(root='/ml/pymnist',
train=True,
transform=None,
download=True)
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
digit = train_loader.dataset.train_data[0]
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])
def getXmean(X_train):
X_train = np.reshape(X_train, (X_train.shape[0], -1))
mean_image = np.mean(X_train, axis=0)
return mean_image
def centralized(X_train, mean_image):
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_train = X_train.astype(np.float64)
X_train -= mean_image
return X_train
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy()
X_train = X_train.reshape(X_train.shape[0], 28 * 28)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = X_test.reshape(X_test.shape[0], 28 * 28)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = KNN_classify(5, 'M', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d /%d correct => accuracy: %f' % (num_correct, num_test, accuracy))
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 368 /1000 correct => accuracy: 0.368000
"""
归一化处理
import operator
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision.datasets as dsets
import matplotlib.pyplot as plt
import numpy as np
from KNN import train_loader, test_loader
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
if __name__ == '__main__':
group, labels = createDataSet()
plt.scatter(group[labels == 'A', 0], group[labels == 'A', 1], color='r', marker='*')
plt.scatter(group[labels == 'B', 0], group[labels == 'B', 1], color='g', marker='+')
plt.show()
def KNN_classify(k, dis, X_train, x_train, Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E为欧拉距离,M为曼哈顿距离'
num_test = Y_test.shape[0]
leballist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
else:
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
leballist.append(sortedClassCount[0][0])
return np.array(leballist)
batch_size = 100
train_dataset = dsets.MNIST(root='/ml/pymnist',
train=True,
transform=None,
download=True)
test_dataset = dsets.MNIST(root='/ml/pymnist', train=False, transform=None, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
digit = train_loader.dataset.train_data[0]
plt.imshow(digit, cmap=plt.cm.binary)
plt.show()
print(train_loader.dataset.train_labels[0])
def getXmean(X_train):
X_train = np.reshape(X_train, (X_train.shape[0], -1))
mean_image = np.mean(X_train, axis=0)
return mean_image
def centralized(X_train, mean_image):
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_train = X_train.astype(np.float64)
X_train -= mean_image
return X_train
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy()
mean_image = getXmean(X_train)
X_train = centralized(X_train, mean_image)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = centralized(X_test, mean_image)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = KNN_classify(5, 'E', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
"""
train_data: torch.Size([60000, 28, 28])
train_labels: torch.Size([60000])
test_data: torch.Size([10000, 28, 28])
test_labels: torch.Size([10000])
tensor(5)
Got 963 / 1000 correct => accuracy: 0.963000
"""
1复利积累,量变引起质变。比如读书,运动,正确的认知 2.正向循环,变成更好的自己,优秀是一种习惯。比如早起读书写作一小时,运动,冥想,看电影听音乐。 3.成长性思维,享受不断成长的过程。 4.长期主义,比如慢慢变富。 5.沸水效应,类似习惯培养,集中培养一个习惯固定好了,再培养下一个习惯。比如阅读专注,运动。 6.吸引效应,积极的心态去面对问题。 7.以终为始,决策定位。比如定好目标再分解到每天去做。 8.二八定律,重要的事情只有两成,做好这两成,比如阅读只有两成是重点,读书是为了改变行动,重点在落到行动力上。 9.不同的人看法不用,认知层次决定。 10.终身成长,走出舒适圈,不要给自己设限。
人从来不是一成不变的,生活会追着你一而再的脱胎换骨。只要愿意,你就永远有机会成为一个不断前行着的人,成为那个自己喜欢着的模样。 |
---|
|