四、KNN算法
4.1算法流程:
1)给定测试对象,计算它与训练集中每个对象的距离。 2)圈定距离最近的k个训练对象,作为测试对象的邻居。 3)根据这k个近邻对象所属的类别,找到占比最高的那个类别作为测试对象的预测类别。
4.2 影响KNN算法的准确度的两个因素:
4.3 KNN代码
4.3.1 无框架代码
import numpy as np
import matplotlib.pyplot as plt
import operator
def createDataSet():
group = np.array([[1.0,2.0],[1.2,0.1],[0.1,1.4],[0.3,3.5],[1.1,1.0],[0.5,1.5]])
labels = np.array(['A','A','B','B','A','B'])
return group,labels
'''
k : 选取的最近k个目标
dis : 'E' 欧拉 ; 'M' 曼哈顿
X_train : 训练测试的特征数据
x_train : 训练测试的标签数据
Y_test : 测试的特征数据
'''
def kNN_classify(k,dis,X_train,x_train,Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E代表欧拉距离,M代表曼哈顿距离'
num_test = Y_test.shape[0]
labellist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
print(i, sortedClassCount)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
if (dis == 'M'):
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
print(i, sortedClassCount)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
if __name__=='__main__':
group,labels = createDataSet()
X_test = kNN_classify(5, 'M', group, labels, np.array([[0.8, 1.2]]))
print(X_test)
4.3.2 pytorch代码
import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import numpy as np
import operator
batch_size = 100
import warnings
warnings.filterwarnings('ignore')
def kNN_classify(k,dis,X_train,x_train,Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M,E代表欧拉距离,M代表曼哈顿距离'
num_test = Y_test.shape[0]
labellist = []
'''
使用欧拉公式作为距离度量
'''
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
if (dis == 'M'):
for i in range(num_test):
distances = np.sum(np.abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[x_train[i]] = classCount.get(x_train[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
def main():
train_dataset = dsets.MNIST(root = '/ml/pymnist',
train = True,
transform = None,
download = True)
test_dataset = dsets.MNIST(root = '/ml/pymnist',
train = False,
transform = None,
download = True)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
batch_size = batch_size,
shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
batch_size = batch_size,
shuffle = True)
X_train = train_loader.dataset.train_data.numpy()
X_train = X_train.reshape(X_train.shape[0],28*28)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = X_test.reshape(X_test.shape[0],28*28)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = kNN_classify(5, 'M', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
main()
4.5 交叉验证
第一步,使用之前所写的KNN分类器,代码如下:
sortedClassCount =
sorted(classCount.items(), key=operator.itemgetter(1),
reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
if (dis == 'M'):
for i in range(num_test):
distances = np.sum(np.abs(self.Xtr -
np.tile(X_test[i], (self.Xtr.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[self.ytr[i]] =
classCount.get(self.ytr[i], 0) + 1
sortedClassCount =
sorted(classCount.items(), key=operator.itemgetter(1),
reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
第二步,准备测试数据与验证数据,值得注意的是,如果使用方法四,则在选择超参数阶段不需要使用到X_test和y_test的输出,代码如下:
X_train = train_loader.dataset.train_data
X_train = X_train.reshape(X_train.shape[0],-1)
mean_image = getXmean(X_train)
X_train = centralized(X_train,mean_image)
y_train = train_loader.dataset.train_labels
y_train = np.array(y_train)
X_test = test_loader.dataset.test_data
X_test = X_test.reshape(X_test.shape[0],-1)
X_test = centralized(X_test,mean_image)
y_test = test_loader.dataset.test_labels
y_test = np.array(y_test)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
第三步,将训练数据分成5个部分,每个部分轮流作为验证集,代码如下:
k_to_accuracies = {}
for k in k_choices:
acc = []
for i in range(num_folds):
x = X_train_folds[0:i] + X_train_folds[i+1:]
x = np.concatenate(x, axis=0)
训练集拼在一起
y = y_train_folds[0:i] + y_train_folds[i+1:]
y = np.concatenate(y)
test_x = X_train_folds[i]
(k, accuracy))
4.4 KNN的图像分类
1.简介
图像分类问题就是将已有的固定的分类标签集合中最合适的标签分配给输入的图像
如图,机器识别的图像大小248*400像素,有红绿蓝RGB3个颜色通道,计算机需要处理的总数据有248×400×3=297600,每个数据在0 -255之间,0代表黑色,255为白色 标签集合为{cat,dog,hat,mug} 图像分类的任务就是预测一个给定的图像包含了哪个分类标签(或者给出属于一系列不同标签的可能性)。
2.预处理
1)目的:
- 使得原始图像符合某种既定规则以便于进行后续的处理,
- 去除图像中的噪声。
- 减少后续的运算量以及加速收敛。
2)常用的图像预处理操作:
3.归一化
归一化可用于保证所有维度上的数据都在一个变化幅度
例如:在预测房价的例子中,假设房价由面积s和卧室数b决定,面积s在0~200之间,卧室数b在0~5之间,进行归一化的一个实例就是s=s/200,b=b/5。 两种方法来实现归一化 最值归一化,比如将最大值归一化成1,最小值归一化成-1;或者将最大值归一化成1,最小值归一化成0 均值方差归一化,一般是将均值归一化成0,方差归一化成1。
代码链接1_knn.ipynb
|