KNN(k-nearest neighbors)是一种监督学习的分类器,计算样本与所有数据的欧几里得距离,后枚举出k个距离最小的样本labels,从这k个样本中对labels进行投票,从中选择出最多票数的label作为result输出。
参考machine learning in action, 文中利用的数据可以在这本书的资源中找到,也可以去github上直接下载手写数字的image
程序如下:
import numpy as np
import operator
import os
import cv2
def classfier0(traindata, labels, data, k):
datasize = traindata.shape[0]
diff = np.tile(data, (datasize, 1)) - traindata
distance = ((diff ** 2).sum(axis=1)) ** 0.5
sortedDist = distance.argsort()
classCount = {}
for i in range(k):
votedLabel = labels[sortedDist[i]]
classCount[votedLabel] = classCount.get(votedLabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
在machine learning in action这本书中,将图像存储成matrix存储在.txt文件中,在书中作者读取.txt文件并存储成vector。
本文将书中的.txt文件读取出来并存储成.jpg的image,通过opencv从image获取用于train的vector
trainingDataPath = "./digits/trainingDigits"
testingDataPath = "./digits/testDigits"
trainingDir = sorted(os.listdir(trainingDataPath))
testingDir = sorted(os.listdir(testingDataPath))
trainingImgDir = "./images/trainingData"
testingImgDir = "./images/testingData"
def txt2matrix(path, txtDir, saveDir):
for txt in txtDir:
file = open(os.path.join(path, txt))
lines = file.readlines()
lineNum = len(lines)
eleNum = len(lines[0])
image = np.zeros((lineNum, eleNum-1))
for i, line in zip(range(lineNum), lines):
for j in range(eleNum-1):
image[i][j] = eval(line[j])*255
imagePath = os.path.join(saveDir, txt[0:-4]+".jpg")
cv2.imwrite(imagePath, image)
txt2matrix(trainingDataPath, trainingDir, trainingImgDir)
txt2matrix(testingDataPath, testingDir, testingImgDir)
def img2mat(imgPath):
imgFiles = sorted(os.listdir(imgPath))
imgMat = np.zeros((len(imgFiles),1024))
labelsVector = np.zeros(len(imgFiles))
for i, imgFile in enumerate(imgFiles):
try:
img = cv2.imread(os.path.join(imgPath, imgFile),0)
img = np.reshape(img,(1,img.shape[0]*img.shape[1]))
except:
continue
labelsVector[i] = int(imgFile.split("_")[0])
imgMat[i,:] = img
return imgMat, labelsVector
def dataTest1():
trainImgMat, trainLabelsVector = img2mat(trainingImgDir)
testImgMat, testLabelsVector = img2mat(testingImgDir)
errorRate = 0
for i in range(len(testImgMat)):
result = classfier0(trainImgMat, trainLabelsVector, testImgMat[i], 5)
if result != testLabelsVector[i]:
errorRate += 1
print("error rate is :{}".format(errorRate/len(testLabelsVector)))
|