相關的源代碼和數據都在這個github連接
https://github.com/Sangewang/MacEnvPython/tree/master/July/MachineLearn/Knn
#coding=utf-8
import numpy as np
import operator
from os import listdir
#intX是測試集 dataSet是訓練集 labels是標籤,k是分類
def classifyKnn(intX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
#intX本身是一個1024維度的vector,擴展到dataSetSize行,列只擴展一次,相當於複製數據dataSetSize次
diffMat = np.tile(intX,(dataSetSize,1)) - dataSet
#矩陣每個數字做平方
sqDiffMat = diffMat ** 2
#axis = 0是在列的方向操作,axis=1是在行的方向上操作
row_SumDistances = sqDiffMat.sum(axis = 1)
oushi_distance = row_SumDistances ** 0.5
'''
數字:4 3 5 2
下標:0 1 2 3
argsort:3 1 0 2 ,即sortedDistIndices存的是3 1 0 2,代表原數組中下標位3的數字最小
所以sortedDistIndices[0] = 3 代表原數組的2 ->下標3對應的標籤找分類
sortedDistIndices[1] = 1 代表原數組的3 ->下標1對應的標籤找分類
'''
sortedDistIndices = oushi_distance.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
maxCount = 0
#step3.在字典中找最大的分類
for key,value in classCount.items():
if value>maxCount:
maxCount = value
maxIndex = key
return maxIndex
#training 訓練集的每個數字都是一個32*32的二維矩陣
def img2vector(filename):
returnVect = np.zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
#os.listdir() 方法用於返回指定的文件夾包含的文件或文件夾的名字的列表
trainingFileList = listdir('HandTrainingData')
mTrain = len(trainingFileList)
#生成一個m*1024大小的二維數組
trainingMat = np.zeros((mTrain,1024))
#訓練集文件名字1_1.txt,7_8.txt,第一個數字代表這個txt文件存的數字,第二個數字是個數
for i in range(mTrain):
fileNameStr = trainingFileList[i] #1_1.txt
fileStr = fileNameStr.split('.')[0] #1.1
classNumStr = int(fileStr.split('_')[0]) #1
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('HandTrainingData/%s'%fileNameStr)
testFileList = listdir('HandTestData')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('HandTestData/%s'%fileNameStr)
#vectorUnderTest是一個1*1024的vector,即一條測試數據
classifierResult = classifyKnn(vectorUnderTest,trainingMat,hwLabels,3)
print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print ("\nthe total number of errors is: %d" % errorCount)
print ("\nthe total error rate is: %f" % (errorCount/float(mTest)))
if __name__ == '__main__':
handwritingClassTest()
測試集數據比較少,因此全部正確