使用 k 近鄰算法改進網站的配對效果
說明:
將數據集文件 ‘datingTestSet2.txt’ 放在當前文件夾
# 導入程序所需要的模塊
import numpy as np
import operator
定義數據集導入函數
file2matrix函數實現的功能是讀取文件數據,函數返回的returnMat和classLabelVector分別是數據集的特徵矩陣和輸出標籤向量。
def file2matrix(filename):
love_dictionary = {'largeDoses':3, 'smallDoses':2, 'didntLike':1} # 三個類別
# 打開文件
fr = open(filename)
# 逐行打開,readlines()方法用於讀取所有行(直到結束符 EOF)並返回列表,該列表可以由 Python 的 for... in ... 結構進行處理。
arrayOLines = fr.readlines()
#得到文件的行數
numberOfLines = len(arrayOLines)
#返回numpy矩陣,numberOfLines行,3列的零元素矩陣//初始化特徵矩陣
returnMat = np.zeros((numberOfLines, 3))
#返回分類的標籤向量//初始化輸出標籤向量
classLabelVector = []
#行的索引值
index = 0
for line in arrayOLines:
# 刪去字符串首尾空格
line = line.strip()
# 按'\t'對字符串進行分割,listFromLine 是列表
listFromLine = line.split('\t')
#將數據前三列提取出來,存放到returnMat的numpy矩陣中,也就是不含標籤變量,只有特徵變量。
# listFromLine的0,1,2元素是特徵,賦值給returnMat的當前行
#一行一行的存儲
returnMat[index, :] = listFromLine[0:3]
# 如果listFromLine最後一個元素是數字
if(listFromLine[-1].isdigit()):
# 直接賦值給classLabelVector
classLabelVector.append(int(listFromLine[-1]))
else:
# 如果listFromLine最後一個元素不是數字,而是字符串。根據字典love_dictionary轉化爲數字
# Python 字典(Dictionary) get() 函數返回指定鍵的值
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat, classLabelVector # 返回的類別標籤classLabelVector是1,2,3
returnMat,classLabelVector=file2matrix('datingTestSet2.txt')
print(returnMat)
print(classLabelVector)
print(returnMat.shape)
[[4.0920000e+04 8.3269760e+00 9.5395200e-01]
[1.4488000e+04 7.1534690e+00 1.6739040e+00]
[2.6052000e+04 1.4418710e+00 8.0512400e-01]
...
[2.6575000e+04 1.0650102e+01 8.6662700e-01]
[4.8111000e+04 9.1345280e+00 7.2804500e-01]
[4.3757000e+04 7.8826010e+00 1.3324460e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]
(1000, 3)
定義特徵歸一化函數
這裏爲什麼要對特徵進行歸一化?
因爲在處理這種不同取值範圍的特徵值時,數值歸一化能夠將不同特徵的取值範圍限定在同一區間例如[0,1]之間,讓不同特徵對距離的計算影響相同。具體可看《機器學習實戰》第2.2.3節內容。
#歸一化特徵變量中的數據
#歸一化公式爲 : newValue = (oldvalue-min)/(max-min)
def autoNorm(dataSet):
#獲得數據每一列的最小值和最大值
#b.min(k)就是b.min(axis=k),就是在他的第k個軸上投影求最小
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
#最大值和最小值的範圍
ranges = maxVals - minVals
#創建numpy矩陣,裏面全是零元素
normDataSet = np.zeros(np.shape(dataSet))
#返回dataSet的行數
m = dataSet.shape[0]
#原始值減去最小值。np.tile: 重複n次
normDataSet = dataSet - np.tile(minVals, (m, 1))
#除以最大和最小值的差,得到歸一化的數據
#normDataSet值被限定在[0,1]之間
normDataSet = normDataSet/np.tile(ranges, (m, 1))
return normDataSet, ranges, minVals
normDataSet,ranges,minVals=autoNorm(returnMat)
print(normDataSet)
print(ranges)
print(minVals)
[[0.44832535 0.39805139 0.56233353]
[0.15873259 0.34195467 0.98724416]
[0.28542943 0.06892523 0.47449629]
...
[0.29115949 0.50910294 0.51079493]
[0.52711097 0.43665451 0.4290048 ]
[0.47940793 0.3768091 0.78571804]]
[9.1273000e+04 2.0919349e+01 1.6943610e+00]
[0. 0. 0.001156]
定義k近鄰算法
def classify0(inX, dataSet, labels, k): # inX是測試集,dataSet是訓練集,labels是訓練樣本標籤,k是取的最近鄰個數
#訓練數據集行數,訓練樣本個數
dataSetSize = dataSet.shape[0]
#在行向量方向上重複inx共datasetSize次,在列向量方向上重複inx共1次
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
#二維特徵相減之後的平方
sqDiffMat = diffMat**2
#sum()所有元素相加,sum(0)列相加,sum(1)行相加
sqDistances = sqDiffMat.sum(axis=1)
#開方,計算出距離。distance是inX與dataSet的歐氏距離
distances = sqDistances**0.5
#argsort函數返回的是數組值從小到大的索引值。即返回distances中元素從小到大排序後的索引值。後續sortedDistIndicies[i]要用到。
sortedDistIndicies = distances.argsort()
#定義一個記錄類別(label)次數的字典,字典存儲k近鄰不同label出現的次數
classCount = {}
#遍歷前k個,記錄次數。選擇距離最小的k個點
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
# classCount中若有此key,對應label加1;classCount中若無此key,則默認爲0
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# operator.itemgetter 獲取對象的哪個維度的數據
# key=operator.itemgetter(1)根據字典的值進行排序
# key=operator.itemgetter(0)根據字典的鍵進行排序
# reverse=True降序排列
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# 返回k近鄰中所屬類別最多的那一類
return sortedClassCount[0][0]
測試算法:作爲完整程序驗證分類器
def datingClassTest():
#整個數據集的10%用來測試
hoRatio = 0.10
#導入數據集。將返回的特徵矩陣和分類向量分別存儲到datingDataMat, datingLabels
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
#數據歸一化,返回歸一化後的矩陣,數據範圍,數據最小值
normMat, ranges, minVals = autoNorm(datingDataMat)
#樣本個數
m = normMat.shape[0]
#測試樣本的個數
numTestVecs = int(m*hoRatio)
#分類錯誤計數
errorCount = 0.0
for i in range(numTestVecs):
#前numTestVecs個數據作爲測試集,後m-numTestVecs個數據作爲訓練集
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" % (errorCount / float(numTestVecs))) # 打印錯誤率
print(errorCount) # 打印錯誤個數
datingClassTest()
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0
使用算法:構建完整可用系統
根據用戶輸入,在線判斷類別。
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input(\
"percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles, percentTats, iceCream, ])
classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels, 5)
print("You will probably like this person: %s" % resultList[classifierResult - 1])
classifyPerson()
percentage of time spent playing video games?10
frequent flier miles earned per year?100
liters of ice cream consumed per year?1
You will probably like this person: in small doses