文章目錄
2 k近鄰算法
2.1 實施kNN算法
代碼清單1:
'''
Author: Solarzhou
Email: [email protected]
'''
from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
#獲取數據集維度,集有多少個向量
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** 2
# 對應的每一行相加
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
# 先按由小到大排列,在獲取原來數組中對應的索引號
# 這裏數值最小的就是關係最緊密的
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
# sorted() 排序函數,這裏採用降序;
# 其中第二個參數key表示,通過第一個參數獲取的鍵值對中的值進行排序
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1), reverse=True)
print(sortedClassCount)
return sortedClassCount[0][0]
測試,結果:
from LearningSpark import KNN
group, labels = KNN.createDataSet()
KNN.classify0([0, 0], group, labels, 3)
[('B', 2), ('A', 1)]
'B'
KNN.classify0([1, 1], group, labels, 3)
[('A', 2), ('B', 1)]
'A'
2.2使用kNN改進約會網站的配對效果
2.2.1 準備數據,從文本中解析數據
# 將文本記錄轉換爲NumPy的解析程序
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
transfomation = listFromLine[-2]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
- 測試結果
注意:書中的源碼在這裏有出錯誤。
1)在讀取文本文件時,應該是datingTestSet2.txt
,否則會報錯:
int() invalid literal for int() with base 10: ''
importlib.reload(KNN)
<module 'LearningSpark.KNN' from 'E:\\Users\\Administrator\\PycharmProjects\\TestCases\\LearningSpark\\KNN.py'>
datingDataMat, datingLabels = KNN.file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
datingDataMat
array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
[1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
[2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
...,
[2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
[4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
[4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])
datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
2.2.2 分析數據: 使用Matplotlib創建散點圖
- 使用
Matplotlib
製作原始數據的散點圖
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
<matplotlib.collections.PathCollection object at 0x00000245657F1C88>
plt.show()
- 利用
datingLabels
中的類標籤屬性,在散點圖上繪製色彩不等的點
ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
<matplotlib.collections.PathCollection object at 0x000002456E563EB8>
plt.show()
測試效果圖:
2.2.3 準備數據:歸一化數值
歸一化特徵值
# 歸一化特徵值
def autoNorm(dataSet):
minVals = dataSet.min(0) # min得到的每一列中的最小值
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet/tile(ranges, (m, 1))
return normDataSet, ranges, minVals
運行測試:
normMat, ranges, minVals = KNN.autoNorm(datingDataMat)
normMat
array([[0.44832535, 0.39805139, 0.56233353],
[0.15873259, 0.34195467, 0.98724416],
[0.28542943, 0.06892523, 0.47449629],
...,
[0.29115949, 0.50910294, 0.51079493],
[0.52711097, 0.43665451, 0.4290048 ],
[0.47940793, 0.3768091 , 0.78571804]])
ranges
array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])
len(ranges)
3
測試算法:作爲完整程序驗證
分類器分類器針對約會網站的測試代碼
# 測試代碼
def datingClassTest():
hoRatio =0.10
datingDataMat, datingLables = file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 訓練集的向量數
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],
datingLables[numTestVecs:m], 5)
print("the classifier come back with: %d, the real answer is: %d"
%(classifierResult, datingLables[i]))
if (classifierResult != datingLables[i]):
errorCount += 1.0
print('the total error rate is:%f'%(errorCount/float(numTestVecs)))
測試結果:
2.3使用算法:構建完整可用系統
2.3.1 準備數據:將圖像轉換爲測試向量
將圖像轉換爲測試向量
# 手寫識別系統
# 將圖像轉換爲測試向量
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
測試結果:
testVector = KNN.img2vector('LearningSpark/Ch02/digits/testDigits/0_13.txt')
testVector[0,0:31]
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
2.3.2 測試算法:使用kNN近鄰算法識別手寫數字
手寫數字識別系統的測試代碼。
首先需要導入listdir
,讀取文件目錄;因此我們首先要確保from os import listdir
寫入文件的其實部分。
# 手寫數字識別系統的測試代碼
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('LearningSpark/Ch02/digits/trainingDigits') #load the training set
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('LearningSpark/Ch02/digits/trainingDigits/%s' % fileNameStr)
testFileList = listdir('LearningSpark/Ch02/digits/testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('LearningSpark/Ch02/digits/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount/float(mTest)))
測試結果:
3 決策樹
3.1 決策樹構造
首先我們需要理解 信息熵, 條件信息熵,信息增益。
信息熵是代表隨機變量的複雜度(不確定性);
條件熵代表在某一個條件下,隨機變量的複雜度(不確定度);
信息增益: 信息熵-條件熵。也就是說,信息增益代表了在一個條件下,信息複雜度(不確定性)減少程度。
參考文檔–知乎專欄
3.1.1信息增益
在劃分數據集之前之後信息發生的變化成爲信息增益。
計算給定數據集的香農熵。
# 香農熵
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelCounts = {}
for feaVec in dataset:
currentLabel = feaVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]/numEntries)
shannonEnt -= prob * log(prob, 2)
return shannonEnt
創建自己的數據集
# 創建自己的數據集
def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
測試結果:
myDat, labels = treeCopy.createDataSet()
myDat
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
0.9709505944546686
myDat[0][-1] = 'maybe'
myDat
[[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
1.3709505944546687
可以看到分類越多,信息熵越大;信息熵天生偏向選擇分支多的屬性。
3.1.2 劃分數據集
- 按照給定的特徵劃分數據集
def splitDataSet(dataSet, axis, value):
'''
:param dataSet: 帶劃分的數據集
:param axis: 劃分數據集的特徵
:param value: 需要返回的特製的值
也即,獲取某一特徵的其餘值
:return:
'''
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
測試結果:
- 選擇最好的數據集劃分方式
#選擇組好的數據集劃分方式
def chooseBestReatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntroy = calcShannonEnt(dataSet)
print('baseEntroy:',baseEntroy)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntroy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prop = len(subDataSet)/float(len(dataSet))
# 這裏是在算條件熵
newEntroy += prop * calcShannonEnt(subDataSet)
print("newEntroy:", newEntroy)
infoGain = baseEntroy - newEntroy
print('infoGain:', infoGain)
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
這裏我們將 baseEntroy
, newEntroy
, infoGain
打印出來,便於查看測試結果:
3.1.3 遞歸構建決策樹
遞歸調用時非常複雜,要想想明白挺不容易。
之前看過一本外國人寫的 Python書籍,提到遞歸這一塊,給的建議是:相信你寫的是正確的,那就是正確的。
# 創建樹
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestReatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}}
del(labels[bestFeat])
bestValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(bestValues)
for value in uniqueVals:
subLables = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet\
(dataSet, bestFeat, value), subLables)
return myTree
3.3 測試和存儲分類器
3.3.1 測試算法:使用決策樹執行分類
構建決策樹分類函數
# 使用決策樹的分類函數
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__=='dict':
classLable = classify(secondDict[key], featLabels, testVec)
else: classLable = secondDict[key]
return classLable
測試結果:
myDat, labels = treeCopy.createDataSet()
labels
['no surfacing', 'flippers']
myTree = treePlotterCopy.retrieveTree(0)
myTree
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
treeCopy.classify(myTree, labels, [1, 0])
'no'
treeCopy.classify(myTree, labels, [1, 1])
'yes'
3.3.2 決策樹存儲
# 使用 pickle 模塊存儲決策樹
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'wb+')
pickle.dump(inputTree, fw, 0)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename, 'rb')
readTree = pickle.load(fr)
return readTree
**注意:**按書中的方式寫入文件會報錯,我們這裏指定格式爲:fw = open(filename, 'wb+')
3.4 示例:使用決策樹預測隱形眼鏡類型
fr = open('LearningSpark/Ch03/lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
len(lenses)
24
lenses
[['young', 'myope', 'no', 'reduced', 'no lenses'], ['young', 'myope', 'no', 'normal', 'soft'], ['young', 'myope', 'yes', 'reduced', 'no lenses'], ['young', 'myope', 'yes', 'normal', 'hard'], ['young', 'hyper', 'no', 'reduced', 'no lenses'], ['young', 'hyper', 'no', 'normal', 'soft'], ['young', 'hyper', 'yes', 'reduced', 'no lenses'], ['young', 'hyper', 'yes', 'normal', 'hard'], ['pre', 'myope', 'no', 'reduced', 'no lenses'], ['pre', 'myope', 'no', 'normal', 'soft'], ['pre', 'myope', 'yes', 'reduced', 'no lenses'], ['pre', 'myope', 'yes', 'normal', 'hard'], ['pre', 'hyper', 'no', 'reduced', 'no lenses'], ['pre', 'hyper', 'no', 'normal', 'soft'], ['pre', 'hyper', 'yes', 'reduced', 'no lenses'], ['pre', 'hyper', 'yes', 'normal', 'no lenses'], ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'no', 'normal', 'no lenses'], ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'yes', 'normal', 'hard'], ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'no', 'normal', 'soft'], ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
lensesTree = treeCopy.createTree(lenses, lensesLabels)
lensesTree
{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'young': 'soft'}}, 'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}}}}}, 'reduced': 'no lenses'}}
treePlotterCopy.createPlot(lensesTree)
測試結果: