機器學習實戰筆記

2 k近鄰算法

2.1 實施kNN算法

代碼清單1:

'''
Author: Solarzhou
Email: [email protected]
'''
from numpy import *
import operator
def createDataSet():
     group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
     labels = ['A', 'A', 'B', 'B']
     return group, labels
def classify0(inX, dataSet, labels, k):
    #獲取數據集維度,集有多少個向量
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat ** 2
    # 對應的每一行相加
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5

    # 先按由小到大排列,在獲取原來數組中對應的索引號
    # 這裏數值最小的就是關係最緊密的
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    # sorted() 排序函數,這裏採用降序;
    # 其中第二個參數key表示,通過第一個參數獲取的鍵值對中的值進行排序
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1), reverse=True)
    print(sortedClassCount)
    return sortedClassCount[0][0]

測試,結果:

from LearningSpark import KNN
group, labels = KNN.createDataSet()
KNN.classify0([0, 0], group, labels, 3)
[('B', 2), ('A', 1)]
'B'
KNN.classify0([1, 1], group, labels, 3)
[('A', 2), ('B', 1)]
'A'

2.2使用kNN改進約會網站的配對效果

2.2.1 準備數據,從文本中解析數據

# 將文本記錄轉換爲NumPy的解析程序
def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())         #get the number of lines in the file
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        transfomation = listFromLine[-2]

        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
  • 測試結果
    注意:書中的源碼在這裏有出錯誤。
    1)在讀取文本文件時,應該是datingTestSet2.txt,否則會報錯:

int()  invalid literal for int() with base 10: ''
importlib.reload(KNN)
<module 'LearningSpark.KNN' from 'E:\\Users\\Administrator\\PycharmProjects\\TestCases\\LearningSpark\\KNN.py'>
datingDataMat, datingLabels = KNN.file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
datingDataMat
array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
       [1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
       [2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
       ...,
       [2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
       [4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
       [4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])
datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

2.2.2 分析數據: 使用Matplotlib創建散點圖

  • 使用Matplotlib製作原始數據的散點圖
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
<matplotlib.collections.PathCollection object at 0x00000245657F1C88>
plt.show()
  • 利用datingLabels中的類標籤屬性,在散點圖上繪製色彩不等的點
ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
           15.0*np.array(datingLabels), 15.0*np.array(datingLabels)) 
<matplotlib.collections.PathCollection object at 0x000002456E563EB8>
plt.show()

測試效果圖:
[外鏈圖片轉存失敗(img-7627hpTr-1565960279793)(_v_images/20190816160239192_22384.png)]

2.2.3 準備數據:歸一化數值

歸一化特徵值

# 歸一化特徵值
def autoNorm(dataSet):
    minVals = dataSet.min(0) # min得到的每一列中的最小值
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet/tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

運行測試:

normMat, ranges, minVals = KNN.autoNorm(datingDataMat)
normMat
array([[0.44832535, 0.39805139, 0.56233353],
       [0.15873259, 0.34195467, 0.98724416],
       [0.28542943, 0.06892523, 0.47449629],
       ...,
       [0.29115949, 0.50910294, 0.51079493],
       [0.52711097, 0.43665451, 0.4290048 ],
       [0.47940793, 0.3768091 , 0.78571804]])
ranges
array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])
len(ranges)
3

測試算法:作爲完整程序驗證

分類器分類器針對約會網站的測試代碼

# 測試代碼
def datingClassTest():
    hoRatio =0.10
    datingDataMat, datingLables = file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 訓練集的向量數
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],
                                     datingLables[numTestVecs:m], 5)
        print("the classifier come back with: %d, the real answer is: %d"
              %(classifierResult, datingLables[i]))
        if (classifierResult != datingLables[i]):
            errorCount += 1.0

    print('the total error rate is:%f'%(errorCount/float(numTestVecs)))

測試結果:
[外鏈圖片轉存失敗(img-Z2I0I82g-1565960279795)(_v_images/20190816172800588_8705.png)]

2.3使用算法:構建完整可用系統

2.3.1 準備數據:將圖像轉換爲測試向量

將圖像轉換爲測試向量

# 手寫識別系統
# 將圖像轉換爲測試向量
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

測試結果:

testVector = KNN.img2vector('LearningSpark/Ch02/digits/testDigits/0_13.txt')
testVector[0,0:31]
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

2.3.2 測試算法:使用kNN近鄰算法識別手寫數字

手寫數字識別系統的測試代碼。
首先需要導入listdir,讀取文件目錄;因此我們首先要確保from os import listdir寫入文件的其實部分。

# 手寫數字識別系統的測試代碼
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('LearningSpark/Ch02/digits/trainingDigits')           #load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('LearningSpark/Ch02/digits/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('LearningSpark/Ch02/digits/testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('LearningSpark/Ch02/digits/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

測試結果:
在這裏插入圖片描述

3 決策樹

3.1 決策樹構造

首先我們需要理解 信息熵, 條件信息熵,信息增益。
信息熵是代表隨機變量的複雜度(不確定性);
條件熵代表在某一個條件下,隨機變量的複雜度(不確定度);
信息增益: 信息熵-條件熵。也就是說,信息增益代表了在一個條件下,信息複雜度(不確定性)減少程度。
參考文檔–知乎專欄

3.1.1信息增益

在劃分數據集之前之後信息發生的變化成爲信息增益。
計算給定數據集的香農熵。

# 香農熵
def calcShannonEnt(dataset):
    numEntries = len(dataset)
    labelCounts = {}
    for feaVec in dataset:
        currentLabel = feaVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]/numEntries)
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt

創建自己的數據集

# 創建自己的數據集
def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

測試結果:

myDat, labels = treeCopy.createDataSet()
myDat
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
0.9709505944546686
myDat[0][-1] = 'maybe'
myDat
[[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
1.3709505944546687

可以看到分類越多,信息熵越大;信息熵天生偏向選擇分支多的屬性。

3.1.2 劃分數據集

  • 按照給定的特徵劃分數據集
def splitDataSet(dataSet, axis, value):
    '''
    :param dataSet: 帶劃分的數據集
    :param axis: 劃分數據集的特徵
    :param value: 需要返回的特製的值
    也即,獲取某一特徵的其餘值
    :return: 
    '''
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

測試結果:
在這裏插入圖片描述

  • 選擇最好的數據集劃分方式
#選擇組好的數據集劃分方式
def chooseBestReatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1
    baseEntroy = calcShannonEnt(dataSet)
    print('baseEntroy:',baseEntroy)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntroy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)           
            prop = len(subDataSet)/float(len(dataSet))
           # 這裏是在算條件熵
            newEntroy += prop * calcShannonEnt(subDataSet)
            print("newEntroy:", newEntroy)
        infoGain = baseEntroy - newEntroy
        print('infoGain:', infoGain)
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

這裏我們將 baseEntroy, newEntroy, infoGain打印出來,便於查看測試結果:
在這裏插入圖片描述

3.1.3 遞歸構建決策樹

遞歸調用時非常複雜,要想想明白挺不容易。
之前看過一本外國人寫的 Python書籍,提到遞歸這一塊,給的建議是:相信你寫的是正確的,那就是正確的。

# 創建樹
def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    bestFeat = chooseBestReatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del(labels[bestFeat])
    bestValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(bestValues)
    for value in uniqueVals:
        subLables = labels[:]

        myTree[bestFeatLabel][value] = createTree(splitDataSet\
                        (dataSet, bestFeat, value), subLables)
    return myTree

3.3 測試和存儲分類器

3.3.1 測試算法:使用決策樹執行分類

構建決策樹分類函數

# 使用決策樹的分類函數
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLable = classify(secondDict[key], featLabels, testVec)
            else: classLable = secondDict[key]
    return classLable

測試結果:

myDat, labels = treeCopy.createDataSet()
labels
['no surfacing', 'flippers']
myTree = treePlotterCopy.retrieveTree(0)
myTree
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
treeCopy.classify(myTree, labels, [1, 0])
'no'
treeCopy.classify(myTree, labels, [1, 1])
'yes'

3.3.2 決策樹存儲

# 使用 pickle 模塊存儲決策樹
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb+')
    pickle.dump(inputTree, fw, 0)
    fw.close()
def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    readTree = pickle.load(fr)
    return readTree

**注意:**按書中的方式寫入文件會報錯,我們這裏指定格式爲:fw = open(filename, 'wb+')

3.4 示例:使用決策樹預測隱形眼鏡類型

    fr = open('LearningSpark/Ch03/lenses.txt')
    lenses = [inst.strip().split('\t') for inst in fr.readlines()]
    len(lenses)
    24
    lenses
    [['young', 'myope', 'no', 'reduced', 'no lenses'], ['young', 'myope', 'no', 'normal', 'soft'], ['young', 'myope', 'yes', 'reduced', 'no lenses'], ['young', 'myope', 'yes', 'normal', 'hard'], ['young', 'hyper', 'no', 'reduced', 'no lenses'], ['young', 'hyper', 'no', 'normal', 'soft'], ['young', 'hyper', 'yes', 'reduced', 'no lenses'], ['young', 'hyper', 'yes', 'normal', 'hard'], ['pre', 'myope', 'no', 'reduced', 'no lenses'], ['pre', 'myope', 'no', 'normal', 'soft'], ['pre', 'myope', 'yes', 'reduced', 'no lenses'], ['pre', 'myope', 'yes', 'normal', 'hard'], ['pre', 'hyper', 'no', 'reduced', 'no lenses'], ['pre', 'hyper', 'no', 'normal', 'soft'], ['pre', 'hyper', 'yes', 'reduced', 'no lenses'], ['pre', 'hyper', 'yes', 'normal', 'no lenses'], ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'no', 'normal', 'no lenses'], ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'yes', 'normal', 'hard'], ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'no', 'normal', 'soft'], ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]
    lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
    lensesTree = treeCopy.createTree(lenses, lensesLabels)
    lensesTree
    {'tearRate': {'normal': {'astigmatic': {'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'young': 'soft'}}, 'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}}}}}, 'reduced': 'no lenses'}}
    treePlotterCopy.createPlot(lensesTree)

測試結果:
在這裏插入圖片描述

3 樸素貝葉斯 & 4 Logistic迴歸

機器學習實戰–樸素貝葉斯 & 4 Logistic迴歸

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章