2 k近鄰算法

2.1 實施kNN算法

代碼清單1：

'''
Author: Solarzhou
Email: [email protected]
'''
from numpy import *
import operator
def createDataSet():
     group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
     labels = ['A', 'A', 'B', 'B']
     return group, labels
def classify0(inX, dataSet, labels, k):
    #獲取數據集維度，集有多少個向量
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat ** 2
    # 對應的每一行相加
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5

    # 先按由小到大排列，在獲取原來數組中對應的索引號
    # 這裏數值最小的就是關係最緊密的
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    # sorted() 排序函數，這裏採用降序；
    # 其中第二個參數key表示，通過第一個參數獲取的鍵值對中的值進行排序
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1), reverse=True)
    print(sortedClassCount)
    return sortedClassCount[0][0]

測試，結果：

from LearningSpark import KNN
group, labels = KNN.createDataSet()
KNN.classify0([0, 0], group, labels, 3)
[('B', 2), ('A', 1)]
'B'
KNN.classify0([1, 1], group, labels, 3)
[('A', 2), ('B', 1)]
'A'

2.2使用kNN改進約會網站的配對效果

2.2.1 準備數據，從文本中解析數據

# 將文本記錄轉換爲NumPy的解析程序
def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())         #get the number of lines in the file
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        transfomation = listFromLine[-2]

        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

測試結果
注意：書中的源碼在這裏有出錯誤。
1）在讀取文本文件時，應該是datingTestSet2.txt，否則會報錯：


int()  invalid literal for int() with base 10: ''

importlib.reload(KNN)
<module 'LearningSpark.KNN' from 'E:\\Users\\Administrator\\PycharmProjects\\TestCases\\LearningSpark\\KNN.py'>
datingDataMat, datingLabels = KNN.file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
datingDataMat
array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
       [1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
       [2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
       ...,
       [2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
       [4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
       [4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])
datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

2.2.2 分析數據：使用Matplotlib創建散點圖

使用Matplotlib製作原始數據的散點圖

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
<matplotlib.collections.PathCollection object at 0x00000245657F1C88>
plt.show()

利用datingLabels中的類標籤屬性，在散點圖上繪製色彩不等的點

ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
           15.0*np.array(datingLabels), 15.0*np.array(datingLabels)) 
<matplotlib.collections.PathCollection object at 0x000002456E563EB8>
plt.show()

測試效果圖：

2.2.3 準備數據：歸一化數值

歸一化特徵值

# 歸一化特徵值
def autoNorm(dataSet):
    minVals = dataSet.min(0) # min得到的每一列中的最小值
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet/tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

運行測試：

normMat, ranges, minVals = KNN.autoNorm(datingDataMat)
normMat
array([[0.44832535, 0.39805139, 0.56233353],
       [0.15873259, 0.34195467, 0.98724416],
       [0.28542943, 0.06892523, 0.47449629],
       ...,
       [0.29115949, 0.50910294, 0.51079493],
       [0.52711097, 0.43665451, 0.4290048 ],
       [0.47940793, 0.3768091 , 0.78571804]])
ranges
array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])
len(ranges)
3

測試算法：作爲完整程序驗證

分類器分類器針對約會網站的測試代碼

# 測試代碼
def datingClassTest():
    hoRatio =0.10
    datingDataMat, datingLables = file2matrix('LearningSpark/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    # 訓練集的向量數
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],
                                     datingLables[numTestVecs:m], 5)
        print("the classifier come back with: %d, the real answer is: %d"
              %(classifierResult, datingLables[i]))
        if (classifierResult != datingLables[i]):
            errorCount += 1.0

    print('the total error rate is：%f'%(errorCount/float(numTestVecs)))

測試結果：

2.3使用算法：構建完整可用系統

2.3.1 準備數據：將圖像轉換爲測試向量

將圖像轉換爲測試向量

# 手寫識別系統
# 將圖像轉換爲測試向量
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

測試結果：

testVector = KNN.img2vector('LearningSpark/Ch02/digits/testDigits/0_13.txt')
testVector[0,0:31]
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

2.3.2 測試算法：使用kNN近鄰算法識別手寫數字

手寫數字識別系統的測試代碼。
首先需要導入listdir，讀取文件目錄；因此我們首先要確保from os import listdir寫入文件的其實部分。

# 手寫數字識別系統的測試代碼
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('LearningSpark/Ch02/digits/trainingDigits')           #load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('LearningSpark/Ch02/digits/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('LearningSpark/Ch02/digits/testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('LearningSpark/Ch02/digits/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

測試結果：

3 決策樹

3.1 決策樹構造

首先我們需要理解信息熵，條件信息熵，信息增益。
信息熵是代表隨機變量的複雜度（不確定性）；
條件熵代表在某一個條件下，隨機變量的複雜度（不確定度）；
信息增益：信息熵-條件熵。也就是說，信息增益代表了在一個條件下，信息複雜度（不確定性）減少程度。
參考文檔–知乎專欄

3.1.1信息增益

在劃分數據集之前之後信息發生的變化成爲信息增益。
計算給定數據集的香農熵。

# 香農熵
def calcShannonEnt(dataset):
    numEntries = len(dataset)
    labelCounts = {}
    for feaVec in dataset:
        currentLabel = feaVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]/numEntries)
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt

創建自己的數據集

# 創建自己的數據集
def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

測試結果：

myDat, labels = treeCopy.createDataSet()
myDat
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
0.9709505944546686
myDat[0][-1] = 'maybe'
myDat
[[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
treeCopy.calcShannonEnt(myDat)
1.3709505944546687

可以看到分類越多，信息熵越大；信息熵天生偏向選擇分支多的屬性。

3.1.2 劃分數據集

按照給定的特徵劃分數據集

def splitDataSet(dataSet, axis, value):
    '''
    :param dataSet: 帶劃分的數據集
    :param axis: 劃分數據集的特徵
    :param value: 需要返回的特製的值
    也即，獲取某一特徵的其餘值
    :return: 
    '''
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

測試結果：

選擇最好的數據集劃分方式

#選擇組好的數據集劃分方式
def chooseBestReatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1
    baseEntroy = calcShannonEnt(dataSet)
    print('baseEntroy:',baseEntroy)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntroy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)           
            prop = len(subDataSet)/float(len(dataSet))
           # 這裏是在算條件熵
            newEntroy += prop * calcShannonEnt(subDataSet)
            print("newEntroy:", newEntroy)
        infoGain = baseEntroy - newEntroy
        print('infoGain:', infoGain)
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

這裏我們將 baseEntroy, newEntroy, infoGain打印出來，便於查看測試結果：

3.1.3 遞歸構建決策樹

遞歸調用時非常複雜，要想想明白挺不容易。
之前看過一本外國人寫的 Python書籍，提到遞歸這一塊，給的建議是：相信你寫的是正確的，那就是正確的。

# 創建樹
def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    bestFeat = chooseBestReatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del(labels[bestFeat])
    bestValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(bestValues)
    for value in uniqueVals:
        subLables = labels[:]

        myTree[bestFeatLabel][value] = createTree(splitDataSet\
                        (dataSet, bestFeat, value), subLables)
    return myTree

3.3 測試和存儲分類器

3.3.1 測試算法：使用決策樹執行分類

構建決策樹分類函數

# 使用決策樹的分類函數
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__=='dict':
                classLable = classify(secondDict[key], featLabels, testVec)
            else: classLable = secondDict[key]
    return classLable

測試結果：

myDat, labels = treeCopy.createDataSet()
labels
['no surfacing', 'flippers']
myTree = treePlotterCopy.retrieveTree(0)
myTree
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
treeCopy.classify(myTree, labels, [1, 0])
'no'
treeCopy.classify(myTree, labels, [1, 1])
'yes'

3.3.2 決策樹存儲

# 使用 pickle 模塊存儲決策樹
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb+')
    pickle.dump(inputTree, fw, 0)
    fw.close()
def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    readTree = pickle.load(fr)
    return readTree

**注意：**按書中的方式寫入文件會報錯，我們這裏指定格式爲：fw = open(filename, 'wb+')

3.4 示例：使用決策樹預測隱形眼鏡類型

    fr = open('LearningSpark/Ch03/lenses.txt')
    lenses = [inst.strip().split('\t') for inst in fr.readlines()]
    len(lenses)
    24
    lenses
    [['young', 'myope', 'no', 'reduced', 'no lenses'], ['young', 'myope', 'no', 'normal', 'soft'], ['young', 'myope', 'yes', 'reduced', 'no lenses'], ['young', 'myope', 'yes', 'normal', 'hard'], ['young', 'hyper', 'no', 'reduced', 'no lenses'], ['young', 'hyper', 'no', 'normal', 'soft'], ['young', 'hyper', 'yes', 'reduced', 'no lenses'], ['young', 'hyper', 'yes', 'normal', 'hard'], ['pre', 'myope', 'no', 'reduced', 'no lenses'], ['pre', 'myope', 'no', 'normal', 'soft'], ['pre', 'myope', 'yes', 'reduced', 'no lenses'], ['pre', 'myope', 'yes', 'normal', 'hard'], ['pre', 'hyper', 'no', 'reduced', 'no lenses'], ['pre', 'hyper', 'no', 'normal', 'soft'], ['pre', 'hyper', 'yes', 'reduced', 'no lenses'], ['pre', 'hyper', 'yes', 'normal', 'no lenses'], ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'no', 'normal', 'no lenses'], ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'myope', 'yes', 'normal', 'hard'], ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'no', 'normal', 'soft'], ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'], ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]
    lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
    lensesTree = treeCopy.createTree(lenses, lensesLabels)
    lensesTree
    {'tearRate': {'normal': {'astigmatic': {'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'young': 'soft'}}, 'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}}}}}, 'reduced': 'no lenses'}}
    treePlotterCopy.createPlot(lensesTree)

測試結果：

3 樸素貝葉斯 & 4 Logistic迴歸

機器學習實戰–樸素貝葉斯 & 4 Logistic迴歸

機器學習實戰筆記

文章目錄

2 k近鄰算法

2.1 實施kNN算法

代碼清單1：

測試，結果：

2.2使用kNN改進約會網站的配對效果

2.2.1 準備數據，從文本中解析數據

2.2.2 分析數據：使用Matplotlib創建散點圖

2.2.3 準備數據：歸一化數值

測試算法：作爲完整程序驗證

2.3使用算法：構建完整可用系統

2.3.1 準備數據：將圖像轉換爲測試向量

2.3.2 測試算法：使用kNN近鄰算法識別手寫數字

3 決策樹

3.1 決策樹構造

3.1.1信息增益

3.1.2 劃分數據集

3.1.3 遞歸構建決策樹

3.3 測試和存儲分類器

3.3.1 測試算法：使用決策樹執行分類

3.3.2 決策樹存儲

3.4 示例：使用決策樹預測隱形眼鏡類型

3 樸素貝葉斯 & 4 Logistic迴歸

Caused by: java.lang.ClassNotFoundException: org.apache.avro.generic.GenericRecord

虛擬化技術的優點和缺點

Cloudera Manager_java.lang.ClassNotFoundException: com.mysql.jdbc.Driver

劍指offer(Python3實現)

劍指offer 2

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

機器學習實戰筆記

文章目錄

2 k近鄰算法

2.1 實施kNN算法

代碼清單1：

測試，結果：

2.2使用kNN改進約會網站的配對效果

2.2.1 準備數據，從文本中解析數據

2.2.2 分析數據： 使用Matplotlib創建散點圖

2.2.3 準備數據：歸一化數值

測試算法：作爲完整程序驗證

2.3使用算法：構建完整可用系統

2.3.1 準備數據：將圖像轉換爲測試向量

2.3.2 測試算法：使用kNN近鄰算法識別手寫數字

3 決策樹

3.1 決策樹構造

3.1.1信息增益

3.1.2 劃分數據集

3.1.3 遞歸構建決策樹

3.3 測試和存儲分類器

3.3.1 測試算法：使用決策樹執行分類

3.3.2 決策樹存儲

3.4 示例：使用決策樹預測隱形眼鏡類型

3 樸素貝葉斯 & 4 Logistic迴歸

2.2.2 分析數據：使用Matplotlib創建散點圖