機器學習-決策樹裁剪(二)

決策樹裁剪有兩種方式:預裁剪和後裁剪。預裁剪是在劃分葉節點時進行計算,如果劃分能帶來泛化性能則劃分,否則不劃分。後裁剪是決策樹完全劃分完畢後,自底向上對結點進行考察,如果性能提升則合併,其訓練時間比預裁剪決策樹要大得多。

訓練數據:

1,青綠,蜷縮,濁響,清晰,凹陷,硬滑,是
2,烏黑,蜷縮,沉悶,清晰,凹陷,硬滑,是
3,烏黑,蜷縮,濁響,清晰,凹陷,硬滑,是
6,青綠,稍蜷,濁響,清晰,稍凹,軟粘,是
7,烏黑,稍蜷,濁響,稍糊,稍凹,軟粘,是
10,青綠,硬挺,清脆,清晰,平坦,軟粘,否
14,淺白,稍蜷,沉悶,稍糊,凹陷,硬滑,否
15,烏黑,稍蜷,濁響,清晰,稍凹,軟粘,否
16,淺白,蜷縮,濁響,模糊,平坦,硬滑,否
17,青綠,蜷縮,沉悶,稍糊,稍凹,硬滑,否

測試數據:

4,青綠,蜷縮,沉悶,清晰,凹陷,硬滑,是
5,淺白,蜷縮,濁響,清晰,凹陷,硬滑,是
8,烏黑,稍蜷,濁響,清晰,稍凹,硬滑,是
9,烏黑,稍蜷,沉悶,稍糊,稍凹,硬滑,否
11,淺白,硬挺,清脆,模糊,平坦,硬滑,否
12,淺白,蜷縮,濁響,模糊,平坦,軟粘,否
13,青綠,稍蜷,濁響,稍糊,凹陷,硬滑,否

預裁剪代碼:

from math import log
import operator
import treePlotter as tp

def createDataSet(filename):
    dataSet=[]
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split(',')
        dataSet.append(lineArr[:])  # 添加數據
    labels = ['編號','色澤','根蒂','敲聲','紋理','頭部','觸感','好瓜']
    #change to discrete values
    return dataSet, labels

#計算信息熵 Ent(D)=-Σp*log2(p)
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)                    #數據總數
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]               #獲取類別
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0   #新key加入字典賦值爲0
        labelCounts[currentLabel] += 1           #已經存在的key,value+=1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2)             #計算信息熵
    return shannonEnt

#獲取特徵值數據集
# dataSet --整個數據集
# axis --數據列
# value --類別
def splitSubDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            retDataSet.append([featVec[axis],featVec[-1]])
    return retDataSet

#除去劃分完成的決策樹數據量
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

# 計算連續變量的分類點
# def calcconplot(subDataSet)

# 計算信息增益並返回信息增益最高的列
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #獲取所有特徵值數量(減1是除去最後一列分類)
    baseEntropy = calcShannonEnt(dataSet)   #計算基礎信息熵Ent(D)
    bestInfoGain = 0.0; bestFeature = []
    for i in range(1,numFeatures):        #遍歷所有特徵值
        featList = [example[i] for example in dataSet]#將特徵值保存在列表中
        uniqueVals = set(featList)       #獲取特徵值分類
        newEntropy = 0.0                                               #特徵值不連續
        for value in uniqueVals:
            subDataSet = splitSubDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #計算信息增益
        if (infoGain > bestInfoGain):       #保存信息增益最高的列
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature                      #返回新增增益最高的列

#特徵若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    return max(classCount)

# 創建決策樹
def createTree(dataSet,labels,validateData):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]#當所有類都相同則不在分類
    if len(dataSet[0]) == 1: #沒有更多特徵值時不再分類
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)             #選取信息增益最大的特徵值
    bestFeatLabel = labels[bestFeat]                         #獲取特徵值列頭名
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)                             # 獲取特徵值分類
    beforeCorrect = undivideCorrect(classList,validateData)
    afterCorrect = divideCorrect(dataSet,uniqueVals,bestFeat,validateData)
    if(beforeCorrect>afterCorrect):
        return majorityCnt(classList)
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])                                    # 刪除已經建立節點的特徵值
    for value in uniqueVals:
        subLabels = labels[:]                                 # 複製出建立節點外的所有特徵值
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels,splitDataSet(validateData, bestFeat, value))  #建立子節點
    return myTree

# 不裁剪正確率
def undivideCorrect(classList,validateData):
    good = splitSubDataSet(validateData, len(validateData[0]) - 1, max(classList))  # 獲取正確個數的個數
    beforeCorrect = len(good) / len(validateData)  # 正確率
    return beforeCorrect

# 裁剪正確率
def divideCorrect(dataSet,uniqueVals,bestFeat,validateData):
    good = 0
    for value in uniqueVals:  # 遍歷所有分類節點
        featList = [feat[-1] for feat in splitDataSet(dataSet, bestFeat, value)] # 從訓練集中判斷是屬於好瓜還是壞瓜
        templList = splitSubDataSet(validateData, bestFeat, value)  # 從測試集中獲取包含特徵值數目
        goodList = []
        if(len(templList)>0):
            goodList = splitSubDataSet(templList, len(templList[0]) - 1, max(featList))  # 獲取正確個數的個數
        good +=len(goodList)
    return good / len(validateData)  # 正確率

# 決策樹進行分類
def classify(inputTree,featLabels,testVec):
    firstStr = list(inputTree.keys())[0]            # 獲取第一個節點
    secondDict = inputTree[firstStr]                # 獲取剩餘節點
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]                        # 獲取測試數據分支
    valueOfFeat = secondDict[key]                   # 進入分支
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

if __name__ == '__main__':
    myData,label = createDataSet('TrainingData.txt')
    validateData,vlabel = createDataSet('ValidateData.txt')
    mytree = createTree(myData,label,validateData)
    tp.createPlot(mytree)

未裁剪與預裁剪結果對比:

後裁剪代碼:

from math import log
import operator
import treePlotter as tp

def createDataSet(filename):
    dataSet=[]
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split(',')
        dataSet.append(lineArr[:])  # 添加數據
    labels = ['編號','色澤','根蒂','敲聲','紋理','頭部','觸感','好瓜']
    #change to discrete values
    return dataSet, labels

#計算信息熵 Ent(D)=-Σp*log2(p)
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)                    #數據總數
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]               #獲取類別
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0   #新key加入字典賦值爲0
        labelCounts[currentLabel] += 1           #已經存在的key,value+=1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2)             #計算信息熵
    return shannonEnt

#獲取特徵值數據集
# dataSet --整個數據集
# axis --數據列
# value --類別
def splitSubDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            retDataSet.append([featVec[axis],featVec[-1]])
    return retDataSet

#除去劃分完成的決策樹數據量
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

# 計算信息增益並返回信息增益最高的列
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #獲取所有特徵值數量(減1是除去最後一列分類)
    baseEntropy = calcShannonEnt(dataSet)   #計算基礎信息熵Ent(D)
    bestInfoGain = 0.0; bestFeature = []
    for i in range(1,numFeatures):        #遍歷所有特徵值
        featList = [example[i] for example in dataSet]#將特徵值保存在列表中
        uniqueVals = set(featList)       #獲取特徵值分類
        newEntropy = 0.0                                               #特徵值不連續
        for value in uniqueVals:
            subDataSet = splitSubDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #計算信息增益
        if (infoGain > bestInfoGain):       #保存信息增益最高的列
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature                      #返回新增增益最高的列

#特徵若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    return max(classCount)

# 創建決策樹
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]#當所有類都相同則不在分類
    if len(dataSet[0]) == 1: #沒有更多特徵值時不再分類
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)             #選取信息增益最大的特徵值
    bestFeatLabel = labels[bestFeat]                         #獲取特徵值列頭名
    myTree = {bestFeatLabel:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)                             # 獲取特徵值分類
    del(labels[bestFeat])                                    # 刪除已經建立節點的特徵值
    for value in uniqueVals:
        subLabels = labels[:]                                 # 複製出建立節點外的所有特徵值
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)  #建立子節點
    return myTree

def postPruning(inputTree,dataSet,validateData,label):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    classList = [example[-1] for example in dataSet]
    featkey = firstStr
    labelIndex = label.index(featkey)
    temp_labels = label.copy()
    del (label[labelIndex])
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            if type(dataSet[0][labelIndex]).__name__ == 'str':
                inputTree[firstStr][key] = postPruning(secondDict[key], splitDataSet(dataSet, labelIndex, key),
                                                           splitDataSet(validateData, labelIndex, key),label.copy())
            else:
                inputTree[firstStr][key] = postPruning(secondDict[key], splitDataSet(dataSet, labelIndex, key),
                                                       splitDataSet(validateData, labelIndex,key), label.copy())
    beforeCorrect = undivideCorrect(classList, validateData)
    afterCorrect = divideCorrect(dataSet, secondDict.keys(), labelIndex, validateData)
    if (beforeCorrect > afterCorrect):
        return majorityCnt(classList)
    return inputTree

# 不裁剪正確率
def undivideCorrect(classList,validateData):
    good = splitSubDataSet(validateData, len(validateData[0]) - 1, max(classList))  # 獲取正確個數的個數
    beforeCorrect = len(good) / len(validateData)  # 正確率
    return beforeCorrect

# 裁剪正確率
def divideCorrect(dataSet,uniqueVals,bestFeat,validateData):
    good = 0
    for value in uniqueVals:  # 遍歷所有分類節點
        featList = [feat[-1] for feat in splitDataSet(dataSet, bestFeat, value)] # 從訓練集中判斷是屬於好瓜還是壞瓜
        templList = splitSubDataSet(validateData, bestFeat, value)  # 從測試集中獲取包含特徵值數目
        goodList = []
        if(len(templList)>0):
            goodList = splitSubDataSet(templList, len(templList[0]) - 1, max(featList))  # 獲取正確個數的個數
        good +=len(goodList)
    return good / len(validateData)  # 正確率

if __name__ == '__main__':
    myData,label = createDataSet('TrainingData.txt')
    validateData,vlabel = createDataSet('ValidateData.txt')
    tmplabel = label.copy()
    mytree = createTree(myData,tmplabel)
    postPruning(mytree,myData,validateData,label)
    tp.createPlot(mytree)

結果:

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章