決策樹裁剪有兩種方式:預裁剪和後裁剪。預裁剪是在劃分葉節點時進行計算,如果劃分能帶來泛化性能則劃分,否則不劃分。後裁剪是決策樹完全劃分完畢後,自底向上對結點進行考察,如果性能提升則合併,其訓練時間比預裁剪決策樹要大得多。
訓練數據:
1,青綠,蜷縮,濁響,清晰,凹陷,硬滑,是
2,烏黑,蜷縮,沉悶,清晰,凹陷,硬滑,是
3,烏黑,蜷縮,濁響,清晰,凹陷,硬滑,是
6,青綠,稍蜷,濁響,清晰,稍凹,軟粘,是
7,烏黑,稍蜷,濁響,稍糊,稍凹,軟粘,是
10,青綠,硬挺,清脆,清晰,平坦,軟粘,否
14,淺白,稍蜷,沉悶,稍糊,凹陷,硬滑,否
15,烏黑,稍蜷,濁響,清晰,稍凹,軟粘,否
16,淺白,蜷縮,濁響,模糊,平坦,硬滑,否
17,青綠,蜷縮,沉悶,稍糊,稍凹,硬滑,否
測試數據:
4,青綠,蜷縮,沉悶,清晰,凹陷,硬滑,是
5,淺白,蜷縮,濁響,清晰,凹陷,硬滑,是
8,烏黑,稍蜷,濁響,清晰,稍凹,硬滑,是
9,烏黑,稍蜷,沉悶,稍糊,稍凹,硬滑,否
11,淺白,硬挺,清脆,模糊,平坦,硬滑,否
12,淺白,蜷縮,濁響,模糊,平坦,軟粘,否
13,青綠,稍蜷,濁響,稍糊,凹陷,硬滑,否
預裁剪代碼:
from math import log import operator import treePlotter as tp def createDataSet(filename): dataSet=[] fr = open(filename) for line in fr.readlines(): lineArr = line.strip().split(',') dataSet.append(lineArr[:]) # 添加數據 labels = ['編號','色澤','根蒂','敲聲','紋理','頭部','觸感','好瓜'] #change to discrete values return dataSet, labels #計算信息熵 Ent(D)=-Σp*log2(p) def calcShannonEnt(dataSet): numEntries = len(dataSet) #數據總數 labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] #獲取類別 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 #新key加入字典賦值爲0 labelCounts[currentLabel] += 1 #已經存在的key,value+=1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #計算信息熵 return shannonEnt #獲取特徵值數據集 # dataSet --整個數據集 # axis --數據列 # value --類別 def splitSubDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: retDataSet.append([featVec[axis],featVec[-1]]) return retDataSet #除去劃分完成的決策樹數據量 def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet # 計算連續變量的分類點 # def calcconplot(subDataSet) # 計算信息增益並返回信息增益最高的列 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #獲取所有特徵值數量(減1是除去最後一列分類) baseEntropy = calcShannonEnt(dataSet) #計算基礎信息熵Ent(D) bestInfoGain = 0.0; bestFeature = [] for i in range(1,numFeatures): #遍歷所有特徵值 featList = [example[i] for example in dataSet]#將特徵值保存在列表中 uniqueVals = set(featList) #獲取特徵值分類 newEntropy = 0.0 #特徵值不連續 for value in uniqueVals: subDataSet = splitSubDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #計算信息增益 if (infoGain > bestInfoGain): #保存信息增益最高的列 bestInfoGain = infoGain bestFeature = i return bestFeature #返回新增增益最高的列 #特徵若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 return max(classCount) # 創建決策樹 def createTree(dataSet,labels,validateData): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0]#當所有類都相同則不在分類 if len(dataSet[0]) == 1: #沒有更多特徵值時不再分類 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) #選取信息增益最大的特徵值 bestFeatLabel = labels[bestFeat] #獲取特徵值列頭名 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) # 獲取特徵值分類 beforeCorrect = undivideCorrect(classList,validateData) afterCorrect = divideCorrect(dataSet,uniqueVals,bestFeat,validateData) if(beforeCorrect>afterCorrect): return majorityCnt(classList) myTree = {bestFeatLabel:{}} del(labels[bestFeat]) # 刪除已經建立節點的特徵值 for value in uniqueVals: subLabels = labels[:] # 複製出建立節點外的所有特徵值 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels,splitDataSet(validateData, bestFeat, value)) #建立子節點 return myTree # 不裁剪正確率 def undivideCorrect(classList,validateData): good = splitSubDataSet(validateData, len(validateData[0]) - 1, max(classList)) # 獲取正確個數的個數 beforeCorrect = len(good) / len(validateData) # 正確率 return beforeCorrect # 裁剪正確率 def divideCorrect(dataSet,uniqueVals,bestFeat,validateData): good = 0 for value in uniqueVals: # 遍歷所有分類節點 featList = [feat[-1] for feat in splitDataSet(dataSet, bestFeat, value)] # 從訓練集中判斷是屬於好瓜還是壞瓜 templList = splitSubDataSet(validateData, bestFeat, value) # 從測試集中獲取包含特徵值數目 goodList = [] if(len(templList)>0): goodList = splitSubDataSet(templList, len(templList[0]) - 1, max(featList)) # 獲取正確個數的個數 good +=len(goodList) return good / len(validateData) # 正確率 # 決策樹進行分類 def classify(inputTree,featLabels,testVec): firstStr = list(inputTree.keys())[0] # 獲取第一個節點 secondDict = inputTree[firstStr] # 獲取剩餘節點 featIndex = featLabels.index(firstStr) key = testVec[featIndex] # 獲取測試數據分支 valueOfFeat = secondDict[key] # 進入分支 if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel if __name__ == '__main__': myData,label = createDataSet('TrainingData.txt') validateData,vlabel = createDataSet('ValidateData.txt') mytree = createTree(myData,label,validateData) tp.createPlot(mytree)
未裁剪與預裁剪結果對比:
後裁剪代碼:
from math import log import operator import treePlotter as tp def createDataSet(filename): dataSet=[] fr = open(filename) for line in fr.readlines(): lineArr = line.strip().split(',') dataSet.append(lineArr[:]) # 添加數據 labels = ['編號','色澤','根蒂','敲聲','紋理','頭部','觸感','好瓜'] #change to discrete values return dataSet, labels #計算信息熵 Ent(D)=-Σp*log2(p) def calcShannonEnt(dataSet): numEntries = len(dataSet) #數據總數 labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] #獲取類別 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 #新key加入字典賦值爲0 labelCounts[currentLabel] += 1 #已經存在的key,value+=1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #計算信息熵 return shannonEnt #獲取特徵值數據集 # dataSet --整個數據集 # axis --數據列 # value --類別 def splitSubDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: retDataSet.append([featVec[axis],featVec[-1]]) return retDataSet #除去劃分完成的決策樹數據量 def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet # 計算信息增益並返回信息增益最高的列 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #獲取所有特徵值數量(減1是除去最後一列分類) baseEntropy = calcShannonEnt(dataSet) #計算基礎信息熵Ent(D) bestInfoGain = 0.0; bestFeature = [] for i in range(1,numFeatures): #遍歷所有特徵值 featList = [example[i] for example in dataSet]#將特徵值保存在列表中 uniqueVals = set(featList) #獲取特徵值分類 newEntropy = 0.0 #特徵值不連續 for value in uniqueVals: subDataSet = splitSubDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #計算信息增益 if (infoGain > bestInfoGain): #保存信息增益最高的列 bestInfoGain = infoGain bestFeature = i return bestFeature #返回新增增益最高的列 #特徵若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 return max(classCount) # 創建決策樹 def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0]#當所有類都相同則不在分類 if len(dataSet[0]) == 1: #沒有更多特徵值時不再分類 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) #選取信息增益最大的特徵值 bestFeatLabel = labels[bestFeat] #獲取特徵值列頭名 myTree = {bestFeatLabel:{}} featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) # 獲取特徵值分類 del(labels[bestFeat]) # 刪除已經建立節點的特徵值 for value in uniqueVals: subLabels = labels[:] # 複製出建立節點外的所有特徵值 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) #建立子節點 return myTree def postPruning(inputTree,dataSet,validateData,label): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] classList = [example[-1] for example in dataSet] featkey = firstStr labelIndex = label.index(featkey) temp_labels = label.copy() del (label[labelIndex]) for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': if type(dataSet[0][labelIndex]).__name__ == 'str': inputTree[firstStr][key] = postPruning(secondDict[key], splitDataSet(dataSet, labelIndex, key), splitDataSet(validateData, labelIndex, key),label.copy()) else: inputTree[firstStr][key] = postPruning(secondDict[key], splitDataSet(dataSet, labelIndex, key), splitDataSet(validateData, labelIndex,key), label.copy()) beforeCorrect = undivideCorrect(classList, validateData) afterCorrect = divideCorrect(dataSet, secondDict.keys(), labelIndex, validateData) if (beforeCorrect > afterCorrect): return majorityCnt(classList) return inputTree # 不裁剪正確率 def undivideCorrect(classList,validateData): good = splitSubDataSet(validateData, len(validateData[0]) - 1, max(classList)) # 獲取正確個數的個數 beforeCorrect = len(good) / len(validateData) # 正確率 return beforeCorrect # 裁剪正確率 def divideCorrect(dataSet,uniqueVals,bestFeat,validateData): good = 0 for value in uniqueVals: # 遍歷所有分類節點 featList = [feat[-1] for feat in splitDataSet(dataSet, bestFeat, value)] # 從訓練集中判斷是屬於好瓜還是壞瓜 templList = splitSubDataSet(validateData, bestFeat, value) # 從測試集中獲取包含特徵值數目 goodList = [] if(len(templList)>0): goodList = splitSubDataSet(templList, len(templList[0]) - 1, max(featList)) # 獲取正確個數的個數 good +=len(goodList) return good / len(validateData) # 正確率 if __name__ == '__main__': myData,label = createDataSet('TrainingData.txt') validateData,vlabel = createDataSet('ValidateData.txt') tmplabel = label.copy() mytree = createTree(myData,tmplabel) postPruning(mytree,myData,validateData,label) tp.createPlot(mytree)
結果: