來自機器學習實戰一書
# !/usr/bin/python
# -*- coding: utf-8 -*-
from math import log
import operator
def createDataSet():
dataSet = [[1, 1, "yes"],
[1, 1, "yes"],
[1, 0, "No"],
[0, 1, "No"],
[0, 1, "No"]]
labels = ['no surfacing', 'flippers']
return dataSet, labels
def calcShannonEnt(dataSet):
"""
# 作用:計算最後一列信息熵
# Args: dataSet: list格式
# return: shannonEnt:最後一列信息熵
"""
numEntries = len(dataSet)
labelCounts = {} # 變量聲明 list 格式
for featVec in dataSet: # 按行循環
currentLabel = featVec[-1] # 最後一行,作爲鍵值
if currentLabel not in labelCounts.keys(): # 當鍵值不存在時創建Label 計數爲0
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 # 鍵值存在時計數+1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2) # 信息熵計算公式
return shannonEnt
def splitDataSet(dataSet, axis, value):
"""
# 作用:劃分數據集,離散特徵
# Args: dataSet: list格式,待劃分數據集
# axis:劃分數據集特徵
# value:返還特徵值
# return: shannonEnt:最後一列信息熵
"""
retDataSet = [] # 變量聲明 list格式
for featVec in dataSet: # 循環列表
if featVec[axis] == value:
reduceFeatVec = featVec[:axis]
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
"""
# 作用:選擇最佳區分度最高的特徵
# Args: dataSet: list格式,待劃分數據集
# axis:劃分數據集特徵
# value:返還特徵值
# return: shannonEnt:最後一列信息熵
"""
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0 # 預存信息
bestFeatures = -1 # 預存
for i in range(numFeatures):
featList = [example[i] for example in dataSet] # 提取列表第一列
uniqueVals = set(featList) # 類似 R中 unique函數
newEntropy = 0 # 變量聲明
# 計算每種劃分方式的信息熵
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
# 選擇最好的信息增益
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeatures = i
return bestFeatures
def majorityCnt(classList):
classCount = {} # 變量聲明,字典格式
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount += 1
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
def createTrees(dataSet, labels):
"""
# 作用:遞歸創建決策樹,字典格式存儲
# Args: dataSet: list格式,待劃分數據集,最後一列爲劃分標籤
# labels: dataSet 特徵列名
# return: myTree:決策樹,字典格式
"""
classList = [example[-1] for example in dataSet]
# 當類別完全相同時停止劃分
if classList.count(classList[0]) == len(classList):
return classList[0]
# 遍歷完所有特徵時 返還出現次數最多的類別
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeture = chooseBestFeatureToSplit(dataSet) # 尋找最佳分類特徵位列數
bestFetureLabel = labels[bestFeture] # 分類特徵標籤
myTree = {bestFetureLabel:{}}
del(labels[bestFeture]) # del刪除 分類標籤
featValues = [example[bestFeture] for example in dataSet] # 當前最佳分類特徵列提取
uniqueVals = set(featValues) # 去重,類似unique
for value in uniqueVals:
subLabels = labels[:] # copy
myTree[bestFetureLabel][value] = createTrees(splitDataSet(dataSet, bestFeture, value), subLabels) # 遞歸創建下一葉結點
return myTree
def classify(inputTree, featLabels, testVec):
"""
# 作用:遞歸進行分類判斷
# Args:inputTree: 已學習的決策樹模型, 字典格式
# featLabels:列名
# testVec:測試樣本, 單條List格式
# return:classLabel: 測試樣本類別
"""
firstStr = inputTree.keys()[0] # 提取第一結點名
secondDict = inputTree[firstStr] # 提取第一結點
featIndex = featLabels.index(firstStr) # firstStr在列名的下標 類似於which函數
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__== 'dict': # 判斷當前格式是否爲字典,即是否還存在下一判斷結點
classLabel = classify(secondDict[key], featLabels, testVec) # 遞歸求標籤
else: classLabel = secondDict[key]
return classLabel
def storeTree(inputTree, filename):
"""
# 作用:保存決策樹模型(字典的保存)
# Args:inputTree: 已學習的決策樹模型, 字典格式
# filename:存儲文件名
"""
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, filename)
fw.close()
def grabTree(filename):
"""
# 作用:讀取保存的決策樹(字典的讀取)
# Args: filename:存儲文件名
"""
import pickle
fr = open(filename)
return pickle.load(fr)
# 測試
import tree
dataSet, labels= tree.createDataSet()
myTree = tree.creatTrees(dataSet, labels)
myTree
{'no surfacing': {0: 'No', 1: {'flippers': {0: 'No', 1: 'yes'}}}}