李航<統計學習方法>第五章案例 5.3決策樹的生成 Page77
書中給的答案
用python實現的並打印生成的決策樹
代碼
import numpy as np
import pandas as pd
class Node():
def __init__(self,node_id ,content=None,complete_featrues=None):
self.node_id=node_id
self.isLeaf=None
self.y_value=None
self.feature_name=None
self.feature_types=None
self.children=[]
self.content=content
self.complete_featrues=complete_featrues
def __repr__(self):
children_ids=[]
if(len(self.children)>0):
for i in range(len(self.children)):
children_ids.append(self.children[i].node_id)
str='node_id:{}, y_value:{}, feature_name:{}, feature_values:{}, children:{}'.format(
self.node_id,self.y_value,self.feature_name,self.feature_types,children_ids)
return str
class Tree():
def __init__(self,method="info_gain_ratio"):
self.method=method
self.rootNode=None
self.node_ids=set()
def _node_id_generator(self,currentId,feature_name,judge_name):
#用於生成node的ID
currentId=currentId.split(sep=':')[0]
i=1
str='{}-{}'.format(currentId,i)
while(str in self.node_ids):
i+=1
str = '{}-{}'.format(currentId, i)
self.node_ids.add(str)
return '{}:{}-{}'.format(str,feature_name,judge_name)
def _experience_entrophy(self,x_train): #經驗熵
x_type_count = np.unique(x_train, return_counts=True)
totalCount = np.sum(x_type_count[1])
res = 0.0
for i in range(len(x_type_count[1])):
percent=x_type_count[1][i] / totalCount
res += -percent * np.log2(percent)
return res
def _conditional_entrophy(self,one_x_train,y_train): #條件熵
if (len(one_x_train) != len(y_train)):
print("x len: {},y len:{}".format(len(one_x_train), len(y_train)))
print("x : {} \n y :{}".format(one_x_train, y_train))
raise Exception
x_types = np.unique(one_x_train, return_counts=True)
x = pd.DataFrame(one_x_train)
y = pd.DataFrame(y_train)
res = 0.0
for i in range(len(x_types[0])):
percent = x_types[1][i] / len(x_types[1])
t = y[(x == x_types[0][i]).values]
res += self._experience_entrophy(t)
return res
def _info_gain(self,one_x_train, y_train): #信息增益
return self._experience_entrophy(y_train)-self._conditional_entrophy(one_x_train,y_train)
def _info_gain_ratio(self,one_x_train, y_train): #信息增益比
return self._info_gain(one_x_train,y_train)/self._experience_entrophy(one_x_train)
def _choose_feature(self,x_dataframe, y_dataframe): #選擇最優特徵
features=x_dataframe.columns
list = []
for i in x_dataframe.columns:
if(self.method=='info_gain'):
list.append(self._info_gain(x_dataframe[i], y_dataframe[0]))
elif(self.method=='info_gain_ratio'):
list.append(self._info_gain_ratio(x_dataframe[i], y_dataframe[0]))
else:
raise TypeError
opt_feature=features[np.argmax(list)]
return opt_feature
def _build_tree(self,node): #遞歸建立決策樹
x_dataframe=node.content[0]
y_dataframe=node.content[1]
featrues_remains=x_dataframe.columns
y_value_count = np.unique(y_dataframe, return_counts=True)
if(len(featrues_remains)==1 or len(y_value_count[0])==1): #停止遞歸,葉子節點
node.y_value = y_value_count[0][np.argmax(y_value_count[1])] #判定值
node.isLeaf = True
return node
else: #繼續遞歸,非葉子節點
node.isLeaf=False
node.y_value=None
node.feature_name=self._choose_feature(x_dataframe,y_dataframe)
node.feature_types=np.unique(x_dataframe[node.feature_name])
for i in range(len(node.feature_types)): #每一種類型建立一個節點例如有自己的房子,會建立是和否兩個節點
tmp=x_dataframe.loc[x_dataframe[node.feature_name]==node.feature_types[i]]
new_x_dataframe=tmp.drop(labels=node.feature_name,axis=1)
new_y_dataframe=y_dataframe[(x_dataframe[node.feature_name]==node.feature_types[i]).values]
new_nodeId = self._node_id_generator(node.node_id,node.feature_name,node.feature_types[i])
# print('append id {} to {}'.format(new_nodeId,node.node_id))
new_node=Node(node_id=new_nodeId,content=(new_x_dataframe,new_y_dataframe))
self._build_tree(new_node)
node.children.append(new_node)
def fit(self,x_dataframe,y_dataframe):
self.rootNode =Node(node_id='1:根節點',content=(x_dataframe,y_dataframe),complete_featrues=x_dataframe.columns)
self._build_tree(self.rootNode)
def _getResult(self,node,test_dataframe):
if(node.isLeaf):
return node.y_value
else:
feature_value=test_dataframe[node.feature_name][0]
next_node=None
for i in range(len(node.feature_types)):
if(node.feature_types[i]==feature_value):
next_node=node.children[i]
return self._getResult(next_node,test_dataframe)
def predict(self,x_test):
test_dataframe=pd.DataFrame(x_test,columns=self.rootNode.complete_featrues)
return self._getResult(self.rootNode,test_dataframe)
def _print_tree(self,node):
print(node)
if(len(node.children)>0):
for i in node.children:
self._print_tree(i)
def print_tree(self):
self._print_tree(self.rootNode)
def test(tree,x_testData, y_testData):
err=0
total_count=0
for i in range(len(x_testData)):
x_data =[x_testData[i]]
y_data =y_testData[i]
res = tree.predict(x_data)
if(res!=y_data):
err+=1
print(x_data)
print(y_data)
print('res',res)
else:
total_count+=1
print('err count = {}'.format(err))
print('total_count = {}'.format(total_count))
def getData():
features=["年齡","有工作","有自己的房子","信貸情況"]
x_train=np.array([
["青年", "否", "否", "一般"],#"否"
["青年", "否", "否", "好"],#"否"
["青年", "是", "否", "好"],#"是"
["青年", "是", "是", "一般"],#"是"
["青年", "否", "否", "一般"],#"否"
["中年", "否", "否", "一般"],
["中年", "否", "否", "好"],
["中年", "是", "是", "好"],
["中年", "否", "是", "非常好"],
["中年", "否", "是", "非常好"],
["老年", "否", "是", "非常好"],
["老年", "否", "是", "好"],
["老年", "是", "否", "好"],
["老年", "是", "否", "非常好"],
["老年", "否", "否", "一般"]
]
)
y_train = np.array(["否", "否", "是", "是", "否", "否", "否", "是", "是", "是", "是", "是", "是", "是", "否"])
x_dataframe=pd.DataFrame(x_train,columns=features)
y_dataframe=pd.DataFrame(y_train)
return x_dataframe,y_dataframe,x_train,y_train
def run():
x_dataframe, y_dataframe, x_testData, y_testData=getData()
t=Tree(method='info_gain_ratio') #信息增益比
# t=Tree(method='info_gain') #信息增益
t.fit(x_dataframe,y_dataframe)
print("**測試訓練集樣本依照決策樹的分類結果**")
test(t,x_testData, y_testData)
print()
print("***************打印決策樹***************")
t.print_tree()
print()
print("*測試非訓練集樣本依照決策樹的分類結果*")
print(t.rootNode.complete_featrues)
testData2=[["青年", "否", "否", "非常好"]]
print(testData2)
res = t.predict(testData2)
print('結果:',res)
if __name__ == '__main__':
import sys
run()
sys.exit(0)