用信息增益比構建決策樹,實現李航統計學習方法第五章案例

李航<統計學習方法>第五章案例 5.3決策樹的生成 Page77

 

書中給的答案

 

用python實現的並打印生成的決策樹

代碼

import numpy as np
import pandas as pd

class Node():
    def __init__(self,node_id ,content=None,complete_featrues=None):
        self.node_id=node_id
        self.isLeaf=None
        self.y_value=None
        self.feature_name=None
        self.feature_types=None
        self.children=[]
        self.content=content
        self.complete_featrues=complete_featrues

    def __repr__(self):
        children_ids=[]
        if(len(self.children)>0):
            for i in range(len(self.children)):
               children_ids.append(self.children[i].node_id)
        str='node_id:{}, y_value:{}, feature_name:{}, feature_values:{}, children:{}'.format(
            self.node_id,self.y_value,self.feature_name,self.feature_types,children_ids)
        return str

class Tree():
    def __init__(self,method="info_gain_ratio"):
        self.method=method
        self.rootNode=None
        self.node_ids=set()

    def _node_id_generator(self,currentId,feature_name,judge_name):
        #用於生成node的ID
        currentId=currentId.split(sep=':')[0]
        i=1
        str='{}-{}'.format(currentId,i)
        while(str in self.node_ids):
            i+=1
            str = '{}-{}'.format(currentId, i)
        self.node_ids.add(str)
        return '{}:{}-{}'.format(str,feature_name,judge_name)


    def _experience_entrophy(self,x_train):        #經驗熵
        x_type_count = np.unique(x_train, return_counts=True)
        totalCount = np.sum(x_type_count[1])
        res = 0.0
        for i in range(len(x_type_count[1])):
            percent=x_type_count[1][i] / totalCount
            res += -percent * np.log2(percent)
        return res


    def _conditional_entrophy(self,one_x_train,y_train): #條件熵
        if (len(one_x_train) != len(y_train)):
            print("x len: {},y len:{}".format(len(one_x_train), len(y_train)))
            print("x : {} \n y :{}".format(one_x_train, y_train))
            raise Exception
        x_types = np.unique(one_x_train, return_counts=True)
        x = pd.DataFrame(one_x_train)
        y = pd.DataFrame(y_train)
        res = 0.0
        for i in range(len(x_types[0])):
            percent = x_types[1][i] / len(x_types[1])
            t = y[(x == x_types[0][i]).values]
            res += self._experience_entrophy(t)
        return res


    def _info_gain(self,one_x_train, y_train):        #信息增益
        return self._experience_entrophy(y_train)-self._conditional_entrophy(one_x_train,y_train)

    def _info_gain_ratio(self,one_x_train, y_train): #信息增益比
        return self._info_gain(one_x_train,y_train)/self._experience_entrophy(one_x_train)


    def _choose_feature(self,x_dataframe, y_dataframe): #選擇最優特徵
        features=x_dataframe.columns
        list = []
        for i in x_dataframe.columns:
            if(self.method=='info_gain'):
                list.append(self._info_gain(x_dataframe[i], y_dataframe[0]))
            elif(self.method=='info_gain_ratio'):
                list.append(self._info_gain_ratio(x_dataframe[i], y_dataframe[0]))
            else:
                raise TypeError
        opt_feature=features[np.argmax(list)]
        return opt_feature


    def _build_tree(self,node):  #遞歸建立決策樹
        x_dataframe=node.content[0]
        y_dataframe=node.content[1]
        featrues_remains=x_dataframe.columns
        y_value_count = np.unique(y_dataframe, return_counts=True)
        if(len(featrues_remains)==1 or len(y_value_count[0])==1): #停止遞歸,葉子節點
            node.y_value = y_value_count[0][np.argmax(y_value_count[1])] #判定值
            node.isLeaf = True
            return node
        else:     #繼續遞歸,非葉子節點
            node.isLeaf=False
            node.y_value=None
            node.feature_name=self._choose_feature(x_dataframe,y_dataframe)
            node.feature_types=np.unique(x_dataframe[node.feature_name])

            for i in range(len(node.feature_types)): #每一種類型建立一個節點例如有自己的房子,會建立是和否兩個節點
                tmp=x_dataframe.loc[x_dataframe[node.feature_name]==node.feature_types[i]]
                new_x_dataframe=tmp.drop(labels=node.feature_name,axis=1)
                new_y_dataframe=y_dataframe[(x_dataframe[node.feature_name]==node.feature_types[i]).values]
                new_nodeId = self._node_id_generator(node.node_id,node.feature_name,node.feature_types[i])
                # print('append id {} to {}'.format(new_nodeId,node.node_id))
                new_node=Node(node_id=new_nodeId,content=(new_x_dataframe,new_y_dataframe))
                self._build_tree(new_node)
                node.children.append(new_node)

    def fit(self,x_dataframe,y_dataframe):
        self.rootNode =Node(node_id='1:根節點',content=(x_dataframe,y_dataframe),complete_featrues=x_dataframe.columns)
        self._build_tree(self.rootNode)


    def _getResult(self,node,test_dataframe):
        if(node.isLeaf):
            return node.y_value
        else:
            feature_value=test_dataframe[node.feature_name][0]
            next_node=None
            for i in range(len(node.feature_types)):
                if(node.feature_types[i]==feature_value):
                    next_node=node.children[i]
            return self._getResult(next_node,test_dataframe)

    def predict(self,x_test):
        test_dataframe=pd.DataFrame(x_test,columns=self.rootNode.complete_featrues)
        return self._getResult(self.rootNode,test_dataframe)
    def _print_tree(self,node):
        print(node)
        if(len(node.children)>0):
            for i in node.children:
                self._print_tree(i)

    def print_tree(self):
         self._print_tree(self.rootNode)


def test(tree,x_testData, y_testData):
    err=0
    total_count=0
    for i in range(len(x_testData)):
        x_data =[x_testData[i]]
        y_data =y_testData[i]
        res = tree.predict(x_data)
        if(res!=y_data):
            err+=1
            print(x_data)
            print(y_data)
            print('res',res)
        else:
            total_count+=1

    print('err count = {}'.format(err))
    print('total_count  = {}'.format(total_count))

def getData():
    features=["年齡","有工作","有自己的房子","信貸情況"]
    x_train=np.array([
        ["青年", "否", "否", "一般"],#"否"
        ["青年", "否", "否", "好"],#"否"
        ["青年", "是", "否", "好"],#"是"
        ["青年", "是", "是", "一般"],#"是"
        ["青年", "否", "否", "一般"],#"否"
        ["中年", "否", "否", "一般"],
        ["中年", "否", "否", "好"],
        ["中年", "是", "是", "好"],
        ["中年", "否", "是", "非常好"],
        ["中年", "否", "是", "非常好"],
        ["老年", "否", "是", "非常好"],
        ["老年", "否", "是", "好"],
        ["老年", "是", "否", "好"],
        ["老年", "是", "否", "非常好"],
        ["老年", "否", "否", "一般"]
        ]
    )
    y_train = np.array(["否", "否", "是", "是", "否", "否", "否", "是", "是", "是", "是", "是", "是", "是", "否"])
    x_dataframe=pd.DataFrame(x_train,columns=features)
    y_dataframe=pd.DataFrame(y_train)
    return x_dataframe,y_dataframe,x_train,y_train

def run():
    x_dataframe, y_dataframe, x_testData, y_testData=getData()
    t=Tree(method='info_gain_ratio') #信息增益比
    # t=Tree(method='info_gain')   #信息增益
    t.fit(x_dataframe,y_dataframe)

    print("**測試訓練集樣本依照決策樹的分類結果**")
    test(t,x_testData, y_testData)
    print()
    print("***************打印決策樹***************")
    t.print_tree()
    print()
    print("*測試非訓練集樣本依照決策樹的分類結果*")
    print(t.rootNode.complete_featrues)
    testData2=[["青年", "否", "否", "非常好"]]
    print(testData2)
    res = t.predict(testData2)

    print('結果:',res)

if __name__ == '__main__':
    import sys
    run()
    sys.exit(0)

 

 

 

 

發佈了70 篇原創文章 · 獲贊 26 · 訪問量 15萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章