Task04:決策樹

理論部分

  • 特徵選擇:信息增益(熵、聯合熵、條件熵)、信息增益比、基尼係數
  • 決策樹生成:ID3決策樹、C4.5決策樹、CART決策樹(CART分類樹、CART迴歸樹)
  • 決策樹剪枝
  • sklearn參數詳解

實戰部分

  • 利用sklearn解決分類問題和迴歸預測。
  • sklearn.tree.DecisionTreeClassifier
  • sklearn.tree.DecisionTreeRegressor
import copy
import numbers
import warnings
from math import ceil

import numpy as np
import pandas as pd
from scipy.sparse import issparse


class DecisionTree(object):
    """自定的樹結構,用來保存決策樹.

    Paramters:
    ----------
    col: int, default(-1)
        當前使用的第幾列數據

    val: int or float or str, 分割節點
        分割節點的值,
        int or float : 使用大於進行比較
        str : 使用等於模式

    LeftChild: DecisionTree
        左子樹, <= val

    RightChild: DecisionTree
        右子樹, > val

    results:
    """

    def __init__(self, col=-1, val=None, LeftChild=None, RightChild=None, result=None):
        self.col = col
        self.val = val
        self.LeftChild = LeftChild
        self.RightChild = RightChild
        self.result = result


class DecisionTreeClassifier(object):
    """使用基尼指數的分類決策樹接口.

    Paramters:
    ---------
    max_depth : int or None, optional(dafault=None)
        表示決策樹的最大深度. None: 表示不設置深度,可以任意擴展,
        直到葉子節點的個數小於min_samples_split個數.

    min_samples_split : int, optional(default=2)
        表示最小分割樣例數.
        if int, 表示最小分割樣例樹,如果小於這個數字,不在進行分割.

    min_samples_leaf : int, optional (default=1)
        表示葉節點最少有min_samples_leaf個節點樹,如果小於等於這個數,直接返回.
        if int, min_samples_leaf就是最小樣例數.

    min_impurity_decrease : float, optional (default=0.)
        分割之後基尼指數大於這個數,則進行分割.
        N_t / N * (impurity - N_t_R / N_t * right_impurity
                        - N_t_L / N_t * left_impurity)

    min_impurity_split : float, default=1e-7
        停止增長的閾值,小於這個值直接返回.

    Attributes
    ----------
    classes_ : array of shape (n_classes,) or a list of such arrays
        表示所有的類

    feature_importances_ : ndarray of shape (n_features,)
        特徵重要性, 被選擇最優特徵的次數,進行降序.

    tree_ : Tree object
        The underlying Tree object.
    """

    def __init__(self,
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_impurity_decrease=0.,
                 min_impurity_split=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.classes_ = None
        self.max_features_ = None
        self.decision_tree = None
        self.all_feats = None

    def fit(self, X, y, check_input=True):
        """使用X和y訓練決策樹的分類模型.

        Parameters
        ----------
        X : {array-like} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels) as integers or strings.

        check_input : bool, (default=True)
            Allow to bypass several input checking.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        if isinstance(X, list):
            X = self.__check_array(X)
        if isinstance(y, list):
            y = self.__check_array(y)
        if X.shape[0] != y.shape[0]:
            raise ValueError("輸入的數據X和y長度不匹配")

        self.classes_ = list(set(y))
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.DataFrame):
            y = y.values

        data_origin = np.c_[X, y]
        #         print (data_origin)
        self.all_feats = [i for i in range(X.shape[1])]
        self.max_features_ = X.shape[0]

        data = copy.deepcopy(data_origin)
        self.decision_tree = self.__build_tree(data, 0)

    def __predict_one(self, input_x):
        """預測一個樣例的返回結果.

        Paramters:
        ---------
        input_x : list or np.ndarray
            需要預測輸入數據

        Returns:
        -------
        class : 對應的類
        """

        tree = self.decision_tree

        # ============================= show me your code =======================
        def run(input_x, tree):
            """內部使用函數

            """
            # 葉子節點返回
            if tree.result != None:
                return tree.result
            v = input_x[tree.col]
            branch = None
            if isinstance(v, int) or isinstance(v, float):
                if v <= tree.val:
                    tree = tree.LeftChild
                else:
                    tree = tree.RightChild
            elif isinstance(v, str):
                if v == tree.val:
                    tree = tree.LeftChild
                else:
                    tree = tree.RightChild
            return run(input_x, tree)

        pre_y = run(input_x, tree)
        # ============================= show me your code =======================
        return pre_y

    def predict(self, test):
        """預測函數,

        Paramters:
        ---------
        test: {array-like} of shape (n_samples, n_features)

        Returns:
        result : np.array(list)
        """
        result = []
        for i in range(len(test)):
            result.append(self.__predict_one(test[i]))
        return np.array(result)

    def score(self, vali_X, vali_y):
        """驗證模型的特徵,這裏使用準確率.
        Parameters
        ----------
        vali_X : {array-like} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        vali_y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels) as integers or strings.

        Returns:
        -------
        score : float, 預測的準確率
        """
        vali_y = np.array(vali_y)
        pre_y = self.predict(vali_X)
        pre_score = 1.0 * sum(vali_y == pre_y) / len(vali_y)
        return pre_score

    def __build_tree(self, data, depth):
        """創建決策樹的主要代碼

        Paramters:
        ---------
        data : {array-like} of shape (n_samples, n_features) + {label}
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        depth: int, 樹的深度

        Returns:
        -------
        DecisionTree

        """
        labels = np.unique(data[:, -1])
        # 只剩下唯一的類別時,停止,返回對應類別
        if len(labels) == 1:
            return DecisionTree(result=list(labels)[0])

        # 遍歷完所有特徵時,只剩下label標籤,就返回出現字數最多的類標籤
        if not self.all_feats:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 超過最大深度,則停止,使用出現最多的參數作爲該葉子節點的類
        if self.max_depth and depth > self.max_depth:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 如果剩餘的樣本數大於等於給定的參數 min_samples_split,
        # 則不在進行分割, 直接返回類別中最多的類,該節點作爲葉子節點
        if self.min_samples_split >= data.shape[0]:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 葉子節點個數小於指定參數就進行返回,葉子節點中的出現最多的類
        if self.min_samples_leaf >= data.shape[0]:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 根據基尼指數選擇每個分割的最優特徵
        best_idx, best_val, min_gini = self.__getBestFeature(data)
        #         print ("Current best Feature:", best_idx, best_val, min_gini)
        # 如果當前的gini指數小於指定閾值,直接返回
        if min_gini < self.min_impurity_split:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        leftData, rightData = self.__splitData(data, best_idx, best_val)

        # ============================= show me your code =======================
        leftDecisionTree = self.__build_tree(leftData, depth + 1)
        rightDecisionTree = self.__build_tree(rightData, depth + 1)
        # ============================= show me your code =======================

        return DecisionTree(col=best_idx, val=best_val, LeftChild=leftDecisionTree, RightChild=rightDecisionTree)

    def __getBestFeature(self, data):
        """得到最優特徵對應的列
        Paramters:
        ---------
        data: np.ndarray
            從data中選擇最優特徵

        Returns:
        -------
        bestInx, val, 最優特徵的列的索引和使用的值.
        """
        best_idx = -1
        best_val = None
        min_gini = 1.0
        # 遍歷現在可以使用的特徵列
        # ============================= show me your code =======================
        for feat_idx in self.all_feats:
            # 遍歷所用的特徵:
            # 判斷數據類型,貌似對numpy.ndarry不好有用
            # numpy.ndarry的類型自動向上擴展
            x = data[:, feat_idx]
            for val in data[:, feat_idx]:
                leftData, rightData = self.__splitData(data, feat_idx, val)
                left_gini = self.gini(leftData[:, -1])
                right_gini = self.gini(rightData[:, -1])
                #                 print (len(leftData), len(rightData), len(data))
                cur_gini = 1.0 * len(leftData) / len(data) * left_gini
                cur_gini += 1.0 * len(rightData) / len(data) * right_gini

                if cur_gini < min_gini:
                    best_idx = feat_idx
                    best_val = val
                    min_gini = cur_gini
        # ============================= show me your code =======================
        # 刪除使用過的特徵
        self.all_feats.remove(best_idx)

        return best_idx, best_val, min_gini

    def gini(self, labels):
        """計算基尼指數.

        Paramters:
        ----------
        labels: list or np.ndarray, 數據對應的類目集合.

        Returns:
        -------
        gini : float ``` Gini(p) = \sum_{k=1}^{K}p_k(1-p_k)=1-\sum_{k=1}^{K}p_k^2 ```

        """
        # ============================= show me your code =======================
        labelSet = np.array(labels)
        length = labelSet.shape[0]
        gini = 1.
        classes = np.unique(labelSet)
        for c in classes:
            gini -= (1.0 * np.sum(labelSet == c) / length) ** 2
        # ============================= show me your code =======================
        return gini

    def __splitData(self, data, featColumn, val):
        '''根據特徵劃分數據集分成左右兩部分.
        Paramters:
        ---------
        data: np.ndarray, 分割的數據

        featColumn : int, 使用第幾列的數據進行分割

        val : int or float or str, 分割的值
            int or float : 使用比較方式
            str : 使用相等方式

        Returns:
        -------
        leftData, RightData
            int or left: leftData <= val < rightData
            str : leftData = val and rightData != val
        '''
        if isinstance(val, str):
            leftData = data[data[:, featColumn] == val]
            rightData = data[data[:, featColumn] != val]
        elif isinstance(val, int) or isinstance(val, float):
            leftData = data[data[:, featColumn] <= val]
            rightData = data[data[:, featColumn] > val]
        return leftData, rightData

    def __check_array(self, X):
        """檢查數據類型
        Parameters:
        ----------
        X : {array-like} of shape (n_samples, n_features)
            The training input samples.

        Retures
        -------
        X: {array-like} of shape (n_samples, n_features)
        """
        if isinstance(X, list):
            X = np.array(X)
        if not isinstance(X, np.ndarray) and not isinstance(X, pd.DataFrame):
            raise ValueError("輸出數據不合法,目前只支持np.ndarray or pd.DataFrame")
        return X

    import numpy as np
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split

    if __name__ == "__main__":
        # 分類樹
        X, y = load_iris(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        clf = DecisionTreeClassifier()

        clf.fit(X_train, y_train)

        print("Classifier Score:", clf.score(X_test, y_test))

 

發佈了33 篇原創文章 · 獲贊 4 · 訪問量 5萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章