Task04：決策樹

理論部分
特徵選擇：信息增益（熵、聯合熵、條件熵）、信息增益比、基尼係數
決策樹生成：ID3決策樹、C4.5決策樹、CART決策樹（CART分類樹、CART迴歸樹）
決策樹剪枝
sklearn參數詳解
實戰部分
利用sklearn解決分類問題和迴歸預測。
sklearn.tree.DecisionTreeClassifier
sklearn.tree.DecisionTreeRegressor
import copy
import numbers
import warnings
from math import ceil

import numpy as np
import pandas as pd
from scipy.sparse import issparse


class DecisionTree(object):
    """自定的樹結構,用來保存決策樹.

    Paramters:
    ----------
    col: int, default(-1)
        當前使用的第幾列數據

    val: int or float or str, 分割節點
        分割節點的值,
        int or float : 使用大於進行比較
        str : 使用等於模式

    LeftChild: DecisionTree
        左子樹, <= val

    RightChild: DecisionTree
        右子樹, > val

    results:
    """

    def __init__(self, col=-1, val=None, LeftChild=None, RightChild=None, result=None):
        self.col = col
        self.val = val
        self.LeftChild = LeftChild
        self.RightChild = RightChild
        self.result = result


class DecisionTreeClassifier(object):
    """使用基尼指數的分類決策樹接口.

    Paramters:
    ---------
    max_depth : int or None, optional(dafault=None)
        表示決策樹的最大深度. None: 表示不設置深度,可以任意擴展,
        直到葉子節點的個數小於min_samples_split個數.

    min_samples_split : int, optional(default=2)
        表示最小分割樣例數.
        if int, 表示最小分割樣例樹,如果小於這個數字,不在進行分割.

    min_samples_leaf : int, optional (default=1)
        表示葉節點最少有min_samples_leaf個節點樹,如果小於等於這個數,直接返回.
        if int, min_samples_leaf就是最小樣例數.

    min_impurity_decrease : float, optional (default=0.)
        分割之後基尼指數大於這個數,則進行分割.
        N_t / N * (impurity - N_t_R / N_t * right_impurity
                        - N_t_L / N_t * left_impurity)

    min_impurity_split : float, default=1e-7
        停止增長的閾值,小於這個值直接返回.

    Attributes
    ----------
    classes_ : array of shape (n_classes,) or a list of such arrays
        表示所有的類

    feature_importances_ : ndarray of shape (n_features,)
        特徵重要性, 被選擇最優特徵的次數,進行降序.

    tree_ : Tree object
        The underlying Tree object.
    """

    def __init__(self,
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_impurity_decrease=0.,
                 min_impurity_split=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.classes_ = None
        self.max_features_ = None
        self.decision_tree = None
        self.all_feats = None

    def fit(self, X, y, check_input=True):
        """使用X和y訓練決策樹的分類模型.

        Parameters
        ----------
        X : {array-like} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels) as integers or strings.

        check_input : bool, (default=True)
            Allow to bypass several input checking.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        if isinstance(X, list):
            X = self.__check_array(X)
        if isinstance(y, list):
            y = self.__check_array(y)
        if X.shape[0] != y.shape[0]:
            raise ValueError("輸入的數據X和y長度不匹配")

        self.classes_ = list(set(y))
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.DataFrame):
            y = y.values

        data_origin = np.c_[X, y]
        #         print (data_origin)
        self.all_feats = [i for i in range(X.shape[1])]
        self.max_features_ = X.shape[0]

        data = copy.deepcopy(data_origin)
        self.decision_tree = self.__build_tree(data, 0)

    def __predict_one(self, input_x):
        """預測一個樣例的返回結果.

        Paramters:
        ---------
        input_x : list or np.ndarray
            需要預測輸入數據

        Returns:
        -------
        class : 對應的類
        """

        tree = self.decision_tree

        # ============================= show me your code =======================
        def run(input_x, tree):
            """內部使用函數

            """
            # 葉子節點返回
            if tree.result != None:
                return tree.result
            v = input_x[tree.col]
            branch = None
            if isinstance(v, int) or isinstance(v, float):
                if v <= tree.val:
                    tree = tree.LeftChild
                else:
                    tree = tree.RightChild
            elif isinstance(v, str):
                if v == tree.val:
                    tree = tree.LeftChild
                else:
                    tree = tree.RightChild
            return run(input_x, tree)

        pre_y = run(input_x, tree)
        # ============================= show me your code =======================
        return pre_y

    def predict(self, test):
        """預測函數,

        Paramters:
        ---------
        test: {array-like} of shape (n_samples, n_features)

        Returns:
        result : np.array(list)
        """
        result = []
        for i in range(len(test)):
            result.append(self.__predict_one(test[i]))
        return np.array(result)

    def score(self, vali_X, vali_y):
        """驗證模型的特徵,這裏使用準確率.
        Parameters
        ----------
        vali_X : {array-like} of shape (n_samples, n_features)
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        vali_y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels) as integers or strings.

        Returns:
        -------
        score : float, 預測的準確率
        """
        vali_y = np.array(vali_y)
        pre_y = self.predict(vali_X)
        pre_score = 1.0 * sum(vali_y == pre_y) / len(vali_y)
        return pre_score

    def __build_tree(self, data, depth):
        """創建決策樹的主要代碼

        Paramters:
        ---------
        data : {array-like} of shape (n_samples, n_features) + {label}
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32``

        depth: int, 樹的深度

        Returns:
        -------
        DecisionTree

        """
        labels = np.unique(data[:, -1])
        # 只剩下唯一的類別時,停止,返回對應類別
        if len(labels) == 1:
            return DecisionTree(result=list(labels)[0])

        # 遍歷完所有特徵時,只剩下label標籤,就返回出現字數最多的類標籤
        if not self.all_feats:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 超過最大深度,則停止,使用出現最多的參數作爲該葉子節點的類
        if self.max_depth and depth > self.max_depth:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 如果剩餘的樣本數大於等於給定的參數 min_samples_split,
        # 則不在進行分割, 直接返回類別中最多的類,該節點作爲葉子節點
        if self.min_samples_split >= data.shape[0]:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 葉子節點個數小於指定參數就進行返回,葉子節點中的出現最多的類
        if self.min_samples_leaf >= data.shape[0]:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        # 根據基尼指數選擇每個分割的最優特徵
        best_idx, best_val, min_gini = self.__getBestFeature(data)
        #         print ("Current best Feature:", best_idx, best_val, min_gini)
        # 如果當前的gini指數小於指定閾值,直接返回
        if min_gini < self.min_impurity_split:
            return DecisionTree(result=np.argmax(np.bincount(data[:, -1].astype(int))))

        leftData, rightData = self.__splitData(data, best_idx, best_val)

        # ============================= show me your code =======================
        leftDecisionTree = self.__build_tree(leftData, depth + 1)
        rightDecisionTree = self.__build_tree(rightData, depth + 1)
        # ============================= show me your code =======================

        return DecisionTree(col=best_idx, val=best_val, LeftChild=leftDecisionTree, RightChild=rightDecisionTree)

    def __getBestFeature(self, data):
        """得到最優特徵對應的列
        Paramters:
        ---------
        data: np.ndarray
            從data中選擇最優特徵

        Returns:
        -------
        bestInx, val, 最優特徵的列的索引和使用的值.
        """
        best_idx = -1
        best_val = None
        min_gini = 1.0
        # 遍歷現在可以使用的特徵列
        # ============================= show me your code =======================
        for feat_idx in self.all_feats:
            # 遍歷所用的特徵:
            # 判斷數據類型,貌似對numpy.ndarry不好有用
            # numpy.ndarry的類型自動向上擴展
            x = data[:, feat_idx]
            for val in data[:, feat_idx]:
                leftData, rightData = self.__splitData(data, feat_idx, val)
                left_gini = self.gini(leftData[:, -1])
                right_gini = self.gini(rightData[:, -1])
                #                 print (len(leftData), len(rightData), len(data))
                cur_gini = 1.0 * len(leftData) / len(data) * left_gini
                cur_gini += 1.0 * len(rightData) / len(data) * right_gini

                if cur_gini < min_gini:
                    best_idx = feat_idx
                    best_val = val
                    min_gini = cur_gini
        # ============================= show me your code =======================
        # 刪除使用過的特徵
        self.all_feats.remove(best_idx)

        return best_idx, best_val, min_gini

    def gini(self, labels):
        """計算基尼指數.

        Paramters:
        ----------
        labels: list or np.ndarray, 數據對應的類目集合.

        Returns:
        -------
        gini : float ``` Gini(p) = \sum_{k=1}^{K}p_k(1-p_k)=1-\sum_{k=1}^{K}p_k^2 ```

        """
        # ============================= show me your code =======================
        labelSet = np.array(labels)
        length = labelSet.shape[0]
        gini = 1.
        classes = np.unique(labelSet)
        for c in classes:
            gini -= (1.0 * np.sum(labelSet == c) / length) ** 2
        # ============================= show me your code =======================
        return gini

    def __splitData(self, data, featColumn, val):
        '''根據特徵劃分數據集分成左右兩部分.
        Paramters:
        ---------
        data: np.ndarray, 分割的數據

        featColumn : int, 使用第幾列的數據進行分割

        val : int or float or str, 分割的值
            int or float : 使用比較方式
            str : 使用相等方式

        Returns:
        -------
        leftData, RightData
            int or left: leftData <= val < rightData
            str : leftData = val and rightData != val
        '''
        if isinstance(val, str):
            leftData = data[data[:, featColumn] == val]
            rightData = data[data[:, featColumn] != val]
        elif isinstance(val, int) or isinstance(val, float):
            leftData = data[data[:, featColumn] <= val]
            rightData = data[data[:, featColumn] > val]
        return leftData, rightData

    def __check_array(self, X):
        """檢查數據類型
        Parameters:
        ----------
        X : {array-like} of shape (n_samples, n_features)
            The training input samples.

        Retures
        -------
        X: {array-like} of shape (n_samples, n_features)
        """
        if isinstance(X, list):
            X = np.array(X)
        if not isinstance(X, np.ndarray) and not isinstance(X, pd.DataFrame):
            raise ValueError("輸出數據不合法,目前只支持np.ndarray or pd.DataFrame")
        return X

    import numpy as np
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split

    if __name__ == "__main__":
        # 分類樹
        X, y = load_iris(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        clf = DecisionTreeClassifier()

        clf.fit(X_train, y_train)

        print("Classifier Score:", clf.score(X_test, y_test))