使用scikit-learn對csv數據文件構建決策樹並可視化

本篇內容是如何使用sklearn構建訓練評估決策樹模型,並使用官方API,或stackoverflow等一些網站上的大牛自定義的方法來可視化決策樹。

1、對數據進行處理並訓練評估模型

from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
import numpy as np

path = "你的csv.csv"  # 5分類的
data = pd.read_csv(path)


# 打亂數據集
from sklearn import utils

data = utils.shuffle(data)

Y = data["score"].values
X = data.drop("score", axis=1).values

# 拆分訓練集和測試集
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=0)

# 決策樹
from sklearn.tree import DecisionTreeClassifier

# 實例化決策樹,香農熵,訓練集訓練
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=3, max_depth=15)
# clf爲擬合好的模型
clf = dtc.fit(X=train_X, y=train_Y)

# 對測試集的自變量矩陣進行預測
predict_Y = dtc.predict(X=test_X)
print(f"predict_Y={predict_Y}")

# 模型評估
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score

# print(dtc.score(test_X, test_Y.astype(int)))
print(accuracy_score(y_true=test_Y.astype(int), y_pred=predict_Y))
# print(classification_report(predict_Y, test_Y.astype(int)))

2、下面繪製決策樹

from sklearn.tree import export_text, export_graphviz
# 加入Graphviz的環境路徑
import os
os.environ["PATH"] += os.pathsep + "G:/24_graphviz_msi/bin"

# 繪圖並導出
dot_data = export_graphviz(clf, out_file=None,
                           feature_names=(list(data.drop("score", axis=1).columns)))  # 第1個參數Decision_tree是dtc或clf都可以
import pydotplus

graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
if os.path.exists("out.png"):
   pass
else:
    graph.write_png("out.png")  # 當前文件夾生成out.png

3、打印決策路徑(官方API與自定義方法)

(1)官方API——export_text
# 打印決策路徑
r = export_text(dtc, feature_names=(list(data.drop("score", axis=1).columns)))    # 第1個參數Decision_tree是dtc或clf都可以
print(r)

# 打印string類型路徑
string_data = export_graphviz(clf, out_file=None, feature_names=(list(data.drop("score", axis=1).columns)))
print(f"string_data={string_data}")

如果你使用export_text遇到報錯sklearn.tree export_textc error,請直接更新scikit-learn

pip install scikit-learn --upgrade
(2)自定義方法1
def get_lineage(tree, feature_names):
    left = tree.tree_.children_left
    print(f"left={left}")
    right = tree.tree_.children_right
    print(f"right={right}")
    threshold = tree.tree_.threshold
    print(f"threshold={threshold}")
    features = [feature_names[i] for i in tree.tree_.feature]
    print(f"features={features}")

    # get ids of child nodes
    idx = np.argwhere(left == -1)[:, 0]
    print(f"idx={idx}")

    def recurse(left, right, child, lineage=None):
        if lineage is None:
            lineage = [child]
            print(f"當前lineage={lineage}")
        if child in left:
            parent = np.where(left == child)[0].item()
            print(f"當前左,parent={parent}")
            split = 'l'
        else:
            parent = np.where(right == child)[0].item()
            print(f"當前右,parent={parent}")
            split = 'r'

        lineage.append((parent, split, threshold[parent], features[parent]))

        if parent == 0:
            lineage.reverse()
            return lineage
        else:
            return recurse(left, right, parent, lineage)

    for child in idx:
        for node in recurse(left, right, child):
            print(node)

get_lineage(clf, data.columns)
(3)自定義方法3
def try2(clf):

    # 決策估算器具有一個名爲tree_的屬性,該屬性存儲整個樹狀結構,並允許訪問低級屬性。
    # The decision estimator has an attribute called tree_  which stores the entire
    # tree structure and allows access to low level attributes.
    # 二叉樹-tree_表示爲多個並行數組。每個數組的第i個元素保存有關節點“ i”的信息。節點0是樹的根。
    # The binary tree —— tree_ is represented as a number of parallel arrays. The i-th element of each array holds information about the node `i`.
    # Node 0 is the tree's root.
    # 注意!某些數組僅適用於葉子節點或拆分節點。在這種情況下,其他類型的節點的值是任意的!
    # NOTE:
    # Some of the arrays only apply to either leaves or split nodes, resp. In this case the values of nodes of the other type are arbitrary!
    #
    # Among those arrays, we have:
    #   - left_child, id of the left child of the node      節點的左子節點的ID
    #   - right_child, id of the right child of the node    節點的右子節點的ID
    #   - feature, feature used for splitting the node      用於拆分節點的功能
    #   - threshold, threshold value at the node            節點上的閾值

    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    # 可以遍歷樹結構以計算各種屬性,例如:作爲每個節點的深度以及是否爲葉子。
    # The tree structure can be traversed to compute various properties such
    # as the depth of each node and whether or not it is a leaf.
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth  # seed 是根節點ID及其父級深度
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        # 如果我們有一個測試節點
        # If we have a test node
        # 如果左子節點和右子節點不同,說明不是葉子結點
        if children_left[node_id] != children_right[node_id]:
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        # 如果是葉子結點
        else:
            is_leaves[node_id] = True

    print(f"The binary tree structure has {n_nodes} nodes and has the following tree structure:")
    for i in range(n_nodes):
        if is_leaves[i]:
            print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
        else:
            print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
                  "node %s."
                  % (node_depth[i] * "\t",
                     i,
                     children_left[i],
                     feature[i],
                     threshold[i],
                     children_right[i],
                     ))
    print()

    # 首先,讓我們檢索每個樣本的決策路徑。decision_path方法允許檢索節點指示符函數。
    # First let's retrieve the decision path of each sample. The decision_path method allows to retrieve the node indicator functions.
    # 指標矩陣在位置 (i,j) 的非零元素表示樣本i通過節點j。
    # A non zero element of indicator matrix at the position (i, j) indicates that the sample i goes through the node j.

    node_indicator = clf.decision_path(test_X)

    # Similarly, we can also have the leaves ids reached by each sample.

    leave_id = clf.apply(test_X)

    # Now, it's possible to get the tests that were used to predict a sample or
    # a group of samples. First, let's make it for the sample.

    # HERE IS WHAT YOU WANT
    sample_id = 0
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    print('Rules used to predict sample %s: ' % sample_id)
    for node_id in node_index:

        if leave_id[sample_id] == node_id:  # <-- changed != to ==
            # continue # <-- comment out
            print("leaf node {} reached, no decision here".format(leave_id[sample_id]))  # <--

        else:  # < -- added else to iterate through decision nodes
            if test_X[sample_id, feature[node_id]] <= threshold[node_id]:
                threshold_sign = "<="
            else:
                threshold_sign = ">"

            # print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
            #       "node %s."
            #       % (node_depth[i] * "\t",
            #          i,
            #          children_left[i],
            #          feature[i],
            #          threshold[i],
            #          children_right[i],
            #          ))
            print(f"decision id node {node_id} : "
                  f"(X[{sample_id}, {feature[node_id]}] (= {test_X[sample_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]})")

try2(clf)

(4)自定義方法3

from sklearn.tree import _tree
def try3(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i]
                    if i != _tree.TREE_UNDEFINED else "undefined!"
                    for i in tree_.feature]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node])))

    recurse(0, 1)

# try3(clf, feature_names=list(data.drop("score", axis=1).columns))

參考:
導入export_text出錯
How to extract the decision rules from scikit-learn decision-tree?
try2方法


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章