本篇內容是如何使用sklearn構建訓練評估決策樹模型,並使用官方API,或stackoverflow等一些網站上的大牛自定義的方法來可視化決策樹。
1、對數據進行處理並訓練評估模型
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
import numpy as np
path = "你的csv.csv" # 5分類的
data = pd.read_csv(path)
# 打亂數據集
from sklearn import utils
data = utils.shuffle(data)
Y = data["score"].values
X = data.drop("score", axis=1).values
# 拆分訓練集和測試集
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=0)
# 決策樹
from sklearn.tree import DecisionTreeClassifier
# 實例化決策樹,香農熵,訓練集訓練
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=3, max_depth=15)
# clf爲擬合好的模型
clf = dtc.fit(X=train_X, y=train_Y)
# 對測試集的自變量矩陣進行預測
predict_Y = dtc.predict(X=test_X)
print(f"predict_Y={predict_Y}")
# 模型評估
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score
# print(dtc.score(test_X, test_Y.astype(int)))
print(accuracy_score(y_true=test_Y.astype(int), y_pred=predict_Y))
# print(classification_report(predict_Y, test_Y.astype(int)))
2、下面繪製決策樹
from sklearn.tree import export_text, export_graphviz
# 加入Graphviz的環境路徑
import os
os.environ["PATH"] += os.pathsep + "G:/24_graphviz_msi/bin"
# 繪圖並導出
dot_data = export_graphviz(clf, out_file=None,
feature_names=(list(data.drop("score", axis=1).columns))) # 第1個參數Decision_tree是dtc或clf都可以
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
if os.path.exists("out.png"):
pass
else:
graph.write_png("out.png") # 當前文件夾生成out.png
3、打印決策路徑(官方API與自定義方法)
(1)官方API——export_text
# 打印決策路徑
r = export_text(dtc, feature_names=(list(data.drop("score", axis=1).columns))) # 第1個參數Decision_tree是dtc或clf都可以
print(r)
# 打印string類型路徑
string_data = export_graphviz(clf, out_file=None, feature_names=(list(data.drop("score", axis=1).columns)))
print(f"string_data={string_data}")
如果你使用export_text遇到報錯sklearn.tree export_textc error,請直接更新scikit-learn
pip install scikit-learn --upgrade
(2)自定義方法1
def get_lineage(tree, feature_names):
left = tree.tree_.children_left
print(f"left={left}")
right = tree.tree_.children_right
print(f"right={right}")
threshold = tree.tree_.threshold
print(f"threshold={threshold}")
features = [feature_names[i] for i in tree.tree_.feature]
print(f"features={features}")
# get ids of child nodes
idx = np.argwhere(left == -1)[:, 0]
print(f"idx={idx}")
def recurse(left, right, child, lineage=None):
if lineage is None:
lineage = [child]
print(f"當前lineage={lineage}")
if child in left:
parent = np.where(left == child)[0].item()
print(f"當前左,parent={parent}")
split = 'l'
else:
parent = np.where(right == child)[0].item()
print(f"當前右,parent={parent}")
split = 'r'
lineage.append((parent, split, threshold[parent], features[parent]))
if parent == 0:
lineage.reverse()
return lineage
else:
return recurse(left, right, parent, lineage)
for child in idx:
for node in recurse(left, right, child):
print(node)
get_lineage(clf, data.columns)
(3)自定義方法3
def try2(clf):
# 決策估算器具有一個名爲tree_的屬性,該屬性存儲整個樹狀結構,並允許訪問低級屬性。
# The decision estimator has an attribute called tree_ which stores the entire
# tree structure and allows access to low level attributes.
# 二叉樹-tree_表示爲多個並行數組。每個數組的第i個元素保存有關節點“ i”的信息。節點0是樹的根。
# The binary tree —— tree_ is represented as a number of parallel arrays. The i-th element of each array holds information about the node `i`.
# Node 0 is the tree's root.
# 注意!某些數組僅適用於葉子節點或拆分節點。在這種情況下,其他類型的節點的值是任意的!
# NOTE:
# Some of the arrays only apply to either leaves or split nodes, resp. In this case the values of nodes of the other type are arbitrary!
#
# Among those arrays, we have:
# - left_child, id of the left child of the node 節點的左子節點的ID
# - right_child, id of the right child of the node 節點的右子節點的ID
# - feature, feature used for splitting the node 用於拆分節點的功能
# - threshold, threshold value at the node 節點上的閾值
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
# 可以遍歷樹結構以計算各種屬性,例如:作爲每個節點的深度以及是否爲葉子。
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth # seed 是根節點ID及其父級深度
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# 如果我們有一個測試節點
# If we have a test node
# 如果左子節點和右子節點不同,說明不是葉子結點
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
# 如果是葉子結點
else:
is_leaves[node_id] = True
print(f"The binary tree structure has {n_nodes} nodes and has the following tree structure:")
for i in range(n_nodes):
if is_leaves[i]:
print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
else:
print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
"node %s."
% (node_depth[i] * "\t",
i,
children_left[i],
feature[i],
threshold[i],
children_right[i],
))
print()
# 首先,讓我們檢索每個樣本的決策路徑。decision_path方法允許檢索節點指示符函數。
# First let's retrieve the decision path of each sample. The decision_path method allows to retrieve the node indicator functions.
# 指標矩陣在位置 (i,j) 的非零元素表示樣本i通過節點j。
# A non zero element of indicator matrix at the position (i, j) indicates that the sample i goes through the node j.
node_indicator = clf.decision_path(test_X)
# Similarly, we can also have the leaves ids reached by each sample.
leave_id = clf.apply(test_X)
# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.
# HERE IS WHAT YOU WANT
sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
node_indicator.indptr[sample_id + 1]]
print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
if leave_id[sample_id] == node_id: # <-- changed != to ==
# continue # <-- comment out
print("leaf node {} reached, no decision here".format(leave_id[sample_id])) # <--
else: # < -- added else to iterate through decision nodes
if test_X[sample_id, feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
# print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
# "node %s."
# % (node_depth[i] * "\t",
# i,
# children_left[i],
# feature[i],
# threshold[i],
# children_right[i],
# ))
print(f"decision id node {node_id} : "
f"(X[{sample_id}, {feature[node_id]}] (= {test_X[sample_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]})")
try2(clf)
(4)自定義方法3
from sklearn.tree import _tree
def try3(tree, feature_names):
tree_ = tree.tree_
feature_name = [feature_names[i]
if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature]
print("def tree({}):".format(", ".join(feature_names)))
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {}:".format(indent, name, threshold))
recurse(tree_.children_left[node], depth + 1)
print("{}else: # if {} > {}".format(indent, name, threshold))
recurse(tree_.children_right[node], depth + 1)
else:
print("{}return {}".format(indent, np.argmax(tree_.value[node])))
recurse(0, 1)
# try3(clf, feature_names=list(data.drop("score", axis=1).columns))
參考:
導入export_text出錯
How to extract the decision rules from scikit-learn decision-tree?
try2方法