一、準備階段
下面是在jupyter notebook中實現決策樹的可視化,所以需要先在Anaconda Powershell Prompt中使用
pip install graphviz
安裝相應的包
然後實現可視化需要用到graphviz文件,可以點擊下方進入官網
-官網下載
下載完成安裝之後需要添加環境變量,將graphviz安裝目錄下的bin文件夾添加到Path環境變量中,步驟如下:
下面驗證是否安裝成功,同樣在Anaconda Powershell Prompt下輸入dot -version
需要注意的是,這裏的命令行是根據自己使用的編輯環境而定,例如:如果使用的是python的IDLE進行編寫的話,就使用windows的命令行。
二、代碼:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# 爲了顯示中文
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
def image_path(fig_id):
return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)
def save_fig(fig_id, tight_layout=True):
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)
iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
# 數據劃分
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)
# 定義決策樹分類器
clf = DecisionTreeClassifier()
# 訓練分類器
clf.fit(train_x, train_y)
# tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
# tree_clf.fit(X, y)
# 可視化決策樹
from sklearn.tree import export_graphviz
export_graphviz(
# 決策樹
clf,
# 保存文件路徑
out_file=image_path("F:/人工智能與機器學習/1.dot"),
# 特徵名字
feature_names=iris.feature_names[2:],
# 類別名
class_names=iris.target_names,
# 繪製帶有圓角的框
rounded=True,
# 當設置爲“true”時,繪製節點以指示分類、迴歸極值或節點純度用於多輸出。
filled=True
)
import graphviz
with open(image_path("F:/人工智能與機器學習/1.dot")) as f:
dot_graph = f.read()
dot=graphviz.Source(dot_graph)
dot.view()
‘Source.gv.pdf’
dot
#測試集上預測
predict = clf.predict(train_x)
sum = 0
for i in range(len(train_y)):
if predict[i]==train_y[i]:
sum +=1
print("sum:%s,len(train_y):%s"%(sum,len(train_y)))
print("預測率:%s"%(sum/len(train_y)))
sum:119,len(train_y):120
預測率:0.9916666666666667
#測試集上預測
predict2 = clf.predict(test_x)
sun = 0
for i in range(len(test_y)):
if predict[i]==test_y[i]:
sun +=1
print("sum:%s,len(test_y):%s"%(sum,len(test_y)))
print("預測率:%s"%(sum/len(test_y)))
sum:119,len(test_y):30
預測率:3.966666666666667
from sklearn.metrics import accuracy_score
y_pred = clf.predict(test_x)
print("測試集精確度:%s"%accuracy_score(test_y, y_pred))
測試集精確度:0.9666666666666667
y_pred2 = clf.predict(train_x)
print("訓練集精確度:%s"%accuracy_score(train_y, y_pred2))
訓練集精確度:0.9916666666666667
score = clf.score(test_x, test_y)
print("\n模型測試集準確率爲:", score)
模型測試集準確率爲: 0.9666666666666667
score = clf.score(train_x, train_y)
print("\n模型訓練集準確率爲:", score)
模型訓練集準確率爲: 0.9916666666666667