sklearn:決策分類樹_紅酒數據集

from sklearn import tree
from sklearn.datasets import load_wine  # 紅酒數據
from sklearn.model_selection import train_test_split
wine = load_wine()  # 導入數據
wine
{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2]),
 'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'),
 'DESCR': '......rics).\n',
 'feature_names': ['alcohol',
  'malic_acid',
  'ash',
  'alcalinity_of_ash',
  'magnesium',
  'total_phenols',
  'flavanoids',
  'nonflavanoid_phenols',
  'proanthocyanins',
  'color_intensity',
  'hue',
  'od280/od315_of_diluted_wines',
  'proline']}
import pandas as pd

# 將特徵數據與 target拼接起來
wine_df = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)] ,axis=1)
wine_df.columns=list(wine.feature_names) + ['target']  # 將數據特徵名稱與數據對應
wine_df['target'] = wine_df['target'].map(dict(zip(range(3), wine.target_names)))  # 顯示類別名稱
wine_df
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline target
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 class_0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 class_0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 class_0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 class_0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 class_0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
173 13.71 5.65 2.45 20.5 95.0 1.68 0.61 0.52 1.06 7.70 0.64 1.74 740.0 class_2
174 13.40 3.91 2.48 23.0 102.0 1.80 0.75 0.43 1.41 7.30 0.70 1.56 750.0 class_2
175 13.27 4.28 2.26 20.0 120.0 1.59 0.69 0.43 1.35 10.20 0.59 1.56 835.0 class_2
176 13.17 2.59 2.37 20.0 120.0 1.65 0.68 0.53 1.46 9.30 0.60 1.62 840.0 class_2
177 14.13 4.10 2.74 24.5 96.0 2.05 0.76 0.56 1.35 9.20 0.61 1.60 560.0 class_2

178 rows × 14 columns

# 拆分數據爲:訓練集和測試集
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score
0.9444444444444444
import graphviz  # 需要提前安裝graphviz

dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

jpg

clf.feature_importances_  # 查看各特徵的重要性,沒有被使用的特徵 重要性爲0
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.3918564 , 0.        , 0.        , 0.1160134 ,
       0.02128596, 0.        , 0.47084424])
dict(zip(wine.feature_names, clf.feature_importances_))  # 將特徵名稱與重要性對應
{'alcohol': 0.0,
 'malic_acid': 0.0,
 'ash': 0.0,
 'alcalinity_of_ash': 0.0,
 'magnesium': 0.0,
 'total_phenols': 0.0,
 'flavanoids': 0.26190367697120653,
 'nonflavanoid_phenols': 0.0,
 'proanthocyanins': 0.0,
 'color_intensity': 0.11601339710491781,
 'hue': 0.0,
 'od280/od315_of_diluted_wines': 0.15123868318487035,
 'proline': 0.47084424273900527}

增加決策樹隨機性

  • 決策樹的隨機性在高維度的數據集中表現的會比較好
  • 在低維度數據集(比如鳶尾花數據集中),隨機性就表現得不夠好
clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50  # 隨機種子
                                  # splitter:默認是best,就是上面的重要性。雖然隨機,但是還是選擇最重要的。
                                  # random讓決策樹更加隨機,樹會更大更深
                                  ,splitter="random"  
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score
0.8888888888888888
import graphviz
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

jpg


剪枝參數:min_samples_leaf & min_samples_split

  • 爲了使決策樹具有更大的泛化能力
  • 限制樹的最大深度,建議從3開始逐漸嘗試
  • 限制葉子節點數量
  • 限制劃分節點數量
import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
                                  ,splitter="random"
                                  ,max_depth = 3
                                  ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
                                  ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph
0.8518518518518519

jpg


max_features & min_impurity_decrease

  • max_features:最大特徵數量限制,超過限制的特徵會被捨棄,是一種降維方式,使用較少
  • min_impurity_decrease:限制信息增益大小,當信息增益小於這個值,就不再進行分支了
import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
#                                   ,splitter="random"
                                  ,max_depth = 5
#                                   ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
#                                   ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
#                                   ,max_features = 2
                                  ,min_impurity_decrease=0.1
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph
0.9444444444444444

jpg


確認最優參數,畫學習曲線

import  matplotlib.pyplot as plt

deths_rt = []

for dep in range(1, 10):
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,max_depth = dep
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回準確度
    deths_rt.append(score)
    
plt.plot(range(1, 10), deths_rt)

png


目標權重參數

  • class_weight & min_weight_fraction_leaf
  • 注意:sklearn不接受一維矩陣
class_weight  # 目標類型的權重,其數據類型爲dict或者列表內的dict,或者爲"balanced"
min_weight_fraction_leaf  # 權重剪枝參數,搭配目標權重使用,比min_samples_leaf更偏向於主導類

其他常用接口

# 返回樣本所在葉子節點的索引
clf.apply(X_test)
array([ 5,  5,  5,  4,  3,  5,  5,  5,  5, 10, 10,  5,  5,  3, 10, 10, 10,
        5,  4,  5, 10,  4,  5, 10,  5,  5,  4,  5,  4,  4,  5,  4,  4, 10,
       10,  5,  4,  5,  5,  5,  4, 10, 10, 10,  5,  5, 10,  4, 10, 10,  5,
        5,  5, 10], dtype=int64)
# 返回預測標籤
clf.predict(X_test
array([1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 0, 2,
       1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0,
       1, 1, 0, 2, 0, 0, 1, 1, 1, 0])
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章