sklearn：決策分類樹_紅酒數據集

from sklearn import tree
from sklearn.datasets import load_wine  # 紅酒數據
from sklearn.model_selection import train_test_split

wine = load_wine()  # 導入數據
wine

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2]),
 'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'),
 'DESCR': '......rics).\n',
 'feature_names': ['alcohol',
  'malic_acid',
  'ash',
  'alcalinity_of_ash',
  'magnesium',
  'total_phenols',
  'flavanoids',
  'nonflavanoid_phenols',
  'proanthocyanins',
  'color_intensity',
  'hue',
  'od280/od315_of_diluted_wines',
  'proline']}

import pandas as pd

# 將特徵數據與 target拼接起來
wine_df = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)] ,axis=1)
wine_df.columns=list(wine.feature_names) + ['target']  # 將數據特徵名稱與數據對應
wine_df['target'] = wine_df['target'].map(dict(zip(range(3), wine.target_names)))  # 顯示類別名稱
wine_df

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline	target
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0	class_0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0	class_0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0	class_0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0	class_0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0	class_0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	13.71	5.65	2.45	20.5	95.0	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740.0	class_2
174	13.40	3.91	2.48	23.0	102.0	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750.0	class_2
175	13.27	4.28	2.26	20.0	120.0	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835.0	class_2
176	13.17	2.59	2.37	20.0	120.0	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840.0	class_2
177	14.13	4.10	2.74	24.5	96.0	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560.0	class_2

178 rows × 14 columns

# 拆分數據爲：訓練集和測試集
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.3)

clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score

0.9444444444444444

import graphviz  # 需要提前安裝graphviz

dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

clf.feature_importances_  # 查看各特徵的重要性，沒有被使用的特徵 重要性爲0

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.3918564 , 0.        , 0.        , 0.1160134 ,
       0.02128596, 0.        , 0.47084424])

dict(zip(wine.feature_names, clf.feature_importances_))  # 將特徵名稱與重要性對應

{'alcohol': 0.0,
 'malic_acid': 0.0,
 'ash': 0.0,
 'alcalinity_of_ash': 0.0,
 'magnesium': 0.0,
 'total_phenols': 0.0,
 'flavanoids': 0.26190367697120653,
 'nonflavanoid_phenols': 0.0,
 'proanthocyanins': 0.0,
 'color_intensity': 0.11601339710491781,
 'hue': 0.0,
 'od280/od315_of_diluted_wines': 0.15123868318487035,
 'proline': 0.47084424273900527}

增加決策樹隨機性

決策樹的隨機性在高維度的數據集中表現的會比較好
在低維度數據集（比如鳶尾花數據集中），隨機性就表現得不夠好

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50  # 隨機種子
                                  # splitter：默認是best，就是上面的重要性。雖然隨機，但是還是選擇最重要的。
                                  # random讓決策樹更加隨機，樹會更大更深
                                  ,splitter="random"  
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score

0.8888888888888888

import graphviz
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

剪枝參數：min_samples_leaf & min_samples_split

爲了使決策樹具有更大的泛化能力
限制樹的最大深度，建議從3開始逐漸嘗試
限制葉子節點數量
限制劃分節點數量

import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
                                  ,splitter="random"
                                  ,max_depth = 3
                                  ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
                                  ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

0.8518518518518519

max_features & min_impurity_decrease

max_features：最大特徵數量限制，超過限制的特徵會被捨棄，是一種降維方式，使用較少
min_impurity_decrease：限制信息增益大小，當信息增益小於這個值，就不再進行分支了

import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
#                                   ,splitter="random"
                                  ,max_depth = 5
#                                   ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
#                                   ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
#                                   ,max_features = 2
                                  ,min_impurity_decrease=0.1
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

0.9444444444444444

確認最優參數，畫學習曲線

import  matplotlib.pyplot as plt

deths_rt = []

for dep in range(1, 10):
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,max_depth = dep
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回準確度
    deths_rt.append(score)

plt.plot(range(1, 10), deths_rt)

目標權重參數

class_weight & min_weight_fraction_leaf
注意：sklearn不接受一維矩陣

class_weight  # 目標類型的權重，其數據類型爲dict或者列表內的dict，或者爲"balanced"

min_weight_fraction_leaf  # 權重剪枝參數，搭配目標權重使用，比min_samples_leaf更偏向於主導類

其他常用接口

# 返回樣本所在葉子節點的索引
clf.apply(X_test)

array([ 5,  5,  5,  4,  3,  5,  5,  5,  5, 10, 10,  5,  5,  3, 10, 10, 10,
        5,  4,  5, 10,  4,  5, 10,  5,  5,  4,  5,  4,  4,  5,  4,  4, 10,
       10,  5,  4,  5,  5,  5,  4, 10, 10, 10,  5,  5, 10,  4, 10, 10,  5,
        5,  5, 10], dtype=int64)

# 返回預測標籤
clf.predict(X_test

array([1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 0, 2,
       1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0,
       1, 1, 0, 2, 0, 0, 1, 1, 1, 0])

sklearn：決策分類樹_紅酒數據集

增加決策樹隨機性

剪枝參數：min_samples_leaf & min_samples_split

max_features & min_impurity_decrease

確認最優參數，畫學習曲線

目標權重參數

其他常用接口

致遠OA及相關OA系統集成與二次開發

EXCEL公式使用總結

System.Object未被引用的程序集中定義

Java 信號量（semaphore）搭配CountDownLatch 實現多線程處理循環內邏輯並限制創建線程數

[轉帖]linux命令top內存顯示M兆或者G

【面試準備】項目經驗——接口自動化項目

windows conda Permission to listen on port 8888 denied

打包上傳python代碼到pypi，通過pip安裝使用

windows解除文件佔用

python壓縮指定文件或目錄爲zip

python logging 日誌按時間間隔自動切分

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結