sklearn：隨機森林_分類器_紅酒數據集

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

對比決策樹和隨機森林

# 紅酒數據集
wine = load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3)

# 構建模型，用決策樹和隨機森林進行對比
clf = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=0)

clf = clf.fit(X_train, y_train)
rfc = rfc.fit(X_train, y_train)

score_d = clf.score(X_test, y_test)
score_r = rfc.score(X_test, y_test)

print(score_d, score_r)

0.9074074074074074 0.9814814814814815


#### 交叉驗證下對比


```python
import matplotlib.pyplot as plt

## 1 次交叉驗證的結果

rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10)

clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf, wine.data, wine.target, cv=10)

plt.plot(range(1, 11), rfc_s, label='RandomForest')
plt.plot(range(1, 11), clf_s, label='DecisionTree')
plt.legend()
plt.show()
# 從圖中可以看出，隨機森林的效果比 決策樹的效果好

# 10 次交叉驗證後，取均值查看效果

rfc_slist = []
clf_slist = []
for i in range(1, 11):
    rfc = RandomForestClassifier(n_estimators=25)
    clf = DecisionTreeClassifier()
    
    rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10).mean()
    clf_s = cross_val_score(clf, wine.data, wine.target, cv=10).mean()
    
    rfc_slist.append(rfc_s)
    clf_slist.append(clf_s)

plt.plot(range(1, 11), rfc_slist, label='RandomForest')
plt.plot(range(1, 11), clf_slist, label='DecisionTree')
plt.legend()
plt.show()

# n_estimator的學習曲線
# 此代碼運行時間較長，大約3min

estimator_list = []

for i in range(1, 201):
    rfc = RandomForestClassifier(n_estimators=i)
    rfc_s = cross_val_score(rfc, wine.data, wine.target, cv=10).mean()
    estimator_list.append(rfc_s)



plt.plot(range(1, 201), estimator_list, label='RandomForest')
plt.legend()
plt.show()
# 可以看出，在迭代到25次時候，隨機森林的數據已趨近於一個值的

## 查看隨機森林中決策樹的參數
rfc = RandomForestClassifier(n_estimators=25)
rfc.fit(X_train, y_train)

rfc.estimators_  # 查看每一棵樹的參數，可以看到每一棵樹的隨機種子不一樣

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=1242025449, splitter='best'),
 ... ... 共25個決策樹
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=392066908, splitter='best')]

隨機森林的袋外數據

在有放回的抽樣中，有一部分數據會被反覆抽到，可能有一部分數據一直沒有被抽到，這部分數據就叫做袋外數據
袋外數據的比例大約是 37%，通過 1- ( 1 - 1/ n) ^ n ，無窮大時候收斂於 1 - (1/e) 來得到
袋外數據可以用於做測試集，且在實例化隨機森林時候，oob_score=True，默認是False狀態
袋外數據使用時候，就不用劃分測試集和訓練集
袋外數據適用於數據量較大的情況，如果數據量較小，就可能出現沒有袋外數據的情況

rfc = RandomForestClassifier(n_estimators=25, oob_score=True)
# 此時不用劃分訓練集和測試集
rfc = rfc.fit(wine.data, wine.target)

# 查看屬性
rfc.oob_score_

0.9662921348314607

rfc.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=425839601, splitter='best'),
 ... ...
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=720743806, splitter='best')]

重要接口

rfc.feature_importances_

array([0.18197243, 0.03502596, 0.01272023, 0.03099205, 0.03664106,
       0.0383327 , 0.13369377, 0.0025354 , 0.01213011, 0.10785671,
       0.05097174, 0.19950555, 0.15762229])

rfc.apply(X_test)

array([[ 6,  5,  8, ...,  3,  2, 10],
       [ 6,  5,  8, ...,  3,  2, 10],
       [17, 18, 18, ..., 22, 10, 20],
       ...,
       [17, 18, 18, ..., 22, 10, 20],
       [ 6,  5, 15, ...,  3,  2, 10],
       [23, 18, 18, ..., 25, 10, 20]], dtype=int64)

rfc.predict(X_test)

array([1, 1, 0, 2, 1, 1, 1, 0, 0, 2, 1, 1, 1, 1, 2, 0, 0, 2, 2, 1, 1, 1,
       2, 1, 0, 2, 2, 0, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 1, 0, 0, 2, 2,
       0, 0, 2, 0, 2, 0, 1, 0, 1, 0])

# 一行是一個樣本在不同類別中的概率
rfc.predict_proba(X_test)

array([[0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  ],
       ... ... 有多少個樣本，就又多少個概率數據
       [0.  , 0.  , 1.  ],
       [1.  , 0.  , 0.  ],
       [0.04, 0.96, 0.  ],
       [0.96, 0.04, 0.  ],
       [0.  , 1.  , 0.  ],
       [0.96, 0.04, 0.  ]])

from scipy.special import comb
import numpy as np

x = np.linspace(0, 1, 20)
y = []
# 隨機生成 0~1 之間的小數，看不同概率的 數據對隨機森林 和 單一決策樹的影響
for epsilon in np.linspace(0, 1, 20):
    E = np.array([comb(25, i)*(epsilon**i)*((1-epsilon)**(25-i)) for i in range(13, 26)]).sum()
    y.append(E)
    
plt.plot(x, y, 'o-', label='estimators are different')
plt.plot(x, x, '--', color='red', label='estimators are same')
plt.xlabel("individual estimator's error")
plt.ylabel("randomForest's error")
plt.legend()
plt.show()

# 下圖中，紅色表示 決策樹 預測準確率
# 藍色表示 決策樹 的準確率 對應的 隨機森林的準確率
# 可以看到，如果單棵決策樹的準確率小於50%了，就不要使用此隨機森林建模了，誤差會更大

sklearn：隨機森林_分類器_紅酒數據集

對比決策樹和隨機森林

隨機森林的袋外數據

重要接口

windows conda Permission to listen on port 8888 denied

打包上傳python代碼到pypi，通過pip安裝使用

windows解除文件佔用

python壓縮指定文件或目錄爲zip

python logging 日誌按時間間隔自動切分

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結