sklearn中的投票機制學習筆記

原創

2019-01-16 14:43

投票機制（voting）是集成學習裏面針對分類問題的一種結合策略。基本思想是選擇所有機器學習算法當中輸出最多的那個類。機器學習分類算法的輸出有兩種類型：一種是直接輸出類標籤，另外一種是輸出類概率，使用前者進行投票叫做硬投票(Majority/Hard voting)，使用後者進行分類叫做軟投票(Soft voting)。 sklearn中的投票機制使用VotingClassifier來實現。本文以iris數據集爲例做實驗驗證！

1.硬投票是選擇算法輸出最多的標籤，如果標籤數量相等，那麼按照升序的次序進行選擇。實例代碼如下:

from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
iris = datasets.load_iris()
X, y = iris.data[:,1:3], iris.target
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=160) 
eclf = VotingClassifier(estimators=[('lr',clf1),('rf',clf2),('gnb',clf3),('xgb',clf4)], voting='hard')
#使用投票法將三個模型結合在以前，estimotor採用 [(name1,clf1),(name2,clf2),...]這樣的輸入，和Pipeline的輸入相同 voting='hard'表示硬投票
for clf, clf_name in zip([clf1, clf2, clf3, clf4, eclf],['Logistic Regrsssion', 'Random Forest', 'naive Bayes', 'xgboost', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print('Accuracy: {:.2f} (+/- {:.2f}) [{}]'.format(scores.mean(), scores.std(), clf_name))

Accuracy: 0.90 (+/- 0.05) [Logistic Regrsssion]
Accuracy: 0.93 (+/- 0.05) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
Accuracy: 0.95 (+/- 0.05) [xgboost]
Accuracy: 0.95 (+/- 0.05) [Ensemble]

2.軟投票是使用各個算法輸出的類概率來進行類的選擇，輸入權重的話，會得到每個類的類概率的加權平均值，值大的類會被選擇。

from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

iris = datasets.load_iris()
X = iris.data[:,[0,2]] #取兩列，方便繪圖
y = iris.target
#實例化分類器
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
clf4 = RandomForestClassifier(random_state=1)
clf5 = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=160) 

eclf = VotingClassifier(estimators=[('dt',clf1),('knn',clf2),('svc',clf3),('rf',clf4),('xgb',clf5)], voting='soft', weights=[2,1,1,2,3])
#weights控制每個算法的權重, voting=’soft' 使用了軟權重
#模型訓練
clf1.fit(X,y)
clf2.fit(X,y)
clf3.fit(X,y)
clf4.fit(X,y)
clf5.fit(X,y)
eclf.fit(X,y)
#網格範圍限定
x_min, x_max = X[:,0].min() -1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() -1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))  #創建網格
#zhou=np.c_[xx.ravel(), yy.ravel()]
#plt.imshow(xx)
fig, axes = plt.subplots(3, 2, sharex='col', sharey='row', figsize=(10, 8)) #共享X軸和Y軸
#for idx, clf, title in zip(product([0, 1],[0, 1]) #這裏的順序同plt.subplots(3, 2)的維度。
for idx, clf, title in zip(product([0, 1,2],[0, 1]),
                           [clf1, clf2, clf3, clf4,clf5,eclf],
                           ['Decision Tree (depth=4)', 'KNN (k=7)',
                            'Kernel SVM','RandomForest','xgboost', 'Soft Voting']):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) #起初我以爲是預測的X的值，實際上是預測了上面創建的網格的值，以這些值來進行描繪區域
    Z = Z.reshape(xx.shape)
    axes[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4) #繪製等高線
    axes[idx[0], idx[1]].scatter(X[:, 0],X[:, 1], c=y, s=20, edgecolor='k') #花上散點,顏色數=類別數
    axes[idx[0], idx[1]].set_title(title)
plt.show()

#for idx, clf, title in zip(product([0,1, 2],[0,1]),
#                           [clf1, clf2, clf3, clf4,clf5,eclf],
#                           ['Decision Tree (depth=4)', 'KNN (k=7)',
#                            'Kernel SVM','RandomForest','xgboost', 'Soft Voting']):
#    print(idx)
#    print('================')    

#對於感興趣的語句可以單獨提取出來研究,因爲Python既很好的支持面向過程,又能很好的支持面向對象封裝
fig, axes = plt.subplots(3, 2, sharex='col', sharey='row', figsize=(10, 8)) #共享X軸和Y軸
#for idx, clf, title in zip(product([0, 1],[0, 1]) #這裏的順序同plt.subplots(3, 2)的維度。
for idx, clf, title in zip(product([0, 1,2],[0, 1]),
                           [clf1, clf2, clf3, clf4,clf5,eclf],
                           ['Decision Tree (depth=4)', 'KNN (k=7)',
                            'Kernel SVM','RandomForest','xgboost', 'Soft Voting']):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) #起初我以爲是預測的X的值，實際上是預測了上面創建的網格的值，以這些值來進行描繪區域
    Z = Z.reshape(xx.shape)
    axes[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4) #繪製等高線
    #axes[idx[0], idx[1]].scatter(X[:, 0],X[:, 1], c=y, s=20, edgecolor='k')
    axes[idx[0], idx[1]].set_title(title)
plt.show()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

sklearn中的投票機制學習筆記

Python3各種數據結構下的排序及去重彙總

Kaggle競賽中最終成爲0.3%的獲獎經驗

恐怖襲擊等級預測量化與ARMIA時間序列建模的例子

pd.read_excel()練習

Chrome71中HTTP Graph Collector Chrome插件離線安裝方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結