集成學習Python代碼

理論知識見:

https://blog.csdn.net/zwqjoy/article/details/80431496

https://sklearn.apachecn.org/docs/0.21.3/12.html

Bagging:

import itertools
# 相當多的牛逼閃閃的數學算法
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
#調整子圖位置大小

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score,train_test_split
#stacking利器:
#繪製學習曲線
from mlxtend.plotting import plot_learning_curves
#畫出logistic模型決策邊界
from mlxtend.plotting import plot_decision_regions

np.random.seed(0)

iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target

clf1 = DecisionTreeClassifier(criterion='entropy',max_depth=1)
#決策樹的評價標準爲信息熵
clf2 = KNeighborsClassifier(n_neighbors=1)

bagging1 = BaggingClassifier(base_estimator=clf1,n_estimators=10,max_samples=0.8,max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2,n_estimators=10,max_samples=0.8,max_features=0.8)
label = ['Decision Tree','K-NN','Bagging Tree','Bagging K-NN']

clf_list =[clf1,clf2,bagging1,bagging2]

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2) #分爲兩行兩列
grid = itertools.product([0,1],repeat=2) #result用for循環print:(0, 0)(0, 1)(1, 0)(1, 1)
#求笛卡爾積,前面的取兩個相乘,product(list1, list2)
for clf,label,grid in zip(clf_list,label,grid):#zip()見代碼末尾
    scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    # cv:選擇每次測試折數  accuracy:評價指標是準確度,可以省略使用默認值
    # 交叉驗證
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"  % (scores.mean(), scores.std(), label))
    #求均值、標準偏差
    clf.fit(X,y)
    ax = plt.subplot(gs[grid[0],grid[1]])
    fig = plot_decision_regions(X=X,y=y,clf=clf,legend=2)
    plt.title(label)

plt.show()
#上圖顯示了決策樹和k-NN分類器的決策邊界,以及它們應用於Iris數據集的bagging集合。
# 決策樹顯示座標軸平行邊界,
# 由於k-神經網絡對訓練樣本的擾動不敏感,因此稱爲穩定學習器,
# 因此決策樹袋裝集成相對於k-神經網絡袋裝集成具有更高的精度

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,bagging1,print_model=False,style='ggplot')
plt.show()
#上圖顯示了裝袋樹集成的學習曲線。我們可以看到訓練數據的平均誤差爲0.3,
# 測試數據的誤差曲線爲u型。
# 訓練和測試錯誤之間的最小差距出現在訓練集大小爲80%左右。
""">>>a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b)     # 返回一個對象
>>> zipped
<zip object at 0x103abc288>
>>> list(zipped)  # list() 轉換爲列表
[(1, 4), (2, 5), (3, 6)]
>>> list(zip(a,c))              # 元素個數與最短的列表一致
[(1, 4), (2, 5), (3, 6)]
>>> a1, a2 = zip(*zip(a,b))          # 與 zip 相反,zip(*) 可理解爲解壓,返回二維矩陣式
>>> list(a1)
[1, 2, 3]
>>> list(a2)
[4, 5, 6]
"""

Boosting:

import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target

clf = DecisionTreeClassifier(criterion='entropy',max_depth=1)

num_est = [1,2,3,10]
label = ['AdaBoost(n_est=1)','AdaBoost(n_est=2)','AdaBoost(n_est=3)','AdaBoost(n_est=10)']

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)

for n_est,label,grd in zip(num_est,label,grid):
    boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=n_est)
    #n_estimator爲樹的數量,默認爲10個樹
    boosting.fit(X,y)
    ax = plt.subplot(gs[grd[0],grd[1]])
    fig = plot_decision_regions(X=X,y=y,clf=boosting,legend=2)
    plt.title(label)
    # 每個基本學習器由一個深度爲1的決策樹組成,
    # 從而根據一個特徵閾值對數據進行分類,
    # 該特徵閾值將空間劃分爲兩個區域,
    # 兩個區域之間由一個平行於其中一個軸的線性決策面分隔。
plt.show()

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=10)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,boosting,print_model=False,style='ggplot')
plt.show()

Stacking:

import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

iris = datasets.load_iris()
X,y = iris.data[:,1:3],iris.target

clf1 = KNeighborsClassifier(n_neighbors=1)#n_neighbors爲選取最近的點的個數
clf2 = RandomForestClassifier(random_state=1)#隨機數生成器使用的種子
clf3 = GaussianNB()#樸素貝葉斯
lr = LogisticRegression()#meta_classifier元分類器
sclf = StackingClassifier(classifiers=[clf1,clf2,clf3],meta_classifier=lr)
label = ['KNN','RandomForest','Naive Bayes','Stacking Classifier']
clf_list = [clf1,clf2,clf3,sclf]

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)

clf_cv_mean,clf_cv_std= [],[]
for clf,label,grd in zip(clf_list,label,grid):
    scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy:均值:%.2f(+/- 標準差:%.2f) [%s]" %(scores.mean(),scores.std(),label))
    #標準差(方差)越小越穩定
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())

    clf.fit(X,y)
    ax = plt.subplot(gs[grd[0],grd[1]])
    fig = plot_decision_regions(X=X,y=y,clf=clf)
    plt.title(label)
plt.show()

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,sclf,print_model=True,style='ggplot')
plt.show()
# 我們可以看到,疊加比單個分類器獲得更高的精度,從學習曲線上看,沒有過度擬合的跡象。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章