集成方法stack模型在樣例數據集的試驗

Table of Contents

# 數據處理、分析
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import os
import glob

# sklearn模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import GradientBoostingClassifier as gbdt
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier

# sklearn特徵工程、數據準備和評估
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate, KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import BernoulliRBM
from sklearn.datasets.samples_generator import make_blobs
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin,ClassifierMixin
from sklearn import clone

# keras數據準備
from keras.models import load_model
from keras.utils import to_categorical

# keras神經網絡
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

# 圖形顯示
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 表格顯示
pd.set_option('max_colwidth',20)
pd.set_option('display.max_columns', 30)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Using TensorFlow backend.

1. 數據準備

數據存儲在csv當中,文件小於10M,以pandas爲主要處理方法。
基本的處理包括:數值型變量歸一化,類別型變量數值化

path = r'C:/Users/Administrator/Documents/ls/data/pima-indians-diabetes.data.csv'
data_set = pd.read_csv(filepath_or_buffer=path,encoding='utf-8',sep=',',index_col=False, header=None)
use_data = pd.get_dummies(data_set, columns=[0,7])
use_data.head()
1 2 3 4 5 6 8 0_0 0_1 0_2 0_3 0_4 0_5 0_6 0_7 ... 7_58 7_59 7_60 7_61 7_62 7_63 7_64 7_65 7_66 7_67 7_68 7_69 7_70 7_72 7_81
0 148 72 35 0 33.6 0.627 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 85 66 29 0 26.6 0.351 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 183 64 0 0 23.3 0.672 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 89 66 23 94 28.1 0.167 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 137 40 35 168 43.1 2.288 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 76 columns

# 對數值型數據進行歸一化處理
def normolized_col(train_df, test_df,tranform_colname=list(range(1,7,1))):
    stand = StandardScaler()
    train_transform = train_df[:,tranform_colname]
    stand.fit(train_transform)
    train_df[:,tranform_colname] = stand.transform(train_transform)
    
    test_df[:,tranform_colname] = stand.transform(test_df[:,tranform_colname])
    
    return train_df, test_df

target_col = 8
input_col = [i for i in use_data.columns if i != target_col]
X = use_data.loc[:,input_col].astype(np.float32).as_matrix()
y = use_data.loc[:,target_col].astype(np.str).as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_test = normolized_col(X_train, X_test)
def acc_cv(models, train_x=X_train, train_y=y_train, n_folds = 5, test_x=X_test, test_y=y_test):
    clf = make_pipeline(models)
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x)
    acc_score = cross_val_score(clf, train_x, train_y, scoring="accuracy", cv = kf)
    print('*'*50)
    print('train_accuracy: %.5f' %(acc_score.mean()))
    clf.fit(X_train, y_train)
#     print('clf:%s'%(clf))
#     X_test_new = StandardScaler().fit(X_train).transform(X_test)
    y_pred = clf.predict(X_test)
    print('test_accuracy: %.5f ' %accuracy_score(y_pred=y_pred, y_true=y_test))
    print('*'*50)

    return acc_score

class StackingAverageModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y):
#         X = X.as_matrix()
#         y = y.as_matrix()

        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=123)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)), dtype=np.float64)

        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):  # KFold的split方法, 將通過indice切分train和test
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred

        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    def predict(self, X):
#         X = X.as_matrix()
        meta_features = np.column_stack(stats.mode(np.column_stack([model.predict(X) for model in base_models]), axis=1)[0] for base_models in self.base_models_)
#         meta_features = np.column_stack([stats.mode(np.column_stack([model.predict(X) for model in base_models]),axis=0)[0]
#                                          for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

2. stack類方法

編寫實現stack的類方法,實現基模型是神經網絡,元模型是其他。具體包括載入模型、預測集構建、元模型訓練、最終預測結果。

class StackingAverageModels_build2():
    '''
    第一層的submodel是神經網絡
    第二層的模型是其他模型。
    '''
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.doc_dir = None
        self.members = None
        self.n_models = None
        self.meta_model = None
        
    def load_all_models(self, n_models, doc_dir):
        all_models = list()
        for i in range(n_models):
            filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
            model = load_model(filename)
            all_models.append(model)
            print('>loaded %s' %(filename))
        
        self.members = all_models
        self.doc_dir = doc_dir
        self.n_models = n_models
        return all_models
    
    def stacked_dataset(self, inputX):
        '''
        第一層模型-建立模型並訓練,輸出預測結果
        '''
        stackX = None
        for model in self.members:
            # 做預測
            y_pred = model.predict(inputX, verbose=0)
            # 預測結果重塑成[row, members, probalities]
            if stackX is None:
                stackX = y_pred
            else:
                stackX = np.dstack((stackX, y_pred))
        # 將預測結果展開成,[rows, members * probalities]
        stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
        
        return stackX

    def fit_stacked_model(self, meta_model):
        '''
        第二層模型-基於第一層預測結果,建立模型並訓練
        return: 已訓練好的模型
        '''
        inputX = self.X_test
        inputy = self.y_test
        
        # 創建訓練集
        stackedX = self.stacked_dataset(inputX)
        # 第二層的模型進行訓練
        meta_model.fit(stackedX, inputy)
        self.meta_model = meta_model
        
        return meta_model

    def stacked_prediction(self):
        '''
        基於第二層模型得到的預測結果
        '''
        # 創建訓練數據集
        stackedX = self.stacked_dataset(self.X_test)
        # 做預測
        model = self.meta_model
        y_pred = model.predict(stackedX)
        return y_pred
# keras中的Sequential建立的DNN

def DNN_base_v1(X_train, y_train):
    
    model = models.Sequential()
    model.add(layers.Dense(96, activation='elu',kernel_regularizer=regularizers.l2(0.005), input_shape=(X_train.shape[1], )))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer=optimizers.Adadelta(), loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=1200, batch_size=50, validation_split=0.2, verbose=0, shuffle=True)
    results_train = model.evaluate(X_train, y_train)
    
    print('accuracy: %s' %(results_train))
    return model

def DNN_fit_and_save(X_train, y_train, doc_dir, model_numbers):
    if os.path.exists(doc_dir) == True:
        pass
    else:
        os.makedirs(doc_dir)
    for i in range(model_numbers):
        model = DNN_base_v1(X_train, y_train)
        filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
        model.save(filename)
        print('>save %s' %(filename))

3. 基模型

doc_dir = r'C:\Users\Administrator\Documents\ls\tmp_models'
dnn = DNN_fit_and_save(X_train, y_train, doc_dir, 5)
614/614 [==============================] - 0s 86us/step
accuracy: [0.50532664577036801, 0.78175895784887506]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
614/614 [==============================] - 0s 80us/step
accuracy: [0.50161608306126793, 0.78664495133421708]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
614/614 [==============================] - 0s 86us/step
accuracy: [0.49014993210957181, 0.78827361582933109]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
614/614 [==============================] - 0s 124us/step
accuracy: [0.50730565588326715, 0.77850162885864704]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
614/614 [==============================] - 0s 83us/step
accuracy: [0.5090954220256122, 0.77198697087819101]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5
lr = LR(random_state=123, verbose=0)
svm_clf2 = SVC(kernel='rbf',class_weight='balanced',random_state=123)
dt = DT(max_depth=4,random_state=123)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
rdf = RandomForestClassifier(random_state=123)
gbm_sklearn_model = gbdt(random_state=123)
xgb_model = xgb.XGBClassifier(seed=123)
gbm_model = gbm.LGBMClassifier(random_state=123)

4. stack模型

不同的元模型:logistic、SVM、Decision Tree、Xgboost、GBM等。將其與神經網絡模型組合成集成學習模型stack方法,在元模型採用基本的默認參數情況下,比較各stack方法的效果

DNN+LR

aa = StackingAverageModels_build2(X_train, y_train, X_test, y_test)
aa.load_all_models(doc_dir=r'C:\Users\Administrator\Documents\ls\tmp_models', n_models=5)
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5





[<keras.engine.sequential.Sequential at 0x2b472320>,
 <keras.engine.sequential.Sequential at 0x2b472128>,
 <keras.engine.sequential.Sequential at 0x2b44aa90>,
 <keras.engine.sequential.Sequential at 0x2b44a470>,
 <keras.engine.sequential.Sequential at 0x2b44acc0>]
aa.fit_stacked_model(meta_model=lr)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.80519480519480524

DNN+Decision Tree

aa.fit_stacked_model(meta_model=dt)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.85064935064935066

DNN+SVM

aa.fit_stacked_model(meta_model=svm_clf2)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.7857142857142857

DNN+Xgboost

aa.fit_stacked_model(meta_model=xgb_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.9285714285714286

DNN+RandomForest

aa.fit_stacked_model(meta_model=rdf)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.96753246753246758

DNN+LightGBM

aa.fit_stacked_model(meta_model=gbm_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.88961038961038963

DNN+GBDT_sklearn

aa.fit_stacked_model(meta_model=gbm_sklearn_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
1.0

小結

結論:樹模型與神經網絡模型的結合,效果不錯。在分類任務可以考慮多嘗試使用。

results = [0.80519480519480524, 0.85064935064935066, 0.7857142857142857, 0.9285714285714286, 0.96753246753246758, 0.88961038961038963,1.0]
results = [float('%.4f' %(i)) for i in results]
model_name = ['LR', 'DT', 'SVM', 'Xgb', 'RDF', 'Lgbm', 'GBDT']
df_plot = pd.DataFrame({'model':model_name, 'accuracy':results})

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(...,fontsize=20)
ax.set_ylabel(...,fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

pl = sns.barplot(x='model', y='accuracy', data=df_plot)
plt.show()

在這裏插入圖片描述


參考文章:

  1. How to Develop a Stacking Ensemble for Deep Learning Neural Networks in Python With Keras
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章