Table of Contents
# 數據處理、分析
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime
import os
import glob
# sklearn模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as gbm
from sklearn.ensemble import GradientBoostingClassifier as gbdt
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
# sklearn特徵工程、數據準備和評估
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate, KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import BernoulliRBM
from sklearn.datasets.samples_generator import make_blobs
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin,ClassifierMixin
from sklearn import clone
# keras數據準備
from keras.models import load_model
from keras.utils import to_categorical
# keras神經網絡
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers
# 圖形顯示
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# 表格顯示
pd.set_option('max_colwidth',20)
pd.set_option('display.max_columns', 30)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Using TensorFlow backend.
1. 數據準備
數據存儲在csv當中,文件小於10M,以pandas爲主要處理方法。
基本的處理包括:數值型變量歸一化,類別型變量數值化
path = r'C:/Users/Administrator/Documents/ls/data/pima-indians-diabetes.data.csv'
data_set = pd.read_csv(filepath_or_buffer=path,encoding='utf-8',sep=',',index_col=False, header=None)
use_data = pd.get_dummies(data_set, columns=[0,7])
use_data.head()
1 | 2 | 3 | 4 | 5 | 6 | 8 | 0_0 | 0_1 | 0_2 | 0_3 | 0_4 | 0_5 | 0_6 | 0_7 | ... | 7_58 | 7_59 | 7_60 | 7_61 | 7_62 | 7_63 | 7_64 | 7_65 | 7_66 | 7_67 | 7_68 | 7_69 | 7_70 | 7_72 | 7_81 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 76 columns
# 對數值型數據進行歸一化處理
def normolized_col(train_df, test_df,tranform_colname=list(range(1,7,1))):
stand = StandardScaler()
train_transform = train_df[:,tranform_colname]
stand.fit(train_transform)
train_df[:,tranform_colname] = stand.transform(train_transform)
test_df[:,tranform_colname] = stand.transform(test_df[:,tranform_colname])
return train_df, test_df
target_col = 8
input_col = [i for i in use_data.columns if i != target_col]
X = use_data.loc[:,input_col].astype(np.float32).as_matrix()
y = use_data.loc[:,target_col].astype(np.str).as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_test = normolized_col(X_train, X_test)
def acc_cv(models, train_x=X_train, train_y=y_train, n_folds = 5, test_x=X_test, test_y=y_test):
clf = make_pipeline(models)
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x)
acc_score = cross_val_score(clf, train_x, train_y, scoring="accuracy", cv = kf)
print('*'*50)
print('train_accuracy: %.5f' %(acc_score.mean()))
clf.fit(X_train, y_train)
# print('clf:%s'%(clf))
# X_test_new = StandardScaler().fit(X_train).transform(X_test)
y_pred = clf.predict(X_test)
print('test_accuracy: %.5f ' %accuracy_score(y_pred=y_pred, y_true=y_test))
print('*'*50)
return acc_score
class StackingAverageModels(BaseEstimator, ClassifierMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
def fit(self, X, y):
# X = X.as_matrix()
# y = y.as_matrix()
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=123)
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)), dtype=np.float64)
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y): # KFold的split方法, 將通過indice切分train和test
instance = clone(model)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
self.meta_model_.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
# X = X.as_matrix()
meta_features = np.column_stack(stats.mode(np.column_stack([model.predict(X) for model in base_models]), axis=1)[0] for base_models in self.base_models_)
# meta_features = np.column_stack([stats.mode(np.column_stack([model.predict(X) for model in base_models]),axis=0)[0]
# for base_models in self.base_models_])
return self.meta_model_.predict(meta_features)
2. stack類方法
編寫實現stack的類方法,實現基模型是神經網絡,元模型是其他。具體包括載入模型、預測集構建、元模型訓練、最終預測結果。
class StackingAverageModels_build2():
'''
第一層的submodel是神經網絡
第二層的模型是其他模型。
'''
def __init__(self, X_train, y_train, X_test, y_test):
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.doc_dir = None
self.members = None
self.n_models = None
self.meta_model = None
def load_all_models(self, n_models, doc_dir):
all_models = list()
for i in range(n_models):
filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
model = load_model(filename)
all_models.append(model)
print('>loaded %s' %(filename))
self.members = all_models
self.doc_dir = doc_dir
self.n_models = n_models
return all_models
def stacked_dataset(self, inputX):
'''
第一層模型-建立模型並訓練,輸出預測結果
'''
stackX = None
for model in self.members:
# 做預測
y_pred = model.predict(inputX, verbose=0)
# 預測結果重塑成[row, members, probalities]
if stackX is None:
stackX = y_pred
else:
stackX = np.dstack((stackX, y_pred))
# 將預測結果展開成,[rows, members * probalities]
stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
return stackX
def fit_stacked_model(self, meta_model):
'''
第二層模型-基於第一層預測結果,建立模型並訓練
return: 已訓練好的模型
'''
inputX = self.X_test
inputy = self.y_test
# 創建訓練集
stackedX = self.stacked_dataset(inputX)
# 第二層的模型進行訓練
meta_model.fit(stackedX, inputy)
self.meta_model = meta_model
return meta_model
def stacked_prediction(self):
'''
基於第二層模型得到的預測結果
'''
# 創建訓練數據集
stackedX = self.stacked_dataset(self.X_test)
# 做預測
model = self.meta_model
y_pred = model.predict(stackedX)
return y_pred
# keras中的Sequential建立的DNN
def DNN_base_v1(X_train, y_train):
model = models.Sequential()
model.add(layers.Dense(96, activation='elu',kernel_regularizer=regularizers.l2(0.005), input_shape=(X_train.shape[1], )))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='elu',kernel_regularizer=regularizers.l2(0.005)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizers.Adadelta(), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=1200, batch_size=50, validation_split=0.2, verbose=0, shuffle=True)
results_train = model.evaluate(X_train, y_train)
print('accuracy: %s' %(results_train))
return model
def DNN_fit_and_save(X_train, y_train, doc_dir, model_numbers):
if os.path.exists(doc_dir) == True:
pass
else:
os.makedirs(doc_dir)
for i in range(model_numbers):
model = DNN_base_v1(X_train, y_train)
filename = os.path.join(doc_dir, 'model_'+ np.str(i + 1)+'.h5')
model.save(filename)
print('>save %s' %(filename))
3. 基模型
doc_dir = r'C:\Users\Administrator\Documents\ls\tmp_models'
dnn = DNN_fit_and_save(X_train, y_train, doc_dir, 5)
614/614 [==============================] - 0s 86us/step
accuracy: [0.50532664577036801, 0.78175895784887506]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
614/614 [==============================] - 0s 80us/step
accuracy: [0.50161608306126793, 0.78664495133421708]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
614/614 [==============================] - 0s 86us/step
accuracy: [0.49014993210957181, 0.78827361582933109]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
614/614 [==============================] - 0s 124us/step
accuracy: [0.50730565588326715, 0.77850162885864704]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
614/614 [==============================] - 0s 83us/step
accuracy: [0.5090954220256122, 0.77198697087819101]
>save C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5
lr = LR(random_state=123, verbose=0)
svm_clf2 = SVC(kernel='rbf',class_weight='balanced',random_state=123)
dt = DT(max_depth=4,random_state=123)
nb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
rdf = RandomForestClassifier(random_state=123)
gbm_sklearn_model = gbdt(random_state=123)
xgb_model = xgb.XGBClassifier(seed=123)
gbm_model = gbm.LGBMClassifier(random_state=123)
4. stack模型
不同的元模型:logistic、SVM、Decision Tree、Xgboost、GBM等。將其與神經網絡模型組合成集成學習模型stack方法,在元模型採用基本的默認參數情況下,比較各stack方法的效果
DNN+LR
aa = StackingAverageModels_build2(X_train, y_train, X_test, y_test)
aa.load_all_models(doc_dir=r'C:\Users\Administrator\Documents\ls\tmp_models', n_models=5)
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_1.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_2.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_3.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_4.h5
>loaded C:\Users\Administrator\Documents\ls\tmp_models\model_5.h5
[<keras.engine.sequential.Sequential at 0x2b472320>,
<keras.engine.sequential.Sequential at 0x2b472128>,
<keras.engine.sequential.Sequential at 0x2b44aa90>,
<keras.engine.sequential.Sequential at 0x2b44a470>,
<keras.engine.sequential.Sequential at 0x2b44acc0>]
aa.fit_stacked_model(meta_model=lr)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.80519480519480524
DNN+Decision Tree
aa.fit_stacked_model(meta_model=dt)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.85064935064935066
DNN+SVM
aa.fit_stacked_model(meta_model=svm_clf2)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.7857142857142857
DNN+Xgboost
aa.fit_stacked_model(meta_model=xgb_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.9285714285714286
DNN+RandomForest
aa.fit_stacked_model(meta_model=rdf)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.96753246753246758
DNN+LightGBM
aa.fit_stacked_model(meta_model=gbm_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
0.88961038961038963
DNN+GBDT_sklearn
aa.fit_stacked_model(meta_model=gbm_sklearn_model)
y_pred = aa.stacked_prediction()
accuracy_score(y_pred=y_pred, y_true=y_test)
1.0
小結
結論:樹模型與神經網絡模型的結合,效果不錯。在分類任務可以考慮多嘗試使用。
results = [0.80519480519480524, 0.85064935064935066, 0.7857142857142857, 0.9285714285714286, 0.96753246753246758, 0.88961038961038963,1.0]
results = [float('%.4f' %(i)) for i in results]
model_name = ['LR', 'DT', 'SVM', 'Xgb', 'RDF', 'Lgbm', 'GBDT']
df_plot = pd.DataFrame({'model':model_name, 'accuracy':results})
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel(...,fontsize=20)
ax.set_ylabel(...,fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
pl = sns.barplot(x='model', y='accuracy', data=df_plot)
plt.show()
參考文章: