def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):
# random forest
rf_est = RandomForestClassifier(random_state=0)
rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]}
rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1)
rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
print('Sample 10 Features from RF Classifier')
print(str(features_top_n_rf[:10]))
# AdaBoost
ada_est =AdaBoostClassifier(random_state=0)
ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1)
ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_ada = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
print('Sample 10 Feature from Ada Classifier:')
print(str(features_top_n_ada[:10]))
# ExtraTree
et_est = ExtraTreesClassifier(random_state=0)
et_param_grid = {'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20]}
et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1)
et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best ET Params:' + str(et_grid.best_params_))
print('Top N Features Best ET Score:' + str(et_grid.best_score_))
print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_et = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
print('Sample 10 Features from ET Classifier:')
print(str(features_top_n_et[:10]))
# GradientBoosting
gb_est =GradientBoostingClassifier(random_state=0)
gb_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20]}
gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1)
gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_gb = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': gb_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
print('Sample 10 Feature from GB Classifier:')
print(str(features_top_n_gb[:10]))
# DecisionTree
dt_est = DecisionTreeClassifier(random_state=0)
dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1)
dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
feature_imp_sorted_dt = pd.DataFrame({'feature': list(titanic_train_data_X),
'importance': dt_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
print('Sample 10 Features from DT Classifier:')
print(str(features_top_n_dt[:10]))
# merge the three models
features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt],
ignore_index=True).drop_duplicates()
features_importance = pd.concat([feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et,
feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=True)
return features_top_n , features_importance
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 18.9s remaining: 10.2s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 19.0s finished
Top N Features Best RF Params:{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
Top N Features Best RF Score:0.8271604938271605
Top N Features RF Train Score:0.9820426487093153
Sample 10 Features from RF Classifier
0 Age
15 Name_length
2 Fare
9 Title_0
7 Sex_0
8 Sex_1
27 Family_Size
3 Pclass
31 Ticket_Letter
11 Title_2
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 18.5s remaining: 9.9s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 18.6s finished
Top N Features Best Ada Params:{'learning_rate': 0.01, 'n_estimators': 500}
Top N Features Best Ada Score:0.8148148148148148
Top N Features Ada Train Score:0.8215488215488216
Sample 10 Feature from Ada Classifier:
9 Title_0
2 Fare
27 Family_Size
28 Family_Size_Category_0
7 Sex_0
3 Pclass
1 Cabin
8 Sex_1
0 Age
15 Name_length
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 27.9s remaining: 15.0s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 28.3s finished
Top N Features Best ET Params:{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
Top N Features Best ET Score:0.8226711560044894
Top N Features ET Train Score:0.9730639730639731
Sample 10 Features from ET Classifier:
9 Title_0
7 Sex_0
8 Sex_1
15 Name_length
0 Age
2 Fare
1 Cabin
31 Ticket_Letter
3 Pclass
10 Title_1
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 36.8s remaining: 19.8s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 37.5s finished
Top N Features Best GB Params:{'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 500}
Top N Features Best GB Score:0.7833894500561167
Top N Features GB Train Score:1.0
Sample 10 Feature from GB Classifier:
9 Title_0
0 Age
2 Fare
15 Name_length
27 Family_Size
28 Family_Size_Category_0
14 Title_5
3 Pclass
26 Pclass_5
31 Ticket_Letter
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 16.1s remaining: 8.6s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 16.3s finished
Top N Features Best DT Params:{'max_depth': 20, 'min_samples_split': 4}
Top N Features Best DT Score:0.7901234567901234
Top N Features DT Train Score:0.9663299663299664
Sample 10 Features from DT Classifier:
9 Title_0
0 Age
2 Fare
15 Name_length
27 Family_Size
14 Title_5
26 Pclass_5
3 Pclass
31 Ticket_Letter
29 Family_Size_Category_1
Name: feature, dtype: object
grid_scores_:給出不同參數情況下的評價結果
best_params_:描述了已取得最佳結果的參數的組合
best_score_:成員提供優化過程期間觀察到的最好的評分
scoring :準確度評價標準,默認None,這時需要使用score函數;或者如scoring='roc_auc',根據所選模型不同,評價準則不同。字符串(函數名),或是可調用對象,需要其函數簽名形如:scorer(estimator, X, y);如果是None,則使用estimator的誤差估計函數。
refit :默認爲True,程序將會以交叉驗證訓練集得到的最佳參數,重新對所有可用的訓練集與開發集進行,作爲最終用於性能評估的最佳模型參數。即在搜索參數結束後,用最佳參數結果再次fit一遍全部數據集。
iid:默認True,爲True時,默認爲各個樣本fold概率分佈一致,誤差估計爲所有樣本之和,而非各個fold的平均。
verbose:日誌冗長度,int:冗長度,0:不輸出訓練過程,1:偶爾輸出,>1:對每個子模型都輸出。
n_jobs: 並行數,int:個數,-1:跟CPU核數一致, 1:默認值。
pre_dispatch:指定總共分發的並行任務數。當n_jobs大於1時,數據將在每個運行點進行復制,這可能導致OOM,而設置pre_dispatch參數,則可以預先劃分總共的job數量,使數據最多被複制pre_dispatch次