1.Requirements
pip3 install bayesian-optimization
pip3 install xgboost
2. XGBoost建模框架
from xgboost import XGBClassifier
import pandas as pd
# 1.讀取訓練數據
df=pd.read_csv('./data/train.csv')
# 2.指定預測目標列名
y_train=df['flag']
del df['flag']
X_train=df
# 3. 構建kfold交叉數據集
from sklearn.model_selection import StratifiedKFold
nfold=5
skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=3)
# 4.構建模型
def train_my_model(learning_rate,max_depth,subsample,colsample_bytree,colsample_bylevel,stopping_tolerance,min_rows,
max_delta_step,max_bins,min_sum_hessian_in_leaf,min_data_in_leaf,reg_lambda,reg_alpha,
min_child_weight,max_leaves,gamma):
ks_result=0
auc_result=0
for i, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
# for test ,just train once
# if i>0:
# break
X_trn, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
y_trn, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
# parameters for tree booster
params = {"n_estimators":50,
"booster":"gbtree",# default:gbtree
"objective":"binary:logistic"}
params["learning_rate"]=learning_rate
params["max_depth"]=int(max_depth) # 整型參數強制轉換
params["subsample"]=subsample
params["colsample_bytree"]=colsample_bytree
params["colsample_bylevel"]=colsample_bylevel
params["stopping_tolerance"]=stopping_tolerance
params["min_rows"]=int(min_rows)
params["max_delta_step"]=max_delta_step
params["max_bins"]=int(max_bins)
params["min_sum_hessian_in_leaf"]=min_sum_hessian_in_leaf
params["min_data_in_leaf"]=min_data_in_leaf
params["reg_lambda"]=reg_lambda
params["reg_alpha"]=reg_alpha
params["min_child_weight"]=min_child_weight
params["max_leaves"]=int(max_leaves)
params["gamma"]=gamma
model = XGBClassifier(**params)
model.fit(X_trn, y_trn,eval_set=[(X_val, y_val)],early_stopping_rounds=10,eval_metric="auc",verbose=0)
#verbose=10,每10iter打印一次
y_pred = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
auc_result+=auc
auc_mean=auc_result/5
# print('AUC:{:.4f}'.format(auc_mean))
# 返回auc作爲貝葉斯超參搜索的優化指標
return auc_mean
3.貝葉斯超參搜索
import time
import datetime
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime("%y%m%d%H%M%S") #年月日時分秒
print(timestamp)
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
logger = JSONLogger(path="./logs"+'_'+timestamp+".json")
# Bayesian超參搜索
# Bounded region of parameter space
pbounds = {"learning_rate": (0.05,0.3), # default:0.3
"max_depth":(5,10),
"subsample": (0.8, 1),
"colsample_bytree":(0.9,1), # alias:col_sample_rate_per_tree
"colsample_bylevel":(0.9,1), # alias:col_sample_rate
"stopping_tolerance":(0.0001,0.001),
"min_rows":(2,10), # alias:min_child_weight
"max_delta_step":(0,10), # alias:max_abs_leafnode_pred
# For tree_method=hist only:
"max_bins":(269,500),
"min_sum_hessian_in_leaf":(0,100), #default:100
"min_data_in_leaf":(0,200), #default:0
# For booster=dart only: [default:gbtree]
# rate_drop
# one_drop
# skip_drop
"reg_lambda":(0,10), # alias:lambda, L2
"reg_alpha":(0,10), # alias:alpha, L1
"min_child_weight":(0.9,1),
"max_leaves":(0,10), #default:0
"gamma":(0,10)} # alias:min_split_improvement,min_split_loss
new_optimizer = BayesianOptimization(
f=train_my_model,
pbounds=pbounds,
random_state=1,
)
# 讀取已有的超參搜索結果json
load_logs(new_optimizer, logs=["./logs.json"]);
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
optimizer.maximize(
init_points=10,
n_iter=100,
)
output like this:
| iter | target | col_sa... | colsam... | learni... | max_bins | max_depth | min_rows | stoppi... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| 1 | 0.6311 | 0.417 | 0.972 | 0.05001 | 338.8 | 10.61 | 2.739 | 0.000267 | 0.9346 |
| 2 | 0.6251 | 0.3968 | 0.9539 | 0.07096 | 427.3 | 11.25 | 9.025 | 0.000124 | 0.967 |
| 3 | 0.6122 | 0.9897 | 0.9252 | 0.09825 | 269.1 | 19.93 | 9.672 | 0.000140 | 0.9204 |
| 4 | 0.6292 | 0.2034 | 0.9866 | 0.08754 | 498.9 | 9.141 | 2.162 | 0.000430 | 0.9462 |
| 5 | 0.607 | 0.8085 | 0.9723 | 0.09671 | 418.7 | 19.94 | 2.037 | 0.000378 | 0.9868 |
print(optimizer.max)
output like this:
{'target': 0.6311, 'params': {'col_sample_rate_per_tree': 0.417, 'colsample_bytree': 0.972, 'learning_rate': 0.05001, 'max_bins': 338.8, 'max_depth': 10.61, 'min_rows': 2.739, 'stopping_tolerance': 0.000267, 'subsample': 0.9346}}
Reference
- 貝葉斯超參搜索github:
https://github.com/fmfn/BayesianOptimization - XGBoost參數調優:
https://snaildove.github.io/2018/12/18/get_started_feature-engineering/