XGBoost建模及貝葉斯超參搜索流程-框架代碼參考

1.Requirements

pip3 install bayesian-optimization
pip3 install xgboost

2. XGBoost建模框架

from xgboost import XGBClassifier
import pandas as pd

# 1.讀取訓練數據
df=pd.read_csv('./data/train.csv')

# 2.指定預測目標列名
y_train=df['flag']
del df['flag']
X_train=df

# 3. 構建kfold交叉數據集
from sklearn.model_selection import StratifiedKFold
nfold=5
skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=3)

# 4.構建模型
def train_my_model(learning_rate,max_depth,subsample,colsample_bytree,colsample_bylevel,stopping_tolerance,min_rows,
                   max_delta_step,max_bins,min_sum_hessian_in_leaf,min_data_in_leaf,reg_lambda,reg_alpha,
                  min_child_weight,max_leaves,gamma):
    ks_result=0
    auc_result=0
    for i, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):

        # for test ,just train once
#         if i>0:
#             break

        X_trn, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_trn, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

        # parameters for tree booster
        params = {"n_estimators":50,
                  "booster":"gbtree",# default:gbtree
                 "objective":"binary:logistic"}
		
        params["learning_rate"]=learning_rate
        params["max_depth"]=int(max_depth) # 整型參數強制轉換
        params["subsample"]=subsample
        params["colsample_bytree"]=colsample_bytree
        params["colsample_bylevel"]=colsample_bylevel
        params["stopping_tolerance"]=stopping_tolerance
        params["min_rows"]=int(min_rows)
        params["max_delta_step"]=max_delta_step
        params["max_bins"]=int(max_bins)
        params["min_sum_hessian_in_leaf"]=min_sum_hessian_in_leaf
        params["min_data_in_leaf"]=min_data_in_leaf
        params["reg_lambda"]=reg_lambda
        params["reg_alpha"]=reg_alpha
        params["min_child_weight"]=min_child_weight
        params["max_leaves"]=int(max_leaves)
        params["gamma"]=gamma
        
        model = XGBClassifier(**params)
        
        model.fit(X_trn, y_trn,eval_set=[(X_val, y_val)],early_stopping_rounds=10,eval_metric="auc",verbose=0) 
        #verbose=10,每10iter打印一次

        y_pred = model.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_pred)
        auc_result+=auc

    auc_mean=auc_result/5
    # print('AUC:{:.4f}'.format(auc_mean))
    
    # 返回auc作爲貝葉斯超參搜索的優化指標
    return auc_mean

3.貝葉斯超參搜索

import time
import datetime
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime("%y%m%d%H%M%S") #年月日時分秒
print(timestamp)

from bayes_opt import BayesianOptimization

from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

from bayes_opt.util import load_logs


logger = JSONLogger(path="./logs"+'_'+timestamp+".json")


# Bayesian超參搜索
# Bounded region of parameter space
pbounds = {"learning_rate": (0.05,0.3), # default:0.3
           "max_depth":(5,10),
           "subsample": (0.8, 1),
           "colsample_bytree":(0.9,1), # alias:col_sample_rate_per_tree
           "colsample_bylevel":(0.9,1), # alias:col_sample_rate
           "stopping_tolerance":(0.0001,0.001),
           "min_rows":(2,10), # alias:min_child_weight
           "max_delta_step":(0,10), # alias:max_abs_leafnode_pred
           
           # For tree_method=hist only:
           "max_bins":(269,500),
           "min_sum_hessian_in_leaf":(0,100), #default:100
           "min_data_in_leaf":(0,200), #default:0
           
           # For booster=dart only: [default:gbtree]
           # rate_drop
           # one_drop
           # skip_drop
           
           "reg_lambda":(0,10), # alias:lambda, L2 
           "reg_alpha":(0,10), # alias:alpha, L1 
        
           "min_child_weight":(0.9,1),
           "max_leaves":(0,10), #default:0
           
           "gamma":(0,10)} # alias:min_split_improvement,min_split_loss

new_optimizer = BayesianOptimization(
    f=train_my_model,
    pbounds=pbounds,
    random_state=1,
)

# 讀取已有的超參搜索結果json
load_logs(new_optimizer, logs=["./logs.json"]);


optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(
    init_points=10,
    n_iter=100,
)

output like this:

|   iter    |  target   | col_sa... | colsam... | learni... | max_bins  | max_depth | min_rows  | stoppi... | subsample |
-------------------------------------------------------------------------------------------------------------------------
|  1        |  0.6311   |  0.417    |  0.972    |  0.05001  |  338.8    |  10.61    |  2.739    |  0.000267 |  0.9346   |
|  2        |  0.6251   |  0.3968   |  0.9539   |  0.07096  |  427.3    |  11.25    |  9.025    |  0.000124 |  0.967    |
|  3        |  0.6122   |  0.9897   |  0.9252   |  0.09825  |  269.1    |  19.93    |  9.672    |  0.000140 |  0.9204   |
|  4        |  0.6292   |  0.2034   |  0.9866   |  0.08754  |  498.9    |  9.141    |  2.162    |  0.000430 |  0.9462   |
|  5        |  0.607    |  0.8085   |  0.9723   |  0.09671  |  418.7    |  19.94    |  2.037    |  0.000378 |  0.9868   |
print(optimizer.max)

output like this:

{'target': 0.6311, 'params': {'col_sample_rate_per_tree': 0.417, 'colsample_bytree': 0.972, 'learning_rate': 0.05001, 'max_bins': 338.8, 'max_depth': 10.61, 'min_rows': 2.739, 'stopping_tolerance': 0.000267, 'subsample': 0.9346}}

Reference

  1. 貝葉斯超參搜索github:
    https://github.com/fmfn/BayesianOptimization
  2. XGBoost參數調優:
    https://snaildove.github.io/2018/12/18/get_started_feature-engineering/
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章