实现多模型统一调参
解决问题:在复现GBDT+LR的经典结构的时候,发现需要对两个模型一起进行调参,网上找不到相关代码,研究之后实现LGB + LR的统一调参
需写3个自定义管道流的类来完成, 两个模型用于预测, 一个实现将GBDT的预测值作为下一步LR的特征的转换
from sklearn.base import BaseEstimator, TransformerMixin
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
class LgbmPipeline(BaseEstimator, TransformerMixin):
# 传入lgb的模型参数, 详细见下方调用
def __init__(self, other_params):
self.other_params = other_params
def fit(self, X, y=None):
lgb = LGBMClassifier(**self.other_params)
lgb.fit(X, y)
self.lgb = lgb
return self
def transform(self, X):
lgb_feature = self.lgb.predict(X, pred_leaf=True)
return lgb_feature
class GbdtMatrixPipeline(BaseEstimator, TransformerMixin):
# 传入叶子数量
def __init__(self, num_leaf):
self.num_leaf = num_leaf
def fit(self, X, y=None):
return self
def transform(self, X):
transformed_matrix = np.zeros([len(X), len(X[0]) * self.num_leaf],
dtype=np.int8) # N * num_tress * num_leafs
for i in range(0, len(X)):
temp = np.arange(len(X[0])) * self.num_leaf + np.array(X[i])#计算onehot在100 * 64列当中的位置 arange(100)*64 + y_pred[i]
transformed_matrix[i][temp] += 1
return transformed_matrix
class LrPipeline(BaseEstimator, TransformerMixin):
def __init__(self, other_params):
self.other_params = other_params
def fit(self, X, y=None):
lr = LogisticRegression(**self.other_params)
lr.fit(X, y)
self.lr = lr
return self
def transform(self, X):
y_pred_scores = self.lr.predict_proba(X)
return y_pred_scores
管道流实现fit与调用, X_train_tr为ndarry格式的数据
from sklearn.pipeline import Pipeline
other_params = {
'task': 'train',
'boosting_type': 'gbdt', # GBDT算法为基础
'objective': 'binary',
'metric': 'auc', # 评判指标
'max_bin': 255, # 大会有更准的效果,更慢的速度
'learning_rate': 0.1, # 学习率
'num_leaves': 32, # 大会更准,但可能过拟合
'max_depth': -1, # 小数据集下限制最大深度可防止过拟合,小于0表示无限制
'feature_fraction': 0.8, # 防止过拟合
'bagging_freq': 5, # 防止过拟合
'bagging_fraction': 0.8, # 防止过拟合
'min_data_in_leaf': 21, # 防止过拟合
'min_sum_hessian_in_leaf': 3.0, # 防止过拟合
'min_child_weight': 0.1,
'lambda_l1': 0.2,
'lambda_l2': 20,
'is_unbalance': True,
'n_estimators': 100
}
Lr_params = {
'class_weight':dict({0: 1, 1:8}),
'penalty': 'l2'
}
model_pipeline = Pipeline([
("lgb", LgbmPipeline(other_params)),
("matrix", GbdtMatrixPipeline(other_params['num_leaves'])),
("lr", LrPipeline(Lr_params))
])
# X_train_tr为ndarry格式的数据
model_pipeline.fit(X_train_tr, y_train)
model_pipeline.transform(X_test_tr)
对pipeline进行网格调参步骤详见:https://sklearn.apachecn.org/docs/master/38.html