sklearn pipeline 實現多個模型統一調參

實現多模型統一調參

解決問題:在復現GBDT+LR的經典結構的時候,發現需要對兩個模型一起進行調參,網上找不到相關代碼,研究之後實現LGB + LR的統一調參

需寫3個自定義管道流的類來完成, 兩個模型用於預測, 一個實現將GBDT的預測值作爲下一步LR的特徵的轉換

from sklearn.base import BaseEstimator, TransformerMixin
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

class LgbmPipeline(BaseEstimator, TransformerMixin):
	# 傳入lgb的模型參數, 詳細見下方調用
    def __init__(self, other_params):
        self.other_params = other_params

    def fit(self, X, y=None):
        lgb = LGBMClassifier(**self.other_params)
        lgb.fit(X, y)
        self.lgb = lgb
        return self

    def transform(self, X):
        lgb_feature = self.lgb.predict(X, pred_leaf=True)
        return lgb_feature

class GbdtMatrixPipeline(BaseEstimator, TransformerMixin):
    # 傳入葉子數量
    def __init__(self, num_leaf):
        self.num_leaf = num_leaf

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        transformed_matrix = np.zeros([len(X), len(X[0]) * self.num_leaf],
                            dtype=np.int8)  # N * num_tress * num_leafs

        for i in range(0, len(X)):
            temp = np.arange(len(X[0])) * self.num_leaf + np.array(X[i])#計算onehot在100 * 64列當中的位置 arange(100)*64 + y_pred[i]
            transformed_matrix[i][temp] += 1

        return transformed_matrix

class LrPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, other_params):
        self.other_params = other_params

    def fit(self, X, y=None):
        lr = LogisticRegression(**self.other_params)
        lr.fit(X, y)
        self.lr = lr
        return self

    def transform(self, X):
        y_pred_scores = self.lr.predict_proba(X)
        return y_pred_scores

管道流實現fit與調用, X_train_tr爲ndarry格式的數據

from sklearn.pipeline import Pipeline
other_params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # GBDT算法爲基礎
    'objective': 'binary',
    'metric': 'auc',  # 評判指標
    'max_bin': 255,  # 大會有更準的效果,更慢的速度
    'learning_rate': 0.1,  # 學習率
    'num_leaves': 32,  # 大會更準,但可能過擬合
    'max_depth': -1,  # 小數據集下限制最大深度可防止過擬合,小於0表示無限制
    'feature_fraction': 0.8,  # 防止過擬合
    'bagging_freq': 5,  # 防止過擬合
    'bagging_fraction': 0.8,  # 防止過擬合
    'min_data_in_leaf': 21,  # 防止過擬合
    'min_sum_hessian_in_leaf': 3.0,  # 防止過擬合
    'min_child_weight': 0.1,
    'lambda_l1': 0.2,
    'lambda_l2': 20,
    'is_unbalance': True,
    'n_estimators': 100
}

Lr_params = {
    'class_weight':dict({0: 1, 1:8}),
    'penalty': 'l2'
}
model_pipeline = Pipeline([
    ("lgb", LgbmPipeline(other_params)),
    ("matrix", GbdtMatrixPipeline(other_params['num_leaves'])),
    ("lr", LrPipeline(Lr_params))
])
# X_train_tr爲ndarry格式的數據
model_pipeline.fit(X_train_tr, y_train)
model_pipeline.transform(X_test_tr)

對pipeline進行網格調參步驟詳見:https://sklearn.apachecn.org/docs/master/38.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章