半自動構造新特徵
主要原理是通過groupby(C1).agg(func)[N1]的方式來創造新特徵,實現特徵交叉
還需配合特徵篩選使用有更好效果
需要傳入gby_cols 分組列,stati_cols 統計列,func_list 統計函數
prefix_list的特徵前綴可自行修改
簡單易懂,無多餘功能,網上沒看到類似函數,自行寫一個
# 半自動構造新特徵
class FeatureCombination(object):
def __init__(self):
# init
self.prefix_list = []
def transform(self, X):
# 下面這三行爲可更改
gby_cols = ['channel_1_name', 'customer_province', 'city', 'online_work_status', 'if_introduced', 'prepare_lessons']
stati_cols = [['if_pay'], ['teacher_join_days'], ['pay_student_nums'], ['age'], ['course_nums'], ['mean_score'], ['mean_price']]
func_list = [['mean']]
combination_list = []
for i in gby_cols:
combination_list.append([i])
item_list = [i]
for j in gby_cols:
if i !=j:
item_list.append(j)
combination_list.append(item_list)
combination_list.append([i, j])
# 完成去重操作
combination_list = np.unique([sorted(x) for x in combination_list]).tolist()
# 完成特徵衍生
for gby_col in combination_list:
for stati_col in stati_cols:
for func in func_list:
X = self.feature_combination(X, gby_col, stati_col, func)
return X
# X 輸入dataframe, gby_cols 分組列, stati_cols 統計列, func_list 處理函數list
def feature_combination(self, X, gby_cols, stati_col, func_list):
gp = X[gby_cols+stati_col].groupby(gby_cols)[stati_col].agg(func_list).reset_index()
gp_values = gp[','.join(stati_col)]
prefix = "-".join(gby_cols+stati_col)
prefix_list = [x +'_by_'+ prefix for x in func_list]
# 將組合衍生後的特徵名稱保存起來
self.prefix_list.append(prefix_list[0])
res = pd.concat([gp[gby_cols], gp_values], axis=1, ignore_index=True)
res.columns = gby_cols+prefix_list
return pd.merge(X, res, how='left', on=gby_cols)