半自动构造新特征
主要原理是通过groupby(C1).agg(func)[N1]的方式来创造新特征,实现特征交叉
还需配合特征筛选使用有更好效果
需要传入gby_cols 分组列,stati_cols 统计列,func_list 统计函数
prefix_list的特征前缀可自行修改
简单易懂,无多余功能,网上没看到类似函数,自行写一个
# 半自动构造新特征
class FeatureCombination(object):
def __init__(self):
# init
self.prefix_list = []
def transform(self, X):
# 下面这三行为可更改
gby_cols = ['channel_1_name', 'customer_province', 'city', 'online_work_status', 'if_introduced', 'prepare_lessons']
stati_cols = [['if_pay'], ['teacher_join_days'], ['pay_student_nums'], ['age'], ['course_nums'], ['mean_score'], ['mean_price']]
func_list = [['mean']]
combination_list = []
for i in gby_cols:
combination_list.append([i])
item_list = [i]
for j in gby_cols:
if i !=j:
item_list.append(j)
combination_list.append(item_list)
combination_list.append([i, j])
# 完成去重操作
combination_list = np.unique([sorted(x) for x in combination_list]).tolist()
# 完成特征衍生
for gby_col in combination_list:
for stati_col in stati_cols:
for func in func_list:
X = self.feature_combination(X, gby_col, stati_col, func)
return X
# X 输入dataframe, gby_cols 分组列, stati_cols 统计列, func_list 处理函数list
def feature_combination(self, X, gby_cols, stati_col, func_list):
gp = X[gby_cols+stati_col].groupby(gby_cols)[stati_col].agg(func_list).reset_index()
gp_values = gp[','.join(stati_col)]
prefix = "-".join(gby_cols+stati_col)
prefix_list = [x +'_by_'+ prefix for x in func_list]
# 将组合衍生后的特征名称保存起来
self.prefix_list.append(prefix_list[0])
res = pd.concat([gp[gby_cols], gp_values], axis=1, ignore_index=True)
res.columns = gby_cols+prefix_list
return pd.merge(X, res, how='left', on=gby_cols)