# FREQUENCY ENCODE TOGETHER
def encode_FE(df1, df2, cols):
for col in cols:
df = pd.concat([df1[col],df2[col]])
vc = df.value_counts(dropna=True, normalize=True).to_dict()
vc[-1] = -1
nm = col+'_FE'
df1[nm] = df1[col].map(vc)
df1[nm] = df1[nm].astype('float32')
df2[nm] = df2[col].map(vc)
df2[nm] = df2[nm].astype('float32')
print(nm,', ',end='')
# LABEL ENCODE
def encode_LE(col,train=X_train,test=X_test,verbose=True):
df_comb = pd.concat([train[col],test[col]],axis=0)
df_comb,_ = df_comb.factorize(sort=True)
nm = col
if df_comb.max()>32000:
train[nm] = df_comb[:len(train)].astype('int32')
test[nm] = df_comb[len(train):].astype('int32')
else:
train[nm] = df_comb[:len(train)].astype('int16')
test[nm] = df_comb[len(train):].astype('int16')
del df_comb; x=gc.collect()
if verbose: print(nm,', ',end='')
# GROUP AGGREGATION MEAN AND STD
# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=X_train, test_df=X_test,
fillna=True, usena=False):
# AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
for main_column in main_columns:
for col in uids:
for agg_type in aggregations:
new_col_name = main_column+'_'+col+'_'+agg_type
temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
columns={agg_type: new_col_name})
temp_df.index = list(temp_df[col])
temp_df = temp_df[new_col_name].to_dict()
train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
test_df[new_col_name] = test_df[col].map(temp_df).astype('float32')
if fillna:
train_df[new_col_name].fillna(-1,inplace=True)
test_df[new_col_name].fillna(-1,inplace=True)
print("'"+new_col_name+"'",', ',end='')
# COMBINE FEATURES
def encode_CB(col1,col2,df1=X_train,df2=X_test):
nm = col1+'_'+col2
df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
encode_LE(nm,verbose=False)
print(nm,', ',end='')
# GROUP AGGREGATION NUNIQUE
def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test):
for main_column in main_columns:
for col in uids:
comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
print(col+'_'+main_column+'_ct, ',end='')
https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600