在sql中比較容易處理類似“近n個月金額之和/最大值/最小值/平均值” 這樣的變量,使用sum(case when date then amount else 0 end) 即可,如果是出差在外只能處理離線數據不能使用數據庫時,這個時候就要用python去構造時間切片類的特徵。整理了自己之前寫過的代碼,往往都太笨拙和重複。
import pandas as pd
import numpy as np
import time as time
#生成實例數據集
data = pd.DataFrame({'id':['a','a','b','b','a'],
'billdate': ['2018-09-01','2018-08-01','2018-08-01','2018-01-01','2018-11-01'],
'amount':list(np.random.randint(1,100,5)),
'interest':np.random.rand(5),
'dt': ['2018-10-01','2018-10-01','2018-11-01','2018-11-01','2018-10-01']
})
data
# 首先計算時間差
def month_sub(d1, d2):
year = int(d1[:4]) - int(d2[:4])
month = int(d1[5:7]) - int(d2[5:7])
return year*12 + month
data['month_diff'] = data.apply(lambda row: month_sub(row['dt'], row['billdate']), axis=1)
def feature(data,cols:list,months:list,only_mark,merge_ori=True):
start = time.time()
# 原數據集的 唯一主鍵
df = pd.DataFrame({only_mark:list(set(data[only_mark]))})
for month in months:
df1 = pd.DataFrame({only_mark:list(set(data[only_mark]))})
for col in cols:
agg_dict = {
"last_%s_%s_count"%(month,col):"count",
"last_%s_%s_sum"%(month,col):"sum",
"last_%s_%s_max"%(month,col):"max",
"last_%s_%s_min"%(month,col):"min",
"last_%s_%s_mean"%(month,col):"mean",
"last_%s_%s_var"%(month,col):"var",
"last_%s_%s_std"%(month,col):"std",
"last_%s_%s_median"%(month,col):"median",
"last_%s_%s_skew"%(month,col):"skew"
}
# 選取時間切片內的數據 進行groupby聚合計算
sta_data = data[data['month_diff']<=month].groupby([only_mark])[col].agg(agg_dict).reset_index()
df1 = df1.merge(sta_data,how = "left",on = only_mark)
df = df.merge(df1,how = "left",on = only_mark)
# 是否與原數據集關聯
if merge_ori:
print("merge the original data")
df = df.merge(data,how="right",on=only_mark).fillna(0) #視情況定 是否需要填充0
else:
df = df.fillna(0)
end = time.time()
cost = end-start
print("cost time %.2f s"%(cost))
return df
if __name__=="__main__":
cols = ['amount', 'interest']
months = [1,3]
result=feature(data,cols,months,"id",merge_ori = 0)