Sklearn:天池新人實戰賽o2o優惠券使用預測 part2

日萌社

人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度學習實戰(不定時更新)


阿里雲官網:天池新人實戰賽o2o優惠券使用預測

數據集下載鏈接:https://pan.baidu.com/s/13OtaUv6j4x8dD7cgD4sL5g 
提取碼:7tze 


Sklearn:天池新人實戰賽o2o優惠券使用預測 part1

Sklearn:天池新人實戰賽o2o優惠券使用預測 part2

Sklearn:天池新人實戰賽o2o優惠券使用預測 part3


特徵處理

In [1]:

import pandas as pd
import numpy as np
from datetime import date
import datetime as dt

獲取數據

In [2]:

off_train = pd.read_csv('./data/ccf_offline_stage1_train.csv', keep_default_na=False, header=0)
off_test = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv',keep_default_na=False, header=0)
on_train = pd.read_csv('./data/ccf_online_stage1_train.csv', keep_default_na=False, header=0)

In [3]:

# 更換特徵名稱
off_train.columns=['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']

數據分批處理

#將數據分爲3個數據集 利用滑窗法

#將2016年1月1日到4月13日的數據提取特徵,利用4月14日的到5月14日的作爲測試集

#將2月1日到5月14日的作爲數據集提取特徵,利用5月15日6月15日的作爲測試集

#將3月15日到6月30日作爲數據集提取特徵,再測試7月1日到7月31日的數據

In [4]:

# 將2016年1月1日到4月13日的數據提取特徵
feature1 = off_train[(off_train.date >= '20160101') & (off_train.date <= '20160413') | (
    (off_train.date == 'null') & (off_train.date_received >= '20160101') & (off_train.date_received <= '20160413'))]
# 利用4月14日的到5月14日的作爲測試集
dataset1 = off_train[(off_train.date_received >= '201604014')
                     & (off_train.date_received <= '20160514')]

In [5]:

# 在2月1日到5月14日之間使用了券,只要領取時間在2月1日到5月14日之間,幷包括沒有數據中沒有領取券的
feature2 = off_train[(off_train.date >= '20160201') & (off_train.date <= '20160514') | (
    (off_train.date == 'null') & (off_train.date_received >= '20160201') & (off_train.date_received <= '20160514'))]
# 提取數據集2的測試集
dataset2 = off_train[(off_train.date_received >= '20160515')
                     & (off_train.date_received <= '20160615')]

In [6]:

# 數據集3的特徵爲 取數據中領券和用券日期大於3月15日和小於6月30日的
feature3 = off_train[((off_train.date >= '20160315') & (off_train.date <= '20160630')) | (
    (off_train.date == 'null') & (off_train.date_received >= '20160315') & (off_train.date_received <= '20160630'))]
# 使數據集3等於test集 沒有label標籤
dataset3 = off_test

特徵工程

用戶特徵處理

In [7]:

def is_firstlastone(x):
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1  # 表明這個優惠券只接受了一次
def get_day_gap_before(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        # 將時間差轉化爲天數
        this_gap = (dt.date(int(date_received[0:4]), int(date_received[4:6]), int(
            date_received[6:8]))-dt.date(int(d[0:4]), int(d[4:6]), int(d[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)
def get_day_gap_after(s):
    date_received, dates = s.split('-')
    dates = dates.split(':')
    gaps = []
    for d in dates:
        this_gap = (dt.datetime(int(d[0:4]), int(d[4:6]), int(d[6:8]))-dt.datetime(
            int(date_received[0:4]), int(date_received[4:6]), int(date_received[6:8]))).days
        if this_gap > 0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)

In [8]:

def GetOtherFeature(dataset):
    # 對於測試集,提取用戶的Id
    dataset3 = dataset
    t = dataset3[['user_id']]
    # 相當於給原有數據加上一列,這個月用戶收取的所有優惠券數目,並初始化爲1
    t['this_month_user_receive_all_coupon_count'] = 1
    # 將t按照用戶id進行分組,然後統計所有用戶收取的優惠券數目,並初始化一個索引值
    t = t.groupby('user_id').agg('sum').reset_index()
    # 提取數據集的優惠券Id和用戶Id
    t1 = dataset3[['user_id', 'coupon_id']]
    # 提取這個月用戶收到的相同的優惠券的數量
    t1['this_month_user_receive_same_coupn_count'] = 1
    t1 = t1.groupby(['user_id', 'coupon_id']).agg('sum').reset_index()
    # 提取數據集的用戶id,優惠券id以及優惠券接收的時間
    t2 = dataset3[['user_id', 'coupon_id', 'date_received']]
    # 將數據轉換爲str類型
    t2.date_received = t2.date_received.astype('str')
    # 如果出現相同的用戶接收相同的優惠券在接收時間上用‘:’連接上第n次接受優惠券的時間
    t2 = t2.groupby(['user_id', 'coupon_id'])['date_received'].agg(
        lambda x: ':'.join(x)).reset_index()
    # 將接收時間的一組按着':'分開,這樣就可以計算接受了優惠券的數量,apply是合併
    t2['receive_number'] = t2.date_received.apply(lambda s: len(s.split(':')))
    t2 = t2[t2.receive_number > 1]
    # 最大接受的日期
    t2['max_date_received'] = t2.date_received.apply(
        lambda s: max([int(d) for d in s.split(':')]))
    # 最小的接收日期
    t2['min_date_received'] = t2.date_received.apply(
        lambda s: min([int(d) for d in s.split(':')]))
    t2 = t2[['user_id', 'coupon_id', 'max_date_received', 'min_date_received']]
    t3 = dataset3[['user_id', 'coupon_id', 'date_received']]
    # 將兩表融合只保留左表數據,這樣得到的表,相當於保留了最近接收時間和最遠接受時間
    t3 = pd.merge(t3, t2, on=['user_id', 'coupon_id'], how='left')
    # 這個優惠券最近接受時間
    t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - \
        t3.date_received.astype(int)
    # 這個優惠券最遠接受時間
    t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype(
        int)-t3.min_date_received
    t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(
        is_firstlastone)
    t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(
        is_firstlastone)
    t3 = t3[['user_id', 'coupon_id', 'date_received', 'this_month_user_receive_same_coupon_lastone',
             'this_month_user_receive_same_coupon_firstone']]
    # 將表格中接收優惠券日期中爲最近和最遠的日期時置爲1其餘爲0,若只接受了一次優惠券爲-1
    # 提取第四個特徵,一個用戶所接收到的所有優惠券的數量
    t4 = dataset3[['user_id', 'date_received']]
    t4['this_day_receive_all_coupon_count'] = 1
    t4 = t4.groupby(['user_id', 'date_received']).agg('sum').reset_index()
    # 提取第五個特徵,一個用戶不同時間所接收到不同優惠券的數量
    t5 = dataset3[['user_id', 'coupon_id', 'date_received']]
    t5['this_day_user_receive_same_coupon_count'] = 1
    t5 = t5.groupby(['user_id', 'coupon_id', 'date_received']
                    ).agg('sum').reset_index()
    # 一個用戶不同優惠券 的接受時間
    t6 = dataset3[['user_id', 'coupon_id', 'date_received']]
    t6.date_received = t6.date_received.astype('str')
    t6 = t6.groupby(['user_id', 'coupon_id'])['date_received'].agg(
        lambda x: ':'.join(x)).reset_index()
    # 重命名inplace代表深拷貝
    t6.rename(columns={'date_received': 'dates'}, inplace=True)
    t7 = dataset3[['user_id', 'coupon_id', 'date_received']]
    # 將t6和t7融合
    t7 = pd.merge(t7, t6, on=['user_id', 'coupon_id'], how='left')
    # 注意這裏所有的時間格式都已經是'str'格式
    t7['date_received_date'] = t7.date_received.astype('str')+'-'+t7.dates
    # print(t7)
    t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
    t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
    t7 = t7[['user_id', 'coupon_id', 'date_received',
             'day_gap_before', 'day_gap_after']]
    # 將所有特徵融合在一張表中
    other_feature3 = pd.merge(t1, t, on='user_id')
    other_feature3 = pd.merge(other_feature3, t3, on=['user_id', 'coupon_id'])
    other_feature3 = pd.merge(other_feature3, t4, on=['user_id', 'date_received'])
    other_feature3 = pd.merge(other_feature3, t5, on=[
                              'user_id', 'coupon_id', 'date_received'])
    other_feature3 = pd.merge(other_feature3, t7, on=[
                              'user_id', 'coupon_id', 'date_received'])
    return other_feature3

提取優惠券的相關特徵

In [9]:

def calc_discount_rate(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])
def get_discount_man(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 'null'
    else:
        return int(s[0])
def get_discount_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s) == 1:
        return 'null'
    else:
        return int(s[1])
def is_man_jian(s):
    s = str(s)
    s = s.split(':')
    if len(s)==1:
        return 0
    else:
        return 1 

In [10]:

def GetCouponFeature(dataset, feature):
    # 對於數據集
    # 將時間轉化爲第幾周
    # 顯示時間是第幾周
    # tt是獲取到的特徵中消費的最大時間
    dataset3 = dataset
    tt = feature[feature.date != 'null'].date.unique().max()
    dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(
        lambda x: date(int(x[0:4]), int(x[4:6]), int(x[6:8])).weekday()+1)
    # 顯示時間是幾月
    dataset3['day_of_month'] = dataset3.date_received.astype(
        'str').apply(lambda x: int(x[6:8]))
    # 顯示時期和截止日之間的天數
    dataset3['days_distance'] = dataset3.date_received.astype('str').apply(
        lambda x: (date(int(x[0:4]), int(x[4:6]), int(x[6:8]))-date(int(tt[0:4]), int(tt[4:6]), int(tt[6:8]))).days)
    # 顯示滿了多少錢後開始減
    dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
    # 顯示滿減的減少的錢
    dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
    # 返回優惠券是否是滿減券
    dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
    # 顯示打折力度
    dataset3['discount_rate'] = dataset3.discount_rate.apply(
        calc_discount_rate)
    d = dataset3[['coupon_id']]
    d['coupon_count'] = 1
    # 顯示每一種優惠券的數量
    d = d.groupby('coupon_id').agg('sum').reset_index()
    dataset3 = pd.merge(dataset3, d, on='coupon_id', how='left')
    return dataset3

提取商品的特徵

In [11]:

def GetMerchantFeature(feature):
    #提取商品的特徵
    #對於數據集
    feature3 = feature
    merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
    t = merchant3[['merchant_id']]
    #刪除重複行數據
    t.drop_duplicates(inplace=True)
    #顯示賣出的商品
    t1 = merchant3[merchant3.date!='null'][['merchant_id']]
    t1['total_sales'] = 1
    #顯示每個商品的銷售數量
    t1 = t1.groupby('merchant_id').agg('sum').reset_index()
    #顯示使用了優惠券消費的商品,正樣本
    t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
    t2['sales_use_coupon'] = 1
    t2 = t2.groupby('merchant_id').agg('sum').reset_index()
    #顯示了商品的優惠券的總數量
    t3 = merchant3[merchant3.coupon_id != 'null'][['merchant_id']]
    t3 ['total_coupon'] = 1
    t3 = t3.groupby('merchant_id').agg('sum').reset_index()
    #顯示商品銷量和距離的關係
    t4 = merchant3[(merchant3.date != 'null')&(merchant3.coupon_id != 'null')][['merchant_id','distance']]
    #把數據中的null值全部替換爲-1
    t4.replace('null',-1,inplace=True)
    t4.distance = t4.distance.astype('int')
    #再把數據中的-1全部替換爲NaN
    t4.replace(-1,np.nan,inplace=True)
    #返回用戶離商品的距離最小值
    t5 = t4.groupby('merchant_id').agg('min').reset_index()
    t5.rename(columns={'distance':'merchant_min_distance'},inplace = True)
    #返回用戶離商品的距離最大值
    t6 = t4.groupby('merchant_id').agg('max').reset_index()
    t6.rename(columns={'distance':'merchant_max_distance'},inplace = True)
    #print(t6)
    #返回距離的平均值
    t7 = t4.groupby('merchant_id').agg('mean').reset_index()
    t7.rename(columns = {'distance':'merchant_mean_distance'},inplace= True)
    #返回距離的中位值
    t8 = t4.groupby('merchant_id').agg('median').reset_index()
    t8.rename(columns={'distance':'merchant_median_distance'},inplace = True)
    merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
    merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
    #將數據中的NaN用0來替換
    merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
    #即優惠券的使用率
    merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float')/merchant3_feature.total_coupon
    #即賣出商品中使用優惠券的佔比
    merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
    #將數據中的NaN用0來替換
    merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
    return merchant3_feature

用戶的相關信息

In [12]:

def get_user_date_datereceived_gap(s):
    s = s.split(':')
    return (date(int(s[0][0:4]), int(s[0][4:6]), int(s[0][6:8])) - date(int(s[1][0:4]), int(s[1][4:6]), int(s[1][6:8]))).days

In [13]:

def GetUserRelateInfo(feature):
    feature3 = feature
    user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
    t = user3[['user_id']]
    #去掉數據中重複的用戶Id
    t.drop_duplicates(inplace=True)
    #用戶購買商品的種類數
    t1 = user3[user3.date!='null'][['user_id','merchant_id']]
    #同樣去掉重複用的用戶id和商品id
    t1.drop_duplicates(inplace=True)
    t1.merchant_id = 1
    t1 = t1.groupby('user_id').agg('sum').reset_index()
    t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
    #使用了優惠券購買商品的用戶id和距離
    t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
    #將null值替換爲-1
    t2.replace('null',-1,inplace=True)
    t2.distance = t2.distance.astype('int')#轉換數據類型爲int
    t2.replace(-1,np.nan,inplace=True)
    #得到使用優惠券購買商品的用戶離店鋪的最短距離
    t3 = t2.groupby('user_id').agg('min').reset_index()
    t3.rename(columns={'distance':'user_min_distance'},inplace=True)
    #得到最大距離
    t4 = t2.groupby('user_id').agg('max').reset_index()
    t4.rename(columns={'distance':'user_max_distance'},inplace=True)
    #得到平均距離
    t5 = t2.groupby('user_id').agg('mean').reset_index()
    t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
    #得到中間距離
    t6 = t2.groupby('user_id').agg('median').reset_index()
    t6.rename(columns={'distance':'user_median_distance'},inplace=True)
    #每個用戶使用優惠券購買的物品數量
    t7 = user3[(user3.date != 'null')&(user3.coupon_id != 'null')][['user_id']]
    t7['buy_use_coupon'] = 1
    t7 = t7.groupby('user_id').agg('sum').reset_index()
    #購買物品的總數
    t8 = user3[user3.date != 'null'][['user_id']]
    t8['buy_total'] = 1
    t8 = t8.groupby('user_id').agg('sum').reset_index()
    #接受的優惠券的總數
    t9 = user3[user3.coupon_id != 'null'][['user_id']]
    t9['coupon_received'] = 1
    t9 = t9.groupby('user_id').agg('sum').reset_index()
    #接受到優惠券的日期和使用之間的間隔
    t10 = user3[(user3.date_received != 'null')&(user3.date != 'null')][['user_id','date_received','date']]
    t10['user_date_datereceived_gap'] = t10.date + ':'+ t10.date_received
    t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
    t10 = t10[['user_id','user_date_datereceived_gap']]
    #將用戶優惠券使用時間的間隔取平均數
    t11 = t10.groupby('user_id').agg('mean').reset_index()
    t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
    #間隔天數的最小值
    t12 = t10.groupby('user_id').agg('min').reset_index()
    t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
    #間隔天數的最大值
    t13 = t10.groupby('user_id').agg('max').reset_index()
    t13.rename(columns={'user_date_datareceived_gap':'max_user_date_datereceived_gap'},inplace=True)
    #將提取的特徵合併
    user3_feature = pd.merge(t,t1,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
    user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')
    user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
    user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
    user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')#使用優惠券購買的商品佔總數的多少
    user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
    user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)#將數據中的NaN值轉爲0
    user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
    return user3_feature

用戶和商家之間的特徵關係

In [14]:

def GetUserMerchantRelateInfo(feature):
    #4.user_merchant:
    #times_user_buy_merchant_before. 
    feature3 = feature
    all_user_merchant = feature3[['user_id','merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)
    #只保留銷售了商品的商戶id
    t = feature3[['user_id','merchant_id','date']]
    t = t[t.date!='null'][['user_id','merchant_id']]
    #用戶一共買了這家商戶的多少商品
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    t1 = feature3[['user_id','merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
    #用戶一共收到一個商戶的多少優惠券
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace = True)
    t2 = feature3[['user_id','merchant_id','date','date_received']]
    t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
    #用戶在一家商戶中使用優惠券購買的商品的數目
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace = True)
    #用戶在一家商家的所有記錄總數
    t3 = feature3[['user_id','merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace = True)
    t4 = feature3[['user_id','merchant_id','date','coupon_id']]
    t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
    #用戶沒有使用優惠券購買的商品的數目
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace = True)
    user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
    user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
    #都是針對一家商戶和一個用戶
    user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
    user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
    #y優惠券的轉換率,用戶使用了的優惠券/一共收到的優惠券
    user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
    #用戶使用優惠券的概率,在一家商戶使用優惠券購買的商品/在一家商戶購買商品的總數
    user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
    #用戶在商戶消費的概率 用戶在商戶購買的總數/在一家商戶瀏覽的總次數
    user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
    #用戶在一家商戶不適用優惠券購買的概率 普通購買的商品數/購買商品的總數
    user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
    return user_merchant3

構建訓練集和測試集

In [15]:

def get_label(s):
    s = s.split(':')
    if s[0]=='null':
        return 0
    elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
        return 1
    else:
        return -1 

In [16]:

def GenerateData(dataset, feature, label=True):
    # 獲取各個特徵處理後的結果
    coupon_feature = GetCouponFeature(dataset, feature)
    merchant_feature = GetMerchantFeature(feature)
    user_feature = GetUserRelateInfo(feature)
    user_merchant = GetUserMerchantRelateInfo(feature)
    other_feature = GetOtherFeature(dataset)
    dataset = pd.merge(coupon_feature, merchant_feature,
                       on='merchant_id', how='left')
    dataset = pd.merge(dataset, user_feature, on='user_id', how='left')
    dataset = pd.merge(dataset, user_merchant, on=[
                       'user_id', 'merchant_id'], how='left')
    dataset = pd.merge(dataset, other_feature, on=[
                       'user_id', 'coupon_id', 'date_received'], how='left')
    dataset.drop_duplicates(inplace=True)
    dataset.user_merchant_buy_total = dataset.user_merchant_buy_total.replace(
        np.nan, 0)
    dataset.user_merchant_any = dataset.user_merchant_any.replace(np.nan, 0)
    dataset.user_merchant_received = dataset.user_merchant_received.replace(
        np.nan, 0)
    dataset['is_weekend'] = dataset.day_of_week.apply(
        lambda x: 1 if x in (6, 7) else 0)
    weekday_dummies = pd.get_dummies(dataset.day_of_week)
    weekday_dummies.columns = [
        'weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
    dataset = pd.concat([dataset, weekday_dummies], axis=1)
    # 如果是訓練集要記得處理label標籤值  但是在測試集中不用處理label標籤 注意off_train和off_test字段
    if label:
        dataset['label'] = dataset.date.astype(
            'str') + ':' + dataset.date_received.astype('str')
        dataset.label = dataset.label.apply(get_label)
        dataset.drop(['merchant_id', 'day_of_week', 'date', 'date_received',
                     'coupon_count'], axis=1, inplace=True)
    else:
        dataset.drop(['merchant_id', 'day_of_week', 'coupon_count'],
                 axis=1, inplace=True)
    # 所有的表都要一起處理null
    dataset = dataset.replace('null', np.nan)
    return dataset

In [17]:

 

GenerateData1 = GenerateData(dataset1, feature1)
GenerateData2 = GenerateData(dataset2, feature2)
GenerateData3 = GenerateData(dataset3, feature3, False)

保存處理好的特徵值, 以便後續使用

In [18]:

GenerateData1.to_csv('./GenerateData1.csv', index=None)
GenerateData2.to_csv('./GenerateData2.csv', index=None)
GenerateData3.to_csv('./GenerateData3.csv', index=None)

模型融合

In [1]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
from xgboost import XGBClassifier

In [2]

def get_processed_data():
    dataset1 = pd.read_csv('./GenerateData1.csv')
    dataset2 = pd.read_csv('./GenerateData2.csv')
    dataset3 = pd.read_csv('./GenerateData3.csv')
    dataset1.label.replace(-1, 0, inplace=True)
    dataset2.label.replace(-1, 0, inplace=True)
    dataset1.drop_duplicates(inplace=True)
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True) 
    # 按照行或列進行合併,axis=0爲列索引,axis=1爲行索引 因爲特徵處理都一樣, 所以按照列索引
    dataset12 = pd.concat([dataset1, dataset2], axis=0)
    dataset12.fillna(-1, inplace=True)
#     dataset3.fillna(0, inplace=True)
    return dataset12, dataset3

In [3]:

dataset12, dataset3 = get_processed_data()

In [4]:

predict_dataset = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
dataset12_label = dataset12.label
# 降低維度, 把沒有必要的字段刪除
dataset12_x = dataset12.drop(['user_id','label','coupon_id','day_gap_before','day_gap_after'],axis=1)
dataset3.fillna(-1, inplace=True)
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

In [5]:

x_train, x_test, y_train, y_test = train_test_split(dataset12_x, dataset12_label, test_size=0.25, random_state=88)
X_train, X_train_lr, y_train, y_train_lr = train_test_split(x_train, y_train, test_size=0.5)

GBDT

In [6]:

gbdt_model =GradientBoostingClassifier(learning_rate=0.1,
                                 n_estimators=190,
                                 min_samples_split=5,
                                 min_samples_leaf=5,
                                 max_depth=15,
                                 random_state=0,
                                 max_features=24,)

XGB

In [7]:

XGB_model = XGBClassifier(max_depth=15, learning_rate=0.01,eta=1, gamma=0, n_jobs=-1)
# XGB_model.fit(x_train, y_train)

In [8]:

from sklearn.preprocessing import  OneHotEncoder

In [9]:

grd_enc = OneHotEncoder()

In [10]:

gbdt_model.fit(X_train, y_train)

Out[10]:

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=15,
                           max_features=24, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=5, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=190,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [11]:

# 訓練one-hot編碼器
grd_enc.fit(gbdt_model.apply(X_train)[:, :, 0])

Out[11]:

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [27]:

# gbdt_model.apply(X_train)

In [24]:

gbdt_model.apply(X_train)[:, :, 0] # 將三維的以二維的形式表示

Out[24]:

array([[1407., 1448., 1478., ..., 1232., 1428., 2841.],
       [1495., 1500., 1807., ..., 1232., 1428., 2841.],
       [ 284.,  377.,  614., ..., 1232.,  756.,  647.],
       ...,
       [ 275.,  362.,  599., ..., 1232.,  772., 1505.],
       [1409., 1456., 1690., ..., 1251., 1450., 2885.],
       [ 139.,  160.,  371., ..., 1126.,  253., 2742.]])

In [12]:

XGB_model.fit(grd_enc.transform(gbdt_model.apply(X_train_lr)[:, :, 0]), y_train_lr)

Out[12]:

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [13]:

# 用訓練好的XGB模型多X_test做預測
y_pred_grd_lm = XGB_model.predict_proba(grd_enc.transform(gbdt_model.apply(x_test)[:, :, 0]))[:, 1]

In [14]:

# 根據預測結果輸出
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)

In [15]:

fpr_grd_lm

Out[15]:

array([0.00000000e+00, 1.97818067e-05, 3.95636133e-05, ...,
       7.50620654e-01, 7.51045963e-01, 1.00000000e+00])

In [16]:

tpr_grd_lm

Out[16]:

array([0.        , 0.0058958 , 0.01106967, ..., 0.98856937, 0.98856937,
       1.        ])

In [17]:

_ 

Out[17]:

array([1.7995733 , 0.7995733 , 0.7982012 , ..., 0.18624917, 0.1855642 ,
       0.18549208], dtype=float32)

In [18]:

print("AUC準確率:", roc_auc_score(y_test,y_pred_grd_lm))
AUC準確率: 0.867398106049461

模型融合 0.7441

In [19]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
y_pred_grd_lm = XGB_model.predict_proba(grd_enc.transform(gbdt_model.apply(dataset3_x)[:, :, 0]))[:, 1]
dataset_preds['label'] = y_pred_grd_lm
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("GBDT_XGB_preds1.csv",index=None,header=None)

線上線下特徵合併

In [ ]:

import datetime
import os
from concurrent.futures import ProcessPoolExecutor
from math import ceil
import pandas as pd

讀入源數據

In [ ]:

# In[] 讀入源數據
def get_source_data():
    # 源數據路徑
    DataPath = 'dataset'
    # 讀入源數據
    off_train = pd.read_csv(os.path.join(DataPath, 'ccf_offline_stage1_train.csv'),
                            parse_dates=['Date_received', 'Date'])
    off_train.columns = ['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date']
    on_train = pd.read_csv(os.path.join(DataPath, 'ccf_online_stage1_train.csv'), parse_dates=['Date_received', 'Date'])
    on_train.columns = ['User_id', 'Merchant_id', 'Action', 'Coupon_id', 'Discount_rate', 'Date_received', 'Date']
    off_test = pd.read_csv(os.path.join(DataPath, 'ccf_offline_stage1_test_revised.csv'), parse_dates=['Date_received'])
    off_test.columns = ['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received']
    print(off_train.info())
    print(off_train.head(5))
    return off_train, on_train, off_test

數據基本處理

null,na 特殊處理

In [ ]:

# In[] null,na 特殊處理
def null_process_offline(dataset, predict=False):
    dataset.Distance.fillna(11, inplace=True)
    dataset.Distance = dataset.Distance.astype(int)
    dataset.Coupon_id.fillna(0, inplace=True)
    dataset.Coupon_id = dataset.Coupon_id.astype(int)
    dataset.Date_received.fillna(date_null, inplace=True)
    dataset[['discount_rate_x', 'discount_rate_y']] = dataset[dataset.Discount_rate.str.contains(':') == True][
        'Discount_rate'].str.split(':', expand=True).astype(int)
    dataset['discount_rate'] = 1 - dataset.discount_rate_y / dataset.discount_rate_x
    dataset.discount_rate = dataset.discount_rate.fillna(dataset.Discount_rate).astype(float)
    if predict:
        return dataset
    else:
        dataset.Date.fillna(date_null, inplace=True)
        return dataset
def null_process_online(dataset):
    dataset.Coupon_id.fillna(0, inplace=True)
    # online.Coupon_id = online.Coupon_id.astype(int)
    dataset.Date_received.fillna(date_null, inplace=True)
    dataset.Date.fillna(date_null, inplace=True)
    return dataset

生成交叉訓練集

In [ ]:

def data_process(off_train, on_train, off_test):
    # train feature split
    # 交叉訓練集一:收到券的日期大於4月14日和小於5月14日
    time_range = ['2016-04-16', '2016-05-15']
    dataset1 = off_train[(off_train.Date_received >= time_range[0]) & (off_train.Date_received <= time_range[1])].copy()
    dataset1['label'] = 0
    dataset1.loc[
        (dataset1.Date != date_null) & (dataset1.Date - dataset1.Date_received <= datetime.timedelta(15)), 'label'] = 1
    # 交叉訓練集一特徵offline:線下數據中領券和用券日期大於1月1日和小於4月13日
    time_range_date_received = ['2016-01-01', '2016-03-31']
    time_range_date = ['2016-01-01', '2016-04-15']
    feature1_off = off_train[(off_train.Date >= time_range_date[0]) & (off_train.Date <= time_range_date[1]) | (
            (off_train.Coupon_id == 0) & (off_train.Date_received >= time_range_date_received[0]) & (
            off_train.Date_received <= time_range_date_received[1]))]
    # 交叉訓練集一特徵online:線上數據中領券和用券日期大於1月1日和小於4月13日[on_train.date == 'null' to on_train.coupon_id == 0]
    feature1_on = on_train[(on_train.Date >= time_range_date[0]) & (on_train.Date <= time_range_date[1]) | (
            (on_train.Coupon_id == 0) & (on_train.Date_received >= time_range_date_received[0]) & (
            on_train.Date_received <= time_range_date_received[1]))]
    # 交叉訓練集二:收到券的日期大於5月15日和小於6月15日
    time_range = ['2016-05-16', '2016-06-15']
    dataset2 = off_train[(off_train.Date_received >= time_range[0]) & (off_train.Date_received <= time_range[1])]
    dataset2['label'] = 0
    dataset2.loc[
        (dataset2.Date != date_null) & (dataset2.Date - dataset2.Date_received <= datetime.timedelta(15)), 'label'] = 1
    # 交叉訓練集二特徵offline:線下數據中領券和用券日期大於2月1日和小於5月14日
    time_range_date_received = ['2016-02-01', '2016-04-30']
    time_range_date = ['2016-02-01', '2016-05-15']
    feature2_off = off_train[(off_train.Date >= time_range_date[0]) & (off_train.Date <= time_range_date[1]) | (
            (off_train.Coupon_id == 0) & (off_train.Date_received >= time_range_date_received[0]) & (
            off_train.Date_received <= time_range_date_received[1]))]
    # 交叉訓練集二特徵online:線上數據中領券和用券日期大於2月1日和小於5月14日
    feature2_on = on_train[(on_train.Date >= time_range_date[0]) & (on_train.Date <= time_range_date[1]) | (
            (on_train.Coupon_id == 0) & (on_train.Date_received >= time_range_date_received[0]) & (
            on_train.Date_received <= time_range_date_received[1]))]
    # 測試集
    dataset3 = off_test
    # 測試集特徵offline :線下數據中領券和用券日期大於3月15日和小於6月30日的
    time_range = ['2016-03-16', '2016-06-30']
    feature3_off = off_train[((off_train.Date >= time_range[0]) & (off_train.Date <= time_range[1])) | (
            (off_train.Coupon_id == 0) & (off_train.Date_received >= time_range[0]) & (
            off_train.Date_received <= time_range[1]))]
    # 測試集特徵online :線上數據中領券和用券日期大於3月15日和小於6月30日的
    feature3_on = on_train[((on_train.Date >= time_range[0]) & (on_train.Date <= time_range[1])) | (
            (on_train.Coupon_id == 0) & (on_train.Date_received >= time_range[0]) & (
            on_train.Date_received <= time_range[1]))]
    # get train feature
    ProcessDataSet1 = get_features(dataset1, feature1_off, feature1_on)
    ProcessDataSet2 = get_features(dataset2, feature2_off, feature2_on)
    ProcessDataSet3 = get_features(dataset3, feature3_off, feature3_on)
    return ProcessDataSet1, ProcessDataSet2, ProcessDataSet3
def get_features(dataset, feature_off, feature_on):
    dataset = get_offline_features(dataset, feature_off)
    return get_online_features(feature_on, dataset)

特徵工程

對線下數據處理

In [ ]:

def get_offline_features(X, offline):
    # X = X[:1000]
    print(len(X), len(X.columns))
    temp = offline[offline.Coupon_id != 0]
    coupon_consume = temp[temp.Date != date_null]
    coupon_no_consume = temp[temp.Date == date_null]
    user_coupon_consume = coupon_consume.groupby('User_id')
    X['weekday'] = X.Date_received.dt.weekday
    X['day'] = X.Date_received.dt.day
    '''user features'''
    # 優惠券消費次數
    temp = user_coupon_consume.size().reset_index(name='u2')
    X = pd.merge(X, temp, how='left', on='User_id')
    # X.u2.fillna(0, inplace=True)
    # X.u2 = X.u2.astype(int)
    # 優惠券不消費次數
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='u3')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 使用優惠券次數與沒使用優惠券次數比值
    X['u19'] = X.u2 / X.u3
    # 領取優惠券次數
    X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
    # 優惠券覈銷率
    X['u4'] = X.u2 / X.u1
    # 普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp1 = temp.groupby('User_id').size().reset_index(name='u5')
    X = pd.merge(X, temp1, how='left', on='User_id')
    # 一共消費多少次
    X['u25'] = X.u2 + X.u5
    # 用戶使用優惠券消費佔比
    X['u20'] = X.u2 / X.u25
    # 正常消費平均間隔
    temp = pd.merge(temp, temp.groupby('User_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u6'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
    # 優惠券消費平均間隔
    temp = pd.merge(coupon_consume, user_coupon_consume.Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['u7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
    # 15天內平均會普通消費幾次
    X['u8'] = X.u6 / 15
    # 15天內平均會優惠券消費幾次
    X['u9'] = X.u7 / 15
    # 領取優惠券到使用優惠券的平均間隔時間
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).reset_index(name='u10')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 在15天內使用掉優惠券的值大小
    X['u11'] = X.u10 / 15
    # 領取優惠券到使用優惠券間隔小於15天的次數
    temp = coupon_consume.copy()
    temp['days'] = (temp.Date - temp.Date_received).dt.days
    temp = temp[temp.days <= 15]
    temp = temp.groupby('User_id').size().reset_index(name='u21')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶15天使用掉優惠券的次數除以使用優惠券的次數
    X['u22'] = X.u21 / X.u2
    # 用戶15天使用掉優惠券的次數除以領取優惠券未消費的次數
    X['u23'] = X.u21 / X.u3
    # 用戶15天使用掉優惠券的次數除以領取優惠券的總次數
    X['u24'] = X.u21 / X.u1
    # 消費優惠券的平均折率
    temp = user_coupon_consume.discount_rate.mean().reset_index(name='u45')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷優惠券的最低消費折率
    temp = user_coupon_consume.discount_rate.min().reset_index(name='u27')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷優惠券的最高消費折率
    temp = user_coupon_consume.discount_rate.max().reset_index(name='u28')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷過的不同優惠券數量
    temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='u32')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶領取所有不同優惠券數量
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Coupon_id']).size().reset_index(name='u47')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
    # 用戶覈銷過的不同優惠券數量佔所有不同優惠券的比重
    X['u33'] = X.u32 / X.u47
    # 用戶平均每種優惠券覈銷多少張
    X['u34'] = X.u2 / X.u47
    # 覈銷優惠券用戶-商家平均距離
    temp = offline[(offline.Coupon_id != 0) & (offline.Date != date_null) & (offline.Distance != 11)]
    temp = temp.groupby('User_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='User_id')
    temp['u35'] = temp.y / temp.x
    temp = temp[['User_id', 'u35']]
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷優惠券中的最小用戶-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.min().reset_index(name='u36')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷優惠券中的最大用戶-商家距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('User_id').Distance.max().reset_index(name='u37')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 優惠券類型
    discount_types = [
        '0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
        '20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
        '50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
        '200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
        '300:10', '200:5', '50:1', '100:1',
    ]
    X['discount_type'] = -1
    for k, v in enumerate(discount_types):
        X.loc[X.Discount_rate == v, 'discount_type'] = k
    # 不同優惠券領取次數
    temp = offline.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u41')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
    # 不同優惠券使用次數
    temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u42')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
    # 不同優惠券不使用次數
    temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().reset_index(name='u43')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
    # 不同打折優惠券使用率
    X['u44'] = X.u42 / X.u41
    # 滿減類型優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains(':') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u48')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 打折類型優惠券領取次數
    temp = offline[offline.Discount_rate.str.contains('\.') == True]
    temp = temp.groupby('User_id').size().reset_index(name='u49')
    X = pd.merge(X, temp, how='left', on='User_id')
    '''offline merchant features'''
    # 商戶消費次數
    temp = offline[offline.Date != date_null].groupby('Merchant_id').size().reset_index(name='m0')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券被領取後覈銷次數
    temp = coupon_consume.groupby('Merchant_id').size().reset_index(name='m1')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商戶正常消費筆數
    X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
    # 商家優惠券被領取次數
    temp = offline[offline.Date_received != date_null].groupby('Merchant_id').size().reset_index(name='m3')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券被領取後覈銷率
    X['m4'] = X.m1 / X.m3
    # 商家優惠券被領取後不覈銷次數
    temp = coupon_no_consume.groupby('Merchant_id').size().reset_index(name='m7')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商戶當天優惠券領取次數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m5')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
    # 商戶當天優惠券領取人數
    temp = X[X.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index()
    temp = temp.groupby(['Merchant_id', 'Date_received']).size().reset_index(name='m6')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
    # 商家優惠券覈銷的平均消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.mean().reset_index(name='m8')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券覈銷的最小消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.max().reset_index(name='m9')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券覈銷的最大消費折率
    temp = coupon_consume.groupby('Merchant_id').discount_rate.min().reset_index(name='m10')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券覈銷不同的用戶數量
    temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m11')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家優惠券領取不同的用戶數量
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m12')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 覈銷商家優惠券的不同用戶數量其佔領取不同的用戶比重
    X['m13'] = X.m11 / X.m12
    # 商家優惠券平均每個用戶覈銷多少張
    X['m14'] = X.m1 / X.m12
    # 商家被覈銷過的不同優惠券數量
    temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='m15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家領取過的不同優惠券數量的比重
    temp = offline[offline.Date_received != date_null].groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').count().reset_index(name='m18')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家被覈銷過的不同優惠券數量佔所有領取過的不同優惠券數量的比重
    X['m19'] = X.m15 / X.m18
    # 商家被覈銷優惠券的平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Merchant_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Merchant_id').size().reset_index(name='len'))
    temp['m20'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('Merchant_id')
    X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
    # 商家被覈銷優惠券中的用戶-商家平均距離
    temp = coupon_consume[coupon_consume.Distance != 11].groupby('Merchant_id').Distance
    temp = pd.merge(temp.count().reset_index(name='x'), temp.sum().reset_index(name='y'), on='Merchant_id')
    temp['m21'] = temp.y / temp.x
    temp = temp[['Merchant_id', 'm21']]
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家被覈銷優惠券中的用戶-商家最小距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.min().reset_index(name='m22')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家被覈銷優惠券中的用戶-商家最大距離
    temp = coupon_consume[coupon_consume.Distance != 11]
    temp = temp.groupby('Merchant_id').Distance.max().reset_index(name='m23')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    """offline coupon features"""
    # 此優惠券一共發行多少張
    temp = offline[offline.Coupon_id != 0].groupby('Coupon_id').size().reset_index(name='c1')
    X = pd.merge(X, temp, how='left', on='Coupon_id')
    # 此優惠券一共被使用多少張
    temp = coupon_consume.groupby('Coupon_id').size().reset_index(name='c2')
    X = pd.merge(X, temp, how='left', on='Coupon_id')
    # 優惠券使用率
    X['c3'] = X.c2 / X.c1
    # 沒有使用的數目
    X['c4'] = X.c1 - X.c2
    # 此優惠券在當天發行了多少張
    temp = X.groupby(['Coupon_id', 'Date_received']).size().reset_index(name='c5')
    X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
    # 優惠券類型(直接優惠爲0, 滿減爲1)
    X['c6'] = 0
    X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
    # 不同打折優惠券領取次數
    temp = offline.groupby('Discount_rate').size().reset_index(name='c8')
    X = pd.merge(X, temp, how='left', on='Discount_rate')
    # 不同打折優惠券使用次數
    temp = coupon_consume.groupby('Discount_rate').size().reset_index(name='c9')
    X = pd.merge(X, temp, how='left', on='Discount_rate')
    # 不同打折優惠券不使用次數
    temp = coupon_no_consume.groupby('Discount_rate').size().reset_index(name='c10')
    X = pd.merge(X, temp, how='left', on='Discount_rate')
    # 不同打折優惠券使用率
    X['c11'] = X.c9 / X.c8
    # 優惠券覈銷平均時間
    temp = pd.merge(coupon_consume, coupon_consume.groupby('Coupon_id').Date.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').Date.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('Coupon_id').size().reset_index(name='count'))
    temp['c12'] = ((temp['max'] - temp['min']).dt.days / (temp['count'] - 1))
    temp = temp.drop_duplicates('Coupon_id')
    X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
    '''user merchant feature'''
    # 用戶領取商家的優惠券次數
    temp = offline[offline.Coupon_id != 0]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um1')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶領取商家的優惠券後不覈銷次數
    temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶領取商家的優惠券後覈銷次數
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um3')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶領取商家的優惠券後覈銷率
    X['um4'] = X.um3 / X.um1
    # 用戶對每個商家的不覈銷次數佔用戶總的不覈銷次數的比重
    temp = coupon_no_consume.groupby('User_id').size().reset_index(name='temp')
    X = pd.merge(X, temp, how='left', on='User_id')
    X['um5'] = X.um2 / X.temp
    X.drop(columns='temp', inplace=True)
    # 用戶在商店總共消費過幾次
    temp = offline[offline.Date != date_null].groupby(['User_id', 'Merchant_id']).size().reset_index(name='um6')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶在商店普通消費次數
    temp = offline[(offline.Coupon_id == 0) & (offline.Date != date_null)]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index(name='um7')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶當天在此商店領取的優惠券數目
    temp = offline[offline.Date_received != date_null]
    temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().reset_index(name='um8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
    # 用戶領取優惠券不同商家數量
    temp = offline[offline.Coupon_id == offline.Coupon_id]
    temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
    temp = temp.groupby('User_id').size().reset_index(name='um9')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷優惠券不同商家數量
    temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='um10')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶覈銷過優惠券的不同商家數量佔所有不同商家的比重
    X['um11'] = X.um10 / X.um9
    # 用戶平均覈銷每個商家多少張優惠券
    X['um12'] = X.u2 / X.um9
    '''other feature'''
    # 用戶領取的所有優惠券數目
    temp = X.groupby('User_id').size().reset_index(name='o1')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id']).size().reset_index(name='o2')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
    # multiple threads
    # data split
    stop = len(X)
    step = int(ceil(stop / cpu_jobs))
    X_chunks = [X[i:i + step] for i in range(0, stop, step)]
    X_list = [X] * cpu_jobs
    counters = [i for i in range(cpu_jobs)]
    start = datetime.datetime.now()
    with ProcessPoolExecutor() as e:
        X = pd.concat(e.map(task, X_chunks, X_list, counters))
        print('time:', str(datetime.datetime.now() - start).split('.')[0])
    # multiple threads
    # 用戶領取優惠券平均時間間隔
    temp = pd.merge(X, X.groupby('User_id').Date_received.max().reset_index(name='max'))
    temp = pd.merge(temp, temp.groupby('User_id').Date_received.min().reset_index(name='min'))
    temp = pd.merge(temp, temp.groupby('User_id').size().reset_index(name='len'))
    temp['o7'] = ((temp['max'] - temp['min']).dt.days / (temp['len'] - 1))
    temp = temp.drop_duplicates('User_id')
    X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
    # 用戶領取特定商家的優惠券數目
    temp = X.groupby(['User_id', 'Merchant_id']).size().reset_index(name='o8')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
    # 用戶領取的不同商家數目
    temp = X.groupby(['User_id', 'Merchant_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o9')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶當天領取的優惠券數目
    temp = X.groupby(['User_id', 'Date_received']).size().reset_index(name='o10')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
    # 用戶當天領取的特定優惠券數目
    temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().reset_index(name='o11')
    X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
    # 用戶領取的所有優惠券種類數目
    temp = X.groupby(['User_id', 'Coupon_id']).size()
    temp = temp.groupby('User_id').size().reset_index(name='o12')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 商家被領取的優惠券數目
    temp = X.groupby('Merchant_id').size().reset_index(name='o13')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家被領取的特定優惠券數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size().reset_index(name='o14')
    X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
    # 商家被多少不同用戶領取的數目
    temp = X.groupby(['Merchant_id', 'User_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o15')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    # 商家發行的所有優惠券種類數目
    temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
    temp = temp.groupby('Merchant_id').size().reset_index(name='o16')
    X = pd.merge(X, temp, how='left', on='Merchant_id')
    print(len(X), len(X.columns))
    return X

對線上數據處理

In [ ]:

def get_online_features(online, X):
    # temp = online[online.Coupon_id == online.Coupon_id]
    # coupon_consume = temp[temp.Date == temp.Date]
    # coupon_no_consume = temp[temp.Date != temp.Date]
    # 用戶線上操作次數
    temp = online.groupby('User_id').size().reset_index(name='on_u1')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上點擊次數
    temp = online[online.Action == 0].groupby('User_id').size().reset_index(name='on_u2')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上點擊率
    X['on_u3'] = X.on_u2 / X.on_u1
    # 用戶線上購買次數
    temp = online[online.Action == 1].groupby('User_id').size().reset_index(name='on_u4')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上購買率
    X['on_u5'] = X.on_u4 / X.on_u1
    # 用戶線上領取次數
    temp = online[online.Coupon_id != 0].groupby('User_id').size().reset_index(name='on_u6')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上領取率
    X['on_u7'] = X.on_u6 / X.on_u1
    # 用戶線上不消費次數
    temp = online[(online.Date == date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u8')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上優惠券覈銷次數
    temp = online[(online.Date != date_null) & (online.Coupon_id != 0)]
    temp = temp.groupby('User_id').size().reset_index(name='on_u9')
    X = pd.merge(X, temp, how='left', on='User_id')
    # 用戶線上優惠券覈銷率
    X['on_u10'] = X.on_u9 / X.on_u6
    # 用戶線下不消費次數佔線上線下總的不消費次數的比重
    X['on_u11'] = X.u3 / (X.on_u8 + X.u3)
    # 用戶線下的優惠券覈銷次數佔線上線下總的優惠券覈銷次數的比重
    X['on_u12'] = X.u2 / (X.on_u9 + X.u2)
    # 用戶線下領取的記錄數量佔總的記錄數量的比重
    X['on_u13'] = X.u1 / (X.on_u6 + X.u1)
    print(len(X), len(X.columns))
    print('----------')
    return X

In [ ]:

def task(X_chunk, X, counter):
    print(counter, end=',', flush=True)
    X_chunk = X_chunk.copy()
    X_chunk['o17'] = -1
    X_chunk['o18'] = -1
    for i, user in X_chunk.iterrows():
        temp = X[X.User_id == user.User_id]
        temp1 = temp[temp.Date_received < user.Date_received]
        temp2 = temp[temp.Date_received > user.Date_received]
        # 用戶此次之後/前領取的所有優惠券數目
        X_chunk.loc[i, 'o3'] = len(temp1)
        X_chunk.loc[i, 'o4'] = len(temp2)
        # 用戶此次之後/前領取的特定優惠券數目
        X_chunk.loc[i, 'o5'] = len(temp1[temp1.Coupon_id == user.Coupon_id])
        X_chunk.loc[i, 'o6'] = len(temp2[temp2.Coupon_id == user.Coupon_id])
        # 用戶上/下一次領取的時間間隔
        temp1 = temp1.sort_values(by='Date_received', ascending=False)
        if len(temp1):
            X_chunk.loc[i, 'o17'] = (user.Date_received - temp1.iloc[0].Date_received).days
        temp2 = temp2.sort_values(by='Date_received')
        if len(temp2):
            X_chunk.loc[i, 'o18'] = (temp2.iloc[0].Date_received - user.Date_received).days
    return X_chunk

數據處理保存

In [ ]:

# 程序開始時間打印
    start = datetime.datetime.now()
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    cpu_jobs = os.cpu_count() - 1
    date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
    pd.set_option('expand_frame_repr', False)
    pd.set_option('display.max_rows', 200)
    pd.set_option('display.max_columns', 200)
    # 預處理後數據存放路徑
    FeaturePath = 'data_preprocessed_2'
    # 讀入源數據
    off_train, on_train, off_test = get_source_data()
    # 源數據null處理
    off_train = null_process_offline(off_train, predict=False)
    on_train = null_process_online(on_train)
    off_test = null_process_offline(off_test, predict=True)
    # 獲取訓練特徵集,測試特徵集
    ProcessDataSet1, ProcessDataSet2, ProcessDataSet3 = data_process(off_train, on_train, off_test)
    # 源數據處理後的數據保存爲文件
    ProcessDataSet1.to_csv(os.path.join(FeaturePath, 'ProcessDataSet1.csv'), index=None)
    ProcessDataSet2.to_csv(os.path.join(FeaturePath, 'ProcessDataSet2.csv'), index=None)
    ProcessDataSet3.to_csv(os.path.join(FeaturePath, 'ProcessDataSet3.csv'), index=None)
    # 花費時間打印
    print((datetime.datetime.now() - start).seconds)

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章