Sklearn:天池新人實戰賽o2o優惠券使用預測 part3

日萌社

人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度學習實戰(不定時更新)


阿里雲官網:天池新人實戰賽o2o優惠券使用預測

數據集下載鏈接:https://pan.baidu.com/s/13OtaUv6j4x8dD7cgD4sL5g 
提取碼:7tze 


Sklearn:天池新人實戰賽o2o優惠券使用預測 part1

Sklearn:天池新人實戰賽o2o優惠券使用預測 part2

Sklearn:天池新人實戰賽o2o優惠券使用預測 part3


新特徵工程XGboost_cv調優

In [1]:

import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
import datetime
import os

獲取數據

缺失值處理

In [2]:

def get_processed_data():
    dataset1 = pd.read_csv('data_preprocessed_2/ProcessDataSet1.csv')
    dataset2 = pd.read_csv('data_preprocessed_2/ProcessDataSet2.csv')
    dataset3 = pd.read_csv('data_preprocessed_2/ProcessDataSet3.csv')
    dataset1.drop_duplicates(inplace=True)
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True)
    dataset12 = pd.concat([dataset1, dataset2], axis=0)
    dataset12.fillna(0, inplace=True)
    dataset3.fillna(0, inplace=True)
    return dataset12, dataset3

模型訓練

訓練集的處理

In [3]:

def train_xgb(dataset12, dataset3):
    predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
    predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
    predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')
    # 將數據轉化爲dmatric格式
    dataset12_x = dataset12.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Date', 'Coupon_id', 'label'], axis=1)
    dataset3_x = dataset3.drop(
        columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
                 'Coupon_id'], axis=1)
    train_dmatrix = xgb.DMatrix(dataset12_x, label=dataset12.label)
    predict_dmatrix = xgb.DMatrix(dataset3_x)

    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'gamma': 0.1,
              'min_child_weight': 1.1,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.01,
              'tree_method': 'gpu_hist',
              'seed': 0,
              'nthread': cpu_jobs,
              'predictor': 'cpu_predictor'
              }
    # 使用xgb.cv優化num_boost_round參數
    cvresult = xgb.cv(params, train_dmatrix, num_boost_round=10000, nfold=2, metrics='auc', seed=0, callbacks=[
        xgb.callback.print_evaluation(show_stdv=False),
        xgb.callback.early_stop(50)
    ])
    num_round_best = cvresult.shape[0] - 1
    print('Best round num: ', num_round_best)
    # 使用優化後的num_boost_round參數訓練模型
    watchlist = [(train_dmatrix, 'train')]
    model = xgb.train(params, train_dmatrix, num_boost_round=num_round_best, evals=watchlist)
    model.save_model('train_dir_2/xgbmodel_cv_new')
    params['predictor'] = 'cpu_predictor'
    model_cv = xgb.Booster(params)
    model_cv.load_model('train_dir_2/xgbmodel_cv_new')
    # predict test set
    dataset3_predict = predict_dataset.copy()
    dataset3_predict['label'] = model_cv.predict(predict_dmatrix)
    # 標籤歸一化
    dataset3_predict.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
        dataset3_predict.label.values.reshape(-1, 1))
    dataset3_predict.sort_values(by=['Coupon_id', 'label'], inplace=True)
    dataset3_predict.to_csv("train_dir_2/xgb_cv_preds.csv", index=None, header=None)
    print(dataset3_predict.describe())
    # 在dataset12上計算auc
    # model = xgb.Booster()
    # model.load_model('train_dir_2/xgbmodel')
    temp = dataset12[['Coupon_id', 'label']].copy()
    temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
    temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
    print(myauc(temp))

性能評價函數

In [4]:

# 性能評價函數
def myauc(test):
    testgroup = test.groupby(['Coupon_id'])
    aucs = []
    for i in testgroup:
        tmpdf = i[1]
        if len(tmpdf['label'].unique()) != 2:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
        aucs.append(auc(fpr, tpr))
    return np.average(aucs)

模型訓練並保存-0.7983

In [ ]:

# 獲取數據
dataset12, dataset3 = get_processed_data()

In [6]:

dataset12.head()

Out[6]:

  User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date discount_rate_x discount_rate_y discount_rate ... on_u4 on_u5 on_u6 on_u7 on_u8 on_u9 on_u10 on_u11 on_u12 on_u13
0 1832624 3381 7610 200:20 0 2016-04-29 1970-01-01 200.0 20.0 0.900000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 163606 1569 5054 200:30 10 2016-04-21 1970-01-01 200.0 30.0 0.850000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1113008 1361 11166 20:1 0 2016-05-15 2016-05-21 20.0 1.0 0.950000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 4061024 3381 7610 200:20 10 2016-04-26 1970-01-01 200.0 20.0 0.900000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 106443 450 3732 30:5 11 2016-04-29 1970-01-01 30.0 5.0 0.833333 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 124 columns

In [8]:

dataset12.shape

Out[8]:

(383386, 124)

In [9]:

dataset12.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 383386 entries, 0 to 252585
Columns: 124 entries, User_id to on_u13
dtypes: float64(96), int64(25), object(3)
memory usage: 375.6+ MB

In [11]:

dataset12.describe()

Out[11]:

  User_id Merchant_id Coupon_id Distance discount_rate_x discount_rate_y discount_rate label weekday day ... on_u4 on_u5 on_u6 on_u7 on_u8 on_u9 on_u10 on_u11 on_u12 on_u13
count 3.833860e+05 383386.000000 383386.000000 383386.000000 383386.000000 383386.000000 383386.000000 383386.000000 383386.000000 383386.000000 ... 383386.000000 383386.000000 383386.000000 383386.000000 383386.0 383386.000000 383386.000000 383386.0 383386.000000 383386.000000
mean 3.683603e+06 3653.920730 6287.911630 3.216938 58.047151 8.606796 0.838597 0.082546 3.086717 17.428612 ... 0.538374 0.165142 0.078388 0.019834 0.0 0.078388 0.041178 0.0 0.001983 0.001983
std 2.123219e+06 2577.836469 3938.971496 4.154925 59.771475 8.860719 0.092783 0.275195 1.984455 8.455349 ... 1.979329 0.355102 0.502537 0.119940 0.0 0.502537 0.198702 0.0 0.032386 0.032386
min 4.000000e+00 2.000000 1.000000 0.000000 0.000000 0.000000 0.333333 0.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000
25% 1.843305e+06 1244.000000 2418.000000 0.000000 20.000000 5.000000 0.800000 0.000000 1.000000 11.000000 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000
50% 3.685242e+06 3381.000000 5584.000000 1.000000 30.000000 5.000000 0.833333 0.000000 3.000000 19.000000 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000
75% 5.522414e+06 5803.000000 9566.000000 6.000000 100.000000 10.000000 0.900000 0.000000 5.000000 24.000000 ... 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000
max 7.360961e+06 8856.000000 14045.000000 11.000000 300.000000 100.000000 0.990000 1.000000 6.000000 31.000000 ... 319.000000 1.000000 64.000000 1.000000 0.0 64.000000 1.000000 0.0 0.985915 0.985915

8 rows × 121 columns

In [12]:

print([column for column in dataset12])
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'discount_rate_x', 'discount_rate_y', 'discount_rate', 'label', 'weekday', 'day', 'u2', 'u3', 'u19', 'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32', 'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41', 'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4', 'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11', 'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16', 'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7', 'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13']

In [10]:

dataset3.shape

Out[10]:

(112803, 122)

In [5]:

start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
dataset12, dataset3 = get_processed_data()
train_xgb(dataset12, dataset3)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('time costed is: %s s' % (datetime.datetime.now() - start).seconds)
2020-03-06 13:07:29
[0]	train-auc:0.82972	test-auc:0.82835
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[1]	train-auc:0.83615	test-auc:0.83513
[2]	train-auc:0.84745	test-auc:0.84609
[3]	train-auc:0.85015	test-auc:0.84850
[4]	train-auc:0.85415	test-auc:0.85298
[5]	train-auc:0.85444	test-auc:0.85324
[6]	train-auc:0.85859	test-auc:0.85735
[7]	train-auc:0.86065	test-auc:0.85951
[8]	train-auc:0.86068	test-auc:0.85961
[9]	train-auc:0.86198	test-auc:0.86090
[10]	train-auc:0.86290	test-auc:0.86175
。。。。。。。。。。。。
[6190]	train-auc:0.93079	test-auc:0.90374
[6191]	train-auc:0.93079	test-auc:0.90374
[6192]	train-auc:0.93079	test-auc:0.90374
[6193]	train-auc:0.93080	test-auc:0.90374
[6194]	train-auc:0.93080	test-auc:0.90374
[6195]	train-auc:0.93081	test-auc:0.90374
[6196]	train-auc:0.93081	test-auc:0.90374
[6197]	train-auc:0.93081	test-auc:0.90374
[6198]	train-auc:0.93082	test-auc:0.90374
[6199]	train-auc:0.93082	test-auc:0.90374
[6200]	train-auc:0.93083	test-auc:0.90375
[6201]	train-auc:0.93083	test-auc:0.90375
[6202]	train-auc:0.93083	test-auc:0.90375
[6203]	train-auc:0.93084	test-auc:0.90375
[6204]	train-auc:0.93084	test-auc:0.90375
[6205]	train-auc:0.93084	test-auc:0.90375
[6206]	train-auc:0.93085	test-auc:0.90375
[6207]	train-auc:0.93085	test-auc:0.90375
Stopping. Best iteration:
[6157]	train-auc:0.93068+0.00010	test-auc:0.90375+0.00058

Best round num:  6157
[0]	train-auc:0.84011
[1]	train-auc:0.84200
[2]	train-auc:0.85052
[3]	train-auc:0.85268
[4]	train-auc:0.85981
[5]	train-auc:0.85955
[6]	train-auc:0.86228
[7]	train-auc:0.86348
[8]	train-auc:0.86390
[9]	train-auc:0.86610
[10]	train-auc:0.86655

。。。。。。。。。。。。

[6120]	train-auc:0.92180
[6121]	train-auc:0.92180
[6122]	train-auc:0.92180
[6123]	train-auc:0.92181
[6124]	train-auc:0.92181
[6125]	train-auc:0.92181
[6126]	train-auc:0.92181
[6127]	train-auc:0.92181
[6128]	train-auc:0.92182
[6129]	train-auc:0.92182
[6130]	train-auc:0.92182
[6131]	train-auc:0.92182
[6132]	train-auc:0.92183
[6133]	train-auc:0.92183
[6134]	train-auc:0.92183
[6135]	train-auc:0.92183
[6136]	train-auc:0.92184
[6137]	train-auc:0.92184
[6138]	train-auc:0.92184
[6139]	train-auc:0.92184
[6140]	train-auc:0.92185
[6141]	train-auc:0.92185
[6142]	train-auc:0.92185
[6143]	train-auc:0.92185
[6144]	train-auc:0.92186
[6145]	train-auc:0.92186
[6146]	train-auc:0.92186
[6147]	train-auc:0.92187
[6148]	train-auc:0.92187
[6149]	train-auc:0.92187
[6150]	train-auc:0.92187
[6151]	train-auc:0.92188
[6152]	train-auc:0.92188
[6153]	train-auc:0.92188
[6154]	train-auc:0.92188
[6155]	train-auc:0.92189
[6156]	train-auc:0.92189
            User_id      Coupon_id          label
count  1.128030e+05  112803.000000  112803.000000
mean   3.684618e+06    9064.658006       0.085931
std    2.126358e+06    4147.283515       0.165224
min    2.090000e+02       3.000000       0.000000
25%    1.843824e+06    5035.000000       0.009209
50%    3.683073e+06    9983.000000       0.025507
75%    5.525176e+06   13602.000000       0.064142
max    7.361024e+06   14045.000000       1.000000
0.8090085946857051
2020-03-06 15:02:05
time costed is: 6875 s

xgboot_demo

In [4]:

import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split

讀取處理後的特徵值

In [5]:

dataset1 = pd.read_csv('./GenerateData1.csv')
dataset2 = pd.read_csv('./GenerateData2.csv')
dataset3 = pd.read_csv('./GenerateData3.csv') 

替換爲0 的值

In [6]:

dataset1.label.replace(-1, 0, inplace=True)
dataset2.label.replace(-1, 0, inplace=True)

去重合並表格並刪除不需要得表格

In [7]:

dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(['user_id', 'label', 'day_gap_before', 'coupon_id', 'day_gap_after'], axis=1)

In [8]:

dataset3.drop_duplicates(inplace=True)
dataset3_preds = dataset3[['user_id', 'coupon_id', 'date_received']]
dataset3_x = dataset3.drop(['user_id', 'coupon_id', 'date_received', 'day_gap_before', 'day_gap_after'], axis=1)
# 獲取數據
dataTrain = xgb.DMatrix(dataset12_x, label=dataset12_y)
dataTest = xgb.DMatrix(dataset3_x)

In [9]:

def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        tmpdf = i[1]
        if len(tmpdf['label'].unique()) != 2:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
        aucs.append(auc(fpr, tpr))
    return np.average(aucs)

xgboot模型 score:0.7885

In [11]:

params = {'booster': 'gbtree',
          'objective': 'rank:pairwise',
          'eval_metric': 'auc',
          'gamma': 0.1,
          'min_child_weight': 1.1,
          'max_depth': 5,
          'lambda': 10,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'eta': 0.01,
          'tree_method': 'exact',
          'seed': 0,
          'nthread': 12
          }

In [12]:

watchlist = [(dataTrain, 'train')]
model = xgb.train(params, dataTrain, num_boost_round=3500, evals=watchlist)
[0]	train-auc:0.84293
[1]	train-auc:0.84883
[2]	train-auc:0.85255
[3]	train-auc:0.85333
[4]	train-auc:0.85568
[5]	train-auc:0.85745
[6]	train-auc:0.85878
[7]	train-auc:0.85870
[8]	train-auc:0.85880
[9]	train-auc:0.85984
[10]	train-auc:0.85987
。。。。。。。。。
[3487]	train-auc:0.90753
[3488]	train-auc:0.90754
[3489]	train-auc:0.90754
[3490]	train-auc:0.90755
[3491]	train-auc:0.90755
[3492]	train-auc:0.90755
[3493]	train-auc:0.90755
[3494]	train-auc:0.90756
[3495]	train-auc:0.90756
[3496]	train-auc:0.90756
[3497]	train-auc:0.90757
[3498]	train-auc:0.90757
[3499]	train-auc:0.90757

In [13]:

model.save_model('./xgbmodel')

In [12]:

model = xgb.Booster(params)

In [13]:

model.load_model('./xgbmodel')

In [17]:

dataset3_preds1 = dataset3_preds.copy()
dataset3_preds1['label'] = model.predict(dataTest)

In [19]:

dataset3_preds1.label.head()

Out[19]:

0   -1.927854
1    0.834743
2   -2.466245
3   -1.992080
4   -0.544283
Name: label, dtype: float32

In [21]:

dataset3_preds1.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
    dataset3_preds1.label.values.reshape(-1, 1))
dataset3_preds1.sort_values(by=['coupon_id', 'label'], inplace=True)
dataset3_preds1.to_csv("./xgb_preds.csv", index=None, header=None)
print(dataset3_preds1.describe())
            user_id      coupon_id  date_received          label
count  1.128030e+05  112803.000000   1.128030e+05  112803.000000
mean   3.684618e+06    9064.658006   2.016072e+07       0.374507
std    2.126358e+06    4147.283515   9.017693e+00       0.130249
min    2.090000e+02       3.000000   2.016070e+07       0.000000
25%    1.843824e+06    5035.000000   2.016071e+07       0.292860
50%    3.683073e+06    9983.000000   2.016072e+07       0.355278
75%    5.525176e+06   13602.000000   2.016072e+07       0.443395
max    7.361024e+06   14045.000000   2.016073e+07       1.000000

In [22]:

dataset3_preds1.label.head()

Out[22]:

88774    0.201625
58111    0.210430
25100    0.218126
79286    0.224153
59129    0.241302
Name: label, dtype: float32

In [17]:

model = xgb.Booster()
model.load_model('./xgbmodel')

In [18]:

temp = dataset12[['coupon_id', 'label']].copy()
temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
print(myauc(temp))
0.7733047598560868


各模型訓練

In [33]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
import lightgbm as lgb

獲取數據

In [3]:

def get_processed_data():
    dataset1 = pd.read_csv('./GenerateData1.csv')
    dataset2 = pd.read_csv('./GenerateData2.csv')
    dataset3 = pd.read_csv('./GenerateData3.csv') 
    dataset1.label.replace(-1, 0, inplace=True)
    dataset2.label.replace(-1, 0, inplace=True)
    dataset1.drop_duplicates(inplace=True)
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True) 
    # 按照行或列進行合併,axis=0爲列索引,axis=1爲行索引 因爲特徵處理都一樣, 所以按照列索引
    dataset12 = pd.concat([dataset1, dataset2], axis=0)
    dataset12.fillna(-1, inplace=True)
#     dataset3.fillna(0, inplace=True)
    return dataset12, dataset3

In [4]:

dataset12, dataset3 = get_processed_data()

In [5]:

predict_dataset = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
dataset12_label = dataset12.label
# 降低維度, 把沒有必要的字段刪除
dataset12_x = dataset12.drop(['user_id','label','coupon_id','day_gap_before','day_gap_after'],axis=1)
dataset3.fillna(-1, inplace=True)
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

數據分割

In [6]:

x_train, x_test, y_train, y_test = train_test_split(dataset12_x, dataset12_label, test_size=0.25, random_state=88)

In [7]:

x_train.shape, x_test.shape, y_train.shape, y_test.shape

Out[7]:

((328240, 52), (109414, 52), (328240,), (109414,))

模型訓練

隨機森林 score:0.7790

In [10]:

model = RandomForestClassifier(n_estimators=190, 
                               criterion='gini', 
                               bootstrap=True,  
                               max_depth=15, 
                               max_features=24, 
                               min_samples_leaf=5, 
                               oob_score=True, 
                               random_state=0, 
                               n_jobs=-1)

In [11]:

model.fit(x_train, y_train)

Out[11]:

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features=24,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=190,
                       n_jobs=-1, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [13]:

model.score(x_test, y_test)

Out[13]:

0.9399071416820517

In [14]:

y_predict_proba = model.predict_proba(x_test)

In [17]:

y_predict_proba[:, 1].itemsize

Out[17]:

8

In [20]:

print("AUC",roc_auc_score(y_test,y_predict_proba[:,1]))
AUC 0.8979076720483452

In [21]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("rdf_preds1.csv",index=None,header=None)

GBDT score:0.7297

In [24]:

model =GradientBoostingClassifier(learning_rate=0.1,
                                 n_estimators=190,
                                 min_samples_split=5,
                                 min_samples_leaf=5,
                                 max_depth=15,
                                 random_state=0,
                                 max_features=24,)
model.fit(x_train, y_train)

Out[24]:

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=15,
                           max_features=24, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=5, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=190,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [25]:

model.score(x_test, y_test)

Out[25]:

0.9373754729742081

In [32]:

x_test.shape

Out[32]:

(109414, 52)

In [28]:

y_predict_proba = model.predict_proba(x_test)
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8692195079846718

In [30]:

y_predict_proba

Out[30]:

array([[4.16554643e-01, 5.83445357e-01],
       [9.98395049e-01, 1.60495139e-03],
       [9.87593646e-01, 1.24063544e-02],
       ...,
       [9.46224733e-01, 5.37752665e-02],
       [8.65366794e-01, 1.34633206e-01],
       [9.99404371e-01, 5.95628598e-04]])

In [31]:

y_predict_proba.itemsize

Out[31]:

8

In [29]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("gbdt_preds2.csv",index=None,header=None)

lightGBM score:0.7869

In [34]:

# 1.boosting_type=‘gbdt’# 提升樹的類型 gbdt,dart,goss,rf
# 2.num_leaves=32#樹的最大葉子數,對比xgboost一般爲2^(max_depth)
# 3.max_depth=-1#最大樹的深度
# 4.learning_rate#學習率
# 5.n_estimators=10: 擬合的樹的棵樹,相當於訓練輪數
# 6.subsample=1.0: 訓練樣本採樣率 
# 7.colsample_bytree=1.0: 訓練特徵採樣率 列
# 8.subsample_freq=1: 子樣本頻率
# 9.reg_alpha=0.0: L1正則化係數
# 10.reg_lambda=0.0: L2正則化係數
# 11.random_state=None: 隨機種子數
# 12.n_jobs=-1: 並行運行多線程核心數
# 13.silent=True: 訓練過程是否打印日誌信息
# 14.min_split_gain=0.0: 最小分割增益
# 15.min_child_weight=0.001: 分支結點的最小權重
# 16.sub_feature: LightGBM將在每次迭代(樹)中隨機選擇部分特性 即隨機選擇70%的特性
model = lgb.LGBMClassifier(
                    learning_rate = 0.01,
                    boosting_type = 'gbdt',
                    objective = 'binary',
                    metric = 'logloss',
                    max_depth = 5,
                    sub_feature = 0.7,
                    num_leaves = 3,
                    colsample_bytree = 0.7,
                    n_estimators = 5000,
                    early_stop = 50)

In [35]:

model.fit(x_train, y_train)

Out[35]:

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               early_stop=50, importance_type='split', learning_rate=0.01,
               max_depth=5, metric='logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
               n_jobs=-1, num_leaves=3, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, sub_feature=0.7,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbose=-1)

In [36]:

model.score(x_test, y_test)

Out[36]:

0.9350448754272762

In [37]:

y_predict_proba = model.predict_proba(x_test)

In [38]:

print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8819782907036887

In [39]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("lightGBM_preds.csv",index=None,header=None)

邏輯迴歸 score:0.6932

In [49]:

model = LogisticRegression(max_iter=1000, n_jobs=-1, l1_ratio=0.01, random_state=22)

In [50]:

model.fit(x_train, y_train)

Out[50]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.01, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=22,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [51]:

model.score(x_test, y_test)

Out[51]:

0.9256036704626465

In [52]:

y_predict_proba = model.predict_proba(x_test)

In [56]:

print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8190074583111724

In [57]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("LOG_preds1.csv",index=None,header=None)

邏輯迴歸 SGDClassifier score:0.6119

In [58]:

#fit_intercept:是否計算偏置
model = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1)

In [59]:

model.fit(x_train, y_train)

Out[59]:

SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.01, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [60]:

model.score(x_test, y_test)

Out[60]:

0.8991719524009725

In [61]:

y_predict_proba = model.predict_proba(x_test)

In [62]:

print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.7653896044259063

In [63]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("SGD_preds1.csv",index=None,header=None)

xgboost模型 XGBClassifier score:0.7551

In [64]:

from xgboost import XGBClassifier

In [65]:

# 可以使用一個參數, 其他參數不變進行調優. 類似於隨機森林模型的調優方法
model = XGBClassifier(max_depth=15, learning_rate=0.01,eta=1, gamma=0, n_jobs=-1)

In [66]:

model.fit(x_train, y_train)

Out[66]:

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [67]:

model.score(x_test, y_test)

Out[67]:

0.939221671815307

In [68]:

y_predict_proba = model.predict_proba(x_test)

In [69]:

print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.892734335681119

In [70]:

dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("XGBC_preds1.csv",index=None,header=None)
  
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章