日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度學習實戰(不定時更新)
阿里雲官網:天池新人實戰賽o2o優惠券使用預測
數據集下載鏈接:https://pan.baidu.com/s/13OtaUv6j4x8dD7cgD4sL5g
提取碼:7tze
Sklearn:天池新人實戰賽o2o優惠券使用預測 part1
Sklearn:天池新人實戰賽o2o優惠券使用預測 part2
Sklearn:天池新人實戰賽o2o優惠券使用預測 part3
新特徵工程XGboost_cv調優
In [1]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
import datetime
import os
獲取數據
缺失值處理
In [2]:
def get_processed_data():
dataset1 = pd.read_csv('data_preprocessed_2/ProcessDataSet1.csv')
dataset2 = pd.read_csv('data_preprocessed_2/ProcessDataSet2.csv')
dataset3 = pd.read_csv('data_preprocessed_2/ProcessDataSet3.csv')
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12.fillna(0, inplace=True)
dataset3.fillna(0, inplace=True)
return dataset12, dataset3
模型訓練
訓練集的處理
In [3]:
def train_xgb(dataset12, dataset3):
predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')
# 將數據轉化爲dmatric格式
dataset12_x = dataset12.drop(
columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
'Date', 'Coupon_id', 'label'], axis=1)
dataset3_x = dataset3.drop(
columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
'Coupon_id'], axis=1)
train_dmatrix = xgb.DMatrix(dataset12_x, label=dataset12.label)
predict_dmatrix = xgb.DMatrix(dataset3_x)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 0.1,
'min_child_weight': 1.1,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.01,
'tree_method': 'gpu_hist',
'seed': 0,
'nthread': cpu_jobs,
'predictor': 'cpu_predictor'
}
# 使用xgb.cv優化num_boost_round參數
cvresult = xgb.cv(params, train_dmatrix, num_boost_round=10000, nfold=2, metrics='auc', seed=0, callbacks=[
xgb.callback.print_evaluation(show_stdv=False),
xgb.callback.early_stop(50)
])
num_round_best = cvresult.shape[0] - 1
print('Best round num: ', num_round_best)
# 使用優化後的num_boost_round參數訓練模型
watchlist = [(train_dmatrix, 'train')]
model = xgb.train(params, train_dmatrix, num_boost_round=num_round_best, evals=watchlist)
model.save_model('train_dir_2/xgbmodel_cv_new')
params['predictor'] = 'cpu_predictor'
model_cv = xgb.Booster(params)
model_cv.load_model('train_dir_2/xgbmodel_cv_new')
# predict test set
dataset3_predict = predict_dataset.copy()
dataset3_predict['label'] = model_cv.predict(predict_dmatrix)
# 標籤歸一化
dataset3_predict.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
dataset3_predict.label.values.reshape(-1, 1))
dataset3_predict.sort_values(by=['Coupon_id', 'label'], inplace=True)
dataset3_predict.to_csv("train_dir_2/xgb_cv_preds.csv", index=None, header=None)
print(dataset3_predict.describe())
# 在dataset12上計算auc
# model = xgb.Booster()
# model.load_model('train_dir_2/xgbmodel')
temp = dataset12[['Coupon_id', 'label']].copy()
temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
print(myauc(temp))
性能評價函數
In [4]:
# 性能評價函數
def myauc(test):
testgroup = test.groupby(['Coupon_id'])
aucs = []
for i in testgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
aucs.append(auc(fpr, tpr))
return np.average(aucs)
模型訓練並保存-0.7983
In [ ]:
# 獲取數據
dataset12, dataset3 = get_processed_data()
In [6]:
dataset12.head()
Out[6]:
User_id | Merchant_id | Coupon_id | Discount_rate | Distance | Date_received | Date | discount_rate_x | discount_rate_y | discount_rate | ... | on_u4 | on_u5 | on_u6 | on_u7 | on_u8 | on_u9 | on_u10 | on_u11 | on_u12 | on_u13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1832624 | 3381 | 7610 | 200:20 | 0 | 2016-04-29 | 1970-01-01 | 200.0 | 20.0 | 0.900000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 163606 | 1569 | 5054 | 200:30 | 10 | 2016-04-21 | 1970-01-01 | 200.0 | 30.0 | 0.850000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 1113008 | 1361 | 11166 | 20:1 | 0 | 2016-05-15 | 2016-05-21 | 20.0 | 1.0 | 0.950000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 4061024 | 3381 | 7610 | 200:20 | 10 | 2016-04-26 | 1970-01-01 | 200.0 | 20.0 | 0.900000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 106443 | 450 | 3732 | 30:5 | 11 | 2016-04-29 | 1970-01-01 | 30.0 | 5.0 | 0.833333 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 124 columns
In [8]:
dataset12.shape
Out[8]:
(383386, 124)
In [9]:
dataset12.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 383386 entries, 0 to 252585
Columns: 124 entries, User_id to on_u13
dtypes: float64(96), int64(25), object(3)
memory usage: 375.6+ MB
In [11]:
dataset12.describe()
Out[11]:
User_id | Merchant_id | Coupon_id | Distance | discount_rate_x | discount_rate_y | discount_rate | label | weekday | day | ... | on_u4 | on_u5 | on_u6 | on_u7 | on_u8 | on_u9 | on_u10 | on_u11 | on_u12 | on_u13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.833860e+05 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | ... | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.0 | 383386.000000 | 383386.000000 | 383386.0 | 383386.000000 | 383386.000000 |
mean | 3.683603e+06 | 3653.920730 | 6287.911630 | 3.216938 | 58.047151 | 8.606796 | 0.838597 | 0.082546 | 3.086717 | 17.428612 | ... | 0.538374 | 0.165142 | 0.078388 | 0.019834 | 0.0 | 0.078388 | 0.041178 | 0.0 | 0.001983 | 0.001983 |
std | 2.123219e+06 | 2577.836469 | 3938.971496 | 4.154925 | 59.771475 | 8.860719 | 0.092783 | 0.275195 | 1.984455 | 8.455349 | ... | 1.979329 | 0.355102 | 0.502537 | 0.119940 | 0.0 | 0.502537 | 0.198702 | 0.0 | 0.032386 | 0.032386 |
min | 4.000000e+00 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.333333 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
25% | 1.843305e+06 | 1244.000000 | 2418.000000 | 0.000000 | 20.000000 | 5.000000 | 0.800000 | 0.000000 | 1.000000 | 11.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
50% | 3.685242e+06 | 3381.000000 | 5584.000000 | 1.000000 | 30.000000 | 5.000000 | 0.833333 | 0.000000 | 3.000000 | 19.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
75% | 5.522414e+06 | 5803.000000 | 9566.000000 | 6.000000 | 100.000000 | 10.000000 | 0.900000 | 0.000000 | 5.000000 | 24.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
max | 7.360961e+06 | 8856.000000 | 14045.000000 | 11.000000 | 300.000000 | 100.000000 | 0.990000 | 1.000000 | 6.000000 | 31.000000 | ... | 319.000000 | 1.000000 | 64.000000 | 1.000000 | 0.0 | 64.000000 | 1.000000 | 0.0 | 0.985915 | 0.985915 |
8 rows × 121 columns
In [12]:
print([column for column in dataset12])
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'discount_rate_x', 'discount_rate_y', 'discount_rate', 'label', 'weekday', 'day', 'u2', 'u3', 'u19', 'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32', 'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41', 'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4', 'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11', 'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16', 'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7', 'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13']
In [10]:
dataset3.shape
Out[10]:
(112803, 122)
In [5]:
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
dataset12, dataset3 = get_processed_data()
train_xgb(dataset12, dataset3)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('time costed is: %s s' % (datetime.datetime.now() - start).seconds)
2020-03-06 13:07:29
[0] train-auc:0.82972 test-auc:0.82835
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.
Will train until test-auc hasn't improved in 50 rounds.
[1] train-auc:0.83615 test-auc:0.83513
[2] train-auc:0.84745 test-auc:0.84609
[3] train-auc:0.85015 test-auc:0.84850
[4] train-auc:0.85415 test-auc:0.85298
[5] train-auc:0.85444 test-auc:0.85324
[6] train-auc:0.85859 test-auc:0.85735
[7] train-auc:0.86065 test-auc:0.85951
[8] train-auc:0.86068 test-auc:0.85961
[9] train-auc:0.86198 test-auc:0.86090
[10] train-auc:0.86290 test-auc:0.86175
。。。。。。。。。。。。
[6190] train-auc:0.93079 test-auc:0.90374
[6191] train-auc:0.93079 test-auc:0.90374
[6192] train-auc:0.93079 test-auc:0.90374
[6193] train-auc:0.93080 test-auc:0.90374
[6194] train-auc:0.93080 test-auc:0.90374
[6195] train-auc:0.93081 test-auc:0.90374
[6196] train-auc:0.93081 test-auc:0.90374
[6197] train-auc:0.93081 test-auc:0.90374
[6198] train-auc:0.93082 test-auc:0.90374
[6199] train-auc:0.93082 test-auc:0.90374
[6200] train-auc:0.93083 test-auc:0.90375
[6201] train-auc:0.93083 test-auc:0.90375
[6202] train-auc:0.93083 test-auc:0.90375
[6203] train-auc:0.93084 test-auc:0.90375
[6204] train-auc:0.93084 test-auc:0.90375
[6205] train-auc:0.93084 test-auc:0.90375
[6206] train-auc:0.93085 test-auc:0.90375
[6207] train-auc:0.93085 test-auc:0.90375
Stopping. Best iteration:
[6157] train-auc:0.93068+0.00010 test-auc:0.90375+0.00058
Best round num: 6157
[0] train-auc:0.84011
[1] train-auc:0.84200
[2] train-auc:0.85052
[3] train-auc:0.85268
[4] train-auc:0.85981
[5] train-auc:0.85955
[6] train-auc:0.86228
[7] train-auc:0.86348
[8] train-auc:0.86390
[9] train-auc:0.86610
[10] train-auc:0.86655
。。。。。。。。。。。。
[6120] train-auc:0.92180
[6121] train-auc:0.92180
[6122] train-auc:0.92180
[6123] train-auc:0.92181
[6124] train-auc:0.92181
[6125] train-auc:0.92181
[6126] train-auc:0.92181
[6127] train-auc:0.92181
[6128] train-auc:0.92182
[6129] train-auc:0.92182
[6130] train-auc:0.92182
[6131] train-auc:0.92182
[6132] train-auc:0.92183
[6133] train-auc:0.92183
[6134] train-auc:0.92183
[6135] train-auc:0.92183
[6136] train-auc:0.92184
[6137] train-auc:0.92184
[6138] train-auc:0.92184
[6139] train-auc:0.92184
[6140] train-auc:0.92185
[6141] train-auc:0.92185
[6142] train-auc:0.92185
[6143] train-auc:0.92185
[6144] train-auc:0.92186
[6145] train-auc:0.92186
[6146] train-auc:0.92186
[6147] train-auc:0.92187
[6148] train-auc:0.92187
[6149] train-auc:0.92187
[6150] train-auc:0.92187
[6151] train-auc:0.92188
[6152] train-auc:0.92188
[6153] train-auc:0.92188
[6154] train-auc:0.92188
[6155] train-auc:0.92189
[6156] train-auc:0.92189
User_id Coupon_id label
count 1.128030e+05 112803.000000 112803.000000
mean 3.684618e+06 9064.658006 0.085931
std 2.126358e+06 4147.283515 0.165224
min 2.090000e+02 3.000000 0.000000
25% 1.843824e+06 5035.000000 0.009209
50% 3.683073e+06 9983.000000 0.025507
75% 5.525176e+06 13602.000000 0.064142
max 7.361024e+06 14045.000000 1.000000
0.8090085946857051
2020-03-06 15:02:05
time costed is: 6875 s
xgboot_demo
In [4]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split
讀取處理後的特徵值
In [5]:
dataset1 = pd.read_csv('./GenerateData1.csv')
dataset2 = pd.read_csv('./GenerateData2.csv')
dataset3 = pd.read_csv('./GenerateData3.csv')
替換爲0 的值
In [6]:
dataset1.label.replace(-1, 0, inplace=True)
dataset2.label.replace(-1, 0, inplace=True)
去重合並表格並刪除不需要得表格
In [7]:
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(['user_id', 'label', 'day_gap_before', 'coupon_id', 'day_gap_after'], axis=1)
In [8]:
dataset3.drop_duplicates(inplace=True)
dataset3_preds = dataset3[['user_id', 'coupon_id', 'date_received']]
dataset3_x = dataset3.drop(['user_id', 'coupon_id', 'date_received', 'day_gap_before', 'day_gap_after'], axis=1)
# 獲取數據
dataTrain = xgb.DMatrix(dataset12_x, label=dataset12_y)
dataTest = xgb.DMatrix(dataset3_x)
In [9]:
def myauc(test):
testgroup = test.groupby(['coupon_id'])
aucs = []
for i in testgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
aucs.append(auc(fpr, tpr))
return np.average(aucs)
xgboot模型 score:0.7885
In [11]:
params = {'booster': 'gbtree',
'objective': 'rank:pairwise',
'eval_metric': 'auc',
'gamma': 0.1,
'min_child_weight': 1.1,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.01,
'tree_method': 'exact',
'seed': 0,
'nthread': 12
}
In [12]:
watchlist = [(dataTrain, 'train')]
model = xgb.train(params, dataTrain, num_boost_round=3500, evals=watchlist)
[0] train-auc:0.84293
[1] train-auc:0.84883
[2] train-auc:0.85255
[3] train-auc:0.85333
[4] train-auc:0.85568
[5] train-auc:0.85745
[6] train-auc:0.85878
[7] train-auc:0.85870
[8] train-auc:0.85880
[9] train-auc:0.85984
[10] train-auc:0.85987
。。。。。。。。。
[3487] train-auc:0.90753
[3488] train-auc:0.90754
[3489] train-auc:0.90754
[3490] train-auc:0.90755
[3491] train-auc:0.90755
[3492] train-auc:0.90755
[3493] train-auc:0.90755
[3494] train-auc:0.90756
[3495] train-auc:0.90756
[3496] train-auc:0.90756
[3497] train-auc:0.90757
[3498] train-auc:0.90757
[3499] train-auc:0.90757
In [13]:
model.save_model('./xgbmodel')
In [12]:
model = xgb.Booster(params)
In [13]:
model.load_model('./xgbmodel')
In [17]:
dataset3_preds1 = dataset3_preds.copy()
dataset3_preds1['label'] = model.predict(dataTest)
In [19]:
dataset3_preds1.label.head()
Out[19]:
0 -1.927854
1 0.834743
2 -2.466245
3 -1.992080
4 -0.544283
Name: label, dtype: float32
In [21]:
dataset3_preds1.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
dataset3_preds1.label.values.reshape(-1, 1))
dataset3_preds1.sort_values(by=['coupon_id', 'label'], inplace=True)
dataset3_preds1.to_csv("./xgb_preds.csv", index=None, header=None)
print(dataset3_preds1.describe())
user_id coupon_id date_received label
count 1.128030e+05 112803.000000 1.128030e+05 112803.000000
mean 3.684618e+06 9064.658006 2.016072e+07 0.374507
std 2.126358e+06 4147.283515 9.017693e+00 0.130249
min 2.090000e+02 3.000000 2.016070e+07 0.000000
25% 1.843824e+06 5035.000000 2.016071e+07 0.292860
50% 3.683073e+06 9983.000000 2.016072e+07 0.355278
75% 5.525176e+06 13602.000000 2.016072e+07 0.443395
max 7.361024e+06 14045.000000 2.016073e+07 1.000000
In [22]:
dataset3_preds1.label.head()
Out[22]:
88774 0.201625
58111 0.210430
25100 0.218126
79286 0.224153
59129 0.241302
Name: label, dtype: float32
In [17]:
model = xgb.Booster()
model.load_model('./xgbmodel')
In [18]:
temp = dataset12[['coupon_id', 'label']].copy()
temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
print(myauc(temp))
0.7733047598560868
各模型訓練
In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
import lightgbm as lgb
獲取數據
In [3]:
def get_processed_data():
dataset1 = pd.read_csv('./GenerateData1.csv')
dataset2 = pd.read_csv('./GenerateData2.csv')
dataset3 = pd.read_csv('./GenerateData3.csv')
dataset1.label.replace(-1, 0, inplace=True)
dataset2.label.replace(-1, 0, inplace=True)
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
# 按照行或列進行合併,axis=0爲列索引,axis=1爲行索引 因爲特徵處理都一樣, 所以按照列索引
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12.fillna(-1, inplace=True)
# dataset3.fillna(0, inplace=True)
return dataset12, dataset3
In [4]:
dataset12, dataset3 = get_processed_data()
In [5]:
predict_dataset = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
dataset12_label = dataset12.label
# 降低維度, 把沒有必要的字段刪除
dataset12_x = dataset12.drop(['user_id','label','coupon_id','day_gap_before','day_gap_after'],axis=1)
dataset3.fillna(-1, inplace=True)
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
數據分割
In [6]:
x_train, x_test, y_train, y_test = train_test_split(dataset12_x, dataset12_label, test_size=0.25, random_state=88)
In [7]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Out[7]:
((328240, 52), (109414, 52), (328240,), (109414,))
模型訓練
隨機森林 score:0.7790
In [10]:
model = RandomForestClassifier(n_estimators=190,
criterion='gini',
bootstrap=True,
max_depth=15,
max_features=24,
min_samples_leaf=5,
oob_score=True,
random_state=0,
n_jobs=-1)
In [11]:
model.fit(x_train, y_train)
Out[11]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=15, max_features=24,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=190,
n_jobs=-1, oob_score=True, random_state=0, verbose=0,
warm_start=False)
In [13]:
model.score(x_test, y_test)
Out[13]:
0.9399071416820517
In [14]:
y_predict_proba = model.predict_proba(x_test)
In [17]:
y_predict_proba[:, 1].itemsize
Out[17]:
8
In [20]:
print("AUC",roc_auc_score(y_test,y_predict_proba[:,1]))
AUC 0.8979076720483452
In [21]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("rdf_preds1.csv",index=None,header=None)
GBDT score:0.7297
In [24]:
model =GradientBoostingClassifier(learning_rate=0.1,
n_estimators=190,
min_samples_split=5,
min_samples_leaf=5,
max_depth=15,
random_state=0,
max_features=24,)
model.fit(x_train, y_train)
Out[24]:
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=15,
max_features=24, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=5,
min_weight_fraction_leaf=0.0, n_estimators=190,
n_iter_no_change=None, presort='deprecated',
random_state=0, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
In [25]:
model.score(x_test, y_test)
Out[25]:
0.9373754729742081
In [32]:
x_test.shape
Out[32]:
(109414, 52)
In [28]:
y_predict_proba = model.predict_proba(x_test)
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8692195079846718
In [30]:
y_predict_proba
Out[30]:
array([[4.16554643e-01, 5.83445357e-01],
[9.98395049e-01, 1.60495139e-03],
[9.87593646e-01, 1.24063544e-02],
...,
[9.46224733e-01, 5.37752665e-02],
[8.65366794e-01, 1.34633206e-01],
[9.99404371e-01, 5.95628598e-04]])
In [31]:
y_predict_proba.itemsize
Out[31]:
8
In [29]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("gbdt_preds2.csv",index=None,header=None)
lightGBM score:0.7869
In [34]:
# 1.boosting_type=‘gbdt’# 提升樹的類型 gbdt,dart,goss,rf
# 2.num_leaves=32#樹的最大葉子數,對比xgboost一般爲2^(max_depth)
# 3.max_depth=-1#最大樹的深度
# 4.learning_rate#學習率
# 5.n_estimators=10: 擬合的樹的棵樹,相當於訓練輪數
# 6.subsample=1.0: 訓練樣本採樣率
# 7.colsample_bytree=1.0: 訓練特徵採樣率 列
# 8.subsample_freq=1: 子樣本頻率
# 9.reg_alpha=0.0: L1正則化係數
# 10.reg_lambda=0.0: L2正則化係數
# 11.random_state=None: 隨機種子數
# 12.n_jobs=-1: 並行運行多線程核心數
# 13.silent=True: 訓練過程是否打印日誌信息
# 14.min_split_gain=0.0: 最小分割增益
# 15.min_child_weight=0.001: 分支結點的最小權重
# 16.sub_feature: LightGBM將在每次迭代(樹)中隨機選擇部分特性 即隨機選擇70%的特性
model = lgb.LGBMClassifier(
learning_rate = 0.01,
boosting_type = 'gbdt',
objective = 'binary',
metric = 'logloss',
max_depth = 5,
sub_feature = 0.7,
num_leaves = 3,
colsample_bytree = 0.7,
n_estimators = 5000,
early_stop = 50)
In [35]:
model.fit(x_train, y_train)
Out[35]:
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
early_stop=50, importance_type='split', learning_rate=0.01,
max_depth=5, metric='logloss', min_child_samples=20,
min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
n_jobs=-1, num_leaves=3, objective='binary', random_state=None,
reg_alpha=0.0, reg_lambda=0.0, silent=True, sub_feature=0.7,
subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
verbose=-1)
In [36]:
model.score(x_test, y_test)
Out[36]:
0.9350448754272762
In [37]:
y_predict_proba = model.predict_proba(x_test)
In [38]:
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8819782907036887
In [39]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("lightGBM_preds.csv",index=None,header=None)
邏輯迴歸 score:0.6932
In [49]:
model = LogisticRegression(max_iter=1000, n_jobs=-1, l1_ratio=0.01, random_state=22)
In [50]:
model.fit(x_train, y_train)
Out[50]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0.01, max_iter=1000,
multi_class='auto', n_jobs=-1, penalty='l2', random_state=22,
solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
In [51]:
model.score(x_test, y_test)
Out[51]:
0.9256036704626465
In [52]:
y_predict_proba = model.predict_proba(x_test)
In [56]:
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.8190074583111724
In [57]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("LOG_preds1.csv",index=None,header=None)
邏輯迴歸 SGDClassifier score:0.6119
In [58]:
#fit_intercept:是否計算偏置
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
max_iter=100,
shuffle=True,
alpha = 0.01,
l1_ratio = 0.01,
n_jobs=1)
In [59]:
model.fit(x_train, y_train)
Out[59]:
SGDClassifier(alpha=0.01, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.01, learning_rate='optimal', loss='log', max_iter=100,
n_iter_no_change=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
In [60]:
model.score(x_test, y_test)
Out[60]:
0.8991719524009725
In [61]:
y_predict_proba = model.predict_proba(x_test)
In [62]:
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.7653896044259063
In [63]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("SGD_preds1.csv",index=None,header=None)
xgboost模型 XGBClassifier score:0.7551
In [64]:
from xgboost import XGBClassifier
In [65]:
# 可以使用一個參數, 其他參數不變進行調優. 類似於隨機森林模型的調優方法
model = XGBClassifier(max_depth=15, learning_rate=0.01,eta=1, gamma=0, n_jobs=-1)
In [66]:
model.fit(x_train, y_train)
Out[66]:
XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.01, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=1,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
validate_parameters=False, verbosity=None)
In [67]:
model.score(x_test, y_test)
Out[67]:
0.939221671815307
In [68]:
y_predict_proba = model.predict_proba(x_test)
In [69]:
print("AUC準確率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC準確率: 0.892734335681119
In [70]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("XGBC_preds1.csv",index=None,header=None)