# encoding = 'utf-8'
# Produced By wt
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import math
import time
import lightgbm as lgb
data = pd.read_csv('../A.txt')
data.head()
#看一下月份分佈,我們用最後一個月做爲跨時間驗證集合
data.obs_mth.unique()
df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
#這是我們全部的變量,info結尾的是自己做的無監督系統輸出的個人表現,score結尾的是收費的外部徵信數據
lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']
df_train = df_train.sort_values(by = 'obs_mth',ascending = False)
df_train.head()
df_train = df_train.sort_values(by = 'obs_mth',ascending = False)
rank_lst = []
for i in range(1,len(df_train)+1):
rank_lst.append(i)
df_train['rank'] = rank_lst
df_train['rank'] = df_train['rank']/len(df_train)
pct_lst = []
for x in df_train['rank']:
if x <= 0.2:
x = 1
elif x <= 0.4:
x = 2
elif x <= 0.6:
x = 3
elif x <= 0.8:
x = 4
else:
x = 5
pct_lst.append(x)
df_train['rank'] = pct_lst
df_train.head()
"""
trick-1 篩選特徵重要性方法
"""
def LGB_test(train_x,train_y,test_x,test_y):
from multiprocessing import cpu_count
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=2, n_estimators=800,max_features = 140, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
num_iterations = 800
)
clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
print(clf.n_features_)
return clf,clf.best_score_[ 'valid_1']['auc']
feature_lst = {}
ks_train_lst = []
ks_test_lst = []
for rk in set(df_train['rank']):
ttest = df_train[df_train['rank'] == rk]
ttrain = df_train[df_train['rank'] != rk]
train = ttrain[lst]
train_y = ttrain.bad_ind
test = ttest[lst]
test_y = ttest.bad_ind
start = time.time()
model,auc = LGB_test(train,train_y,test,test_y)
end = time.time()
feature = pd.DataFrame(
{'name' : model.booster_.feature_name(),
'importance' : model.feature_importances_
}).sort_values(by = ['importance'],ascending = False)
y_pred_train_lgb = model.predict_proba(train)[:, 1]
y_pred_test_lgb = model.predict_proba(test)[:, 1]
train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb)
test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb)
train_ks = abs(train_fpr_lgb - train_tpr_lgb).max()
test_ks = abs(test_fpr_lgb - test_tpr_lgb).max()
train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb)
test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb)
ks_train_lst.append(train_ks)
ks_test_lst.append(test_ks)
feature_lst[str(rk)] = feature[feature.importance>=20].name
train_ks = np.mean(ks_train_lst)
test_ks = np.mean(ks_test_lst)
ft_lst = {}
for i in range(1,6):
ft_lst[str(i)] = feature_lst[str(i)]
fn_lst=list(set(ft_lst['1']) & set(ft_lst['2'])
& set(ft_lst['3']) & set(ft_lst['4']) &set(ft_lst['5']))
print('train_ks: ',train_ks)
print('test_ks: ',test_ks)
print('ft_lst: ',fn_lst )
"""
trick-2 可以加載之前的模型繼續訓練 pre train
"""
# 繼續訓練
# 從./model/model.txt中加載模型初始化
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='./model/lgb_model.txt',
valid_sets=lgb_eval)
print('以舊模型爲初始化,完成第 10-20 輪訓練...')
# 在訓練的過程中調整超參數
# 比如這裏調整的是學習率
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('逐步調整學習率完成第 20-30 輪訓練...')
# 調整其他超參數
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('逐步調整bagging比率完成第 30-40 輪訓練...')
"""
trick-3 設置閾值、自動調參數、gridSearch 比較慢且容易過擬合
調參方法 * offks + 0.8(offks - devks)最大化
"""
train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
feature_lst = ['person_info','finance_info','credit_info','act_info']
x = train[feature_lst]
y = train['bad_ind']
val_x = val[feature_lst]
val_y = val['bad_ind']
train_x,test_x,train_y,test_y = train_test_split(x,y,random_state=0,test_size=0.2)
#改變我們想去調整的參數爲value,設置調參區間
min_value = 40
max_value = 60
for value in range(min_value,max_value+1):
best_omd = -1
best_value = -1
best_ks=[]
def lgb_test(train_x,train_y,test_x,test_y):
clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
learning_rate = 0.1,
n_estimators = value,
max_depth = 5,
num_leaves = 20,
max_bin = 45,
min_data_in_leaf = 6,
bagging_fraction = 0.6,
bagging_freq = 0,
feature_fraction = 0.8,
silent=True
)
clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc')
return clf,clf.best_score_['valid_1']['auc'],
lgb_model , lgb_auc = lgb_test(train_x,train_y,test_x,test_y)
y_pred = lgb_model.predict_proba(x)[:,1]
fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
y_pred = lgb_model.predict_proba(val_x)[:,1]
fpr_lgb,tpr_lgb,_ = roc_curve(val_y,y_pred)
val_ks = abs(fpr_lgb - tpr_lgb).max()
Omd= val_ks + 0.8*(val_ks - train_ks)
if Omd>best_omd:
best_omd = Omd
best_value = value
best_ks = [train_ks,val_ks]
print('best_value:',best_value)
print('best_ks:',best_ks)
"""不均衡學習"""
from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = SMOTE(k_neighbors=15, kind='borderline1', m_neighbors=4, n_jobs=1,
out_step='deprecated', random_state=0, ratio=None,
svm_estimator='deprecated')
rex,rey = smote.fit_resample(train_x_osvp,train_y_osvp)
"""模型融合"""
train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
feature_lst = ['person_info','finance_info','credit_info','act_info']
x = train[feature_lst]
y = train['bad_ind']
val_x = val[feature_lst]
val_y = val['bad_ind']
lr_model = LogisticRegression(C=0.1,class_weight='balanced',solver='liblinear')
lr_model.fit(x,y)
y_pred = lr_model.predict_proba(x)[:,1]
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_lr_train - tpr_lr_train).max()
print('train_ks : ',train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1]
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)
val_ks = abs(fpr_lr - tpr_lr).max()
print('val_ks : ',val_ks)
from matplotlib import pyplot as plt
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
df_test = data[data.obs_mth == '2018-11-30'].reset_index().copy()
NUMERIC_COLS = ['person_info','finance_info','credit_info','act_info']
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
lgb_train = lgb.Dataset(df_train[NUMERIC_COLS], df_train['bad_ind'], free_raw_data=False)
params = {
'num_boost_round': 50,
'boosting_type': 'gbdt',
'objective': 'binary',
'num_leaves': 2,
'metric': 'auc',
'max_depth':1,
'feature_fraction':1,
'bagging_fraction':1,
}
model = lgb.train(params,lgb_train)
leaf = model.predict(df_train[NUMERIC_COLS],pred_leaf=True)
lgb_enc = OneHotEncoder()
lgb_enc.fit(leaf)
data_leaf = np.hstack((lgb_enc.transform(leaf).toarray(),df_train[NUMERIC_COLS]))
train, val, train_y, val_y = train_test_split(data_leaf,df_train['bad_ind'],test_size=0.2, random_state=random.choice(range(10000)))
lgb_lm = LogisticRegression(penalty='l1',C = 0.3,solver='liblinear')
lgb_lm.fit(train, train_y)
y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1]
fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train)
y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1]
fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train')
plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train))
print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm))
leaf_test = model.predict(df_test[NUMERIC_COLS],pred_leaf=True)
lgb_enc = OneHotEncoder()
lgb_enc.fit(leaf_test)
data_leaf_test = np.hstack((lgb_enc.transform(leaf_test).toarray(),df_test[NUMERIC_COLS]))
train = data_leaf.copy()
train_y = df_train['bad_ind'].copy()
val = data_leaf_test.copy()
val_y = df_test['bad_ind'].copy()
lgb_lm = LogisticRegression(penalty='l2',C = 0.2,class_weight='balanced',solver='liblinear')
lgb_lm.fit(train, train_y)
y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1]
fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train)
y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1]
fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train')
plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train))
print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm))
"""異常檢測"""
from pyod.models.lof import LOF
clf = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2,
metric_params=None, contamination=0.1, n_jobs=1)
clf.fit(x)
from pyod.models.iforest import IForest
clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0,
max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None,verbose=0)
clf.fit(x)