機器學習——分類預測項目小試

該工程是2019招商銀行的賽題2:基於收支記錄判斷借貸意願 的完整代碼
這類題目的大致流程爲:
數據分析
數據清洗
特徵提取
選擇模型
建模
訓練模型與調參
數據預測

#導入相關庫
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LinearRegression   
from sklearn.model_selection import KFold
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection  import GridSearchCV 
from xgboost.sklearn import XGBClassifier
from sklearn import  metrics    #scklearn functions
import matplotlib.pylab as plt
from datetime import datetime
#讀取csv文件
train = pd.read_csv(r'data/FT_Camp_2/train.csv')
cust_bas_inf = pd.read_csv(r'data/FT_Camp_2/cust_bas_inf.csv')
g2 = pd.read_csv(r'data/FT_Camp_2/g2.csv')
pred_users = pd.read_csv(r'data/FT_Camp_2/pred_users.csv')
sz_detail = pd.read_csv(r'data/FT_Camp_2/sz_detail.csv')
trx_cod = pd.read_csv(r'data/FT_Camp_2/trx_cod.csv')
#清除重複數據
cust_bas_inf = cust_bas_inf.drop_duplicates()
#空數據填充
age = cust_bas_inf['age']
gender = cust_bas_inf['gender']
aum227 = cust_bas_inf['aum227']
aum306 = cust_bas_inf['aum306']
for i in range(124000):
    if aum227.loc[i]=='\\N':
        aum227.loc[i]='0'
    if aum306.loc[i]=='\\N':
        aum306.loc[i]='0'
    if gender.loc[i]=='\\N':
        gender.loc[i]='M'
    if age.loc[i]=='\\N':
        age.loc[i]='72'
#轉換類型
float_aum227 = aum227.astype('float64')
float_aum306 = aum306.astype('float64')
int_age = age.astype('int')
#合併到源數據集中
cust_bas_inf = pd.merge(cust_bas_inf,float_aum227,right_index=True,left_index=True)
cust_bas_inf = pd.merge(cust_bas_inf,float_aum306,right_index=True,left_index=True)
cust_bas_inf = pd.merge(cust_bas_inf,int_age,right_index=True,left_index=True)

#取出能用的部分
cust_bas_inf=cust_bas_inf[['id','gender','aum227_y','aum306_y','age_y']]
#性別one_hot編碼
cust_bas_inf=pd.get_dummies(cust_bas_inf,columns=['gender'])
#更名
colNameDict = {'age_y':'age'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)

colNameDict = {'gender_F':'F'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)

colNameDict = {'gender_M':'M'}
cust_bas_inf.rename(columns = colNameDict,inplace=True)
#對年齡進行min-max歸一化
agemin = np.min(cust_bas_inf['age'])
agemax = np.max(cust_bas_inf['age'])
cust_bas_inf['age'] = (cust_bas_inf['age']-agemin)/(agemax-agemin)
#Z-score標準化方法(保留正負)
cust_bas_inf['aum227_y'] = (cust_bas_inf['aum227_y'] - cust_bas_inf['aum227_y'].mean())/cust_bas_inf['aum227_y'].std()

cust_bas_inf['aum306_y'] = (cust_bas_inf['aum306_y'] - cust_bas_inf['aum306_y'].mean())/cust_bas_inf['aum306_y'].std()

#更名 分爲t與p,分別爲訓練與預測數據
cust_t = cust_bas_inf[['id','F','M','aum227_y','age']]
cust_p = cust_bas_inf[['id','F','M','aum306_y','age']]
colNameDict = {'aum227_y':'aum'}
cust_t.rename(columns = colNameDict,inplace=True)

colNameDict = {'aum306_y':'aum'}
cust_p.rename(columns = colNameDict,inplace=True)
cust_p.head()

在這裏插入圖片描述

sz_detail.head()

在這裏插入圖片描述

#把數據按日期分爲兩部分
sz_detail['prt_dt'] = sz_detail['prt_dt'].apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
data_before = sz_detail[(sz_detail['prt_dt']>= datetime.strptime('2019-01-01','%Y-%m-%d')) &
                (sz_detail['prt_dt']<datetime.strptime('2019-02-28','%Y-%m-%d'))]
data_after = sz_detail[(sz_detail['prt_dt']>= datetime.strptime('2019-01-01','%Y-%m-%d')) &
                (sz_detail['prt_dt']<datetime.strptime('2019-03-07','%Y-%m-%d'))]
data_before.head()

在這裏插入圖片描述

#對rmb_amt求和並進行歸一化處理
data_before = data_before.groupby('id')['rmb_amt'].nunique()
data_before = data_before.to_frame().reset_index()
data_before.columns = ['id', 'rmb_sum']
data_before['rmb_sum'] = (data_before['rmb_sum'] - data_before['rmb_sum'].mean())/data_before['rmb_sum'].std()

data_after = data_after.groupby('id')['rmb_amt'].nunique()
data_after = data_after.to_frame().reset_index()
data_after.columns = ['id', 'rmb_sum']
data_after['rmb_sum'] = (data_after['rmb_sum'] - data_after['rmb_sum'].mean())/data_after['rmb_sum'].std()

data_after.head()

在這裏插入圖片描述

#與trx_cod進行合併,使用sz_id特徵
colNameDict = {'g2_cod':'g2_id'}
sz_detail.rename(columns = colNameDict,inplace=True)
sz_detail = pd.merge(sz_detail,trx_cod,on='sz_id')
sz_detail = sz_detail.drop(['sz_id','rmb_amt','g2_id','prt_dt'],axis=1)
sz_detail = sz_detail.drop_duplicates()
sz_detail.head()

在這裏插入圖片描述

#對sz_id中的cat1 cat2進行one-hot編碼
sz_detail=pd.get_dummies(sz_detail,columns=['cat1','cat2'])
sz_detail.info()
#合併以上經過處理的數據
tdata = pd.merge(cust_t,sz_detail,on='id')
pdata = pd.merge(cust_p,sz_detail,on='id')
data_train = pd.merge(tdata,train,on='id')
data_pre   = pd.merge(pdata,pred_users,on='id')
data_train = pd.merge(data_train,data_before,on='id')
data_pre   = pd.merge(data_pre,data_after,on='id')
tdata.info()
data_pre.head()

在這裏插入圖片描述

#查看正負樣本的數目
train_postive = data_train[data_train['click_w228'] == 1]
train_negative = data_train[data_train['click_w228'] == 0]
len(train_postive),len(train_negative)
#如果比例失調 則進行以下處理
sampler = np.random.randint(0,len(train_negative),size = int(len(train_negative)/2))
part_negative = train_negative.take(sampler)
data_train = pd.concat([train_postive,part_negative,train_postive],axis=0,ignore_index=True)
len(part_negative)
#開始訓練
train_label = data_train['click_w228']
train = data_train.drop(['id','click_w228'],axis =1)
train.info()
dtrain = xgb.DMatrix(train, train_label)
del train, train_label
#此處調參需轉xgboost的相關調參博客
params = {        
            'max_depth':3,
            'min_child_weight':3,
            'eta':0.3,
            'subsample':1,
            'colsample_bytree':1,
            'scale_pos_weight':1,
            'max_delta_step': 0,
            'eval_metric':'auc',
            'lambda' :0,
            'alpha': 0,
            'gamma': 0,
            'seed': 1,
            'objective':'binary:logistic',
}
#t0 = time.time()
model1 = xgb.train(params,dtrain,num_boost_round=17)
#print time.time() - t0
%matplotlib inline
import seaborn as sns
sns.set(font_scale = 0.5)
xgb.plot_importance(model1)

在這裏插入圖片描述

data_pre = data_pre.drop(['id'],axis =1)
x_feat = xgb.DMatrix(data_pre)
p1 = model1.predict(x_feat)
print(p1)
#整理結果並保存
name_attribute = ['score']
writerCSV=pd.DataFrame(columns=name_attribute,data=p1)
writer_CSV = pd.merge(pred_users,writerCSV,right_index=True,left_index=True)
writer_CSV.to_csv('./result512v3.csv',encoding='utf-8')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章