金融貸款預測

本數據來源於https://tianchi.aliyun.com/course/courseConsole?spm=5176.12282070.0.0.764c290a2RIpBY&courseId=192&chapterIndex=10&sectionIndex=1

本是天池的課堂,不過講的不是太清楚,所以後面的變量控制,清洗數據都是按照自己的想法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv(r'LoanStats_2016Q3.csv',skiprows=1,low_memory=False)
df.info()

先刪除一些無關數據 ,數據格式都統一

df.drop('id',1,inplace=True)
df.drop('member_id',1,inplace=True)
df.dropna(axis=0,how='all',inplace=True) #all空就刪
df.dropna(axis=0,how='all',inplace=True) #all空就刪
df.emp_title=df.emp_title.astype('str').apply(lambda x:x.lower())#由於數據形式大小寫沒固定,全部轉化一下,可以避免大小寫不同導致的特徵不一樣
df.drop('emp_title',1,inplace=True)#這是職業,分類太多了,先暫時刪掉,之後有時間可以加回來

 年份信息格式修好統一

df['emp_length'].fillna(value=0,inplace=True)
#正則替換掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
#正則替換掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)

查看缺失值的比例

#處理對象類型的缺失,O代表float型的字段
df.select_dtypes(include=['O']).describe().T.assign(missing_pct=df.apply(lambda x: (len(x)-x.count())/float(len(x))))

刪除缺失值太多的列

df.drop('desc',1,inplace=True)
df.drop('verification_status_joint',1,inplace=True)
df.drop('zip_code',1,inplace=True)
df.drop('addr_state',1,inplace=True)
df.drop('earliest_cr_line',1,inplace=True)
df.drop('revol_util',1,inplace=True)
df.drop('title',1,inplace=True)
df.drop('term',1,inplace=True)
df.drop('issue_d',1,inplace=True)

刪除貸款後的信息

df.drop(['out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','grade','sub_grade'],1,inplace=True)
df.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee'],1,inplace=True)
df.drop(['last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d'],1,inplace=True)
df.drop(['policy_code'],1,inplace=True)
df.drop('annual_inc_joint',1,inplace=True)
df.drop('dti_joint',1,inplace=True)

是否還款按是否違約分類

dict={lo[0]:1, lo[1]:1,  lo[2]:0, lo[3]:np.nan, lo[4]:0, lo[5]:np.nan, lo[6]:np.nan}
df.loan_status.replace(dict,inplace=True)

 剩下一些變量修改

df.loan_status.dropna(how='any',inplace=True)
df.dropna(subset=['loan_status'],inplace=True)#這列的空值刪除
df.loan_status=df.loan_status.astype('float').astype('int')
#df.loan_status=df.loan_status.apply(lambda x : int(str(x).strip()))#有時候int(str(x)會出錯,用float代替會好一些
df.int_rate=df.int_rate.apply(lambda x:float(x[:-1])/100 if '%' in x else(np.nan))

刪除相關性。corr太高的數據

df.drop(['funded_amnt','funded_amnt_inv','installment'],axis=1,inplace=True)

get_dummies,處理特徵

df.home_ownership.value_counts()
home_dumm=pd.get_dummies(df.home_ownership)
df= pd.concat([df,home_dumm],axis=1)
df.drop(['home_ownership'],1,inplace=True)

df.verification_status.value_counts()
veristatus_dumm=pd.get_dummies(df.verification_status)
df= pd.concat([df,veristatus_dumm],axis=1)
df.drop(['verification_status'],1,inplace=True)

df.application_type.value_counts()
apptype_dumm=pd.get_dummies(df.application_type)
df= pd.concat([df,apptype_dumm],axis=1)
df.drop(['application_type'],1,inplace=True)

df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)

df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)

df.initial_list_status.value_counts()
inls_dumm=pd.get_dummies(df.initial_list_status)
df= pd.concat([df,inls_dumm],axis=1)
df.drop(['initial_list_status'],1,inplace=True)

還沒有做特徵工程,先嚐試丟進模型裏面試試

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
Y=df.loan_status
df.drop(['loan_status'],1,inplace=True)
X=df.copy()
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=123)

 簡單的調試了一下模型,達到auc=0.68,不算很高,一般般的效果

lgb_train = lgb.Dataset(x_train,y_train) # 將數據保存到LightGBM二進制文件將使加載更快
lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 設置提升類型
    'objective': 'binary', # 目標函數
    'metric': {'l2', 'auc'},  # 評估函數
    'num_leaves': 30,   # 葉子節點數*
    'learning_rate': 0.1,  # 學習速率
    'feature_fraction': 1, # 建樹的特徵選擇比例,默認0,
    'bagging_fraction': 1, # 建樹的樣本採樣比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代執行bagging
    'verbose': 1, # <0 顯示致命的, =0 顯示錯誤 (警告), =1 顯示信息,>1 調試
    'min_data_in_leaf':30 #防止過擬合,對於大數據集設置幾百幾千*
}
params['is_unbalance']='true'
#bst=lgb.cv(params,lgb_train,nfold=3,early_stopping_rounds=5)
est=lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval)#valid_sets=lgb_eval 每次迭代顯示auc
y_pre=est.predict(x_test)
roc_auc_score(y_test,y_pre)
[1]	valid_0's auc: 0.671759	valid_0's l2: 0.0389263
[2]	valid_0's auc: 0.672499	valid_0's l2: 0.0479194
[3]	valid_0's auc: 0.678703	valid_0's l2: 0.0570938
[4]	valid_0's auc: 0.679827	valid_0's l2: 0.0665111
[5]	valid_0's auc: 0.681003	valid_0's l2: 0.0758469
[6]	valid_0's auc: 0.683287	valid_0's l2: 0.0848328
[7]	valid_0's auc: 0.683924	valid_0's l2: 0.0933776
[8]	valid_0's auc: 0.682981	valid_0's l2: 0.101624
[9]	valid_0's auc: 0.684177	valid_0's l2: 0.108998
[10]	valid_0's auc: 0.68466	valid_0's l2: 0.116012
[11]	valid_0's auc: 0.685856	valid_0's l2: 0.122292
[12]	valid_0's auc: 0.687436	valid_0's l2: 0.128198
[13]	valid_0's auc: 0.688577	valid_0's l2: 0.133389
[14]	valid_0's auc: 0.689055	valid_0's l2: 0.138128
[15]	valid_0's auc: 0.688259	valid_0's l2: 0.142351
[16]	valid_0's auc: 0.68847	valid_0's l2: 0.14615
[17]	valid_0's auc: 0.689827	valid_0's l2: 0.149469
[18]	valid_0's auc: 0.690383	valid_0's l2: 0.152582
[19]	valid_0's auc: 0.690423	valid_0's l2: 0.155276
[20]	valid_0's auc: 0.689452	valid_0's l2: 0.157742
[21]	valid_0's auc: 0.689922	valid_0's l2: 0.159834
[22]	valid_0's auc: 0.689969	valid_0's l2: 0.161778
[23]	valid_0's auc: 0.690007	valid_0's l2: 0.163318
[24]	valid_0's auc: 0.689451	valid_0's l2: 0.164765
[25]	valid_0's auc: 0.689326	valid_0's l2: 0.165952
[26]	valid_0's auc: 0.690091	valid_0's l2: 0.166698
[27]	valid_0's auc: 0.689723	valid_0's l2: 0.167523
[28]	valid_0's auc: 0.689428	valid_0's l2: 0.168271
[29]	valid_0's auc: 0.687731	valid_0's l2: 0.168675
[30]	valid_0's auc: 0.686303	valid_0's l2: 0.169073
0.6863027484698025

第二個模型,使用邏輯迴歸,可能哪裏出錯了,導致auc=0.5,暫時將案例放上了,之後有時間再仔細調整

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler#歸一化,除以最大減最小
from sklearn.preprocessing import StandardScaler#標準化 ,除以方差
x_train3=StandardScaler(x_train)
x_test3=StandardScaler(x_test)
x_train=x_train.fillna(0)
x_test=x_test.fillna(0)

lr = LogisticRegression(C=190,dual=True,random_state=123)
lr.fit(b,y_train) 
y_pre1 = lr.predict(x_test)

roc_auc_score(y_test,y_pre1)

0.5

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章