本是天池的課堂,不過講的不是太清楚,所以後面的變量控制,清洗數據都是按照自己的想法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv(r'LoanStats_2016Q3.csv',skiprows=1,low_memory=False)
df.info()
先刪除一些無關數據 ,數據格式都統一
df.drop('id',1,inplace=True)
df.drop('member_id',1,inplace=True)
df.dropna(axis=0,how='all',inplace=True) #all空就刪
df.dropna(axis=0,how='all',inplace=True) #all空就刪
df.emp_title=df.emp_title.astype('str').apply(lambda x:x.lower())#由於數據形式大小寫沒固定,全部轉化一下,可以避免大小寫不同導致的特徵不一樣
df.drop('emp_title',1,inplace=True)#這是職業,分類太多了,先暫時刪掉,之後有時間可以加回來
年份信息格式修好統一
df['emp_length'].fillna(value=0,inplace=True)
#正則替換掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
#正則替換掉year
df['emp_length'].replace(to_replace="[^0-9]+",value='',inplace=True,regex=True)
查看缺失值的比例
#處理對象類型的缺失,O代表float型的字段
df.select_dtypes(include=['O']).describe().T.assign(missing_pct=df.apply(lambda x: (len(x)-x.count())/float(len(x))))
刪除缺失值太多的列
df.drop('desc',1,inplace=True)
df.drop('verification_status_joint',1,inplace=True)
df.drop('zip_code',1,inplace=True)
df.drop('addr_state',1,inplace=True)
df.drop('earliest_cr_line',1,inplace=True)
df.drop('revol_util',1,inplace=True)
df.drop('title',1,inplace=True)
df.drop('term',1,inplace=True)
df.drop('issue_d',1,inplace=True)
刪除貸款後的信息
df.drop(['out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','grade','sub_grade'],1,inplace=True)
df.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee'],1,inplace=True)
df.drop(['last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d'],1,inplace=True)
df.drop(['policy_code'],1,inplace=True)
df.drop('annual_inc_joint',1,inplace=True)
df.drop('dti_joint',1,inplace=True)
是否還款按是否違約分類
dict={lo[0]:1, lo[1]:1, lo[2]:0, lo[3]:np.nan, lo[4]:0, lo[5]:np.nan, lo[6]:np.nan}
df.loan_status.replace(dict,inplace=True)
剩下一些變量修改
df.loan_status.dropna(how='any',inplace=True)
df.dropna(subset=['loan_status'],inplace=True)#這列的空值刪除
df.loan_status=df.loan_status.astype('float').astype('int')
#df.loan_status=df.loan_status.apply(lambda x : int(str(x).strip()))#有時候int(str(x)會出錯,用float代替會好一些
df.int_rate=df.int_rate.apply(lambda x:float(x[:-1])/100 if '%' in x else(np.nan))
刪除相關性。corr太高的數據
df.drop(['funded_amnt','funded_amnt_inv','installment'],axis=1,inplace=True)
get_dummies,處理特徵
df.home_ownership.value_counts()
home_dumm=pd.get_dummies(df.home_ownership)
df= pd.concat([df,home_dumm],axis=1)
df.drop(['home_ownership'],1,inplace=True)
df.verification_status.value_counts()
veristatus_dumm=pd.get_dummies(df.verification_status)
df= pd.concat([df,veristatus_dumm],axis=1)
df.drop(['verification_status'],1,inplace=True)
df.application_type.value_counts()
apptype_dumm=pd.get_dummies(df.application_type)
df= pd.concat([df,apptype_dumm],axis=1)
df.drop(['application_type'],1,inplace=True)
df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)
df.pymnt_plan.value_counts()
pyplan_dumm=pd.get_dummies(df.pymnt_plan)
df= pd.concat([df,pyplan_dumm],axis=1)
df.drop(['pymnt_plan'],1,inplace=True)
df.initial_list_status.value_counts()
inls_dumm=pd.get_dummies(df.initial_list_status)
df= pd.concat([df,inls_dumm],axis=1)
df.drop(['initial_list_status'],1,inplace=True)
還沒有做特徵工程,先嚐試丟進模型裏面試試
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
Y=df.loan_status
df.drop(['loan_status'],1,inplace=True)
X=df.copy()
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=123)
簡單的調試了一下模型,達到auc=0.68,不算很高,一般般的效果
lgb_train = lgb.Dataset(x_train,y_train) # 將數據保存到LightGBM二進制文件將使加載更快
lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
params = {
'task': 'train',
'boosting_type': 'gbdt', # 設置提升類型
'objective': 'binary', # 目標函數
'metric': {'l2', 'auc'}, # 評估函數
'num_leaves': 30, # 葉子節點數*
'learning_rate': 0.1, # 學習速率
'feature_fraction': 1, # 建樹的特徵選擇比例,默認0,
'bagging_fraction': 1, # 建樹的樣本採樣比例
'bagging_freq': 5, # k 意味着每 k 次迭代執行bagging
'verbose': 1, # <0 顯示致命的, =0 顯示錯誤 (警告), =1 顯示信息,>1 調試
'min_data_in_leaf':30 #防止過擬合,對於大數據集設置幾百幾千*
}
params['is_unbalance']='true'
#bst=lgb.cv(params,lgb_train,nfold=3,early_stopping_rounds=5)
est=lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval)#valid_sets=lgb_eval 每次迭代顯示auc
y_pre=est.predict(x_test)
roc_auc_score(y_test,y_pre)
[1] valid_0's auc: 0.671759 valid_0's l2: 0.0389263 [2] valid_0's auc: 0.672499 valid_0's l2: 0.0479194 [3] valid_0's auc: 0.678703 valid_0's l2: 0.0570938 [4] valid_0's auc: 0.679827 valid_0's l2: 0.0665111 [5] valid_0's auc: 0.681003 valid_0's l2: 0.0758469 [6] valid_0's auc: 0.683287 valid_0's l2: 0.0848328 [7] valid_0's auc: 0.683924 valid_0's l2: 0.0933776 [8] valid_0's auc: 0.682981 valid_0's l2: 0.101624 [9] valid_0's auc: 0.684177 valid_0's l2: 0.108998 [10] valid_0's auc: 0.68466 valid_0's l2: 0.116012 [11] valid_0's auc: 0.685856 valid_0's l2: 0.122292 [12] valid_0's auc: 0.687436 valid_0's l2: 0.128198 [13] valid_0's auc: 0.688577 valid_0's l2: 0.133389 [14] valid_0's auc: 0.689055 valid_0's l2: 0.138128 [15] valid_0's auc: 0.688259 valid_0's l2: 0.142351 [16] valid_0's auc: 0.68847 valid_0's l2: 0.14615 [17] valid_0's auc: 0.689827 valid_0's l2: 0.149469 [18] valid_0's auc: 0.690383 valid_0's l2: 0.152582 [19] valid_0's auc: 0.690423 valid_0's l2: 0.155276 [20] valid_0's auc: 0.689452 valid_0's l2: 0.157742 [21] valid_0's auc: 0.689922 valid_0's l2: 0.159834 [22] valid_0's auc: 0.689969 valid_0's l2: 0.161778 [23] valid_0's auc: 0.690007 valid_0's l2: 0.163318 [24] valid_0's auc: 0.689451 valid_0's l2: 0.164765 [25] valid_0's auc: 0.689326 valid_0's l2: 0.165952 [26] valid_0's auc: 0.690091 valid_0's l2: 0.166698 [27] valid_0's auc: 0.689723 valid_0's l2: 0.167523 [28] valid_0's auc: 0.689428 valid_0's l2: 0.168271 [29] valid_0's auc: 0.687731 valid_0's l2: 0.168675 [30] valid_0's auc: 0.686303 valid_0's l2: 0.169073
0.6863027484698025
第二個模型,使用邏輯迴歸,可能哪裏出錯了,導致auc=0.5,暫時將案例放上了,之後有時間再仔細調整
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler#歸一化,除以最大減最小
from sklearn.preprocessing import StandardScaler#標準化 ,除以方差
x_train3=StandardScaler(x_train)
x_test3=StandardScaler(x_test)
x_train=x_train.fillna(0)
x_test=x_test.fillna(0)
lr = LogisticRegression(C=190,dual=True,random_state=123)
lr.fit(b,y_train)
y_pre1 = lr.predict(x_test)
roc_auc_score(y_test,y_pre1)
0.5