20200317_決策樹預測貸款申請

使用決策樹,預測貸款申請

import pandas as pd
#  忽略彈出的warnings
import warnings
warnings.filterwarnings('ignore')  
text=pd.read_excel('data/LoanStats_securev1_2019Q4.xlsx')
text.head()
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
0 164027473 20000 20000 20000 36 months 0.1240 668.12 B B4 NaN ... 0 2 100.0 50.0 1 0 60800 42566 5200 40000.0
1 163984413 16500 16500 16500 60 months 0.1033 353.27 B B1 NaN ... 0 0 100.0 0.0 0 0 223390 40913 40500 39890.0
2 164193225 7500 7500 7500 36 months 0.1240 250.55 B B4 Rn ... 0 7 54.5 16.7 0 0 138468 102122 47700 90768.0
3 162948736 19000 19000 18975 36 months 0.0646 581.99 A A1 Tech Ops Analyst ... 0 0 100.0 40.0 0 0 184034 28461 38400 35000.0
4 164161686 10000 10000 10000 36 months 0.2055 374.45 D D2 Planner ... 0 2 100.0 16.7 0 0 639373 161516 24600 172818.0

5 rows × 114 columns

目標變量

text['loan_status'].value_counts()
Current               122625
Fully Paid              3539
In Grace Period         1079
Late (31-120 days)       509
Late (16-30 days)        304
Charged Off               80
n                          1
Name: loan_status, dtype: int64
#0爲已經完成的
def function(x):
    if 'Current' in x:
        return 0
    elif 'Fully Paid' in x:
        return 0
    else:
        return 1
text['loan_status']=text.apply(lambda x:function(x['loan_status']),axis=1)
text['loan_status'].value_counts()
0    126164
1      1973
Name: loan_status, dtype: int64
pos_trainDf = text[text['loan_status'] == 1]
neg_trainDf = text[text['loan_status'] == 0].sample(n=4000, random_state=2018)
text = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0,random_state=2018)
text.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 114 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(36), int64(50), object(27)
memory usage: 5.2+ MB

缺失值查看

check_null = text.isnull().sum(axis=0).sort_values(ascending=False)/float(len(text)) #查看缺失值比例
print(check_null[check_null >0.2]) # 查看缺失比例大於20%的屬性。
desc                              0.999833
mths_since_last_record            0.899046
verification_status_joint         0.880629
annual_inc_joint                  0.864055
dti_joint                         0.864055
mths_since_recent_bc_dlq          0.794408
mths_since_last_major_derog       0.769965
mths_since_recent_revol_delinq    0.703164
mths_since_last_delinq            0.548468
dtype: float64
thresh_count = len(text)*0.4 # 設定閥值
data = text.dropna(thresh=thresh_count, axis=1 ) #若某一列數據缺失的數量超過閥值就會被刪除
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 106 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(30), int64(50), object(25)
memory usage: 4.9+ MB

刪除無意義的列

sub_grade:與Grade的信息重複

emp_title :缺失值較多,同時不能反映借款人收入或資產的真實情況

zip_code:地址郵編,郵編顯示不全,沒有意義

addr_state:申請地址所屬州,不能反映借款人的償債能力

last_credit_pull_d :LendingClub平臺最近一個提供貸款的時間,沒有意義

policy_code : 變量信息全爲1

pymnt_plan 基本是n

title: title與purpose的信息重複,同時title的分類信息更加離散

next_pymnt_d : 下一個付款時間,沒有意義

policy_code : 沒有意義

collection_recovery_fee: 全爲0,沒有意義

earliest_cr_line : 記錄的是借款人發生第一筆借款的時間

issue_d : 貸款發行時間,這裏提前向模型泄露了信息

last_pymnt_d、collection_recovery_fee、last_pymnt_amnt: 預測貸款違約模型是貸款前的風險控制手段,這些貸後信息都會影響我們訓練模型的效果,在此將這些信息刪除

drop_list = ['sub_grade', 'emp_title',  'title', 'zip_code', 'addr_state', 
             'mths_since_last_delinq' ,'initial_list_status','title','issue_d','last_pymnt_d','last_pymnt_amnt',
             'next_pymnt_d','last_credit_pull_d','policy_code','collection_recovery_fee', 'earliest_cr_line']
data.drop(drop_list, axis=1, inplace = True)
data.head()
id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade emp_length home_ownership ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
18821 163425898 4500 4500 4500 36 months 0.1612 158.48 C NaN RENT ... 0 2 100.0 28.6 0 0 44700 10872 32800 0.0
61234 161908366 20000 20000 20000 60 months 0.2305 564.39 D NaN OWN ... 0 0 100.0 33.3 0 0 54349 19572 10400 22349.0
119781 159901427 10000 10000 10000 60 months 0.1862 257.32 D 6 years OWN ... 0 3 100.0 0.0 0 0 69077 48184 9600 49477.0
49201 162292591 21000 21000 21000 60 months 0.1430 491.91 C < 1 year RENT ... 0 0 100.0 0.0 0 0 109894 66662 33800 67194.0
53727 162154208 40000 40000 40000 60 months 0.0819 814.70 A 10+ years RENT ... 0 0 100.0 50.0 0 0 207370 160985 98000 61725.0

5 rows × 91 columns

分類變量

objectColumns = data.select_dtypes(include=["object"]).columns
data[objectColumns].isnull().sum().sort_values(ascending=False)
emp_length             572
application_type         1
url                      1
total_acc                0
delinq_2yrs              0
purpose                  0
pymnt_plan               0
verification_status      0
annual_inc               0
home_ownership           0
grade                    0
term                     0
dtype: int64
# data['int_rate'] = data['int_rate'].str.rstrip('%').astype('float')
# data['revol_util'] = data['revol_util'].str.rstrip('%').astype('float')
# data['annual_inc'] = data['annual_inc'].str.replace(",","").astype('float')
import numpy as np
objectColumns = data.select_dtypes(include=["object"]).columns # 篩選數據類型爲object的數據
data[objectColumns] = data[objectColumns].fillna("Unknown") #以分類“Unknown”填充缺失值
import missingno as msno
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
%matplotlib inline
msno.bar(data[objectColumns]) #可視化
<matplotlib.axes._subplots.AxesSubplot at 0x2cacc08aa20>

在這裏插入圖片描述

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    },
    "grade":{
        "A": 1,
        "B": 2,
        "C": 3,
        "D": 4,
        "E": 5,
        "F": 6,
        "G": 7
    }
}
data = data.replace(mapping_dict) #變量映射

數值類型缺失值

data.select_dtypes(include=[np.number]).isnull().sum().sort_values(ascending=False)
il_util                  883
mths_since_recent_inq    655
mo_sin_old_il_acct       203
mths_since_rcnt_il       203
bc_util                  109
                        ... 
total_cu_tl                0
inq_fi                     0
total_rev_hi_lim           0
total_bc_limit             0
id                         0
Length: 80, dtype: int64
numColumns = data.select_dtypes(include=[np.number]).columns
msno.matrix(data[numColumns]) #缺失值可視化
<matplotlib.axes._subplots.AxesSubplot at 0x2caecfe1160>

在這裏插入圖片描述

data.select_dtypes(include=[np.number])
id loan_amnt funded_amnt funded_amnt_inv int_rate installment grade loan_status dti fico_range_low ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
18821 163425898 4500 4500 4500 0.1612 158.48 3 1 16.13 705 ... 0 2 100.0 28.6 0 0 44700 10872 32800 0.0
61234 161908366 20000 20000 20000 0.2305 564.39 4 0 34.14 735 ... 0 0 100.0 33.3 0 0 54349 19572 10400 22349.0
119781 159901427 10000 10000 10000 0.1862 257.32 4 0 27.84 680 ... 0 3 100.0 0.0 0 0 69077 48184 9600 49477.0
49201 162292591 21000 21000 21000 0.1430 491.91 3 1 21.82 740 ... 0 0 100.0 0.0 0 0 109894 66662 33800 67194.0
53727 162154208 40000 40000 40000 0.0819 814.70 1 0 27.52 700 ... 0 0 100.0 50.0 0 0 207370 160985 98000 61725.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
86547 160719957 30000 30000 30000 0.0819 611.03 1 0 5.68 740 ... 0 2 100.0 40.0 0 0 361548 46148 94500 0.0
69734 161401437 16000 16000 16000 0.1430 549.18 3 1 13.73 660 ... 0 0 90.9 66.7 0 0 21300 15022 7800 6000.0
30947 162968064 1600 1600 1600 0.1102 52.40 2 0 17.32 715 ... 0 1 100.0 50.0 0 0 63659 41808 27200 30259.0
29039 163064608 10000 10000 10000 0.1240 334.06 2 0 22.91 680 ... 0 2 66.7 0.0 0 0 230024 36479 2900 60846.0
92872 160838177 23000 23000 23000 0.1774 580.81 3 1 0.00 800 ... 0 0 100.0 0.0 0 0 85255 0 600 0.0

5973 rows × 80 columns

data.isnull().sum().sum()
mean_cols=data.mean()
data= data.fillna(mean_cols)

目標變量

y=data['loan_status']
x=data.drop(['loan_status'],axis=1)
#使用pandas庫將類別變量編碼
x =pd.get_dummies(x)
n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('樣本個數:{}; 正樣本佔{:.2%}; 負樣本佔{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特徵維數:', x.shape[1])
樣本個數:5973; 正樣本佔66.97%; 負樣本佔33.03%
特徵維數: 7167

特徵工程

#數據進行分割(訓練數據和測試數據)
from sklearn.model_selection  import train_test_split#測試集和訓練集
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("訓練數據集樣本數目:%d, 測試數據集樣本數目:%d" % (x_train.shape[0], x_test.shape[0]))
y_train = y_train.astype(np.int)
y_test = y_test.astype(np.int)
訓練數據集樣本數目:4778, 測試數據集樣本數目:1195
#參數優化
from sklearn.pipeline import Pipeline #管道
from sklearn.model_selection import GridSearchCV #網格搜索交叉驗證,用於選擇最優的參數
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
pipes =Pipeline([
            ('mms', MinMaxScaler()), ## 歸一化操作
            ('pca', PCA()), ## 降緯
            ('RandomForestClassifier', RandomForestClassifier(criterion='gini'))
        ])
# 參數
#
# estimators = [1,50,100,500]
# depth = [1,2,3,7,15]
parameters = [
    {
    "pca__n_components": [1,2,3,4],
    "RandomForestClassifier__n_estimators":[1,50,100,500],
    "RandomForestClassifier__max_depth":[1,2,3,7,15]
    }
]
#獲取數據
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
gscv = GridSearchCV(pipes, param_grid=parameters)
gscv.fit(x_train2, y_train2)
print ("score值:",gscv.best_score_,"最優參數列表:", gscv.best_params_)
score值: 0.6720405704396591 最優參數列表: {'RandomForestClassifier__max_depth': 7, 'RandomForestClassifier__n_estimators': 500, 'pca__n_components': 4}
#標準化
ss = MinMaxScaler()#分類模型,經常使用的是minmaxscaler歸一化,迴歸模型經常用standardscaler
x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)
x_train.shape
(4778, 7167)
#降維
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.08187674 0.05705152 0.05380546 0.04683824]
#隨機森林模型
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=2000, criterion='gini', max_depth=7, random_state=0)
forest.fit(x_train, y_train)#max_depth一般不宜設置過大,把每個模型作爲一個弱分類器
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
#模型效果評估
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
score = forest.score(x_test, y_test)
print ("準確率:%.2f%%" % (score * 100))
#模型預測
y_score = forest.predict(x_test)# prodict_proba輸出概率
準確率:66.78%
# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_score) ###計算真正率和假正率
roc_auc = auc(fpr,tpr) ###計算auc的值
print('auc:%.2f'%(roc_auc))
auc:0.51
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率爲橫座標,真正率爲縱座標做曲線
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
<Figure size 432x288 with 0 Axes>

在這裏插入圖片描述

決策樹

#參數優化
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([
            ('mms', MinMaxScaler()),
            ('pca', PCA()),
            ('decision', DecisionTreeClassifier(random_state=0))
        ])

# 參數
parameters = {
    "pca__n_components": [0.5,0.99],#設置爲浮點數代表主成分方差所佔最小比例的閾值
    "decision__criterion": ["gini", "entropy"],
    "decision__max_depth": [1,2,3,4,5,6,7,8,9,10]
}
#數據
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
#模型構建:通過網格交叉驗證,尋找最優參數列表, param_grid可選參數列表,cv:進行幾折交叉驗證
gscv = GridSearchCV(pipe, param_grid=parameters,cv=3)
#模型訓練
gscv.fit(x_train2, y_train2)
#算法的最優解
print("最優參數列表:", gscv.best_params_)
print("score值:",gscv.best_score_)
最優參數列表: {'decision__criterion': 'gini', 'decision__max_depth': 4, 'pca__n_components': 0.99}
score值: 0.6917121178186392
#降維
from sklearn.decomposition import PCA
pca = PCA(n_components= 0.99)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.34176263 0.23813938 0.22458996 0.19550803]
tree = DecisionTreeClassifier(criterion='gini', max_depth=4)
tree.fit(x_train, y_train) # fit模型訓練
# 模型相關的指標輸出
# print("訓練集上的準確率:%.3f" % tree.score(x_train, y_train))
y_hat = tree.predict(x_test) # 獲取預測值
print("準確率:%.3f" % (np.mean(y_hat == y_test)))
準確率:0.671
# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_hat) ###計算真正率和假正率
roc_auc = auc(fpr,tpr) ###計算auc的值
print('auc:%.2f'%(roc_auc))
auc:0.51
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率爲橫座標,真正率爲縱座標做曲線
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
<Figure size 432x288 with 0 Axes>

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章