import pandas as pd
# 忽略彈出的warnings
import warnings
warnings.filterwarnings('ignore')
text=pd.read_excel('data/LoanStats_securev1_2019Q4.xlsx')
text.head()
id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | ... | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 164027473 | 20000 | 20000 | 20000 | 36 months | 0.1240 | 668.12 | B | B4 | NaN | ... | 0 | 2 | 100.0 | 50.0 | 1 | 0 | 60800 | 42566 | 5200 | 40000.0 |
1 | 163984413 | 16500 | 16500 | 16500 | 60 months | 0.1033 | 353.27 | B | B1 | NaN | ... | 0 | 0 | 100.0 | 0.0 | 0 | 0 | 223390 | 40913 | 40500 | 39890.0 |
2 | 164193225 | 7500 | 7500 | 7500 | 36 months | 0.1240 | 250.55 | B | B4 | Rn | ... | 0 | 7 | 54.5 | 16.7 | 0 | 0 | 138468 | 102122 | 47700 | 90768.0 |
3 | 162948736 | 19000 | 19000 | 18975 | 36 months | 0.0646 | 581.99 | A | A1 | Tech Ops Analyst | ... | 0 | 0 | 100.0 | 40.0 | 0 | 0 | 184034 | 28461 | 38400 | 35000.0 |
4 | 164161686 | 10000 | 10000 | 10000 | 36 months | 0.2055 | 374.45 | D | D2 | Planner | ... | 0 | 2 | 100.0 | 16.7 | 0 | 0 | 639373 | 161516 | 24600 | 172818.0 |
5 rows × 114 columns
text['loan_status'].value_counts()
Current 122625
Fully Paid 3539
In Grace Period 1079
Late (31-120 days) 509
Late (16-30 days) 304
Charged Off 80
n 1
Name: loan_status, dtype: int64
#0爲已經完成的
def function(x):
if 'Current' in x:
return 0
elif 'Fully Paid' in x:
return 0
else:
return 1
text['loan_status']=text.apply(lambda x:function(x['loan_status']),axis=1)
text['loan_status'].value_counts()
0 126164
1 1973
Name: loan_status, dtype: int64
pos_trainDf = text[text['loan_status'] == 1]
neg_trainDf = text[text['loan_status'] == 0].sample(n=5000, random_state=2018)
text = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0,random_state=2018)
text.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6973 entries, 110105 to 92872
Columns: 114 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(36), int64(50), object(27)
memory usage: 6.1+ MB
缺失值查看
check_null = text.isnull().sum(axis=0).sort_values(ascending=False)/float(len(text)) #查看缺失值比例
print(check_null[check_null >0.2]) # 查看缺失比例大於20%的屬性。
desc 0.999857
mths_since_last_record 0.899613
verification_status_joint 0.882977
annual_inc_joint 0.864334
dti_joint 0.864334
mths_since_recent_bc_dlq 0.794206
mths_since_last_major_derog 0.771691
mths_since_recent_revol_delinq 0.704145
mths_since_last_delinq 0.551556
dtype: float64
thresh_count = len(text)*0.4 # 設定閥值
data = text.dropna(thresh=thresh_count, axis=1 ) #若某一列數據缺失的數量超過閥值就會被刪除
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6973 entries, 110105 to 92872
Columns: 106 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(30), int64(50), object(25)
memory usage: 5.7+ MB
刪除無意義的列
sub_grade:與Grade的信息重複
emp_title :缺失值較多,同時不能反映借款人收入或資產的真實情況
zip_code:地址郵編,郵編顯示不全,沒有意義
addr_state:申請地址所屬州,不能反映借款人的償債能力
last_credit_pull_d :LendingClub平臺最近一個提供貸款的時間,沒有意義
policy_code : 變量信息全爲1
pymnt_plan 基本是n
title: title與purpose的信息重複,同時title的分類信息更加離散
next_pymnt_d : 下一個付款時間,沒有意義
policy_code : 沒有意義
collection_recovery_fee: 全爲0,沒有意義
earliest_cr_line : 記錄的是借款人發生第一筆借款的時間
issue_d : 貸款發行時間,這裏提前向模型泄露了信息
last_pymnt_d、collection_recovery_fee、last_pymnt_amnt: 預測貸款違約模型是貸款前的風險控制手段,這些貸後信息都會影響我們訓練模型的效果,在此將這些信息刪除
drop_list = ['sub_grade', 'emp_title', 'title', 'zip_code', 'addr_state',
'mths_since_last_delinq' ,'initial_list_status','title','issue_d','last_pymnt_d','last_pymnt_amnt',
'next_pymnt_d','last_credit_pull_d','policy_code','collection_recovery_fee', 'earliest_cr_line']
data.drop(drop_list, axis=1, inplace = True)
data.head()
id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | emp_length | home_ownership | ... | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
110105 | 160041532 | 5000 | 5000 | 5000 | 36 months | 0.1102 | 163.75 | B | NaN | RENT | ... | 0 | 0 | 66.7 | 100.0 | 0 | 0 | 4900 | 3313 | 2800 | 0.0 |
36272 | 162224912 | 9000 | 9000 | 9000 | 36 months | 0.0819 | 282.82 | A | 10+ years | MORTGAGE | ... | 1 | 1 | 89.7 | 0.0 | 0 | 0 | 443788 | 77239 | 45400 | 84288.0 |
66408 | 161687267 | 15950 | 15950 | 15950 | 36 months | 0.1171 | 527.57 | B | 10+ years | MORTGAGE | ... | 0 | 2 | 100.0 | 0.0 | 0 | 0 | 233421 | 115483 | 62100 | 90541.0 |
121280 | 159844563 | 5600 | 5600 | 5600 | 36 months | 0.1171 | 185.23 | B | 10+ years | RENT | ... | 0 | 7 | 100.0 | 0.0 | 0 | 0 | 64866 | 5486 | 51000 | 4166.0 |
20195 | 163365512 | 28000 | 28000 | 28000 | 60 months | 0.0819 | 570.29 | A | < 1 year | MORTGAGE | ... | 0 | 4 | 100.0 | 10.0 | 0 | 0 | 115641 | 66401 | 33800 | 75041.0 |
5 rows × 91 columns
分類變量
objectColumns = data.select_dtypes(include=["object"]).columns
data[objectColumns].isnull().sum().sort_values(ascending=False)
emp_length 651
application_type 1
url 1
total_acc 0
delinq_2yrs 0
purpose 0
pymnt_plan 0
verification_status 0
annual_inc 0
home_ownership 0
grade 0
term 0
dtype: int64
# data['int_rate'] = data['int_rate'].str.rstrip('%').astype('float')
# data['revol_util'] = data['revol_util'].str.rstrip('%').astype('float')
# data['annual_inc'] = data['annual_inc'].str.replace(",","").astype('float')
import numpy as np
objectColumns = data.select_dtypes(include=["object"]).columns # 篩選數據類型爲object的數據
data[objectColumns] = data[objectColumns].fillna("Unknown") #以分類“Unknown”填充缺失值
import missingno as msno
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
%matplotlib inline
msno.bar(data[objectColumns]) #可視化
<matplotlib.axes._subplots.AxesSubplot at 0x238b1e15a58>
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
},
"grade":{
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7
}
}
data = data.replace(mapping_dict) #變量映射
數值類型缺失值
data.select_dtypes(include=[np.number]).isnull().sum().sort_values(ascending=False)
il_util 1011
mths_since_recent_inq 773
mo_sin_old_il_acct 234
mths_since_rcnt_il 234
bc_util 116
...
total_cu_tl 0
inq_fi 0
total_rev_hi_lim 0
total_bc_limit 0
id 0
Length: 80, dtype: int64
numColumns = data.select_dtypes(include=[np.number]).columns
msno.matrix(data[numColumns]) #缺失值可視化
<matplotlib.axes._subplots.AxesSubplot at 0x238d13c17f0>
data.isnull().sum().sum()
mean_cols=data.mean()
data= data.fillna(mean_cols)
目標變量
y=data['int_rate']
x=data.drop(['int_rate'],axis=1)
#使用pandas庫將類別變量編碼
x=pd.get_dummies(x)
特徵工程
#數據進行分割(訓練數據和測試數據)
from sklearn.model_selection import train_test_split#測試集和訓練集
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("訓練數據集樣本數目:%d, 測試數據集樣本數目:%d" % (x_train.shape[0], x_test.shape[0]))
y_train = y_train.astype(np.int)
y_test = y_test.astype(np.int)
訓練數據集樣本數目:5578, 測試數據集樣本數目:1395
#標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train))
x_test = pd.DataFrame(scaler.transform(x_test))
x_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5578 entries, 0 to 5577
Columns: 8267 entries, 0 to 8266
dtypes: float64(8267)
memory usage: 351.8 MB
#降維
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.00212175 0.00138494]
y_train.shape
(5578,)
神經網絡
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
Using Theano backend.
WARNING (theano.configdefaults): g++ not available, if using conda: `conda install m2w64-toolchain`
D:\sofewore\anaconda\lib\site-packages\theano\configdefaults.py:560: UserWarning: DeprecationWarning: there is no c++ compiler.This is deprecated and with Theano 0.11 a c++ compiler will be mandatory
warnings.warn("DeprecationWarning: there is no c++ compiler."
WARNING (theano.configdefaults): g++ not detected ! Theano will be unable to execute optimized C-implementations (for both CPU and GPU) and will default to Python implementations. Performance will be severely degraded. To remove this warning, set Theano flags cxx to an empty string.
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=2))
model.add(Dense(1, activation='relu'))
model.compile(optimizer='rmsprop',
loss='mse',
metrics=['accuracy'])
model.fit(x_train, y_train, batch_size =1000,epochs=1)
y_true=model.predict(x_test)
Epoch 1/1
5578/5578 [==============================] - 13s 2ms/step - loss: 0.2186 - accuracy: 1.0000