%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')
plt.style.use("seaborn")
plt.rc('font', family='SimHei', size=13) # 顯示中文
plt.rcParams['axes.unicode_minus'] = False # 用來正常顯示負
# 載入數據
data = pd.read_csv(r"./cs_training.csv",encoding='gbk')
# 查看數據集
# data.head(10)
# 將特徵名字改爲中文
column={'SeriousDlqin2yrs':'好壞客戶',
'RevolvingUtilizationOfUnsecuredLines':'可用額度比值',
'age':'年齡',
'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天筆數',
'DebtRatio':'負債率',
'MonthlyIncome':'月收入',
'NumberOfOpenCreditLinesAndLoans':'信貸數量',
'NumberOfTimes90DaysLate':'逾期90天筆數',
'NumberRealEstateLoansOrLines':'固定資產貸款量',
'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天筆數',
'NumberOfDependents':'家屬數量'}
data.rename(columns=column,inplace=True)
data.head()
好壞客戶 | 可用額度比值 | 年齡 | 逾期30-59天筆數 | 負債率 | 月收入 | 信貸數量 | 逾期90天筆數 | 固定資產貸款量 | 逾期60-89天筆數 | 家屬數量 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
1 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
2 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
3 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
4 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
from sklearn.ensemble import RandomForestRegressor
# 用隨機森林對缺失值預測填充函數
def set_missing(df):
# 把已有的數值型特徵取出來
process_df = df.iloc[:,[5,0,1,2,3,4,6,7,8,9]]
# 分成已知該特徵和未知該特徵兩部分
# dataframe.values獲取的是dataframe中的數據爲數組array
known = process_df[process_df['月收入'].notnull()].values
unknown = process_df[process_df['月收入'].isnull()].values
# X爲已知月收入的特徵屬性值
X = known[:, 1:]
# y爲結果標籤值月收入
y = known[:, 0]
# X與y用於訓練隨機森林模型,fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
# 用得到的模型進行未知特徵值預測
predicted = rfr.predict(unknown[:, 1:]).round(0)
# 用得到的預測結果填補原缺失數據
df.loc[df['月收入'].isnull(), '月收入'] = predicted
return df
# 用隨機森林填補比較多的缺失值
data=set_missing(data)
# 刪除比較少的缺失值
data=data.dropna()
# 刪除重複項
data = data.drop_duplicates()
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 145563 entries, 0 to 149999
Data columns (total 11 columns):
好壞客戶 145563 non-null int64
可用額度比值 145563 non-null float64
年齡 145563 non-null int64
逾期30-59天筆數 145563 non-null int64
負債率 145563 non-null float64
月收入 145563 non-null float64
信貸數量 145563 non-null int64
逾期90天筆數 145563 non-null int64
固定資產貸款量 145563 non-null int64
逾期60-89天筆數 145563 non-null int64
家屬數量 145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.3 MB
# 刪除逾期30-59天筆數、逾期90天筆數、逾期60-89天筆數大於80的數據
data = data[data['逾期30-59天筆數'] < 80]
data = data[data['逾期90天筆數'] < 80]
data = data[data['逾期60-89天筆數'] < 80]
data = data[data['年齡'] > 0]
col_list = data.columns.values
col_list
array(['好壞客戶', '可用額度比值', '年齡', '逾期30-59天筆數', '負債率', '月收入', '信貸數量',
'逾期90天筆數', '固定資產貸款量', '逾期60-89天筆數', '家屬數量'], dtype=object)
new_col_list = []
for i in range(len(col_list)):
if i != 0 and i != 3 and i != 7 and i != 9:
new_col_list.append(col_list[i])
# 去除單側99%上部分異常值
for item in new_col_list:
data = data[data[item] < data[item].quantile(0.99)]
import woe.feature_process as fp
import woe.eval as eval
data.columns
Index(['好壞客戶', '可用額度比值', '年齡', '逾期30-59天筆數', '負債率', '月收入', '信貸數量', '逾期90天筆數',
'固定資產貸款量', '逾期60-89天筆數', '家屬數量'],
dtype='object')
data.rename(columns={'好壞客戶': 'target'}, inplace=True)
# woe分箱, iv and transform
data_woe = data # 用於存儲所有數據的woe值
civ_list = []
n_positive = sum(data['target'])
n_negtive = len(data) - n_positive
for column in list(data.columns[1:]):
if data[column].dtypes == 'object':
civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
else:
civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
civ_list.append(civ)
data_woe[column] = fp.woe_trans(data[column], civ)
civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0315.csv')
# 刪除iv值過小的變量
iv_thre = 0.001
iv = civ_df[['var_name','iv']].drop_duplicates()
x_columns = iv.var_name[iv.iv > iv_thre]
-------------process continuous variable:可用額度比值-------------
---------------process continuous variable:年齡---------------
-----------process continuous variable:逾期30-59天筆數-----------
--------------process continuous variable:負債率---------------
--------------process continuous variable:月收入---------------
--------------process continuous variable:信貸數量--------------
------------process continuous variable:逾期90天筆數-------------
------------process continuous variable:固定資產貸款量-------------
-----------process continuous variable:逾期60-89天筆數-----------
--------------process continuous variable:家屬數量--------------
可用額度比值
年齡
逾期30-59天筆數
負債率
月收入
信貸數量
逾期90天筆數
固定資產貸款量
逾期60-89天筆數
家屬數量
civ_df
var_name | split_list | sub_total_sample_num | positive_sample_num | negative_sample_num | sub_total_num_percentage | positive_rate_in_sub_total | woe_list | iv_list | iv | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 可用額度比值 | (-INF,0.0] | 9352 | 281 | 9071 | 0.071213 | 0.030047 | -0.757463 | 0.029626 | 1.097527 |
1 | 可用額度比值 | (0.0,0.04215617400000002] | 29188 | 365 | 28823 | 0.222259 | 0.012505 | -1.652011 | 0.312454 | 1.097527 |
2 | 可用額度比值 | (0.04215617400000002,0.0596119858] | 6962 | 111 | 6851 | 0.053014 | 0.015944 | -1.405599 | 0.059004 | 1.097527 |
3 | 可用額度比值 | (0.0596119858,0.13857709429999995] | 17901 | 375 | 17526 | 0.136312 | 0.020949 | -1.127495 | 0.108464 | 1.097527 |
4 | 可用額度比值 | (0.13857709429999995,0.21535932080000003] | 10113 | 295 | 9818 | 0.077008 | 0.029170 | -0.787977 | 0.034242 | 1.097527 |
5 | 可用額度比值 | (0.21535932080000003,0.30067412204] | 8274 | 296 | 7978 | 0.063004 | 0.035775 | -0.577063 | 0.016386 | 1.097527 |
6 | 可用額度比值 | (0.30067412204,0.3974544458] | 7510 | 371 | 7139 | 0.057187 | 0.049401 | -0.240106 | 0.002970 | 1.097527 |
7 | 可用額度比值 | (0.3974544458,0.5331554074] | 8506 | 586 | 7920 | 0.064771 | 0.068893 | 0.113193 | 0.000872 | 1.097527 |
8 | 可用額度比值 | (0.5331554074,0.74050784496] | 9985 | 996 | 8989 | 0.076033 | 0.099750 | 0.517010 | 0.025541 | 1.097527 |
9 | 可用額度比值 | (0.74050784496,0.90349439404] | 7295 | 1103 | 6192 | 0.055550 | 0.151199 | 0.991796 | 0.084555 | 1.097527 |
10 | 可用額度比值 | (0.90349439404,+INF) | 16238 | 3360 | 12878 | 0.123648 | 0.206922 | 1.373441 | 0.423411 | 1.097527 |
0 | 年齡 | (-INF,32.0] | 13531 | 1392 | 12139 | 0.103035 | 0.102875 | 0.551338 | 0.039964 | 0.046040 |
1 | 年齡 | (32.0,+INF) | 117793 | 6747 | 111046 | 0.896965 | 0.057278 | -0.083827 | 0.006076 | 0.046040 |
0 | 逾期30-59天筆數 | (-INF,0.0] | 111119 | 4279 | 106840 | 0.846144 | 0.038508 | -0.500593 | 0.170989 | 0.606073 |
1 | 逾期30-59天筆數 | (0.0,+INF) | 20205 | 3860 | 16345 | 0.153856 | 0.191042 | 1.273765 | 0.435084 | 0.606073 |
0 | 負債率 | (-INF,0.018495376] | 10665 | 466 | 10199 | 0.081211 | 0.043694 | -0.368839 | 0.009420 | 0.088009 |
1 | 負債率 | (0.018495376,0.087064379] | 8843 | 577 | 8266 | 0.067337 | 0.065249 | 0.054956 | 0.000208 | 0.088009 |
2 | 負債率 | (0.087064379,0.138218834] | 7533 | 449 | 7084 | 0.057362 | 0.059604 | -0.041551 | 0.000097 | 0.088009 |
3 | 負債率 | (0.138218834,0.191269577] | 9148 | 493 | 8655 | 0.069660 | 0.053892 | -0.148363 | 0.001437 | 0.088009 |
4 | 負債率 | (0.191269577,0.229044637] | 6898 | 383 | 6515 | 0.052527 | 0.055523 | -0.116807 | 0.000681 | 0.088009 |
5 | 負債率 | (0.229044637,0.26480176767999997] | 6780 | 313 | 6467 | 0.051628 | 0.046165 | -0.311244 | 0.004370 | 0.088009 |
6 | 負債率 | (0.26480176767999997,0.33095571454] | 12054 | 618 | 11436 | 0.091788 | 0.051269 | -0.201013 | 0.003398 | 0.088009 |
7 | 負債率 | (0.33095571454,0.37664756308] | 7551 | 440 | 7111 | 0.057499 | 0.058270 | -0.065603 | 0.000240 | 0.088009 |
8 | 負債率 | (0.37664756308,0.4237495164599999] | 6696 | 406 | 6290 | 0.050988 | 0.060633 | -0.023343 | 0.000028 | 0.088009 |
9 | 負債率 | (0.4237495164599999,0.54743575044] | 12664 | 918 | 11746 | 0.096433 | 0.072489 | 0.167949 | 0.002929 | 0.088009 |
10 | 負債率 | (0.54743575044,0.7263413320000001] | 9112 | 825 | 8287 | 0.069386 | 0.090540 | 0.409960 | 0.013976 | 0.088009 |
11 | 負債率 | (0.7263413320000001,2.6823588614000204] | 9111 | 1026 | 8085 | 0.069378 | 0.112611 | 0.652677 | 0.039439 | 0.088009 |
12 | 負債率 | (2.6823588614000204,1009.0] | 10925 | 639 | 10286 | 0.083191 | 0.058490 | -0.061614 | 0.000307 | 0.088009 |
13 | 負債率 | (1009.0,+INF) | 13344 | 586 | 12758 | 0.101611 | 0.043915 | -0.363574 | 0.011478 | 0.088009 |
0 | 月收入 | (-INF,1159.0] | 13281 | 877 | 12404 | 0.101132 | 0.066034 | 0.067753 | 0.000478 | 0.114078 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7 | 月收入 | (4831.0,5332.68] | 7201 | 470 | 6731 | 0.054834 | 0.065269 | 0.055274 | 0.000172 | 0.114078 |
8 | 月收入 | (5332.68,5917.0] | 7381 | 432 | 6949 | 0.056205 | 0.058529 | -0.060907 | 0.000203 | 0.114078 |
9 | 月收入 | (5917.0,6667.0] | 8647 | 533 | 8114 | 0.065845 | 0.061640 | -0.005805 | 0.000002 | 0.114078 |
10 | 月收入 | (6667.0,7916.0] | 10306 | 519 | 9787 | 0.078478 | 0.050359 | -0.219886 | 0.003448 | 0.114078 |
11 | 月收入 | (7916.0,8333.0] | 7967 | 327 | 7640 | 0.060667 | 0.041044 | -0.434172 | 0.009484 | 0.114078 |
12 | 月收入 | (8333.0,10300.0] | 10340 | 489 | 9851 | 0.078737 | 0.047292 | -0.285946 | 0.005687 | 0.114078 |
13 | 月收入 | (10300.0,+INF) | 13361 | 474 | 12887 | 0.101741 | 0.035476 | -0.585747 | 0.027165 | 0.114078 |
0 | 信貸數量 | (-INF,3.0] | 18632 | 1864 | 16768 | 0.141878 | 0.100043 | 0.520272 | 0.048333 | 0.067247 |
1 | 信貸數量 | (3.0,4.0] | 10396 | 620 | 9776 | 0.079163 | 0.059638 | -0.040946 | 0.000130 | 0.067247 |
2 | 信貸數量 | (4.0,5.0] | 11689 | 671 | 11018 | 0.089009 | 0.057404 | -0.081496 | 0.000570 | 0.067247 |
3 | 信貸數量 | (5.0,6.0] | 12373 | 651 | 11722 | 0.094217 | 0.052615 | -0.173693 | 0.002635 | 0.067247 |
4 | 信貸數量 | (6.0,7.0] | 12102 | 629 | 11473 | 0.092154 | 0.051975 | -0.186600 | 0.002958 | 0.067247 |
5 | 信貸數量 | (7.0,8.0] | 11422 | 518 | 10904 | 0.086976 | 0.045351 | -0.329890 | 0.008205 | 0.067247 |
6 | 信貸數量 | (8.0,9.0] | 10219 | 568 | 9651 | 0.077815 | 0.055583 | -0.115675 | 0.000990 | 0.067247 |
7 | 信貸數量 | (9.0,10.0] | 8745 | 488 | 8257 | 0.066591 | 0.055803 | -0.111481 | 0.000788 | 0.067247 |
8 | 信貸數量 | (10.0,11.0] | 7431 | 405 | 7026 | 0.056585 | 0.054501 | -0.136466 | 0.000993 | 0.067247 |
9 | 信貸數量 | (11.0,13.0] | 11199 | 615 | 10584 | 0.085278 | 0.054916 | -0.128456 | 0.001330 | 0.067247 |
10 | 信貸數量 | (13.0,+INF) | 17116 | 1110 | 16006 | 0.130334 | 0.064852 | 0.048416 | 0.000312 | 0.067247 |
0 | 逾期90天筆數 | (-INF,0.0] | 124488 | 5426 | 119062 | 0.947946 | 0.043587 | -0.371422 | 0.111376 | 0.800610 |
1 | 逾期90天筆數 | (0.0,+INF) | 6836 | 2713 | 4123 | 0.052054 | 0.396870 | 2.298494 | 0.689234 | 0.800610 |
0 | 固定資產貸款量 | (-INF,0.0] | 49471 | 3805 | 45666 | 0.376710 | 0.076914 | 0.231982 | 0.022454 | 0.043142 |
1 | 固定資產貸款量 | (0.0,1.0] | 48153 | 2429 | 45724 | 0.366673 | 0.050443 | -0.218124 | 0.015867 | 0.043142 |
2 | 固定資產貸款量 | (1.0,2.0] | 28413 | 1538 | 26875 | 0.216358 | 0.054130 | -0.143694 | 0.004196 | 0.043142 |
3 | 固定資產貸款量 | (2.0,+INF) | 5287 | 367 | 4920 | 0.040259 | 0.069416 | 0.121318 | 0.000625 | 0.043142 |
0 | 逾期60-89天筆數 | (-INF,0.0] | 125162 | 6053 | 119109 | 0.953078 | 0.048361 | -0.262465 | 0.058584 | 0.515526 |
1 | 逾期60-89天筆數 | (0.0,+INF) | 6162 | 2086 | 4076 | 0.046922 | 0.338526 | 2.047152 | 0.456942 | 0.515526 |
0 | 家屬數量 | (-INF,0.0] | 79954 | 4351 | 75603 | 0.608830 | 0.054419 | -0.138070 | 0.010928 | 0.028199 |
1 | 家屬數量 | (0.0,1.0] | 24473 | 1683 | 22790 | 0.186356 | 0.068770 | 0.111276 | 0.002423 | 0.028199 |
2 | 家屬數量 | (1.0,2.0] | 18117 | 1377 | 16740 | 0.137957 | 0.076006 | 0.219126 | 0.007295 | 0.028199 |
3 | 家屬數量 | (2.0,3.0] | 8780 | 728 | 8052 | 0.066858 | 0.082916 | 0.313645 | 0.007553 | 0.028199 |
66 rows × 10 columns
x_columns
0 可用額度比值
0 年齡
0 逾期30-59天筆數
0 負債率
0 月收入
0 信貸數量
0 逾期90天筆數
0 固定資產貸款量
0 逾期60-89天筆數
0 家屬數量
Name: var_name, dtype: object
iv
var_name | iv | |
---|---|---|
0 | 可用額度比值 | 1.097527 |
0 | 年齡 | 0.046040 |
0 | 逾期30-59天筆數 | 0.606073 |
0 | 負債率 | 0.088009 |
0 | 月收入 | 0.114078 |
0 | 信貸數量 | 0.067247 |
0 | 逾期90天筆數 | 0.800610 |
0 | 固定資產貸款量 | 0.043142 |
0 | 逾期60-89天筆數 | 0.515526 |
0 | 家屬數量 | 0.028199 |
data_woe.head()
target | 可用額度比值 | 年齡 | 逾期30-59天筆數 | 負債率 | 月收入 | 信貸數量 | 逾期90天筆數 | 固定資產貸款量 | 逾期60-89天筆數 | 家屬數量 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0 | 1.373441 | -0.083827 | -0.500593 | -0.041551 | 0.461028 | -0.040946 | -0.371422 | 0.231982 | -0.262465 | 0.111276 |
2 | 0 | 0.517010 | -0.083827 | 1.273765 | 0.054956 | 0.461028 | 0.520272 | 2.298494 | 0.231982 | -0.262465 | -0.138070 |
3 | 0 | -0.577063 | 0.551338 | -0.500593 | 0.054956 | 0.461028 | -0.081496 | -0.371422 | 0.231982 | -0.262465 | -0.138070 |
5 | 0 | -0.787977 | -0.083827 | -0.500593 | -0.065603 | 0.243014 | 0.520272 | -0.371422 | -0.218124 | -0.262465 | 0.111276 |
7 | 0 | 0.991796 | -0.083827 | -0.500593 | -0.116807 | 0.243014 | -0.329890 | -0.371422 | 0.231982 | -0.262465 | -0.138070 |
模型建立
信用評分卡模型在國外是一種成熟的預測方法,尤其在信用風險評估以及金融風險控制領域更是得到了比較廣泛的使用,其原理是將模型變量WOE編碼方式離散化之後運用logistic迴歸模型進行的一種二分類變量的廣義線性模型,下面將模型目標標量爲1記爲違約用戶,對於目標變量爲0記爲正常用戶,採用sklearn中LogisticRegression進行建模
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 模型評估
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import auc
# 數據提取與數據分割
col_names = data_woe.columns.values
X = data_woe[col_names[1:]] # 特徵列
y = data_woe[col_names[0]] # 標籤列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
lr = LogisticRegression(C=1000.0, random_state=0)
result = lr.fit(X_train, y_train)
result
LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=0, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
# 模型預測
y_pred = lr.predict(X_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# 預測爲壞的客戶的概率
prob_pred = [round(u[1], 5) for u in lr.predict_proba(X_test)]
# 預測的準確率
accuracy_score(y_test, y_pred)
0.9387532362048835
# 樣本類別不平衡,用PR不好評價,採用ROC曲線
FPR, TPR, thresholds = metrics.roc_curve(y_test, prob_pred, pos_label=1)
metrics.auc(FPR, TPR)
0.8499778184241903
# 畫圖對預測值和實際值進行比較
plt.plot(FPR, TPR, 'b', label='AUC = %0.2f' % metrics.auc(FPR, TPR)) # 生成ROC曲線
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()
[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-ZuGQDu7G-1577332256208)(output_33_0.png)]
從上圖可知,AUC值爲0.85,說明該模型的預測效果還是不錯的,正確率較高
評分卡計算方法
odds爲good用戶概率(p)與bad用戶概率(1-p)的比值
評分卡設定的分值刻度可以通過將分值表示爲比率對數的現行表達式來定義。公式如下:
常數 A 和 B 通常被稱爲補償和刻度,它們的值可以通過將兩個已知或者假設的分值帶入 中得到。通常,需要兩個假設:
- 在某個特定的比率設定特定的預期分值
- 指定比率翻番的分數(PDO,Point-to-Double Odds)
首先,設定比率爲odds的特定點的分值爲。然後,比率爲 2odds的點分值爲,帶入可以得到
import math
# PDO爲比率翻番的分數,P0爲特定比例的預期分值,B爲刻度
PDO = 20
P0 = 600
B = PDO / math.log(2)
B
28.85390081777927
# A爲補償
A = P0 + B * math.log(1 / 60)
A
481.8621880878296
基於Logistic的評分卡構建
最終,評分卡的分值可以寫成下列形式:
Score
變量,⋯, 爲自變量對應WOE, ,⋯,爲邏輯斯蒂迴歸方程的係數
# 邏輯斯蒂迴歸的係數列表
coef_list = list(result.coef_[0])
coef_list.insert(0, result.intercept_[0])
# 計算信用評分
def credit_socre(data, coef):
score_list = []
for i in range(data.shape[0]):
tmp_score = coef[0]
for j in range(data.shape[1]):
tmp_score += data.iat[i, j] * coef[j + 1]
score = A - B * tmp_score
score_list.append(score)
return score_list
score_list = credit_socre(data_woe.iloc[:, 1:], coef_list)
data_woe.insert(11, 'credit_score', score_list)
data_woe.head().append(data_woe.tail())
# 在原始數據中插入信用評分
data.insert(11, 'credit_socre', score_list)
data.head().append(data.tail())