信用評分卡模型

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

plt.style.use("seaborn")
plt.rc('font', family='SimHei', size=13)  # 顯示中文
plt.rcParams['axes.unicode_minus'] = False  # 用來正常顯示負

# 載入數據
data = pd.read_csv(r"./cs_training.csv",encoding='gbk')
# 查看數據集
# data.head(10)

# 將特徵名字改爲中文
column={'SeriousDlqin2yrs':'好壞客戶',
        'RevolvingUtilizationOfUnsecuredLines':'可用額度比值',
        'age':'年齡',
        'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天筆數',
        'DebtRatio':'負債率',
        'MonthlyIncome':'月收入',
        'NumberOfOpenCreditLinesAndLoans':'信貸數量',
        'NumberOfTimes90DaysLate':'逾期90天筆數',
        'NumberRealEstateLoansOrLines':'固定資產貸款量',
        'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天筆數',
        'NumberOfDependents':'家屬數量'}
data.rename(columns=column,inplace=True)
data.head()

	好壞客戶	可用額度比值	年齡	逾期30-59天筆數	負債率	月收入	信貸數量	逾期90天筆數	固定資產貸款量	家屬數量
0	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

from sklearn.ensemble import RandomForestRegressor

# 用隨機森林對缺失值預測填充函數
def set_missing(df):
    # 把已有的數值型特徵取出來
    process_df = df.iloc[:,[5, 0, 1, 2, 3, 4, 6, 7, 8, 9]]
    # 分成已知該特徵和未知該特徵兩部分
    # dataframe.values獲取的是dataframe中的數據爲數組array
    known = process_df[process_df['月收入'].notnull()].values
    unknown = process_df[process_df['月收入'].isnull()].values
    # X爲已知月收入的特徵屬性值
    X = known[:, 1:]
    # y爲結果標籤值月收入
    y = known[:, 0]
    # X與y用於訓練隨機森林模型，fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型進行未知特徵值預測
    predicted = rfr.predict(unknown[:, 1:]).round(0)
    # 用得到的預測結果填補原缺失數據
    df.loc[df['月收入'].isnull(), '月收入'] = predicted
    return df

# 用隨機森林填補比較多的缺失值
data = set_missing(data)

# 刪除比較少的缺失值
data = data.dropna()

# 刪除重複項
data = data.drop_duplicates()

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145563 entries, 0 to 149999
Data columns (total 11 columns):
好壞客戶          145563 non-null int64
可用額度比值        145563 non-null float64
年齡            145563 non-null int64
逾期30-59天筆數    145563 non-null int64
負債率           145563 non-null float64
月收入           145563 non-null float64
信貸數量          145563 non-null int64
逾期90天筆數       145563 non-null int64
固定資產貸款量       145563 non-null int64
逾期60-89天筆數    145563 non-null int64
家屬數量          145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.3 MB

# 刪除逾期30-59天筆數、逾期90天筆數、逾期60-89天筆數大於80的數據
data = data[data['逾期30-59天筆數'] < 80]
data = data[data['逾期90天筆數'] < 80]
data = data[data['逾期60-89天筆數'] < 80]

data = data[data['年齡'] > 0]

col_list = data.columns.values
col_list

array(['好壞客戶', '可用額度比值', '年齡', '逾期30-59天筆數', '負債率', '月收入', '信貸數量',
       '逾期90天筆數', '固定資產貸款量', '逾期60-89天筆數', '家屬數量'], dtype=object)

new_col_list = []
for i in range(len(col_list)):
    if i != 0 and i != 3 and i != 7 and i != 9:
        new_col_list.append(col_list[i])

# 去除單側99%上部分異常值
for item in new_col_list:
    data = data[data[item] < data[item].quantile(0.99)]

from sklearn.tree import DecisionTreeClassifier


def _optimal_binning_boundary(x, y):
    """
    利用決策樹獲得最優分箱的邊界值列表
    """

    boundary = []  # 待return的分箱邊界值列表
    y = y.values
    clf = DecisionTreeClassifier(criterion='gini',
                                 max_leaf_nodes=6,
                                 min_samples_leaf=5)
    x = x.values.reshape(-1, 1)
    clf.fit(x, y)  # 訓練決策樹

    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold

    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 獲得決策樹節點上的劃分邊界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min() - 0.0001
    max_x = x.max() + 0.1  # +0.1是爲了考慮後續groupby操作時，能包含特徵最大值的樣本
    boundary = [min_x] + boundary + [max_x]

    return boundary

x = data.iloc[:, 1:]
y = data['好壞客戶']

def cut_func(data):
    """
    分箱的結果保存到一個字典中
    """
    cut_dict = {}
    col_list = data.columns[1:]
    for i in range(len(col_list)):
        bins = _optimal_binning_boundary(data.iloc[:, i + 1], data[data.columns[0]])
        cut_ = pd.cut(data[col_list[i]], bins, labels=False)
        cut_dict[col_list[i]] = cut_
    return cut_dict

cut_dict = cut_func(data)

# WOE值計算
def get_woe_data(cut, data):
    BT = data.sum()  # 總的壞客戶
    GT = data.count() - data.sum()  # 總的好客戶
    grouped = data.groupby(cut, as_index=True).value_counts()
    Bi = grouped.unstack().iloc[:, 1]  # 每個分段區間壞的客戶數
    Gi = grouped.unstack().iloc[:, 0]  # 每個分段區間好的客戶數
    odds = (Bi / Gi) * (GT / BT)
    woe = np.log(odds)
    return woe

def cut_woe_func(src_dict, src_data):
    """
    計算每個分箱後的woe值，結果保存到字典中
    """
    cut_woe_dict = {}
    for key in src_dict.keys():
        cut_woe = get_woe_data(cut_dict[key], src_data["好壞客戶"])
        cut_woe_dict[key] = cut_woe
    return cut_woe_dict

cut_woe_dict = cut_woe_func(cut_dict, data)

# IV值計算
def get_IV_data(cut, cut_woe, data):
    grouped = data.groupby(cut, as_index=True).value_counts()
    Bi = grouped.unstack().iloc[:,1]
    BT = data.sum()
    Gi = grouped.unstack().iloc[:,0]
    GT = data.count() - data.sum()
    cut_IV = (( Bi / BT - Gi / GT) * cut_woe).sum()   
    
    return cut_IV

def cut_IV_func(src_dict, src_cut_woe_dict, src_data):
    """
    計算各個分箱後的IV值，並將結果保存到字典中
    """
    cut_IV_dict = {}
    for key in src_dict.keys():
        cut_IV = get_IV_data(src_dict[key], src_cut_woe_dict[key], src_data['好壞客戶'])
        cut_IV_dict[key] = cut_IV
    return cut_IV_dict

cut_IV_dict = cut_IV_func(cut_dict, cut_woe_dict, data)

cut_IV_dict

{'可用額度比值': 1.0496267788824982,
 '年齡': 0.2283880128708045,
 '逾期30-59天筆數': 0.6890352866527477,
 '負債率': 0.0797191570453572,
 '月收入': 0.1018015502249017,
 '信貸數量': 0.09975008041753788,
 '逾期90天筆數': 0.8367128665446881,
 '固定資產貸款量': 0.0431415986925943,
 '逾期60-89天筆數': 0.5379534841785022,
 '家屬數量': 0.028199260527371775}

IV_df = pd.DataFrame([cut_IV_dict])
IV_df

	可用額度比值	年齡	逾期30-59天筆數	負債率	月收入	信貸數量	逾期90天筆數	固定資產貸款量	逾期60-89天筆數	家屬數量
0	1.049627	0.228388	0.689035	0.079719	0.101802	0.09975	0.836713	0.043142	0.537953	0.028199

iv = IV_df.plot.bar(rot=90, figsize=(10,5), fontsize=(10))
iv.set_title('特徵變量與IV值分佈圖', fontsize=(15))
iv.set_xlabel('特徵變量', fontsize=(15))
iv.set_ylabel('IV', fontsize=(15))
plt.show()

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-UG16ash0-1577330868231)(output_33_0.png)]

# 新建dwoe_df存放woe轉換後的數據
woe_df = pd.DataFrame()

# 轉換woe
def replace_data(cut, cut_woe):
    a = []
    for i in cut.unique():
        a.append(i)
        a.sort()
    for j in range(len(a)):
        cut.replace(a[j], cut_woe.values[j], inplace=True)
    return cut

def gen_data_func(src_data, src_cut_dict, src_cut_woe_dict):
    """
    存放woe轉換後的數據
    """
    for key in src_cut_dict.keys():
        new_key = key + "WOE"
        src_data[new_key] = replace_data(src_cut_dict[key], src_cut_woe_dict[key])
    return src_data

woe_df = gen_data_func(woe_df, cut_dict, cut_woe_dict)

woe_df.insert(0, '好壞客戶', data["好壞客戶"])

woe_df.head()

	可用額度比值WOE	年齡WOE	逾期30-59天筆數WOE	負債率WOE	月收入WOE	信貸數量WOE	逾期90天筆數WOE	固定資產貸款量WOE	逾期60-89天筆數WOE	家屬數量WOE
1	1.257482	0.265965	-0.500593	-0.104119	0.470780	-0.145441	-0.371422	0.231982	-0.262465	0.111276
2	0.404043	0.265965	0.897932	-0.104119	0.470780	0.362270	1.996894	0.231982	-0.262465	-0.138070
3	-1.122039	0.455702	-0.500593	-0.104119	0.470780	-0.145441	-0.371422	0.231982	-0.262465	-0.138070
5	-1.122039	-0.920630	-0.500593	-0.104119	0.112917	0.135016	-0.371422	-0.218124	-0.262465	0.111276
7	0.853912	0.265965	-0.500593	-0.104119	0.112917	-0.145441	-0.371422	0.231982	-0.262465	-0.138070

模型建立

信用評分卡模型在國外是一種成熟的預測方法，尤其在信用風險評估以及金融風險控制領域更是得到了比較廣泛的使用，其原理是將模型變量WOE編碼方式離散化之後運用logistic迴歸模型進行的一種二分類變量的廣義線性模型，下面將模型目標標量爲1記爲違約用戶，對於目標變量爲0記爲正常用戶，採用sklearn中LogisticRegression進行建模

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 模型評估
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import auc

# 數據提取與數據分割
col_names = woe_df.columns.values
X = woe_df[col_names[1:]]  # 特徵列
y = woe_df[col_names[0]]  # 標籤列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)

lr = LogisticRegression(C=1000.0, random_state=0)
result = lr.fit(X_train, y_train)

result

LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# 模型預測
y_pred = lr.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# 預測爲壞的客戶的概率
prob_pred = [round(u[1], 5) for u in lr.predict_proba(X_test)]

模型評估

# 預測的準確率
accuracy_score(y_test, y_pred)

0.9404538301436621

# 樣本類別不平衡，用PR不好評價，採用ROC曲線
FPR, TPR, thresholds = metrics.roc_curve(y_test, prob_pred, pos_label=1)
metrics.auc(FPR, TPR)

0.8537992492862908

# 畫圖對預測值和實際值進行比較
plt.plot(FPR, TPR, 'b', label='AUC = %0.2f' % metrics.auc(FPR, TPR)) # 生成ROC曲線
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-2174WDcy-1577330868236)(output_51_0.png)]

從上圖可知，AUC值爲0.85，說明該模型的預測效果還是不錯的，正確率較高

信用評分

我們已經基本完成了建模相關的工作，並用ROC曲線驗證了模型的預測能力。接下來的步驟，就是將Logistic模型轉換爲標準評分卡的形式

評分卡計算方法

odds爲good用戶概率（p）與bad用戶概率（1-p）的比值

$\operatorname{odds}=\frac{p}{1-p}$

評分卡設定的分值刻度可以通過將分值表示爲比率對數的現行表達式來定義。公式如下：

$score_{總}=A+B{*}\ln(odds)$

常數 A 和 B 通常被稱爲補償和刻度，它們的值可以通過將兩個已知或者假設的分值帶入 $score_{總}=A+B{*}\ln(odds)$ 中得到。通常，需要兩個假設：

在某個特定的比率設定特定的預期分值 $P_{0}$
指定比率翻番的分數（PDO，Point-to-Double Odds）

首先，設定比率爲odds的特定點的分值爲 $P_{0}$ 。然後，比率爲 2odds的點分值爲 $P_{0}-PDO$ ，帶入可以得到
$B=\frac{PDO}{\log (2)}$

$A=P_{0}+B \log \left(odds\right)$

$P_{0}$ 和PDO的值都是已知常數，我們可以設定評分卡刻度使得比率爲 1：60（違約與正常）時的分值爲600分，PDO = 20，從而計算出A和B

import math

# PDO爲比率翻番的分數，P0爲特定比例的預期分值，B爲刻度
PDO = 20
P0 = 600
B = PDO / math.log(2)
B

28.85390081777927

# A爲補償
A = P0 + B * math.log(1 / 60)
A

481.8621880878296

基於Logistic的評分卡構建

最終，評分卡的分值可以寫成下列形式：

Score $=A-B\left(\beta_{0}+\beta_{1} x_{1}+\cdots+\beta_{p} x_{p}\right)$

變量 $x_{1}$ ,⋯, $x_{p}$ 爲自變量對應WOE, $\beta_{0}$ ,⋯, $\beta_{p}$ 爲邏輯斯蒂迴歸方程的係數

# 邏輯斯蒂迴歸的係數列表
coef_list = list(result.coef_[0])
coef_list.insert(0, result.intercept_[0])

# 計算信用評分
def credit_socre(data, coef): 
    score_list = []
    for i in range(data.shape[0]):
        tmp_score = coef[0]
        for j in range(data.shape[1]):
            tmp_score += data.iat[i, j] * coef[j + 1]
        score = A - B * tmp_score
        score_list.append(score)
    return score_list

score_list = credit_socre(woe_df.iloc[:, 1:], coef_list)

woe_df.insert(11, 'credit_score', score_list)

woe_df.head().append(woe_df.tail())

	可用額度比值WOE	年齡WOE	逾期30-59天筆數WOE	負債率WOE	月收入WOE	信貸數量WOE	逾期90天筆數WOE	固定資產貸款量WOE	逾期60-89天筆數WOE	家屬數量WOE	credit_score
1	1.257482	0.265965	-0.500593	-0.104119	0.470780	-0.145441	-0.371422	0.231982	-0.262465	0.111276	548.295866
2	0.404043	0.265965	0.897932	-0.104119	0.470780	0.362270	1.996894	0.231982	-0.262465	-0.138070	499.787457
3	-1.122039	0.455702	-0.500593	-0.104119	0.470780	-0.145441	-0.371422	0.231982	-0.262465	-0.138070	588.316050
5	-1.122039	-0.920630	-0.500593	-0.104119	0.112917	0.135016	-0.371422	-0.218124	-0.262465	0.111276	608.484060
7	0.853912	0.265965	-0.500593	-0.104119	0.112917	-0.145441	-0.371422	0.231982	-0.262465	-0.138070	559.845105
149995	-1.122039	-0.920630	-0.500593	-0.104119	0.323795	-0.145441	-0.371422	-0.218124	-0.262465	-0.138070	610.921871
149996	-1.122039	0.265965	-0.500593	0.408699	0.112917	-0.145441	-0.371422	-0.218124	-0.262465	0.219126	584.601393
149997	-1.122039	-0.293016	-0.500593	-0.218218	-0.393628	0.048416	-0.371422	-0.218124	-0.262465	-0.138070	609.993977
149998	-1.122039	0.455702	-0.500593	-0.376448	0.112917	-0.145441	-0.371422	0.231982	-0.262465	-0.138070	597.332740
149999	0.853912	-0.920630	-0.500593	-0.104119	-0.380625	-0.145441	-0.371422	-0.143694	-0.262465	-0.138070	582.329013

data.shape

(131324, 11)

# 在原始數據中插入信用評分
data.insert(11, 'credit_socre', score_list)

data.head().append(data.tail())

	可用額度比值	年齡	逾期30-59天筆數	負債率	月收入	信貸數量	逾期90天筆數	固定資產貸款量	家屬數量	credit_socre
1	0.957151	40	0	0.121876	2600.0	4	0	0	1.0	548.295866
2	0.658180	38	1	0.085113	3042.0	2	1	0	0.0	499.787457
3	0.233810	30	0	0.036050	3300.0	5	0	0	0.0	588.316050
5	0.213179	74	0	0.375607	3500.0	3	0	1	1.0	608.484060
7	0.754464	39	0	0.209940	3500.0	8	0	0	0.0	559.845105
149995	0.040674	74	0	0.225131	2100.0	4	0	1	0.0	610.921871
149996	0.299745	44	0	0.716562	5584.0	4	0	1	2.0	584.601393
149997	0.246044	58	0	3870.000000	2554.0	18	0	1	0.0	609.993977
149998	0.000000	30	0	0.000000	5716.0	4	0	0	0.0	597.332740
149999	0.850283	64	0	0.249908	8158.0	8	0	2	0.0	582.329013

信用評分卡模型

模型建立

模型評估

信用評分

評分卡計算方法

基於Logistic的評分卡構建

Git服務器搭建（gogs服務）

面向對象的補充（slots、tracemalloc、運算符重載中的反向方法）

大數據挖掘——數據預處理

大數據挖掘——認識數據

集成學習：隨機森林和GBDT

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結