使用信用卡數據開發信貸評分卡

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
pd.options.display.max_columns = None

# 拆分列
def split_column(df, y="y"):
    try:
        X = df.drop(y, axis=1)
    except KeyError:
        raise KeyError("請在拆分列的參數中選擇數據中有的字段")
    y = pd.DataFrame(df[y], columns=[y])
    return X, y

# 自定義函數
def check_nan(df_var):
    print("列數：{}，行數：{}".format(*df_var.shape))
    nan_result = df_var.isnull().sum(axis=0)
    col_name_list = df_var.columns.values
    result_dict = {k: v for k, v in zip(col_name_list, list(nan_result))}

    total = df_var.shape[0]
    
    nan_dict = dict()
    for rd in result_dict.items():
        print("{}: {}%".format(rd[0], round((rd[1]/total)*100, 2)))
        nan_dict[rd[0]] = round((rd[1]/total)*100, 2)

    return nan_dict

# 導入數據
df = pd.read_csv('zh/cs-training.csv')
df.head(15)

# 查看各字段數量和類型
df.info()

# 檢查缺失值
_ = check_nan(df)

# 查看數據基本情況
df.describe()

# 填充缺失值：月收入使用平均值填充
df=df.fillna({'月收入':df['月收入'].mean()})
# 刪除缺失值：家屬數量缺失 2.62% 直接刪掉有缺失的行
df=df.dropna()
# 刪除與訓練無關的變量
df = df.drop(["Unnamed: 0", "ID"], axis=1)
df.shape

# 結果填充、刪除操作後，查看數據情況
df1 = df
df.head(15)

# 再次確認缺失值情況
_ = check_nan(df)

# 異常值分析
x1=df['可用額度比值']
x2=df['負債率']
x3=df1["年齡"]
x4=df1["逾期30-59天筆數"]
x5=df1["逾期60-89天筆數"]
x6=df1["逾期90天筆數"]
x7=df1["信貸數量"]
x8=df1["固定資產貸款量"]
fig=plt.figure(figsize=(20,15))
ax1=fig.add_subplot(221)
ax2=fig.add_subplot(222)
ax3=fig.add_subplot(223)
ax4=fig.add_subplot(224)
ax1.boxplot([x1,x2])
ax1.set_xticklabels(["可用額度比值","負債率"], fontsize=20)
ax2.boxplot(x3)
ax2.set_xticklabels("年齡", fontsize=20)
ax3.boxplot([x4,x5,x6])
ax3.set_xticklabels(["逾期30-59天筆數","逾期60-89天筆數","逾期90天筆數"], fontsize=20)
ax4.boxplot([x7,x8])
ax4.set_xticklabels(["信貸數量","固定資產貸款量"], fontsize=20)
plt.show()
# 異常值處理：消除不合邏輯的數據和超級離羣的數據
# 可用額度比值應該小於1，
# 年齡爲0的是異常值，
# 逾期天數筆數大於80的是超級離羣數據，
# 固定資產貸款量大於50的是超級離羣數據

# 處理異常值：過濾離羣值，篩選出剩餘部分數據
df1=df1[df1['可用額度比值']<1]
df1=df1[df1['年齡']>0]
df1=df1[df1['逾期30-59天筆數']<80]
df1=df1[df1['逾期60-89天筆數']<80]
df1=df1[df1['逾期90天筆數']<80]
df1=df1[df1['固定資產貸款量']<50]
df1.shape

# 計算變量之間的相關係數
# 如果變量之間相關係數大於0.6，說明兩個變量有較高的正相關性，
# 這種情況訓練的模型會使模型失真，可以選擇去掉其中一個變量
corr = df1.corr()
xticks = list(corr.index) # x軸標籤
yticks = list(corr.index) # y軸標籤
fig = plt.figure(figsize=(15,10))
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap="rainbow",ax=ax1,linewidths=.5, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})
ax1.set_xticklabels(xticks, rotation=35, fontsize=15)
ax1.set_yticklabels(yticks, rotation=0, fontsize=15)
plt.show()
# 本例中沒有相關度較高的變量

def get_bins(cut_bins):
    bin_set = set()
    bin_list = []
    for i in list(cut_bins.index):
        i = str(i).replace("(", "").replace("]", "")
        i = i.split(",")
        i_a = float(i[0])
        i_b = float(i[1])
        bin_set.add(i_a)
        bin_set.add(i_b)
    bin_list = list(bin_set)
    bin_list = sorted(bin_list, reverse=False)
    print(bin_list)
    return bin_list

# 手動分箱：等頻分箱+手動定義區間
cut1=pd.qcut(df1["可用額度比值"],4,labels=False)
cut_bins1 = pd.qcut(df1["可用額度比值"], 4).value_counts()
bins1 = get_bins(cut_bins1)

cut2=pd.qcut(df1["年齡"],8,labels=False)
cut_bins2=pd.qcut(df1["年齡"],8).value_counts()
bins2 = get_bins(cut_bins2)

bins3=[-1,0,1,3,5,13]
cut3=pd.cut(df1["逾期30-59天筆數"],bins3,labels=False)

cut4=pd.qcut(df1["負債率"],3,labels=False)
cut_bins4=pd.qcut(df1["負債率"],3).value_counts()
bins4 = get_bins(cut_bins4)

cut5=pd.qcut(df1["月收入"],4,labels=False)
cut_bins5=pd.qcut(df1["月收入"],4).value_counts()
bins5 = get_bins(cut_bins5)

cut6=pd.qcut(df1["信貸數量"],4,labels=False)
cut_bins6=pd.qcut(df1["信貸數量"],4).value_counts()
bins6 = get_bins(cut_bins6)

bins7=[-1, 0, 1, 3,5, 20]
cut7=pd.cut(df1["逾期90天筆數"],bins7,labels=False)

bins8=[-1, 0,1,2, 3, 33]
cut8=pd.cut(df1["固定資產貸款量"],bins8,labels=False)

bins9=[-1, 0, 1, 3, 12]
cut9=pd.cut(df1["逾期60-89天筆數"],bins9,labels=False)

bins10=[-1, 0, 1, 2, 3, 5, 21]
cut10=pd.cut(df1["家屬數量"],bins10,labels=False)

key_list = ["可用額度比值", "年齡", "逾期30-59天筆數", "負債率", "月收入",
            "信貸數量", "逾期90天筆數", "固定資產貸款量", "逾期60-89天筆數", "家屬數量"]
key_bin_list = [bins1, bins2, bins3, bins4, bins5, bins6, bins7, bins8, bins9, bins10]

items = []
for index, key in enumerate(key_list):
    bin_list = key_bin_list[index]
    for i in range(len(bin_list)):
        if i != (len(bin_list) - 1):
            item = dict()
            item["變量名稱"] = key
            item["區間"] = "[{},{}]".format(bin_list[i], bin_list[i+1])
            items.append(item)

score_card = pd.DataFrame(items, columns=["變量名稱", "區間"])
score_card.head(10)

# 計算對應區間和變量的WOE值

rate=df1["好壞客戶"].sum()/(df1["好壞客戶"].count()-df1["好壞客戶"].sum())
def get_woe_data(cut):
    grouped=df1["好壞客戶"].groupby(cut,as_index = True).value_counts()
    woe=np.log(grouped.unstack().iloc[:,1]/grouped.unstack().iloc[:,0]/rate)
    return woe

woe_list = []
cut1_woe=get_woe_data(cut1)
cut2_woe=get_woe_data(cut2)
cut3_woe=get_woe_data(cut3)
cut4_woe=get_woe_data(cut4)
cut5_woe=get_woe_data(cut5)
cut6_woe=get_woe_data(cut6)
cut7_woe=get_woe_data(cut7)
cut8_woe=get_woe_data(cut8)
cut9_woe=get_woe_data(cut9)
cut10_woe=get_woe_data(cut10)

woe_list = list(cut1_woe) + list(cut2_woe) + list(cut3_woe) + list(cut4_woe) + list(cut5_woe) + list(cut6_woe) + list(cut7_woe) + list(cut8_woe) + list(cut9_woe) + list(cut10_woe)

score_card["WOE"] = woe_list

score_card.head(10)

def get_IV_data(cut,cut_woe):
    grouped=df1["好壞客戶"].groupby(cut,as_index = True).value_counts()
    cut_IV=((grouped.unstack().iloc[:,1]/df1["好壞客戶"].sum()-grouped.unstack().iloc[:,0]/(df1["好壞客戶"].count()-df1["好壞客戶"].sum()))*cut_woe).sum()    
    return cut_IV
#計算各分組的IV值 一般取IV值大於0.02的變量用作訓練
cut1_IV=get_IV_data(cut1,cut1_woe)
cut2_IV=get_IV_data(cut2,cut2_woe)
cut3_IV=get_IV_data(cut3,cut3_woe)
cut4_IV=get_IV_data(cut4,cut4_woe)
cut5_IV=get_IV_data(cut5,cut5_woe)
cut6_IV=get_IV_data(cut6,cut6_woe)
cut7_IV=get_IV_data(cut7,cut7_woe)
cut8_IV=get_IV_data(cut8,cut8_woe)
cut9_IV=get_IV_data(cut9,cut9_woe)
cut10_IV=get_IV_data(cut10,cut10_woe)
IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=['可用額度比值','年齡','逾期30-59天筆數','負債率','月收入','信貸數量','逾期90天筆數','固定資產貸款量','逾期60-89天筆數','家屬數量'],columns=['IV'])
IV = IV.sort_index(by = ["IV"],ascending = [False]) 
iv=IV.plot.bar(color='b',alpha=0.3,rot=30,figsize=(10,5),fontsize=(10))
iv.set_title('特徵變量與IV值分佈圖',fontsize=(15))
iv.set_xlabel('特徵變量',fontsize=(15))
iv.set_ylabel('IV',fontsize=(15))

IV.sort_index(by = ["IV"],ascending = [False])

# 將之前各個變量的值，按照指定區間的WOE值進行填充
df_new=pd.DataFrame()   #新建df_new存放woe轉換後的數據
def replace_data(cut,cut_woe):
    a=[]
    for i in cut.unique():
        a.append(i)
        a.sort()
    for m in range(len(a)):
        cut.replace(a[m],cut_woe.values[m],inplace=True)
    return cut
df_new["好壞客戶"]=df1["好壞客戶"]
df_new["可用額度比值"]=replace_data(cut1,cut1_woe)
df_new["年齡"]=replace_data(cut2,cut2_woe)
df_new["逾期30-59天筆數"]=replace_data(cut3,cut3_woe)
df_new["負債率"]=replace_data(cut4,cut4_woe)
df_new["月收入"]=replace_data(cut5,cut5_woe)
df_new["信貸數量"]=replace_data(cut6,cut6_woe)
df_new["逾期90天筆數"]=replace_data(cut7,cut7_woe)
df_new["固定資產貸款量"]=replace_data(cut8,cut8_woe)
df_new["逾期60-89天筆數"]=replace_data(cut9,cut9_woe)
df_new["家屬數量"]=replace_data(cut10,cut10_woe)
df_new.head()

# 使用邏輯迴歸算法訓練，求解變量權重
x, y = split_column(df_new, "好壞客戶")  # 拆分列

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.6,random_state=0)
model=LogisticRegression()
clf=model.fit(x_train,y_train)
print('模型準確率：{}'.format(clf.score(x_test,y_test)))

模型準確率：0.9418841189674523

# 計算AUC值
y_prob = model.predict_proba(x_test)

fpr, tpr, threshold = metrics.roc_curve(y_test, y_prob[:, 1])
auc_value = metrics.auc(fpr, tpr)  # 計算auc
print(auc_value)

plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_curve')
plt.legend(loc="lower right")
plt.show()

# 特徵權值係數，後面轉換爲打分規則時會用到
coe=clf.coef_

items = []
for index, c in enumerate(coe[0]):
    item = dict()
    item["變量名稱"] = key_list[index]
    item["模型權重"] = c
    items.append(item)

coef_df = pd.DataFrame(items, columns=["變量名稱", "模型權重"])
coef_df

# 計算KS值
fig, ax = plt.subplots()
ax.plot(1 - threshold, tpr, label='tpr') # ks曲線要按照預測概率降序排列，所以需要1-threshold鏡像
ax.plot(1 - threshold, fpr, label='fpr')
ax.plot(1 - threshold, tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS Curve')
plt.ylim([0.0, 1.0])
plt.figure(figsize=(20,20))
legend = ax.legend(loc='upper left')
plt.show()

# 計算KS值
max(tpr-fpr)

0.5274346008328302

# 假設好壞比爲20的時候分數爲600分，每高20分好壞比翻一倍
# 現在我們求每個變量不同woe值對應的分數刻度可得：
factor = 20 / np.log(2)
offset = 600 - 20 * np.log(20) / np.log(2)
def get_score(coe,woe,factor):
    scores=[]
    for w in woe:
        score=round(coe*w*factor,0)
        scores.append(score)
    return scores
x1 = get_score(coe[0][0], cut1_woe, factor)
x2 = get_score(coe[0][1], cut2_woe, factor)
x3 = get_score(coe[0][2], cut3_woe, factor)
x4 = get_score(coe[0][3], cut4_woe, factor)
x5 = get_score(coe[0][4], cut5_woe, factor)
x6 = get_score(coe[0][5], cut6_woe, factor)
x7 = get_score(coe[0][6], cut7_woe, factor)
x8 = get_score(coe[0][7], cut8_woe, factor)
x9 = get_score(coe[0][8], cut9_woe, factor)
x10 = get_score(coe[0][9], cut10_woe, factor)
print("可用額度比值對應的分數:{}".format(x1))
print("年齡對應的分數:{}".format(x2))
print("逾期30-59天筆數對應的分數:{}".format(x3))
print("負債率對應的分數:{}".format(x4))
print("月收入對應的分數:{}".format(x5))
print("信貸數量對應的分數:{}".format(x6))
print("逾期90天筆數對應的分數:{}".format(x7))
print("固定資產貸款量對應的分數:{}".format(x8))
print("逾期60-89天筆數對應的分數:{}".format(x9))
print("家屬數量對應的分數:{}".format(x10))

x_all = x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10
score_card["評分刻度"] = x_all

# 查看評分標準
score_card.head(20)

# 計算測試集中每個用戶的最終得分
def compute_score(series,bins,score):
    list = []
    i = 0
    while i < len(series):
        value = series[i]
        j = len(bins) - 2
        m = len(bins) - 2
        while j >= 0:
            if value >= bins[j]:
                j = -1
            else:
                j -= 1
                m -= 1
        list.append(score[m])
        i += 1
    return list

# 加載測試集
path2='zh/cs-test.csv'
test1 = pd.read_csv(path2)

test1, t_ID = split_column(test1, "ID")
test1 = test1.drop(["好壞客戶", "Unnamed: 0"], axis=1)

# 計算測試集中每個用戶的最終得分
test1['x1'] = pd.Series(compute_score(test1['可用額度比值'], bins1, x1))
test1['x2'] = pd.Series(compute_score(test1['年齡'], bins2, x2))
test1['x3'] = pd.Series(compute_score(test1['逾期30-59天筆數'], bins3, x3))
test1['x4'] = pd.Series(compute_score(test1['負債率'], bins4, x4))
test1['x5'] = pd.Series(compute_score(test1['月收入'], bins5, x5))
test1['x6'] = pd.Series(compute_score(test1['信貸數量'], bins6, x6))
test1['x7'] = pd.Series(compute_score(test1['逾期90天筆數'], bins7, x7))
test1['x8'] = pd.Series(compute_score(test1['固定資產貸款量'], bins8, x8))
test1['x9'] = pd.Series(compute_score(test1['逾期60-89天筆數'], bins9, x9))
test1['x10'] = pd.Series(compute_score(test1['家屬數量'], bins10, x10))
test1['Score'] = test1['x1']+test1['x2']+test1['x3']+test1['x4']+test1['x5']+test1['x6']+test1['x7']+test1['x8']+test1['x9']+test1['x10']+600

test1.head(10)

test1.to_csv("score.csv")

使用信用卡數據開發信貸評分卡

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

[轉帖]

python列出centos7內存使用前50的進程信息

Garnet：微軟官方基於.NET開源的高性能分佈式緩存存儲數據庫

Flink執行圖

Java響應式編程

評估統計算法在銀行僞造鈔票檢測中的價值

基於django開發下載excel文件的接口

使用python&pandas讀取hive數據

使用信用卡數據開發信貸評分卡

使用pyspark SQL處理MySQL中的數據

Ubuntu中Matplotlib繪圖的中文亂碼

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結