【風控建模】邏輯迴歸算法構建標準信用評分卡模型

正火速編輯!
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述在這裏插入圖片描述在這裏插入圖片描述需要數據集,請留言聯繫筆者!!!
話不多說,直接上代碼。

主程序

import pandas as pd
import datetime
import collections
import numpy as np
import numbers
import random
import sys
import sys
_path = r'C:\Users\A3\Desktop\LR_scorecard'
sys.path.append(_path)
import pickle
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from importlib import reload
from matplotlib import pyplot as plt
reload(sys)
#sys.setdefaultencoding( "utf-8")
import scorecard_functions as sf
#from scorecard_functions_V3 import *
from sklearn.linear_model import LogisticRegressionCV
# -*- coding: utf-8 -*-

################################
######## UDF: 自定義函數 ########
################################
### 對時間窗口,計算累計產比 ###
def TimeWindowSelection(df, daysCol, time_windows):
    '''
    :param df: the dataset containg variabel of days
    :param daysCol: the column of days
    :param time_windows: the list of time window
    :return:
    '''
    freq_tw = {}
    for tw in time_windows:
        freq = sum(df[daysCol].apply(lambda x: int(x<=tw)))
        freq_tw[tw] = freq
    return freq_tw


def DeivdedByZero(nominator, denominator):
    '''
    當分母爲0時,返回0;否則返回正常值
    '''
    if denominator == 0:
        return 0
    else:
        return nominator*1.0/denominator


#對某些統一的字段進行統一
def ChangeContent(x):
    y = x.upper()
    if y == '_MOBILEPHONE':
        y = '_PHONE'
    return y

def MissingCategorial(df,x):
    missing_vals = df[x].map(lambda x: int(x!=x))
    return sum(missing_vals)*1.0/df.shape[0]

def MissingContinuous(df,x):
    missing_vals = df[x].map(lambda x: int(np.isnan(x)))
    return sum(missing_vals) * 1.0 / df.shape[0]

def MakeupRandom(x, sampledList):
    if x==x:
        return x
    else:
        randIndex = random.randint(0, len(sampledList)-1)
        return sampledList[randIndex]



############################################################
#Step 0: 數據分析的初始工作, 包括讀取數據文件、檢查用戶Id的一致性等#
############################################################

folderOfData = 'C:/Users/A3/Desktop/scorecard/'
data1 = pd.read_csv(folderOfData+'PPD_LogInfo_3_1_Training_Set.csv', header = 0)
data2 = pd.read_csv(folderOfData+'PPD_Training_Master_GBK_3_1_Training_Set.csv', header = 0,encoding = 'gbk')
data3 = pd.read_csv(folderOfData+'PPD_Userupdate_Info_3_1_Training_Set.csv', header = 0)

#############################################################################################
# Step 1: 從PPD_LogInfo_3_1_Training_Set &  PPD_Userupdate_Info_3_1_Training_Set數據中衍生特徵#
#############################################################################################
# compare whether the four city variables match
data2['city_match'] = data2.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)
del data2['UserInfo_2']
del data2['UserInfo_4']
del data2['UserInfo_8']
del data2['UserInfo_20']

### 提取申請日期,計算日期差,查看日期差的分佈
data1['logInfo'] = data1['LogInfo3'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data1['Listinginfo'] = data1['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data1['ListingGap'] = data1[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
plt.hist(data1['ListingGap'],bins=200)
plt.title('Days between login date and listing date')
ListingGap2 = data1['ListingGap'].map(lambda x: min(x,365))
plt.hist(ListingGap2,bins=200)

timeWindows = TimeWindowSelection(data1, 'ListingGap', range(30,361,30))

'''
使用180天作爲最大的時間窗口計算新特徵
所有可以使用的時間窗口可以有7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.
在每個時間窗口內,計算總的登錄次數,不同的登錄方式,以及每種登錄方式的平均次數
'''
time_window = [7, 30, 60, 90, 120, 150, 180]
var_list = ['LogInfo1','LogInfo2']
data1GroupbyIdx = pd.DataFrame({'Idx':data1['Idx'].drop_duplicates()})

for tw in time_window:
    data1['TruncatedLogInfo'] = data1['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw))
    temp = data1.loc[data1['logInfo'] >= data1['TruncatedLogInfo']]
    for var in var_list:
        #count the frequences of LogInfo1 and LogInfo2
        count_stats = temp.groupby(['Idx'])[var].count().to_dict()
        data1GroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1GroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0))

        # count the distinct value of LogInfo1 and LogInfo2
        Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates()
        uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict()
        data1GroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0))

        # calculate the average count of each value in LogInfo1 and LogInfo2
        data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\
            apply(lambda x: DeivdedByZero(x[0],x[1]), axis=1)


data3['ListingInfo'] = data3['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3['UserupdateInfo'] = data3['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3['ListingGap'] = data3[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
collections.Counter(data3['ListingGap'])
hist_ListingGap = np.histogram(data3['ListingGap'])
hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})
hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()
hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])

'''
對 QQ和qQ, Idnumber和idNumber,MOBILEPHONE和PHONE 進行統一
在時間切片內,計算
 (1) 更新的頻率
 (2) 每種更新對象的種類個數
 (3) 對重要信息如IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE的更新
'''
data3['UserupdateInfo1'] = data3['UserupdateInfo1'].map(ChangeContent)
data3GroupbyIdx = pd.DataFrame({'Idx':data3['Idx'].drop_duplicates()})

time_window = [7, 30, 60, 90, 120, 150, 180]
for tw in time_window:
    data3['TruncatedLogInfo'] = data3['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw))
    temp = data3.loc[data3['UserupdateInfo'] >= data3['TruncatedLogInfo']]

    #frequency of updating
    freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
    data3GroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3GroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0))

    # number of updated types
    Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates()
    uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
    data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x))

    #average count of each type
    data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3GroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \
        apply(lambda x: x[0] * 1.0 / x[1], axis=1)

    #whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE
    Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x])
    Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum()
    for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']:
        item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict()
        data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))

# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_Set
allData = pd.concat([data2.set_index('Idx'), data3GroupbyIdx.set_index('Idx'), data1GroupbyIdx.set_index('Idx')],axis= 1)
allData.to_csv(folderOfData+'allData_0.csv',encoding = 'gbk')




#######################################
# Step 2: 對類別型變量和數值型變量進行補缺#
######################################
allData = pd.read_csv(folderOfData+'allData_0.csv',header = 0,encoding = 'gbk')
allFeatures = list(allData.columns)
allFeatures.remove('target')
if 'Idx' in allFeatures:
    allFeatures.remove('Idx')
allFeatures.remove('ListingInfo')

#檢查是否有常數型變量,並且檢查是類別型還是數值型變量
numerical_var = []
for col in allFeatures:
    if len(set(allData[col])) == 1:
        print('delete {} from the dataset because it is a constant'.format(col))
        del allData[col]
        allFeatures.remove(col)
    else:
        uniq_valid_vals = [i for i in allData[col] if i == i]
        uniq_valid_vals = list(set(uniq_valid_vals))
        if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):
            numerical_var.append(col)

categorical_var = [i for i in allFeatures if i not in numerical_var]


#檢查變量的最多值的佔比情況,以及每個變量中佔比最大的值
records_count = allData.shape[0]
col_most_values,col_large_value = {},{}
for col in allFeatures:
    value_count = allData[col].groupby(allData[col]).count()
    col_most_values[col] = max(value_count)/records_count
    large_value = value_count[value_count== max(value_count)].index[0]
    col_large_value[col] = large_value
col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')
col_most_values_df.columns = ['max percent']
col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)
pcnt = list(col_most_values_df[:500]['max percent'])
vars = list(col_most_values_df[:500].index)
plt.bar(range(len(pcnt)), height = pcnt)
plt.title('Largest Percentage of Single Value in Each Variable')

#計算多數值佔比超過90%的字段中,少數值的壞樣本率是否會顯著高於多數值
large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)
bad_rate_diff = {}
for col in large_percent_cols:
    large_value = col_large_value[col]
    temp = allData[[col,'target']]
    temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1)
    bad_rate = temp.groupby(col).mean()
    if bad_rate.iloc[0]['target'] == 0:
        bad_rate_diff[col] = 0
        continue
    bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target'])
bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)
bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]
plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)

#由於所有的少數值的壞樣本率並沒有顯著高於多數值,意味着這些變量可以直接剔除
for col in large_percent_cols:
    if col in numerical_var:
        numerical_var.remove(col)
    else:
        categorical_var.remove(col)
    del allData[col]

'''
對類別型變量,如果缺失超過80%, 就刪除,否則當成特殊的狀態
'''
missing_pcnt_threshould_1 = 0.8
for col in categorical_var:
    missingRate = MissingCategorial(allData,col)
    print('{0} has missing rate as {1}'.format(col,missingRate))
    if missingRate > missing_pcnt_threshould_1:
        categorical_var.remove(col)
        del allData[col]
    if 0 < missingRate < missing_pcnt_threshould_1:
        uniq_valid_vals = [i for i in allData[col] if i == i]
        uniq_valid_vals = list(set(uniq_valid_vals))
        if isinstance(uniq_valid_vals[0], numbers.Real):
            missing_position = allData.loc[allData[col] != allData[col]][col].index
            not_missing_sample = [-1]*len(missing_position)
            allData.loc[missing_position, col] = not_missing_sample
        else:
            # In this way we convert NaN to NAN, which is a string instead of np.nan
            allData[col] = allData[col].map(lambda x: str(x).upper())

allData_bk = allData.copy()
'''
檢查數值型變量
'''
missing_pcnt_threshould_2 = 0.8
deleted_var = []
for col in numerical_var:
    missingRate = MissingContinuous(allData, col)
    print('{0} has missing rate as {1}'.format(col, missingRate))
    if missingRate > missing_pcnt_threshould_2:
        deleted_var.append(col)
        print('we delete variable {} because of its high missing rate'.format(col))
    else:
        if missingRate > 0:
            not_missing = allData.loc[allData[col] == allData[col]][col]
            #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
            missing_position = allData.loc[allData[col] != allData[col]][col].index
            not_missing_sample = random.sample(list(not_missing), len(missing_position))
            allData.loc[missing_position,col] = not_missing_sample
            #del allData[col]
            #allData[col] = makeuped
            missingRate2 = MissingContinuous(allData, col)
            print('missing rate after making up is:{}'.format(str(missingRate2)))

if deleted_var != []:
    for col in deleted_var:
        numerical_var.remove(col)
        del allData[col]


allData.to_csv(folderOfData+'allData_1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)

allData = pd.read_csv(folderOfData+'allData_1.csv', header=0,encoding='gbk')




###################################
# Step 3: 基於卡方分箱法對變量進行分箱#
###################################
'''
對不同類型的變量,分箱的處理是不同的:
(1)數值型變量可直接分箱
(2)取值個數較多的類別型變量,需要用bad rate做編碼轉換成數值型變量,再分箱
(3)取值個數較少的類別型變量不需要分箱,但是要檢查是否每個類別都有好壞樣本。如果有類別只有好或壞,需要合併
'''

#for each categorical variable, if it has distinct values more than 5, we use the ChiMerge to merge it

trainData = pd.read_csv(folderOfData+'allData_1.csv',header = 0, encoding='gbk')
#trainData = pd.read_csv(folderOfData+'allData_1.csv',header = 0, encoding='gbk',dtype=object)
allFeatures = list(trainData.columns)
allFeatures.remove('ListingInfo')
allFeatures.remove('target')
#allFeatures.remove('Idx')

#將特徵區分爲數值型和類別型
numerical_var = []
for var in allFeatures:
    uniq_vals = list(set(trainData[var]))
    if np.nan in uniq_vals:
        uniq_vals.remove( np.nan)
    if len(uniq_vals) >= 10 and isinstance(uniq_vals[0],numbers.Real):
        numerical_var.append(var)

categorical_var = [i for i in allFeatures if i not in numerical_var]

for col in categorical_var:
    #for Chinese character, upper() is not valid
    if col not in ['UserInfo_7','UserInfo_9','UserInfo_19']:
        trainData[col] = trainData[col].map(lambda x: str(x).upper())


'''
對於類別型變量,按照以下方式處理
1,如果變量的取值個數超過5,計算bad rate進行編碼
2,除此之外,其他任何類別型變量如果有某個取值中,對應的樣本全部是壞樣本或者是好樣本,進行合併。
'''
deleted_features = []   #將處理過的變量刪除,防止對後面建模的干擾
encoded_features = {}   #將bad rate編碼方式保存下來,在以後的測試和生產環境中需要使用
merged_features = {}    #將類別型變量合併方案保留下來
var_IV = {}  #save the IV values for binned features       #將IV值保留和WOE值
var_WOE = {}
for col in categorical_var:
    print('we are processing {}'.format(col))
# =============================================================================
#     if len(set(trainData[col]))>1000:
#         continue
# =============================================================================
    if len(set(trainData[col]))>5:
        print('{} is encoded with bad rate'.format(col))
        col0 = str(col)+'_encoding'

        #(1), 計算壞樣本率並進行編碼
        encoding_result = sf.BadRateEncoding(trainData, col, 'target')
        trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['bad_rate']

        #(2), 將(1)中的編碼後的變量也加入數值型變量列表中,爲後面的卡方分箱做準備
        numerical_var.append(col0)

        #(3), 保存編碼結果
        encoded_features[col] = [col0, br_encoding]

        #(4), 刪除原始值

        deleted_features.append(col)
    else:
        bad_bin = trainData.groupby([col])['target'].sum()
        #對於類別數少於5個,但是出現0壞樣本的特徵需要做處理
        if min(bad_bin) == 0:
            print('{} has 0 bad sample!'.format(col))
            col1 = str(col) + '_mergeByBadRate'
            #(1), 找出最優合併方式,使得每一箱同時包含好壞樣本
            mergeBin = sf.MergeBad0(trainData, col, 'target')
            #(2), 依照(1)的結果對值進行合併
            trainData[col1] = trainData[col].map(mergeBin)
            maxPcnt = sf.MaximumBinPcnt(trainData, col1)
            #如果合併後導致有箱佔比超過90%,就刪除。
            if maxPcnt > 0.9:
                print('{} is deleted because of large percentage of single bin'.format(col))
                deleted_features.append(col)
                categorical_var.remove(col)
                del trainData[col]
                continue
            #(3) 如果合併後的新的變量滿足要求,就保留下來
            merged_features[col] = [col1, mergeBin]
            WOE_IV = sf.CalcWOE(trainData, col1, 'target')
            var_WOE[col1] = WOE_IV['WOE']
            var_IV[col1] = WOE_IV['IV']
            #del trainData[col]
            deleted_features.append(col)
        else:
            WOE_IV = sf.CalcWOE(trainData, col, 'target')
            var_WOE[col] = WOE_IV['WOE']
            var_IV[col] = WOE_IV['IV']


'''
對於連續型變量,處理方式如下:
1,利用卡方分箱法將變量分成5個箱
2,檢查壞樣本率的單調性,如果發現單調性不滿足,就進行合併,直到滿足單調性
'''
var_cutoff = {}
for col in numerical_var:
    print("{} is in processing".format(col))
    col1 = str(col) + '_Bin'

    #(1),用卡方分箱法進行分箱,並且保存每一個分割的端點。例如端點=[10,20,30]表示將變量分爲x<10,10<x<20,20<x<30和x>30.
    #特別地,缺失值-1不參與分箱
    if -1 in set(trainData[col]):
        special_attribute = [-1]
    else:
        special_attribute = []
    cutOffPoints = sf.ChiMerge(trainData, col, 'target',special_attribute=special_attribute)
    var_cutoff[col] = cutOffPoints
    trainData[col1] = trainData[col].map(lambda x: sf.AssignBin(x, cutOffPoints,special_attribute=special_attribute))

    #(2), check whether the bad rate is monotone
    BRM = sf.BadRateMonotone(trainData, col1, 'target',special_attribute=special_attribute)
    if not BRM:
        if special_attribute == []:
            bin_merged = sf.Monotone_Merge(trainData, 'target', col1)
            removed_index = []
            for bin in bin_merged:
                if len(bin)>1:
                    indices = [int(b.replace('Bin ','')) for b in bin]
                    removed_index = removed_index+indices[0:-1]
            removed_point = [cutOffPoints[k] for k in removed_index]
            for p in removed_point:
                cutOffPoints.remove(p)
            var_cutoff[col] = cutOffPoints
            trainData[col1] = trainData[col].map(lambda x: sf.AssignBin(x, cutOffPoints, special_attribute=special_attribute))
        else:
            cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
            temp = trainData.loc[~trainData[col].isin(special_attribute)]
            bin_merged = sf.Monotone_Merge(temp, 'target', col1)
            removed_index = []
            for bin in bin_merged:
                if len(bin) > 1:
                    indices = [int(b.replace('Bin ', '')) for b in bin]
                    removed_index = removed_index + indices[0:-1]
            removed_point = [cutOffPoints2[k] for k in removed_index]
            for p in removed_point:
                cutOffPoints2.remove(p)
            cutOffPoints2 = cutOffPoints2 + special_attribute
            var_cutoff[col] = cutOffPoints2
            trainData[col1] = trainData[col].map(lambda x: sf.AssignBin(x, cutOffPoints2, special_attribute=special_attribute))

    #(3), 分箱後再次檢查是否有單一的值佔比超過90%。如果有,刪除該變量
    maxPcnt = sf.MaximumBinPcnt(trainData, col1)
    if maxPcnt > 0.9:
        # del trainData[col1]
        deleted_features.append(col)
        numerical_var.remove(col)
        print('we delete {} because the maximum bin occupies more than 90%'.format(col))
        continue

    WOE_IV = sf.CalcWOE(trainData, col1, 'target')
    var_IV[col] = WOE_IV['IV']
    var_WOE[col] = WOE_IV['WOE']
    #del trainData[col]



trainData.to_csv(folderOfData+'allData_2.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)



with open(folderOfData+'var_WOE.pkl',"wb") as f:
    f.write(pickle.dumps(var_WOE))

with open(folderOfData+'var_IV.pkl',"wb") as f:
    f.write(pickle.dumps(var_IV))


with open(folderOfData+'var_cutoff.pkl',"wb") as f:
    f.write(pickle.dumps(var_cutoff))


with open(folderOfData+'merged_features.pkl',"wb") as f:
    f.write(pickle.dumps(merged_features))

########################################
# Step 4: WOE編碼後的單變量分析與多變量分析#
########################################
trainData = pd.read_csv(folderOfData+'allData_2.csv', header=0, encoding='gbk')


with open(folderOfData+'var_WOE.pkl',"rb") as f:
    var_WOE = pickle.load(f)

with open(folderOfData+'var_IV.pkl',"rb") as f:
    var_IV = pickle.load(f)


with open(folderOfData+'var_cutoff.pkl',"rb") as f:
    var_cutoff = pickle.load(f)


with open(folderOfData+'merged_features.pkl',"rb") as f:
    merged_features = pickle.load(f)

#將一些看起來像數值變量實際上是類別變量的字段轉換成字符
num2str = ['SocialNetwork_13','SocialNetwork_12','UserInfo_6','UserInfo_5','UserInfo_10','UserInfo_17']
for col in num2str:
    trainData[col] = trainData[col].map(lambda x: str(x))


for col in var_WOE.keys():
    print(col)
    col2 = str(col)+"_WOE"
    if col in var_cutoff.keys():
        cutOffPoints = var_cutoff[col]
        special_attribute = []
        if -1 in cutOffPoints:
            special_attribute = [-1]
        binValue = trainData[col].map(lambda x: sf.AssignBin(x, cutOffPoints,special_attribute=special_attribute))
        trainData[col2] = binValue.map(lambda x: var_WOE[col][x])
    else:
        print('********************************************************************************************')
        print(col)
        if -1 in set(trainData[col]):
            trainData[col2] = trainData[col].map(lambda x: var_WOE[col][str(x*1.0)])
        else:
            trainData[col2] = trainData[col].map(lambda x: var_WOE[col][x])

trainData.to_csv(folderOfData+'allData_3.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)



### (i) 選擇IV高於閾值的變量
trainData = pd.read_csv(folderOfData+'allData_3.csv', header=0,encoding='gbk')
all_IV = list(var_IV.values())
all_IV = sorted(all_IV, reverse=True)
plt.bar(x=range(len(all_IV)), height = all_IV)
iv_threshould = 0.02
varByIV = [k for k, v in var_IV.items() if v > iv_threshould]



### (ii) 檢查WOE編碼後的變量的兩兩線性相關性

var_IV_selected = {k:var_IV[k] for k in varByIV}
var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True)
var_IV_sorted = [i[0] for i in var_IV_sorted]

removed_var  = []
roh_thresould = 0.6
for i in range(len(var_IV_sorted)-1):
    if var_IV_sorted[i] not in removed_var:
        x1 = var_IV_sorted[i]+"_WOE"
        for j in range(i+1,len(var_IV_sorted)):
            if var_IV_sorted[j] not in removed_var:
                x2 = var_IV_sorted[j] + "_WOE"
                roh = np.corrcoef([trainData[x1], trainData[x2]])[0, 1]
                if abs(roh) >= roh_thresould:
                    print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh)))
                    if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]:
                        removed_var.append(var_IV_sorted[j])
                    else:
                        removed_var.append(var_IV_sorted[i])

var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]

### (iii)檢查是否有變量與其他所有變量的VIF > 10
for i in range(len(var_IV_sortet_2)):
    x0 = trainData[var_IV_sortet_2[i]+'_WOE']
    x0 = np.array(x0)
    X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]]
    X = trainData[X_Col]
    X = np.matrix(X)
    regr = LinearRegression()
    clr= regr.fit(X, x0)
    x_pred = clr.predict(X)
    R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum()
    vif = 1/(1-R2)
    if vif > 10:
        print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))



#########################
# Step 5: 應用邏輯迴歸模型#
#########################
multi_analysis = [i+'_WOE' for i in var_IV_sortet_2]
y = trainData['target']
X = trainData[multi_analysis].copy()
X['intercept'] = [1]*X.shape[0]


LR = sm.Logit(y, X).fit()
summary = LR.summary2()
pvals = LR.pvalues.to_dict()
params = LR.params.to_dict()

#發現有變量不顯著,因此需要單獨檢驗顯著性
varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True)
varLargeP = [i[0] for i in varLargeP]
p_value_list = {}
for var in varLargeP:
    X_temp = trainData[var].copy().to_frame()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    p_value_list[var] = LR.pvalues[var]
for k,v in p_value_list.items():
    print("{0} has p-value of {1} in univariate regression".format(k,v))


#發現有變量的係數爲正,因此需要單獨檢驗正確性
varPositive = [k for k,v in params.items() if v >= 0]
coef_list = {}
for var in varPositive:
    X_temp = trainData[var].copy().to_frame()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    coef_list[var] = LR.params[var]
for k,v in coef_list.items():
    print("{0} has coefficient of {1} in univariate regression".format(k,v))


selected_var = [multi_analysis[0]]
for var in multi_analysis[1:]:
    try_vars = selected_var+[var]
    X_temp = trainData[try_vars].copy()
    X_temp['intercept'] = [1] * X_temp.shape[0]
    LR = sm.Logit(y, X_temp).fit()
    #summary = LR.summary2()
    pvals, params = LR.pvalues, LR.params
    del params['intercept']
    if max(pvals)<0.1 and max(params)<0:
        selected_var.append(var)

LR.summary2()

y_pred = LR.predict(X_temp)
y_result = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target'])})
sf.KS(y_result,'y_pred','y_real')

roc_auc_score(trainData['target'], y_pred)



################
# Step 6: 尺度化#
################
scores = sf.Prob2Score(y_pred,200,100)
plt.hist(scores,bins=100)

功能函數

import numpy as np
import pandas as pd

def SplitData(df, col, numOfSplit, special_attribute=[]):
    '''
    :param df: 按照col排序後的數據集
    :param col: 待分箱的變量
    :param numOfSplit: 切分的組別數
    :param special_attribute: 在切分數據集的時候,某些特殊值需要排除在外
    :return: 在原數據集上增加一列,把原始細粒度的col重新劃分成粗粒度的值,便於分箱中的合併處理
    '''
    df2 = df.copy()
    if special_attribute != []:
        df2 = df.loc[~df[col].isin(special_attribute)]
    N = df2.shape[0]
    n = int(N/numOfSplit)
    splitPointIndex = [i*n for i in range(1,numOfSplit)]
    rawValues = sorted(list(df2[col]))
    splitPoint = [rawValues[i] for i in splitPointIndex]
    splitPoint = sorted(list(set(splitPoint)))
    return splitPoint

def MaximumBinPcnt(df,col):
    '''
    :return: 數據集df中,變量col的分佈佔比
    '''
    N = df.shape[0]
    total = df.groupby([col])[col].count()
    pcnt = total*1.0/N
    return max(pcnt)



def Chi2(df, total_col, bad_col):
    '''
    :param df: 包含全部樣本總計與壞樣本總計的數據框
    :param total_col: 全部樣本的個數
    :param bad_col: 壞樣本的個數
    :return: 卡方值
    '''
    df2 = df.copy()
    # 求出df中,總體的壞樣本率和好樣本率
    badRate = sum(df2[bad_col])*1.0/sum(df2[total_col])
    # 當全部樣本只有好或者壞樣本時,卡方值爲0
    if badRate in [0,1]:
        return 0
    df2['good'] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1)
    goodRate = sum(df2['good']) * 1.0 / sum(df2[total_col])
    # 期望壞(好)樣本個數=全部樣本個數*平均壞(好)樣本佔比
    df2['badExpected'] = df[total_col].apply(lambda x: x*badRate)
    df2['goodExpected'] = df[total_col].apply(lambda x: x * goodRate)
    badCombined = zip(df2['badExpected'], df2[bad_col])
    goodCombined = zip(df2['goodExpected'], df2['good'])
    badChi = [(i[0]-i[1])**2/i[0] for i in badCombined]
    goodChi = [(i[0] - i[1]) ** 2 / i[0] for i in goodCombined]
    chi2 = sum(badChi) + sum(goodChi)
    return chi2



def BinBadRate(df, col, target, grantRateIndicator=0):
    '''
    :param df: 需要計算好壞比率的數據集
    :param col: 需要計算好壞比率的特徵
    :param target: 好壞標籤
    :param grantRateIndicator: 1返回總體的壞樣本率,0不返回
    :return: 每箱的壞樣本率,以及總體的壞樣本率(當grantRateIndicator==1時)
    '''
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1)
    dicts = dict(zip(regroup[col],regroup['bad_rate']))
    if grantRateIndicator==0:
        return (dicts, regroup)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    overallRate = B * 1.0 / N
    return (dicts, regroup, overallRate)



def AssignGroup(x, bin):
    '''
    :return: 數值x在區間映射下的結果。例如,x=2,bin=[0,3,5], 由於0<x<3,x映射成3
    '''
    N = len(bin)
    if x<=min(bin):
        return min(bin)
    elif x>max(bin):
        return 10e10
    else:
        for i in range(N-1):
            if bin[i] < x <= bin[i+1]:
                return bin[i+1]


def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
    '''
    :param df: 包含目標變量與分箱屬性的數據框
    :param col: 需要分箱的屬性
    :param target: 目標變量,取值0或1
    :param max_interval: 最大分箱數。如果原始屬性的取值個數低於該參數,不執行這段函數
    :param special_attribute: 不參與分箱的屬性取值
    :param minBinPcnt:最小箱的佔比,默認爲0
    :return: 分箱結果
    '''
    colLevels = sorted(list(set(df[col])))
    N_distinct = len(colLevels)
    if N_distinct <= max_interval:  #如果原始屬性的取值個數低於max_interval,不執行這段函數
        print("The number of original levels for {} is less than or equal to max intervals".format(col))
        return colLevels[:-1]
    else:
        if len(special_attribute)>=1:
            df1 = df.loc[df[col].isin(special_attribute)]
            df2 = df.loc[~df[col].isin(special_attribute)]
        else:
            df2 = df.copy()
        N_distinct = len(list(set(df2[col])))

        # 步驟一: 通過col對數據集進行分組,求出每組的總樣本數與壞樣本數
        if N_distinct > 100:
            split_x = SplitData(df2, col, 100)
            df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
        else:
            df2['temp'] = df2[col]
        # 總體bad rate將被用來計算expected bad count
        (binBadRate, regroup, overallRate) = BinBadRate(df2, 'temp', target, grantRateIndicator=1)

        # 首先,每個單獨的屬性值將被分爲單獨的一組
        # 對屬性值進行排序,然後兩兩組別進行合併
        colLevels = sorted(list(set(df2['temp'])))
        groupIntervals = [[i] for i in colLevels]

        # 步驟二:建立循環,不斷合併最優的相鄰兩個組別,直到:
        # 1,最終分裂出來的分箱數<=預設的最大分箱數
        # 2,每箱的佔比不低於預設值(可選)
        # 3,每箱同時包含好壞樣本
        # 如果有特殊屬性,那麼最終分裂出來的分箱數=預設的最大分箱數-特殊屬性的個數
        split_intervals = max_interval - len(special_attribute)
        while (len(groupIntervals) > split_intervals):  # 終止條件: 當前分箱數=預設的分箱數
            # 每次循環時, 計算合併相鄰組別後的卡方值。具有最小卡方值的合併方案,是最優方案
            chisqList = []
            for k in range(len(groupIntervals)-1):
                temp_group = groupIntervals[k] + groupIntervals[k+1]
                df2b = regroup.loc[regroup['temp'].isin(temp_group)]
                chisq = Chi2(df2b, 'total', 'bad')
                chisqList.append(chisq)
            best_comnbined = chisqList.index(min(chisqList))
            groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1]
            # 當將最優的相鄰的兩個變量合併在一起後,需要從原來的列表中將其移除。例如,將[3,4,5] 與[6,7]合併成[3,4,5,6,7]後,需要將[3,4,5] 與[6,7]移除,保留[3,4,5,6,7]
            groupIntervals.remove(groupIntervals[best_comnbined+1])
        groupIntervals = [sorted(i) for i in groupIntervals]
        cutOffPoints = [max(i) for i in groupIntervals[:-1]]

        # 檢查是否有箱沒有好或者壞樣本。如果有,需要跟相鄰的箱進行合併,直到每箱同時包含好壞樣本
        groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
        df2['temp_Bin'] = groupedvalues
        (binBadRate,regroup) = BinBadRate(df2, 'temp_Bin', target)
        [minBadRate, maxBadRate] = [min(binBadRate.values()),max(binBadRate.values())]
        while minBadRate ==0 or maxBadRate == 1:
            # 找出全部爲好/壞樣本的箱
            indexForBad01 = regroup[regroup['bad_rate'].isin([0,1])].temp_Bin.tolist()
            bin=indexForBad01[0]
            # 如果是最後一箱,則需要和上一個箱進行合併,也就意味着分裂點cutOffPoints中的最後一個需要移除
            if bin == max(regroup.temp_Bin):
                cutOffPoints = cutOffPoints[:-1]
            # 如果是第一箱,則需要和下一個箱進行合併,也就意味着分裂點cutOffPoints中的第一個需要移除
            elif bin == min(regroup.temp_Bin):
                cutOffPoints = cutOffPoints[1:]
            # 如果是中間的某一箱,則需要和前後中的一個箱進行合併,依據是較小的卡方值
            else:
                # 和前一箱進行合併,並且計算卡方值
                currentIndex = list(regroup.temp_Bin).index(bin)
                prevIndex = list(regroup.temp_Bin)[currentIndex - 1]
                df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, bin])]
                (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
                chisq1 = Chi2(df2b, 'total', 'bad')
                # 和後一箱進行合併,並且計算卡方值
                laterIndex = list(regroup.temp_Bin)[currentIndex + 1]
                df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, bin])]
                (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
                chisq2 = Chi2(df2b, 'total', 'bad')
                if chisq1 < chisq2:
                    cutOffPoints.remove(cutOffPoints[currentIndex - 1])
                else:
                    cutOffPoints.remove(cutOffPoints[currentIndex])
            # 完成合並之後,需要再次計算新的分箱準則下,每箱是否同時包含好壞樣本
            groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
            df2['temp_Bin'] = groupedvalues
            (binBadRate, regroup) = BinBadRate(df2, 'temp_Bin', target)
            [minBadRate, maxBadRate] = [min(binBadRate.values()), max(binBadRate.values())]
        # 需要檢查分箱後的最小佔比
        if minBinPcnt > 0:
            groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
            df2['temp_Bin'] = groupedvalues
            valueCounts = groupedvalues.value_counts().to_frame()
            N = sum(valueCounts['temp'])
            valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
            valueCounts = valueCounts.sort_index()
            minPcnt = min(valueCounts['pcnt'])
            while minPcnt < minBinPcnt and len(cutOffPoints) > 2:
                # 找出佔比最小的箱
                indexForMinPcnt = valueCounts[valueCounts['pcnt'] == minPcnt].index.tolist()[0]
                # 如果佔比最小的箱是最後一箱,則需要和上一個箱進行合併,也就意味着分裂點cutOffPoints中的最後一個需要移除
                if indexForMinPcnt == max(valueCounts.index):
                    cutOffPoints = cutOffPoints[:-1]
                # 如果佔比最小的箱是第一箱,則需要和下一個箱進行合併,也就意味着分裂點cutOffPoints中的第一個需要移除
                elif indexForMinPcnt == min(valueCounts.index):
                    cutOffPoints = cutOffPoints[1:]
                # 如果佔比最小的箱是中間的某一箱,則需要和前後中的一個箱進行合併,依據是較小的卡方值
                else:
                    # 和前一箱進行合併,並且計算卡方值
                    currentIndex = list(valueCounts.index).index(indexForMinPcnt)
                    prevIndex = list(valueCounts.index)[currentIndex - 1]
                    df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, indexForMinPcnt])]
                    (binBadRate, df2b) = BinBadRate(df3, 'temp_Bin', target)
                    chisq1 = Chi2(df2b, 'total', 'bad')
                    # 和後一箱進行合併,並且計算卡方值
                    laterIndex = list(valueCounts.index)[currentIndex + 1]
                    df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, indexForMinPcnt])]
                    (binBadRate, df2b) = BinBadRate(df3b, 'temp_Bin', target)
                    chisq2 = Chi2(df2b, 'total', 'bad')
                    if chisq1 < chisq2:
                        cutOffPoints.remove(cutOffPoints[currentIndex - 1])
                    else:
                        cutOffPoints.remove(cutOffPoints[currentIndex])
                groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
                df2['temp_Bin'] = groupedvalues
                valueCounts = groupedvalues.value_counts().to_frame()
                valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
                valueCounts = valueCounts.sort_index()
                minPcnt = min(valueCounts['pcnt'])
        cutOffPoints = special_attribute + cutOffPoints
        return cutOffPoints



def BadRateEncoding(df, col, target):
    '''
    :return: 在數據集df中,用壞樣本率給col進行編碼。target表示壞樣本標籤
    '''
    regroup = BinBadRate(df, col, target, grantRateIndicator=0)[1]
    br_dict = regroup[[col,'bad_rate']].set_index([col]).to_dict(orient='index')
    for k, v in br_dict.items():
        br_dict[k] = v['bad_rate']
    badRateEnconding = df[col].map(lambda x: br_dict[x])
    return {'encoding':badRateEnconding, 'bad_rate':br_dict}


def AssignBin(x, cutOffPoints,special_attribute=[]):
    '''
    :param x: 某個變量的某個取值
    :param cutOffPoints: 上述變量的分箱結果,用切分點表示
    :param special_attribute:  不參與分箱的特殊取值
    :return: 分箱後的對應的第幾個箱,從0開始
    例如, cutOffPoints = [10,20,30], 對於 x = 7, 返回 Bin 0;對於x=23,返回Bin 2; 對於x = 35, return Bin 3。
    對於特殊值,返回的序列數前加"-"
    '''
    cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
    numBin = len(cutOffPoints2)
    if x in special_attribute:
        i = special_attribute.index(x)+1
        return 'Bin {}'.format(0-i)
    if x<=cutOffPoints2[0]:
        return 'Bin 0'
    elif x > cutOffPoints2[-1]:
        return 'Bin {}'.format(numBin)
    else:
        for i in range(0,numBin):
            if cutOffPoints2[i] < x <=  cutOffPoints2[i+1]:
                return 'Bin {}'.format(i+1)



def CalcWOE(df, col, target):
    '''
    :param df: 包含需要計算WOE的變量和目標變量
    :param col: 需要計算WOE、IV的變量,必須是分箱後的變量,或者不需要分箱的類別型變量
    :param target: 目標變量,0、1表示好、壞
    :return: 返回WOE和IV
    '''
    total = df.groupby([col])[target].count()
    total = pd.DataFrame({'total': total})
    bad = df.groupby([col])[target].sum()
    bad = pd.DataFrame({'bad': bad})
    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
    regroup.reset_index(level=0, inplace=True)
    N = sum(regroup['total'])
    B = sum(regroup['bad'])
    regroup['good'] = regroup['total'] - regroup['bad']
    G = N - B
    regroup['bad_pcnt'] = regroup['bad'].map(lambda x: x*1.0/B)
    regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
    regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
    for k, v in WOE_dict.items():
        WOE_dict[k] = v['WOE']
    IV = regroup.apply(lambda x: (x.good_pcnt-x.bad_pcnt)*np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
    IV = sum(IV)
    return {"WOE": WOE_dict, 'IV':IV}


def FeatureMonotone(x):
    '''
    :return: 返回序列x中有幾個元素不滿足單調性,以及這些元素的位置。
    例如,x=[1,3,2,5], 元素3比前後兩個元素都大,不滿足單調性;元素2比前後兩個元素都小,也不滿足單調性。
    故返回的不滿足單調性的元素個數爲2,位置爲1和2.
    '''
    monotone = [x[i]<x[i+1] and x[i] < x[i-1] or x[i]>x[i+1] and x[i] > x[i-1] for i in range(1,len(x)-1)]
    index_of_nonmonotone = [i+1 for i in range(len(monotone)) if monotone[i]]
    return {'count_of_nonmonotone':monotone.count(True), 'index_of_nonmonotone':index_of_nonmonotone}

## 判斷某變量的壞樣本率是否單調
def BadRateMonotone(df, sortByVar, target,special_attribute = []):
    '''
    :param df: 包含檢驗壞樣本率的變量,和目標變量
    :param sortByVar: 需要檢驗壞樣本率的變量
    :param target: 目標變量,0、1表示好、壞
    :param special_attribute: 不參與檢驗的特殊值
    :return: 壞樣本率單調與否
    '''
    df2 = df.loc[~df[sortByVar].isin(special_attribute)]
    if len(set(df2[sortByVar])) <= 2:
        return True
    regroup = BinBadRate(df2, sortByVar, target)[1]
    combined = zip(regroup['total'],regroup['bad'])
    badRate = [x[1]*1.0/x[0] for x in combined]
    badRateNotMonotone = FeatureMonotone(badRate)['count_of_nonmonotone']
    if badRateNotMonotone > 0:
        return False
    else:
        return True

def MergeBad0(df,col,target, direction='bad'):
    '''
     :param df: 包含檢驗0%或者100%壞樣本率
     :param col: 分箱後的變量或者類別型變量。檢驗其中是否有一組或者多組沒有壞樣本或者沒有好樣本。如果是,則需要進行合併
     :param target: 目標變量,0、1表示好、壞
     :return: 合併方案,使得每個組裏同時包含好壞樣本
     '''
    regroup = BinBadRate(df, col, target)[1]
    if direction == 'bad':
        # 如果是合併0壞樣本率的組,則跟最小的非0壞樣本率的組進行合併
        regroup = regroup.sort_values(by  = 'bad_rate')
    else:
        # 如果是合併0好樣本率的組,則跟最小的非0好樣本率的組進行合併
        regroup = regroup.sort_values(by='bad_rate',ascending=False)
    regroup.index = range(regroup.shape[0])
    col_regroup = [[i] for i in regroup[col]]
    del_index = []
    for i in range(regroup.shape[0]-1):
        col_regroup[i+1] = col_regroup[i] + col_regroup[i+1]
        del_index.append(i)
        if direction == 'bad':
            if regroup['bad_rate'][i+1] > 0:
                break
        else:
            if regroup['bad_rate'][i+1] < 1:
                break
    col_regroup2 = [col_regroup[i] for i in range(len(col_regroup)) if i not in del_index]
    newGroup = {}
    for i in range(len(col_regroup2)):
        for g2 in col_regroup2[i]:
            newGroup[g2] = 'Bin '+str(i)
    return newGroup


def Monotone_Merge(df, target, col):
    '''
    :return:將數據集df中,不滿足壞樣本率單調性的變量col進行合併,使得合併後的新的變量中,壞樣本率單調,輸出合併方案。
    例如,col=[Bin 0, Bin 1, Bin 2, Bin 3, Bin 4]是不滿足壞樣本率單調性的。合併後的col是:
    [Bin 0&Bin 1, Bin 2, Bin 3, Bin 4].
    合併只能在相鄰的箱中進行。
    迭代地尋找最優合併方案。每一步迭代時,都嘗試將所有非單調的箱進行合併,每一次嘗試的合併都是跟前後箱進行合併再做比較
    '''
    def MergeMatrix(m, i,j,k):
        '''
        :param m: 需要合併行的矩陣
        :param i,j: 合併第i和j行
        :param k: 刪除第k行
        :return: 合併後的矩陣
        '''
        m[i, :] = m[i, :] + m[j, :]
        m = np.delete(m, k, axis=0)
        return m

    def Merge_adjacent_Rows(i, bad_by_bin_current, bins_list_current, not_monotone_count_current):
        '''
        :param i: 需要將第i行與前、後的行分別進行合併,比較哪種合併方案最佳。判斷準則是,合併後非單調性程度減輕,且更加均勻
        :param bad_by_bin_current:合併前的分箱矩陣,包括每一箱的樣本個數、壞樣本個數和壞樣本率
        :param bins_list_current: 合併前的分箱方案
        :param not_monotone_count_current:合併前的非單調性元素個數
        :return:分箱後的分箱矩陣、分箱方案、非單調性元素個數和衡量均勻性的指標balance
        '''
        i_prev = i - 1
        i_next = i + 1
        bins_list = bins_list_current.copy()
        bad_by_bin = bad_by_bin_current.copy()
        not_monotone_count = not_monotone_count_current
        #合併方案a:將第i箱與前一箱進行合併
        bad_by_bin2a = MergeMatrix(bad_by_bin.copy(), i_prev, i, i)
        bad_by_bin2a[i_prev, -1] = bad_by_bin2a[i_prev, -2] / bad_by_bin2a[i_prev, -3]
        not_monotone_count2a = FeatureMonotone(bad_by_bin2a[:, -1])['count_of_nonmonotone']
        # 合併方案b:將第i行與後一行進行合併
        bad_by_bin2b = MergeMatrix(bad_by_bin.copy(), i, i_next, i_next)
        bad_by_bin2b[i, -1] = bad_by_bin2b[i, -2] / bad_by_bin2b[i, -3]
        not_monotone_count2b = FeatureMonotone(bad_by_bin2b[:, -1])['count_of_nonmonotone']
        balance = ((bad_by_bin[:, 1] / N).T * (bad_by_bin[:, 1] / N))[0, 0]
        balance_a = ((bad_by_bin2a[:, 1] / N).T * (bad_by_bin2a[:, 1] / N))[0, 0]
        balance_b = ((bad_by_bin2b[:, 1] / N).T * (bad_by_bin2b[:, 1] / N))[0, 0]
        #滿足下述2種情況時返回方案a:(1)方案a能減輕非單調性而方案b不能;(2)方案a和b都能減輕非單調性,但是方案a的樣本均勻性優於方案b
        if not_monotone_count2a < not_monotone_count_current and not_monotone_count2b >= not_monotone_count_current or \
                                        not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a < balance_b:
            bins_list[i_prev] = bins_list[i_prev] + bins_list[i]
            bins_list.remove(bins_list[i])
            bad_by_bin = bad_by_bin2a
            not_monotone_count = not_monotone_count2a
            balance = balance_a
        # 同樣地,滿足下述2種情況時返回方案b:(1)方案b能減輕非單調性而方案a不能;(2)方案a和b都能減輕非單調性,但是方案b的樣本均勻性優於方案a
        elif not_monotone_count2a >= not_monotone_count_current and not_monotone_count2b < not_monotone_count_current or \
                                        not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a > balance_b:
            bins_list[i] = bins_list[i] + bins_list[i_next]
            bins_list.remove(bins_list[i_next])
            bad_by_bin = bad_by_bin2b
            not_monotone_count = not_monotone_count2b
            balance = balance_b
        #如果方案a和b都不能減輕非單調性,返回均勻性更優的合併方案
        else:
            if balance_a< balance_b:
                bins_list[i] = bins_list[i] + bins_list[i_next]
                bins_list.remove(bins_list[i_next])
                bad_by_bin = bad_by_bin2b
                not_monotone_count = not_monotone_count2b
                balance = balance_b
            else:
                bins_list[i] = bins_list[i] + bins_list[i_next]
                bins_list.remove(bins_list[i_next])
                bad_by_bin = bad_by_bin2b
                not_monotone_count = not_monotone_count2b
                balance = balance_b
        return {'bins_list': bins_list, 'bad_by_bin': bad_by_bin, 'not_monotone_count': not_monotone_count,
                'balance': balance}


    N = df.shape[0]
    [badrate_bin, bad_by_bin] = BinBadRate(df, col, target)
    bins = list(bad_by_bin[col])
    bins_list = [[i] for i in bins]
    badRate = sorted(badrate_bin.items(), key=lambda x: x[0])
    badRate = [i[1] for i in badRate]
    not_monotone_count, not_monotone_position = FeatureMonotone(badRate)['count_of_nonmonotone'], FeatureMonotone(badRate)['index_of_nonmonotone']
    #迭代地尋找最優合併方案,終止條件是:當前的壞樣本率已經單調,或者當前只有2箱
    while (not_monotone_count > 0 and len(bins_list)>2):
        #當非單調的箱的個數超過1個時,每一次迭代中都嘗試每一個箱的最優合併方案
        all_possible_merging = []
        for i in not_monotone_position:
            merge_adjacent_rows = Merge_adjacent_Rows(i, np.mat(bad_by_bin), bins_list, not_monotone_count)
            all_possible_merging.append(merge_adjacent_rows)
        balance_list = [i['balance'] for i in all_possible_merging]
        not_monotone_count_new = [i['not_monotone_count'] for i in all_possible_merging]
        #如果所有的合併方案都不能減輕當前的非單調性,就選擇更加均勻的合併方案
        if min(not_monotone_count_new) >= not_monotone_count:
            best_merging_position = balance_list.index(min(balance_list))
        #如果有多個合併方案都能減輕當前的非單調性,也選擇更加均勻的合併方案
        else:
            better_merging_index = [i for i in range(len(not_monotone_count_new)) if not_monotone_count_new[i] < not_monotone_count]
            better_balance = [balance_list[i] for i in better_merging_index]
            best_balance_index = better_balance.index(min(better_balance))
            best_merging_position = better_merging_index[best_balance_index]
        bins_list = all_possible_merging[best_merging_position]['bins_list']
        bad_by_bin = all_possible_merging[best_merging_position]['bad_by_bin']
        not_monotone_count = all_possible_merging[best_merging_position]['not_monotone_count']
        not_monotone_position = FeatureMonotone(bad_by_bin[:, 3])['index_of_nonmonotone']
    return bins_list





def Prob2Score(prob, basePoint, PDO):
    #將概率轉化成分數且爲正整數
    y = np.log(prob/(1-prob))
    return (basePoint+PDO/np.log(2)*(-y)).map(lambda x: int(x))



### 計算KS值
def KS(df, score, target):
    '''
    :param df: 包含目標變量與預測值的數據集
    :param score: 得分或者概率
    :param target: 目標變量
    :return: KS值
    '''
    total = df.groupby([score])[target].count()
    bad = df.groupby([score])[target].sum()
    all = pd.DataFrame({'total':total, 'bad':bad})
    all['good'] = all['total'] - all['bad']
    all[score] = all.index
    all = all.sort_values(by=score,ascending=False)
    all.index = range(len(all))
    all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
    all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
    KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
    return max(KS)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章