# 【風控建模】XGBoost算法構建信用評分卡模型

### 主程序

``````import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
import datetime
import collections
import numpy as np
import numbers
import random
import sys
_path = r'C:\Users\A3\Desktop\LR_scorecard'
sys.path.append(_path)
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
#sys.setdefaultencoding( "utf-8")
# -*- coding: utf-8 -*-

### 對時間窗口，計算累計產比 ###
def TimeWindowSelection(df, daysCol, time_windows):
'''
:param df: the dataset containg variabel of days
:param daysCol: the column of days
:param time_windows: the list of time window
:return:
'''
freq_tw = {}
for tw in time_windows:
freq = sum(df[daysCol].apply(lambda x: int(x<=tw)))
freq_tw[tw] = freq
return freq_tw

def DeivdedByZero(nominator, denominator):
'''
當分母爲0時，返回0；否則返回正常值
'''
if denominator == 0:
return 0
else:
return nominator*1.0/denominator

#對某些統一的字段進行統一
def ChangeContent(x):
y = x.upper()
if y == '_MOBILEPHONE':
y = '_PHONE'
return y

def MissingCategorial(df,x):
missing_vals = df[x].map(lambda x: int(x!=x))
return sum(missing_vals)*1.0/df.shape[0]

def MissingContinuous(df,x):
missing_vals = df[x].map(lambda x: int(np.isnan(x)))
return sum(missing_vals) * 1.0 / df.shape[0]

def MakeupRandom(x, sampledList):
if x==x:
return x
else:
randIndex = random.randint(0, len(sampledList)-1)
return sampledList[randIndex]

def Outlier_Dectection(df,x):
'''
:param df:
:param x:
:return:
'''
p25, p75 = np.percentile(df[x], 25),np.percentile(df[x], 75)
d = p75 - p25
upper, lower =  p75 + 1.5*d, p25-1.5*d
truncation = df[x].map(lambda x: max(min(upper, x), lower))
return truncation

############################################################
#Step 0: 數據分析的初始工作, 包括讀取數據文件、檢查用戶Id的一致性等#
############################################################

folderOfData = 'C:/Users/A3/Desktop/XGBoost_scorecard/'

#將數據集分爲訓練集與測試集
all_ids = data2['Idx']
train_ids, test_ids = train_test_split(all_ids, test_size=0.3)
train_ids = pd.DataFrame(train_ids)
test_ids = pd.DataFrame(test_ids)

data1_train = pd.merge(left=train_ids,right = data1, on='Idx', how='inner')
data2_train = pd.merge(left=train_ids,right = data2, on='Idx', how='inner')
data3_train = pd.merge(left=train_ids,right = data3, on='Idx', how='inner')

data1_test = pd.merge(left=test_ids,right = data1, on='Idx', how='inner')
data2_test = pd.merge(left=test_ids,right = data2, on='Idx', how='inner')
data3_test = pd.merge(left=test_ids,right = data3, on='Idx', how='inner')

#############################################################################################
# Step 1: 從PPD_LogInfo_3_1_Training_Set &  PPD_Userupdate_Info_3_1_Training_Set數據中衍生特徵#
#############################################################################################
# compare whether the four city variables match
data2_train['city_match'] = data2_train.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)
del data2_train['UserInfo_2']
del data2_train['UserInfo_4']
del data2_train['UserInfo_8']
del data2_train['UserInfo_20']

### 提取申請日期，計算日期差，查看日期差的分佈
data1_train['Listinginfo'] = data1_train['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))
data1_train['ListingGap'] = data1_train[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)

### 提取申請日期，計算日期差，查看日期差的分佈
'''

'''
time_window = [7, 30, 60, 90, 120, 150, 180]
data1GroupbyIdx = pd.DataFrame({'Idx':data1_train['Idx'].drop_duplicates()})

for tw in time_window:
data1_train['TruncatedLogInfo'] = data1_train['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw))
for var in var_list:
count_stats = temp.groupby(['Idx'])[var].count().to_dict()
data1GroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1GroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0))

Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates()
uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict()
data1GroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0))

# calculate the average count of each value in LogInfo1 and LogInfo2
data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\
apply(lambda x: DeivdedByZero(x[0],x[1]), axis=1)

data3_train['ListingInfo'] = data3_train['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3_train['UserupdateInfo'] = data3_train['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))
data3_train['ListingGap'] = data3_train[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)
collections.Counter(data3_train['ListingGap'])
hist_ListingGap = np.histogram(data3_train['ListingGap'])
hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})
hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()
hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])

'''

(1) 更新的頻率
(2) 每種更新對象的種類個數
'''
data3_train['UserupdateInfo1'] = data3_train['UserupdateInfo1'].map(ChangeContent)
data3GroupbyIdx = pd.DataFrame({'Idx':data3_train['Idx'].drop_duplicates()})

time_window = [7, 30, 60, 90, 120, 150, 180]
for tw in time_window:
data3_train['TruncatedLogInfo'] = data3_train['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw))

#frequency of updating
freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
data3GroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3GroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0))

# number of updated types
Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates()
uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict()
data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x))

#average count of each type
data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3GroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \
apply(lambda x: x[0] * 1.0 / x[1], axis=1)

#whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE
Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x])
Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum()
item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict()
data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))

# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_Set
allData = pd.concat([data2_train.set_index('Idx'), data3GroupbyIdx.set_index('Idx'), data1GroupbyIdx.set_index('Idx')],axis= 1)
allData.to_csv(folderOfData+'allData_0.csv',encoding = 'gbk')

########################################
# Step 2: 對類別型變量和數值型變量進行預處理#
########################################
allFeatures = list(allData.columns)
allFeatures.remove('target')
if 'Idx' in allFeatures:
allFeatures.remove('Idx')
allFeatures.remove('ListingInfo')

#檢查是否有常數型變量，並且檢查是類別型還是數值型變量
numerical_var = []
for col in allFeatures:
if len(set(allData[col])) == 1:
print('delete {} from the dataset because it is a constant'.format(col))
del allData[col]
allFeatures.remove(col)
else:
uniq_valid_vals = [i for i in allData[col] if i == i]
uniq_valid_vals = list(set(uniq_valid_vals))
if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):
numerical_var.append(col)

categorical_var = [i for i in allFeatures if i not in numerical_var]

#檢查變量的最多值的佔比情況,以及每個變量中佔比最大的值
records_count = allData.shape[0]
col_most_values,col_large_value = {},{}
for col in allFeatures:
value_count = allData[col].groupby(allData[col]).count()
col_most_values[col] = max(value_count)/records_count
large_value = value_count[value_count== max(value_count)].index[0]
col_large_value[col] = large_value
col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')
col_most_values_df.columns = ['max percent']
col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)
pcnt = list(col_most_values_df[:500]['max percent'])
vars = list(col_most_values_df[:500].index)
plt.bar(range(len(pcnt)), height = pcnt)
plt.title('Largest Percentage of Single Value in Each Variable')

#計算多數值佔比超過90%的字段中，少數值的壞樣本率是否會顯著高於多數值
large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)
for col in large_percent_cols:
large_value = col_large_value[col]
temp = allData[[col,'target']]
temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1)
continue

#由於所有的少數值的壞樣本率並沒有顯著高於多數值，意味着這些變量可以直接剔除
for col in large_percent_cols:
if col in numerical_var:
numerical_var.remove(col)
else:
categorical_var.remove(col)
del allData[col]

'''

'''
missing_pcnt_threshould_1 = 0.8
for col in categorical_var:
missingRate = MissingCategorial(allData,col)
print('{0} has missing rate as {1}'.format(col,missingRate))
if missingRate > missing_pcnt_threshould_1:
categorical_var.remove(col)
del allData[col]
allData_bk = allData.copy()

'''

'''
dummy_map = {}
dummy_columns = []
for raw_col in categorical_var:
dummies = pd.get_dummies(allData.loc[:, raw_col], prefix=raw_col)
col_onehot = pd.concat([allData[raw_col], dummies], axis=1)
col_onehot = col_onehot.drop_duplicates()
allData = pd.concat([allData, dummies], axis=1)
del allData[raw_col]
dummy_map[raw_col] = col_onehot
dummy_columns = dummy_columns + list(dummies)

with open(folderOfData+'dummy_map.pkl',"wb") as f:
f.write(pickle.dumps(dummy_map))

with open(folderOfData+'dummy_columns.pkl',"wb") as f:
f.write(pickle.dumps(dummy_columns))

'''

'''
missing_pcnt_threshould_2 = 0.8
deleted_var = []
for col in numerical_var:
missingRate = MissingContinuous(allData, col)
print('{0} has missing rate as {1}'.format(col, missingRate))
if missingRate > missing_pcnt_threshould_2:
deleted_var.append(col)
print('we delete variable {} because of its high missing rate'.format(col))
else:
if missingRate > 0:
not_missing = allData.loc[allData[col] == allData[col]][col]
#makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
missing_position = allData.loc[allData[col] != allData[col]][col].index
not_missing_sample = random.sample(list(not_missing), len(missing_position))
allData.loc[missing_position,col] = not_missing_sample
#del allData[col]
#allData[col] = makeuped
missingRate2 = MissingContinuous(allData, col)
print('missing rate after making up is:{}'.format(str(missingRate2)))

if deleted_var != []:
for col in deleted_var:
numerical_var.remove(col)
del allData[col]

'''

'''
max_min_standardized = {}
for col in numerical_var:
truncation = Outlier_Dectection(allData, col)
upper, lower = max(truncation), min(truncation)
d = upper - lower
if d == 0:
print("{} is almost a constant".format(col))
numerical_var.remove(col)
continue
allData[col] = truncation.map(lambda x: (upper - x)/d)
max_min_standardized[col] = [lower, upper]

with open(folderOfData+'max_min_standardized.pkl',"wb") as f:
f.write(pickle.dumps(max_min_standardized))

allData.to_csv(folderOfData+'allData_1_XGBoost.csv', header=True,encoding='gbk', columns = allData.columns, index=False)

####################################
# Step 3: 構建XGBoost模型用於違約預測 #
####################################
all_features = list(allData.columns)
all_features.remove('target')
#all_features.remove('ListingInfo')
X_train, y_train = allData[all_features], allData['target']

param_test1 = {'max_depth':range(3,10,2), 'min_child_weight':range(1,6,2)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, gamma=0, subsample=0.8, colsample_bytree=0.8,
param_grid = param_test1,scoring='roc_auc',n_jobs=4,iid=False,cv=5)
gsearch1.fit(X_train,y_train)
best_max_depth, best_min_child_weight = gsearch1.best_params_['max_depth'],gsearch1.best_params_['min_child_weight']   #9,3

param_test2 = {'gamma':[i/10.0 for i in range(0,5)]}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, subsample=0.8, colsample_bytree=0.8, max_depth= best_max_depth,
param_grid = param_test2,scoring='roc_auc',n_jobs=4,iid=False,cv=5)
gsearch2.fit(X_train,y_train)
best_gamma = gsearch2.best_params_['gamma']  #0

param_test3 = {'subsample':[i/10.0 for i in range(6,10)],'colsample_bytree':[i/10.0 for i in range(6,10)]}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth= best_max_depth, gamma=best_gamma,
param_grid = param_test3,scoring='roc_auc',n_jobs=4,iid=False,cv=5)
gsearch3.fit(X_train,y_train)
best_colsample_bytree, best_subsample = gsearch3.best_params_['colsample_bytree'], gsearch3.best_params_['subsample']  #0.8, 0.6

param_test4 = {'reg_alpha':[0.01,0.1,1,10,50,100,200,500]}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth= best_max_depth, gamma=best_gamma,
colsample_bytree = best_colsample_bytree, subsample = best_subsample,
param_grid = param_test4,scoring='roc_auc',n_jobs=4,iid=False,cv=5)
gsearch4.fit(X_train,y_train)
best_reg_alpha = gsearch4.best_params_['reg_alpha']   #50

param_test5 = {'n_estimators':range(100,401,10)}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1,  max_depth= best_max_depth, gamma=best_gamma,
colsample_bytree = best_colsample_bytree, subsample = best_subsample,reg_alpha=best_reg_alpha,
param_grid = param_test5,scoring='roc_auc',n_jobs=4,iid=False,cv=5)
gsearch5.fit(X_train,y_train)
best_n_estimators = gsearch5.best_params_     #390

#用獲取得到的最優參數再次訓練模型
best_xgb = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth= best_max_depth, gamma=best_gamma,
colsample_bytree = best_colsample_bytree, subsample = best_subsample, reg_alpha=best_reg_alpha,
best_xgb.fit(X_train,y_train)
y_pred = best_xgb.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)   #0.8
feature_importance = best_xgb.feature_importances_

#利用特徵重要性篩去一部分無用的變量
X_train_temp = X_train.copy()
features_in_model = all_features
while(min(feature_importance)<0.00001):
features_in_model = [features_in_model[i] for i in range(len(feature_importance)) if feature_importance[i] > 0.00001]
X_train_temp= X_train_temp[features_in_model]
best_xgb.fit(X_train_temp, y_train)
feature_importance = best_xgb.feature_importances_

y_pred = best_xgb.predict_proba(X_train_temp)[:,1]
roc_auc_score(y_train, y_pred)  # 0.8
print('There are {} features in the raw data'.format(X_train.shape[1]))  #400
print('There are {} features in the reduced data'.format(X_train_temp.shape[1]))  #158

``````

### 功能模塊

``````import numpy as np
import pandas as pd

def SplitData(df, col, numOfSplit, special_attribute=[]):
'''
:param df: 按照col排序後的數據集
:param col: 待分箱的變量
:param numOfSplit: 切分的組別數
:param special_attribute: 在切分數據集的時候，某些特殊值需要排除在外
:return: 在原數據集上增加一列，把原始細粒度的col重新劃分成粗粒度的值，便於分箱中的合併處理
'''
df2 = df.copy()
if special_attribute != []:
df2 = df.loc[~df[col].isin(special_attribute)]
N = df2.shape[0]
n = int(N/numOfSplit)
splitPointIndex = [i*n for i in range(1,numOfSplit)]
rawValues = sorted(list(df2[col]))
splitPoint = [rawValues[i] for i in splitPointIndex]
splitPoint = sorted(list(set(splitPoint)))
return splitPoint

def MaximumBinPcnt(df,col):
'''
:return: 數據集df中，變量col的分佈佔比
'''
N = df.shape[0]
total = df.groupby([col])[col].count()
pcnt = total*1.0/N
return max(pcnt)

'''
:param df: 包含全部樣本總計與壞樣本總計的數據框
:param total_col: 全部樣本的個數
:return: 卡方值
'''
df2 = df.copy()
# 求出df中，總體的壞樣本率和好樣本率
# 當全部樣本只有好或者壞樣本時，卡方值爲0
return 0
df2['good'] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1)
goodRate = sum(df2['good']) * 1.0 / sum(df2[total_col])
# 期望壞（好）樣本個數＝全部樣本個數*平均壞（好）樣本佔比
df2['goodExpected'] = df[total_col].apply(lambda x: x * goodRate)
goodCombined = zip(df2['goodExpected'], df2['good'])
goodChi = [(i[0] - i[1]) ** 2 / i[0] for i in goodCombined]
return chi2

'''
:param df: 需要計算好壞比率的數據集
:param col: 需要計算好壞比率的特徵
:param target: 好壞標籤
:param grantRateIndicator: 1返回總體的壞樣本率，0不返回
:return: 每箱的壞樣本率，以及總體的壞樣本率（當grantRateIndicator＝＝1時）
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
if grantRateIndicator==0:
return (dicts, regroup)
N = sum(regroup['total'])
overallRate = B * 1.0 / N
return (dicts, regroup, overallRate)

def AssignGroup(x, bin):
'''
:return: 數值x在區間映射下的結果。例如，x=2，bin=[0,3,5], 由於0<x<3,x映射成3
'''
N = len(bin)
if x<=min(bin):
return min(bin)
elif x>max(bin):
return 10e10
else:
for i in range(N-1):
if bin[i] < x <= bin[i+1]:
return bin[i+1]

def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
'''
:param df: 包含目標變量與分箱屬性的數據框
:param col: 需要分箱的屬性
:param target: 目標變量，取值0或1
:param max_interval: 最大分箱數。如果原始屬性的取值個數低於該參數，不執行這段函數
:param special_attribute: 不參與分箱的屬性取值
:param minBinPcnt：最小箱的佔比，默認爲0
:return: 分箱結果
'''
colLevels = sorted(list(set(df[col])))
N_distinct = len(colLevels)
if N_distinct <= max_interval:  #如果原始屬性的取值個數低於max_interval，不執行這段函數
print("The number of original levels for {} is less than or equal to max intervals".format(col))
return colLevels[:-1]
else:
if len(special_attribute)>=1:
df1 = df.loc[df[col].isin(special_attribute)]
df2 = df.loc[~df[col].isin(special_attribute)]
else:
df2 = df.copy()
N_distinct = len(list(set(df2[col])))

# 步驟一: 通過col對數據集進行分組，求出每組的總樣本數與壞樣本數
if N_distinct > 100:
split_x = SplitData(df2, col, 100)
df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
else:
df2['temp'] = df2[col]

# 首先，每個單獨的屬性值將被分爲單獨的一組
# 對屬性值進行排序，然後兩兩組別進行合併
colLevels = sorted(list(set(df2['temp'])))
groupIntervals = [[i] for i in colLevels]

# 步驟二：建立循環，不斷合併最優的相鄰兩個組別，直到：
# 1，最終分裂出來的分箱數<＝預設的最大分箱數
# 2，每箱的佔比不低於預設值（可選）
# 3，每箱同時包含好壞樣本
# 如果有特殊屬性，那麼最終分裂出來的分箱數＝預設的最大分箱數－特殊屬性的個數
split_intervals = max_interval - len(special_attribute)
while (len(groupIntervals) > split_intervals):  # 終止條件: 當前分箱數＝預設的分箱數
# 每次循環時, 計算合併相鄰組別後的卡方值。具有最小卡方值的合併方案，是最優方案
chisqList = []
for k in range(len(groupIntervals)-1):
temp_group = groupIntervals[k] + groupIntervals[k+1]
df2b = regroup.loc[regroup['temp'].isin(temp_group)]
chisqList.append(chisq)
best_comnbined = chisqList.index(min(chisqList))
groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1]
# 當將最優的相鄰的兩個變量合併在一起後，需要從原來的列表中將其移除。例如，將[3,4,5] 與[6,7]合併成[3,4,5,6,7]後，需要將[3,4,5] 與[6,7]移除，保留[3,4,5,6,7]
groupIntervals.remove(groupIntervals[best_comnbined+1])
groupIntervals = [sorted(i) for i in groupIntervals]
cutOffPoints = [max(i) for i in groupIntervals[:-1]]

# 檢查是否有箱沒有好或者壞樣本。如果有，需要跟相鄰的箱進行合併，直到每箱同時包含好壞樣本
groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
df2['temp_Bin'] = groupedvalues
# 找出全部爲好／壞樣本的箱
# 如果是最後一箱，則需要和上一個箱進行合併，也就意味着分裂點cutOffPoints中的最後一個需要移除
if bin == max(regroup.temp_Bin):
cutOffPoints = cutOffPoints[:-1]
# 如果是第一箱，則需要和下一個箱進行合併，也就意味着分裂點cutOffPoints中的第一個需要移除
elif bin == min(regroup.temp_Bin):
cutOffPoints = cutOffPoints[1:]
# 如果是中間的某一箱，則需要和前後中的一個箱進行合併，依據是較小的卡方值
else:
# 和前一箱進行合併，並且計算卡方值
currentIndex = list(regroup.temp_Bin).index(bin)
prevIndex = list(regroup.temp_Bin)[currentIndex - 1]
df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, bin])]
# 和後一箱進行合併，並且計算卡方值
laterIndex = list(regroup.temp_Bin)[currentIndex + 1]
df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, bin])]
if chisq1 < chisq2:
cutOffPoints.remove(cutOffPoints[currentIndex - 1])
else:
cutOffPoints.remove(cutOffPoints[currentIndex])
# 完成合並之後，需要再次計算新的分箱準則下，每箱是否同時包含好壞樣本
groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
df2['temp_Bin'] = groupedvalues
# 需要檢查分箱後的最小佔比
if minBinPcnt > 0:
groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
df2['temp_Bin'] = groupedvalues
valueCounts = groupedvalues.value_counts().to_frame()
N = sum(valueCounts['temp'])
valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
valueCounts = valueCounts.sort_index()
minPcnt = min(valueCounts['pcnt'])
while minPcnt < minBinPcnt and len(cutOffPoints) > 2:
# 找出佔比最小的箱
indexForMinPcnt = valueCounts[valueCounts['pcnt'] == minPcnt].index.tolist()[0]
# 如果佔比最小的箱是最後一箱，則需要和上一個箱進行合併，也就意味着分裂點cutOffPoints中的最後一個需要移除
if indexForMinPcnt == max(valueCounts.index):
cutOffPoints = cutOffPoints[:-1]
# 如果佔比最小的箱是第一箱，則需要和下一個箱進行合併，也就意味着分裂點cutOffPoints中的第一個需要移除
elif indexForMinPcnt == min(valueCounts.index):
cutOffPoints = cutOffPoints[1:]
# 如果佔比最小的箱是中間的某一箱，則需要和前後中的一個箱進行合併，依據是較小的卡方值
else:
# 和前一箱進行合併，並且計算卡方值
currentIndex = list(valueCounts.index).index(indexForMinPcnt)
prevIndex = list(valueCounts.index)[currentIndex - 1]
df3 = df2.loc[df2['temp_Bin'].isin([prevIndex, indexForMinPcnt])]
# 和後一箱進行合併，並且計算卡方值
laterIndex = list(valueCounts.index)[currentIndex + 1]
df3b = df2.loc[df2['temp_Bin'].isin([laterIndex, indexForMinPcnt])]
if chisq1 < chisq2:
cutOffPoints.remove(cutOffPoints[currentIndex - 1])
else:
cutOffPoints.remove(cutOffPoints[currentIndex])
groupedvalues = df2['temp'].apply(lambda x: AssignBin(x, cutOffPoints))
df2['temp_Bin'] = groupedvalues
valueCounts = groupedvalues.value_counts().to_frame()
valueCounts['pcnt'] = valueCounts['temp'].apply(lambda x: x * 1.0 / N)
valueCounts = valueCounts.sort_index()
minPcnt = min(valueCounts['pcnt'])
cutOffPoints = special_attribute + cutOffPoints
return cutOffPoints

'''
:return: 在數據集df中，用壞樣本率給col進行編碼。target表示壞樣本標籤
'''
regroup = BinBadRate(df, col, target, grantRateIndicator=0)[1]
for k, v in br_dict.items():

def AssignBin(x, cutOffPoints,special_attribute=[]):
'''
:param x: 某個變量的某個取值
:param cutOffPoints: 上述變量的分箱結果，用切分點表示
:param special_attribute:  不參與分箱的特殊取值
:return: 分箱後的對應的第幾個箱，從0開始
例如, cutOffPoints = [10,20,30], 對於 x = 7, 返回 Bin 0；對於x=23，返回Bin 2； 對於x = 35, return Bin 3。
對於特殊值，返回的序列數前加"-"
'''
cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
numBin = len(cutOffPoints2)
if x in special_attribute:
i = special_attribute.index(x)+1
return 'Bin {}'.format(0-i)
if x<=cutOffPoints2[0]:
return 'Bin 0'
elif x > cutOffPoints2[-1]:
return 'Bin {}'.format(numBin)
else:
for i in range(0,numBin):
if cutOffPoints2[i] < x <=  cutOffPoints2[i+1]:
return 'Bin {}'.format(i+1)

def CalcWOE(df, col, target):
'''
:param df: 包含需要計算WOE的變量和目標變量
:param col: 需要計算WOE、IV的變量，必須是分箱後的變量，或者不需要分箱的類別型變量
:param target: 目標變量，0、1表示好、壞
:return: 返回WOE和IV
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(level=0, inplace=True)
N = sum(regroup['total'])
G = N - B
regroup['good_pcnt'] = regroup['good'].map(lambda x: x * 1.0 / G)
regroup['WOE'] = regroup.apply(lambda x: np.log(x.good_pcnt*1.0/x.bad_pcnt),axis = 1)
WOE_dict = regroup[[col,'WOE']].set_index(col).to_dict(orient='index')
for k, v in WOE_dict.items():
WOE_dict[k] = v['WOE']
IV = sum(IV)
return {"WOE": WOE_dict, 'IV':IV}

def FeatureMonotone(x):
'''
:return: 返回序列x中有幾個元素不滿足單調性，以及這些元素的位置。
例如，x=[1,3,2,5], 元素3比前後兩個元素都大，不滿足單調性；元素2比前後兩個元素都小，也不滿足單調性。
故返回的不滿足單調性的元素個數爲2，位置爲1和2.
'''
monotone = [x[i]<x[i+1] and x[i] < x[i-1] or x[i]>x[i+1] and x[i] > x[i-1] for i in range(1,len(x)-1)]
index_of_nonmonotone = [i+1 for i in range(len(monotone)) if monotone[i]]
return {'count_of_nonmonotone':monotone.count(True), 'index_of_nonmonotone':index_of_nonmonotone}

## 判斷某變量的壞樣本率是否單調
def BadRateMonotone(df, sortByVar, target,special_attribute = []):
'''
:param df: 包含檢驗壞樣本率的變量，和目標變量
:param sortByVar: 需要檢驗壞樣本率的變量
:param target: 目標變量，0、1表示好、壞
:param special_attribute: 不參與檢驗的特殊值
:return: 壞樣本率單調與否
'''
df2 = df.loc[~df[sortByVar].isin(special_attribute)]
if len(set(df2[sortByVar])) <= 2:
return True
badRate = [x[1]*1.0/x[0] for x in combined]
return False
else:
return True

'''
:param df: 包含檢驗0％或者100%壞樣本率
:param col: 分箱後的變量或者類別型變量。檢驗其中是否有一組或者多組沒有壞樣本或者沒有好樣本。如果是，則需要進行合併
:param target: 目標變量，0、1表示好、壞
:return: 合併方案，使得每個組裏同時包含好壞樣本
'''
# 如果是合併0壞樣本率的組，則跟最小的非0壞樣本率的組進行合併
else:
# 如果是合併0好樣本率的組，則跟最小的非0好樣本率的組進行合併
regroup.index = range(regroup.shape[0])
col_regroup = [[i] for i in regroup[col]]
del_index = []
for i in range(regroup.shape[0]-1):
col_regroup[i+1] = col_regroup[i] + col_regroup[i+1]
del_index.append(i)
break
else:
break
col_regroup2 = [col_regroup[i] for i in range(len(col_regroup)) if i not in del_index]
newGroup = {}
for i in range(len(col_regroup2)):
for g2 in col_regroup2[i]:
newGroup[g2] = 'Bin '+str(i)
return newGroup

def Monotone_Merge(df, target, col):
'''
:return:將數據集df中，不滿足壞樣本率單調性的變量col進行合併，使得合併後的新的變量中，壞樣本率單調，輸出合併方案。
例如，col=[Bin 0, Bin 1, Bin 2, Bin 3, Bin 4]是不滿足壞樣本率單調性的。合併後的col是：
[Bin 0&Bin 1, Bin 2, Bin 3, Bin 4].
合併只能在相鄰的箱中進行。
迭代地尋找最優合併方案。每一步迭代時，都嘗試將所有非單調的箱進行合併，每一次嘗試的合併都是跟前後箱進行合併再做比較
'''
def MergeMatrix(m, i,j,k):
'''
:param m: 需要合併行的矩陣
:param i,j: 合併第i和j行
:param k: 刪除第k行
:return: 合併後的矩陣
'''
m[i, :] = m[i, :] + m[j, :]
m = np.delete(m, k, axis=0)
return m

'''
:param i: 需要將第i行與前、後的行分別進行合併，比較哪種合併方案最佳。判斷準則是，合併後非單調性程度減輕，且更加均勻
:param bins_list_current: 合併前的分箱方案
:param not_monotone_count_current:合併前的非單調性元素個數
:return:分箱後的分箱矩陣、分箱方案、非單調性元素個數和衡量均勻性的指標balance
'''
i_prev = i - 1
i_next = i + 1
bins_list = bins_list_current.copy()
not_monotone_count = not_monotone_count_current
#合併方案a：將第i箱與前一箱進行合併
# 合併方案b：將第i行與後一行進行合併
balance = ((bad_by_bin[:, 1] / N).T * (bad_by_bin[:, 1] / N))[0, 0]
balance_a = ((bad_by_bin2a[:, 1] / N).T * (bad_by_bin2a[:, 1] / N))[0, 0]
balance_b = ((bad_by_bin2b[:, 1] / N).T * (bad_by_bin2b[:, 1] / N))[0, 0]
#滿足下述2種情況時返回方案a：（1）方案a能減輕非單調性而方案b不能；（2）方案a和b都能減輕非單調性，但是方案a的樣本均勻性優於方案b
if not_monotone_count2a < not_monotone_count_current and not_monotone_count2b >= not_monotone_count_current or \
not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a < balance_b:
bins_list[i_prev] = bins_list[i_prev] + bins_list[i]
bins_list.remove(bins_list[i])
not_monotone_count = not_monotone_count2a
balance = balance_a
# 同樣地，滿足下述2種情況時返回方案b：（1）方案b能減輕非單調性而方案a不能；（2）方案a和b都能減輕非單調性，但是方案b的樣本均勻性優於方案a
elif not_monotone_count2a >= not_monotone_count_current and not_monotone_count2b < not_monotone_count_current or \
not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a > balance_b:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
not_monotone_count = not_monotone_count2b
balance = balance_b
#如果方案a和b都不能減輕非單調性，返回均勻性更優的合併方案
else:
if balance_a< balance_b:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
not_monotone_count = not_monotone_count2b
balance = balance_b
else:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
not_monotone_count = not_monotone_count2b
balance = balance_b
'balance': balance}

N = df.shape[0]
bins_list = [[i] for i in bins]
#迭代地尋找最優合併方案，終止條件是:當前的壞樣本率已經單調，或者當前只有2箱
while (not_monotone_count > 0 and len(bins_list)>2):
#當非單調的箱的個數超過1個時，每一次迭代中都嘗試每一個箱的最優合併方案
all_possible_merging = []
for i in not_monotone_position:
balance_list = [i['balance'] for i in all_possible_merging]
not_monotone_count_new = [i['not_monotone_count'] for i in all_possible_merging]
#如果所有的合併方案都不能減輕當前的非單調性，就選擇更加均勻的合併方案
if min(not_monotone_count_new) >= not_monotone_count:
best_merging_position = balance_list.index(min(balance_list))
#如果有多個合併方案都能減輕當前的非單調性，也選擇更加均勻的合併方案
else:
better_merging_index = [i for i in range(len(not_monotone_count_new)) if not_monotone_count_new[i] < not_monotone_count]
better_balance = [balance_list[i] for i in better_merging_index]
best_balance_index = better_balance.index(min(better_balance))
best_merging_position = better_merging_index[best_balance_index]
bins_list = all_possible_merging[best_merging_position]['bins_list']
not_monotone_count = all_possible_merging[best_merging_position]['not_monotone_count']
return bins_list

def Prob2Score(prob, basePoint, PDO):
#將概率轉化成分數且爲正整數
y = np.log(prob/(1-prob))
return (basePoint+PDO/np.log(2)*(-y)).map(lambda x: int(x))

### 計算KS值
def KS(df, score, target):
'''
:param df: 包含目標變量與預測值的數據集
:param score: 得分或者概率
:param target: 目標變量
:return: KS值
'''
total = df.groupby([score])[target].count()
all[score] = all.index
all = all.sort_values(by=score,ascending=False)
all.index = range(len(all))