數據挖掘實戰之信用卡違約率分析

本文通過針對臺灣某銀行 2005 年 4 月到 9 月的信用卡數據這一數據集構建一個分析信用卡違約率的分類器。

數據來源https://github.com/cystanford/credit_default

1、數據加載和探索:

數據完整,沒有缺失值

#查看下一個月的違約情況
default = data['default.payment.next.month'].value_counts()
default

df = pd.DataFrame({'default.payment.next.month':default.index,'values':default.values})  #barplot的data參數需要是Dataframe或者array
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']  #用來正常顯示中文標籤
plt.figure(figsize = (6,6))
plt.title('信用卡違約率客戶\n(違約:1,守約:0)')
sns.set_color_codes('pastel')
sns.barplot(x = 'default.payment.next.month',y = 'values',data = df)
locs,labels = plt.xticks()

#特徵選擇
data.drop(['ID'],inplace = True,axis = 1)
target = data['default.payment.next.month'].values
columns = data.columns.tolist()  #data.columns返回array,可以通過tolist(),或者list(array)轉換爲list,一般tolist()效率更高
columns.remove('default.payment.next.month')
features = data[columns].values
#30%作爲測試集,其餘作爲訓練集
from sklearn.cross_validation import train_test_split
train_x,test_x,train_y,test_y = train_test_split(features,target,test_size = 0.30,stratify = target,random_state = 1)

2、分類階段:因爲不確定哪個分類器的分類效果好,構造了SVM、決策樹、隨機森林、KNN、Adaboost分類器,

然後通過 GridSearchCV 工具,找到每個分類器的最優參數和最優分數,最終找到適合這個項目的分類器和該分類器的參數。

#構造各種分類器
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
classifiers = [
    SVC(random_state = 1,kernel = 'rbf'),
    DecisionTreeClassifier(random_state = 1,criterion = 'gini'),
    RandomForestClassifier(random_state = 1,criterion = 'gini'),
    KNeighborsClassifier(metric = 'minkowski'),
    AdaBoostClassifier(random_state = 1)]


#分類器名稱
classifiers_names = ['svc',
                     'decisionTreeClassifier',
                     'randomForestClassifier',
                     'kneighborsClassifier',
                     'adaBoostClassifier']


#分類器參數
classifiers_param_grid = [{'svc__C':[1],'svc__gamma':[0.01]},
                         {'decisionTreeClassifier__max_depth':[6,9,11]},
                         {'randomForestClassifier__n_estimators':[3,5,6]},
                         {'kneighborsClassifier__n_neighbors':[4,6,8]},
                         {'adaBoostClassifier__n_estimators':[10,50,100]}]
#對具體的分類器進行GridSearchCV參數調優
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve
def GridSearchCV_work(pipeline,train_x,train_y,test_x,test_y,param_grid,score = 'accuracy'):
    response = {}
    gridsearch = GridSearchCV(estimator = pipeline,param_grid = param_grid,scoring = score)
    search = gridsearch.fit(train_x,train_y)
    print('GridSearch 最優參數:',search.best_params_)
    print('GridSearch 最優分數:%0.4lf' %search.best_score_)
    predict_y = gridsearch.predict(test_x)
    print('準確率 %0.4lf' %accuracy_score(test_y,predict_y))    
    response['predict_y'] = predict_y
    response['accuracy_score'] = accuracy_score(test_y,predict_y)
    return response


for model,model_name,model_param_grid in zip(classifiers,classifiers_names,classifiers_param_grid):
    pipeline = Pipeline([('scaler',StandardScaler()),
                        (model_name,model)])
    result = GridSearchCV_work(pipeline,train_x,train_y,test_x,test_y,model_param_grid,score = 'accuracy')

可以看到SVM分類器的準確率最高爲0.8172,最優分數爲0.8174。

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章