數據挖掘案例: 泰坦尼克號

*. 數據挖掘流程

（一）數據讀取：

讀取數據，並進行展示
統計數據各項指標
明確數據規模與要完成任務

（二）特徵理解分析

單特徵分析，逐個變量分析其對結果的影響
多變量統計分析，綜合考慮多種情況影響
統計繪圖得出結論

（三）數據清洗與預處理

對缺失值進行填充
特徵標準化/歸一化
篩選有價值的特徵
分析特徵之間的相關性

（四）建立模型

特徵數據與標籤準備
數據集切分
多種建模算法對比
集成策略等方案改進

一. 數據讀取與統計分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv(r'F:\51學習\study\數據挖掘案例\泰坦尼克號\train.csv')
print(data.shape)
data.head()

# data.info()
# data.dtypes
# data.describe()
# import pandas_profiling
# pandas_profiling.ProfileReport(data)
data.isnull().sum()

統計獲救情況

plt.figure(figsize = (14, 6))
plt.subplot(1, 2, 1)
data['Survived'].value_counts().plot.pie(explode = [0, 0.08], autopct = '%.2f%%', shadow = True)
plt.title('Survived')
plt.ylabel('')
plt.subplot(1, 2, 2)
sns.countplot('Survived', data = data)
plt.title('Survived')

二. 特徵分析 & 缺失值填充

2.1 性別與獲救

data.groupby(['Sex','Survived'])['Survived'].count()

fig, ax = plt.subplots(1, 2, figsize = (14, 6))
data[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar(ax = ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue = 'Survived', data = data, ax = ax[1])
ax[1].set_title('Sex: Survived vs Dead')

2.2 船艙等級與獲救

pd.crosstab(data.Pclass, data.Survived, margins = True).style.background_gradient(cmap = 'autumn')

fig, ax = plt.subplots(1, 2, figsize = (14, 6))
data['Pclass'].value_counts().plot.bar(ax = ax[0])
ax[0].set(title = 'Number Of Passengers By Pclass', ylabel = 'Count')

sns.countplot('Pclass', hue = 'Survived', data = data, ax = ax[1])
ax[1].set_title('Pclass: Survived vs Dead')

船艙等級和性別對結果的影響

pd.crosstab([data.Pclass, data.Sex], data.Survived, margins = True).style.background_gradient(cmap = 'autumn')

pd.pivot_table(data, index = 'Pclass', columns = 'Sex', values = 'Survived')

sns.factorplot('Pclass', 'Survived', hue = 'Sex', data = data)

#sns.pointplot('Pclass', 'Survived', hue = 'Sex', data = data)

2.3 年齡與獲救

print('Oldest Passenger was of:',data['Age'].max(),'Years')
print('Youngest Passenger was of:',data['Age'].min(),'Years')
print('Average Age on the ship:',data['Age'].mean(),'Years')

plt.figure(figsize = (14, 6))

plt.subplot(1, 2, 1)
sns.violinplot('Pclass', 'Age', hue = 'Survived', data = data, split = True)
plt.title('Pclass and Age vs Survived')
plt.yticks(range(0, 110, 10));

plt.subplot(1, 2, 2)
sns.violinplot('Sex', 'Age', hue = 'Survived', data = data, split = True)
plt.title('Sex and Age vs Survived')
plt.yticks(range(0, 110, 10));

Oldest Passenger was of: 80.0 Years
Youngest Passenger was of: 0.42 Years
Average Age on the ship: 29.69911764705882 Years

2.4 姓名(稱謂) 與獲救

data['Initial'] = 0
data['Initial'] = data.Name.str.extract('([A-Za-z]+)\.')
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(cmap = 'autumn')

data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace = True)
pd.crosstab(data.Initial, data.Sex).style.background_gradient(cmap = 'summer')

data.groupby('Initial')['Age'].mean()

2.5 填充缺失值

# 使用每組的均值來進行填充
data.loc[(data.Age.isnull()) & (data.Initial == 'Mr'), 'Age'] = 33
data.loc[(data.Age.isnull()) & (data.Initial == 'Mrs'),'Age']=36
data.loc[(data.Age.isnull()) & (data.Initial == 'Master'),'Age']=5
data.loc[(data.Age.isnull()) & (data.Initial == 'Miss'),'Age']=22
data.loc[(data.Age.isnull()) & (data.Initial == 'Other'),'Age']=46
data.Age.isnull().any()

False

plt.style.use('seaborn-darkgrid')
plt.figure(figsize = (8, 6))
data[data['Survived'] == 0].Age.plot.hist(bins = 30, color = 'red', alpha = 0.5, label = 'Not Survived', edgecolor = 'k')
data[data['Survived'] == 1].Age.plot.hist(bins = 30, color = 'green', alpha = 0.5, label = 'Survived', edgecolor = 'k')
plt.legend()

plt.figure(figsize = (16, 5))
g = sns.FacetGrid(data, col = 'Initial')
g.map(sns.pointplot, 'Pclass', 'Survived')

2.6 登船地點與獲救

pd.crosstab([data.Embarked, data.Pclass], [data.Sex, data.Survived], margins = True).style.background_gradient(cmap = 'autumn')

#plt.figure(figsize = (6, 4))
sns.factorplot('Embarked', 'Survived', data = data)
fig = plt.gcf()
fig.set_size_inches(5,3)

fig, ax = plt.subplots(2, 2, figsize = (16, 14))

sns.countplot('Embarked', data = data, ax = ax[0, 0])
ax[0, 0].set_title('No. Of Passengers Boarded')

sns.countplot('Embarked', hue = 'Sex', data = data, ax = ax[0, 1])
ax[0, 1].set_title('Male-Female Split For Embarked')

sns.countplot('Embarked', hue = 'Survived', data = data, ax = ax[1, 0])
ax[1, 0].set_title('Embarked vs Survived')

sns.countplot('Embarked', hue = 'Pclass', data = data, ax = ax[1, 1])
ax[1, 1].set_title('Embarked vs Pclass')

plt.tight_layout()

sns.factorplot('Pclass', 'Survived', hue = 'Sex', col = 'Embarked', data = data)

data['Embarked'].fillna('S', inplace = True)
data.Embarked.isnull().any()

False

2.7 兄弟姐妹的數量

pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap = 'autumn')

fig, ax = plt.subplots(1, 2, figsize = (16, 6))
sns.barplot('SibSp', 'Survived', data = data, ax = ax[0])
ax[0].set_title('SibSp vs Survived')

sns.factorplot('SibSp', 'Survived', data = data, ax = ax[1])
ax[1].set_title('SibSp vs Survived')

plt.close()

pd.crosstab(data.SibSp, data.Pclass).style.background_gradient(cmap = 'summer')

2.8 父母和孩子的數量

pd.crosstab(data.Parch, data.Pclass).style.background_gradient(cmap = 'autumn')

fig, ax = plt.subplots(1, 2, figsize = (16, 6))
sns.barplot('Parch', 'Survived', data = data, ax = ax[0])
ax[0].set_title('Parch vs Survived')
sns.factorplot('Parch', 'Survived', data = data, ax = ax[1])
ax[1].set_title('Parch vs Survived')
plt.close()

2.9 船票的價格

print('Highest Fare was:',data['Fare'].max())
print('Lowest Fare was:',data['Fare'].min())
print('Average Fare was:',data['Fare'].mean())

fig, ax = plt.subplots(1, 3, figsize = (21, 6))
sns.distplot(data[data['Pclass'] == 1].Fare, ax = ax[0])
ax[0].set_title('Fare in Pclass 1')
sns.distplot(data[data['Pclass'] == 2].Fare, ax = ax[1])
ax[1].set_title('Fare in Pclass 2')
sns.distplot(data[data['Pclass'] == 3].Fare, ax = ax[2])
ax[2].set_title('Fare in Pclass 3')

Highest Fare was: 512.3292
Lowest Fare was: 0.0
Average Fare was: 32.2042079685746

三. 特徵相關性

3.1 相關性熱度圖

plt.figure(figsize = (8, 6))
sns.heatmap(data.corr(), annot = True, cmap = 'RdYlGn', linewidths = 0.5)

3.2 熱度圖下三角

datacorr = data.corr()
mask = np.array(datacorr)
mask[np.tril_indices_from(mask)] = False
plt.figure(figsize = (8, 6))
sns.heatmap(datacorr, mask = mask, annot = True, cmap = 'autumn', linewidths = 0.2)

四. 構建特徵

4.1 年齡特徵

data['Age_band'] = 0
data.loc[data['Age'] <= 16, 'Age_band'] = 0
data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age_band'] = 1
data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age_band'] = 2
data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age_band'] = 3
data.loc[data['Age'] > 64, 'Age_band'] = 4

data['Age_band'].value_counts().to_frame().style.background_gradient(cmap = 'summer')

sns.factorplot('Age_band', 'Survived', data = data, col = 'Pclass')

4.2 家庭總人口

data['Family_Size'] = 0
data['Family_Size'] = data['Parch'] + data['SibSp']
data['Alone'] = 0
data.loc[data.Family_Size == 0, 'Alone'] = 1

fig, ax = plt.subplots(1, 2, figsize = ( 16, 6))
sns.factorplot('Family_Size', 'Survived', data = data, ax = ax[0])
ax[0].set_title('Family_Size vs Survived')
sns.pointplot('Alone', 'Survived', data = data, ax = ax[1])
ax[1].set_title('Alone vs Survived')
plt.close()

sns.factorplot('Alone', 'Survived', data = data, hue = 'Sex', col = 'Pclass')

4.3 船票價格

from sklearn.preprocessing import LabelEncoder
data['Fare_Range'] = pd.qcut(data['Fare'], 4)
data['Fare_Range'] = LabelEncoder().fit_transform(data['Fare_Range'])
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap = 'autumn')

sns.factorplot('Fare_Range', 'Survived', data = data, hue = 'Sex')

4.4 類型轉換與特徵清洗

data['Sex'].replace(['male', 'female'], [0, 1], inplace = True)
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
data['Initial'] = LabelEncoder().fit_transform(data['Initial'])
data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'], axis = 1, inplace = True)

plt.figure(figsize = (10, 8))
sns.heatmap(data.corr(), annot = True, cmap = 'RdYlGn', linewidths = 0.2)

五. 機器學習建模

5.1 切分訓練集與測試集

from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.3, random_state = 42, stratify = data['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X = test[test.columns[1:]]
test_Y = test[test.columns[:1]]
X = data[data.columns[1:]]
Y = data['Survived']

print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

(623, 9)
(623, 1)
(268, 9)
(268, 1)

5.2 邏輯迴歸 LogisticRegression

from sklearn import metrics

def acc_score(pre_y, y):
    return metrics.accuracy_score(pre_y, y)

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(train_X, train_Y)
pre_logistic = logistic.predict(test_X)
#print('Score for logistic regression is ', logistic.score(test_X, test_Y))
print('Accuracy for logistic regression is ', acc_score(pre_logistic, test_Y))

Accuracy for logistic regression is 0.7947761194029851

5.3 支持向量機 SVM

from sklearn import svm

svc_linear = svm.SVC(kernel = 'linear', C = 0.1, gamma = 0.1)
svc_linear.fit(train_X, train_Y)
pre_svc_linear = svc_linear.predict(test_X)
print('Accuracy for linear SVM is ', acc_score(pre_svc_linear, test_Y))

svc_rbf = svm.SVC(kernel = 'rbf', C = 1, gamma = 0.1)
svc_rbf.fit(train_X, train_Y)
pre_svc_rbf = svc_rbf.predict(test_X)
print('Accuracy for rbf SVM is ', acc_score(pre_svc_rbf, test_Y))

Accuracy for linear SVM is 0.7761194029850746
Accuracy for rbf SVM is 0.8171641791044776

5.4 決策樹 DecisionTree

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(train_X, train_Y)
pre_dtc = dtc.predict(test_X)
print('Accuracy for decision tree classifier is ', acc_score(pre_dtc, test_Y))

Accuracy for decision tree classifier is 0.7835820895522388

5.5 隨機森林 RandomForest

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(train_X, train_Y)
pre_rfc = rfc.predict(test_X)
print('Accuracy for random forest classifier is', acc_score(pre_rfc, test_Y))

Accuracy for random forest classifier is 0.7873134328358209

5.6 K近鄰 KNeighbors

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_X, train_Y)
pre_knn = knn.predict(test_X)
print('Accuracy for KNN classifier is', acc_score(pre_knn, test_Y))

Accuracy for KNN classifier is 0.7873134328358209

a_index = list(range(1, 11))
a = pd.Series()

for i in a_index:
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(train_X, train_Y)
    pre_knn = knn.predict(test_X)
    a = a.append(pd.Series(acc_score(pre_knn, test_Y)))

print('Accuracies for different values of n are:', a.values, 'with the max value is ', a.max())

plt.figure(figsize = (8, 6))
plt.plot(a_index, a)

Accuracies for different values of n are: [0.72761194 0.78731343 0.78358209 0.79477612 0.78731343 0.77985075
0.77238806 0.7761194 0.7761194 0.77238806]
with the max value is 0.7947761194029851

5.7 高斯貝葉斯 Naive Bayes

from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
bayes.fit(train_X, train_Y)
pre_bayes = bayes.predict(test_X)
print("The accuracy of the NaiveBayes is", acc_score(pre_bayes, test_Y))

The accuracy of the NaiveBayes is 0.7985074626865671

5.8 梯度提升樹 GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 50)
gbc.fit(train_X, train_Y)
pre_gbc = gbc.predict(test_X)
print("The accuracy of the gradient boosting is", acc_score(pre_gbc, test_Y))

The accuracy of the gradient boosting is 0.8246268656716418

六. 交叉驗證

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

kfold = KFold(n_splits = 10, random_state = 42)
cv_mean = []
accuracy = []
std = []
classifiers = ['Logistic Regression', 'Linear Svm', 'Radial Svm', 'KNN',
               'Decision Tree', 'Random Forest', 'Naive Bayes', 'Gradient Boosting']
models = [LogisticRegression(), svm.SVC(kernel = 'linear'), svm.SVC(kernel = 'rbf'),
         KNeighborsClassifier(n_neighbors = 4), DecisionTreeClassifier(),
         RandomForestClassifier(n_estimators = 200), GaussianNB(), GradientBoostingClassifier(n_estimators = 50)]
for model in models:
    cv_result = cross_val_score(model, X, Y, cv = kfold, scoring = 'accuracy')
    cv_mean.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe = pd.DataFrame({'CV Mean': cv_mean, 'Std':std}, index = classifiers)
new_models_dataframe

accuracy_dataframe = pd.DataFrame(accuracy, index = classifiers)

plt.figure(figsize = (16, 6))
accuracy_dataframe.T.boxplot()

new_models_dataframe.sort_values('CV Mean', ascending = False).plot()
fig = plt.gcf()
fig.set_size_inches(8, 6)
plt.title("Average CV Mean Accuracy & Std")
plt.xticks(rotation = 45)

七. 混淆矩陣

from sklearn.metrics import confusion_matrix

fig, ax = plt.subplots(3, 3, figsize = (12, 10))
pre_lr = cross_val_predict(LogisticRegression(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_lr), ax = ax[0, 0], annot = True, fmt = '3.0f')
ax[0, 0].set_title('Matrix for Logistic Regression')

pre_linear = cross_val_predict(svm.SVC(kernel = 'linear'), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_linear), ax = ax[0, 1], annot = True, fmt = '0.0f')
ax[0, 1].set_title('Matrix for Linear SVM')

pre_rbf = cross_val_predict(svm.SVC(kernel = 'rbf'), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_rbf), ax = ax[0, 2], annot = True, fmt = '0.0f')
ax[0, 2].set_title('Matrix for rbf SVM')

pre_knn = cross_val_predict(KNeighborsClassifier(n_neighbors = 4), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_knn), ax = ax[1, 0], annot = True, fmt = '0.0f')
ax[1, 0].set_title('Matrix for KNN')

pre_dtc = cross_val_predict(DecisionTreeClassifier(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_dtc), ax = ax[1, 1], annot = True, fmt = '0.0f')
ax[1, 1].set_title('Matrix for Dicision Tree')

pre_rfc = cross_val_predict(RandomForestClassifier(n_estimators = 200), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_rfc), ax = ax[1, 2], annot = True, fmt = '0.0f')
ax[1, 2].set_title('Matrix for Random Forest')

pre_bayes = cross_val_predict(GaussianNB(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_bayes), ax = ax[2, 0], annot = True, fmt = '0.0f')
ax[2, 0].set_title('Matrix for Naive Bayes')

pre_GBDT = cross_val_predict(GradientBoostingClassifier(n_estimators = 50), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_GBDT), ax = ax[2, 1], annot = True, fmt = '0.0f')
ax[2, 1].set_title('Matrix for Gradient Boosting')

八. 超參數調整 GridSearchCV

from sklearn.model_selection import GridSearchCV
C = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
gamma = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
kernel = ['rbf', 'linear']
hyper = {'kernel': kernel, 'C': C, 'gamma': gamma}
gridsearch = GridSearchCV(estimator = svm.SVC(), param_grid = hyper, verbose = True)
gridsearch.fit(X, Y)

print(gridsearch.best_score_)
print(gridsearch.best_params_)

0.8327721661054994
{‘C’: 0.8, ‘gamma’: 0.1, ‘kernel’: ‘rbf’}

hyper = {'n_estimators': [10, 50, 100, 200, 300, 500, 800, 1000],
        'max_depth': [None, 5, 8], 'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3]}
grid_search = GridSearchCV(estimator = RandomForestClassifier(random_state = 42),
                          param_grid = hyper, verbose = True )
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_

0.8338945005611672

九. 集成模塊

9.1 投票分類器

from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf = VotingClassifier(estimators = [('LR', LogisticRegression(C = 0.05)),
                                                 ('SVM', svm.SVC(kernel = 'linear', probability = True)),
                                                 ('SVC', svm.SVC(probability = True, kernel = 'rbf', C = 0.8, gamma = 0.1)),
                                                 ('KNN', KNeighborsClassifier(n_neighbors = 4)),
                                                 ('DTC', DecisionTreeClassifier(random_state = 42)),
                                                 ('RFC', RandomForestClassifier(n_estimators = 500, max_depth = 5, min_samples_leaf = 3, min_samples_split = 2, random_state = 42)),
                                                 ('NB', GaussianNB()),
                                                 ('GBDT', GradientBoostingClassifier(n_estimators = 50))],
                                   voting = 'soft').fit(train_X, train_Y)
print('The accuracy for ensembled model is: ', ensemble_lin_rbf.score(test_X, test_Y))
cross = cross_val_score(ensemble_lin_rbf, X, Y, cv = 10, scoring = 'accuracy')
print('The score validated score is ', cross.mean())

The accuracy for ensembled model is: 0.8171641791044776
The score validated score is 0.8282984337759618

9.2 引導聚類算法 Bagging

from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 4), random_state = 42, n_estimators = 500)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print('The accuracy for bagged KNN is: ', acc_score(prediction, test_Y))
result = cross_val_score(model, X, Y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged KNN is:', result.mean())

The accuracy for bagged KNN is: 0.7947761194029851
The cross validated score for bagged KNN is: 0.7957496311428895

model = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 42, n_estimators = 100)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print('The accuracy for bagged Decision Tree is: ', acc_score(prediction, test_Y))
result = cross_val_score(model, X, Y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged Decision Tree is: ', result.mean())

The accuracy for bagged Decision Tree is: 0.7761194029850746
The cross validated score for bagged Decision Tree is: 0.8070369424582908

9.3 自適應增強 AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators = 200, random_state = 42, learning_rate = 0.05)
result = cross_val_score(ada, X, Y, cv = 10, scoring = 'accuracy')
result.mean()

0.8327553626149132

hyper = {'n_estimators': list(range(100, 1100, 100)), 'learning_rate': [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
grid_search = GridSearchCV(estimator = AdaBoostClassifier(random_state = 42), param_grid = hyper, verbose = True)
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_

0.8327721661054994
AdaBoostClassifier(algorithm=‘SAMME.R’, base_estimator=None,
learning_rate=0.05, n_estimators=200, random_state=42)

ada = AdaBoostClassifier(n_estimators = 200, random_state = 42, learning_rate = 0.05)
result = cross_val_predict(ada, X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, result), cmap = 'autumn', annot = True, fmt = '2.0f')

9.4 梯度提升 GradientBoosting

grad = GradientBoostingClassifier(n_estimators = 500, random_state = 42, learning_rate = 0.05)
result = cross_val_score(grad, X, Y, cv = 10, scoring = 'accuracy')
result.mean()

0.8160265577119509

hyper = {'n_estimators': range(100, 1100, 100), 'max_depth': [3, 4, 5],
        'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(random_state = 42),
                          param_grid = hyper, verbose = True)
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_

0.8282828282828283

gbc = grid_search.best_estimator_
result = cross_val_predict(gbc, X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, result), cmap = 'summer', annot = True, fmt = '.0f')

十. 特徵重要性

fig, ax = plt.subplots(2, 2, figsize = (16, 12))
model = RandomForestClassifier(n_estimators = 500, max_depth = 5, min_samples_leaf = 3,
                               min_samples_split=2, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[0, 0], color='green')
ax[0, 0].set_title('Feature Importance in Random Forests')

# 支持向量機沒有feature_importances_
#model = svm.SVC(kernel = 'rbf', C = 0.9, gamma = 0.1, random_state = 42)
#model.fit(X, Y)
#pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(ax = ax[0, 1])
#ax[0, 1].set_title('Feature Importance in SVM')

model = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.05, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[0, 1], color='#ddff11')
ax[0, 1].set_title('Feature Importance in AdaBoost')

model = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05, max_depth = 3, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[1, 0], color='blue')
ax[1, 0].set_title('Feature Importance in Gradient Boosting')

import xgboost as xgb
model = xgb.XGBClassifier(n_estimators = 900, learning_rate = 0.1)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[1, 1], color='red')
ax[1, 1].set_title('Feature Importance in XgBoost')