目錄:
*. 數據挖掘流程
(一)數據讀取:
- 讀取數據,並進行展示
- 統計數據各項指標
- 明確數據規模與要完成任務
(二)特徵理解分析
- 單特徵分析,逐個變量分析其對結果的影響
- 多變量統計分析,綜合考慮多種情況影響
- 統計繪圖得出結論
(三)數據清洗與預處理
- 對缺失值進行填充
- 特徵標準化/歸一化
- 篩選有價值的特徵
- 分析特徵之間的相關性
(四)建立模型
- 特徵數據與標籤準備
- 數據集切分
- 多種建模算法對比
- 集成策略等方案改進
一. 數據讀取與統計分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv(r'F:\51學習\study\數據挖掘案例\泰坦尼克號\train.csv')
print(data.shape)
data.head()
# data.info()
# data.dtypes
# data.describe()
# import pandas_profiling
# pandas_profiling.ProfileReport(data)
data.isnull().sum()
- 統計獲救情況
plt.figure(figsize = (14, 6))
plt.subplot(1, 2, 1)
data['Survived'].value_counts().plot.pie(explode = [0, 0.08], autopct = '%.2f%%', shadow = True)
plt.title('Survived')
plt.ylabel('')
plt.subplot(1, 2, 2)
sns.countplot('Survived', data = data)
plt.title('Survived')
二. 特徵分析 & 缺失值填充
2.1 性別與獲救
data.groupby(['Sex','Survived'])['Survived'].count()
fig, ax = plt.subplots(1, 2, figsize = (14, 6))
data[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar(ax = ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue = 'Survived', data = data, ax = ax[1])
ax[1].set_title('Sex: Survived vs Dead')
2.2 船艙等級與獲救
pd.crosstab(data.Pclass, data.Survived, margins = True).style.background_gradient(cmap = 'autumn')
fig, ax = plt.subplots(1, 2, figsize = (14, 6))
data['Pclass'].value_counts().plot.bar(ax = ax[0])
ax[0].set(title = 'Number Of Passengers By Pclass', ylabel = 'Count')
sns.countplot('Pclass', hue = 'Survived', data = data, ax = ax[1])
ax[1].set_title('Pclass: Survived vs Dead')
- 船艙等級和性別對結果的影響
pd.crosstab([data.Pclass, data.Sex], data.Survived, margins = True).style.background_gradient(cmap = 'autumn')
pd.pivot_table(data, index = 'Pclass', columns = 'Sex', values = 'Survived')
sns.factorplot('Pclass', 'Survived', hue = 'Sex', data = data)
#sns.pointplot('Pclass', 'Survived', hue = 'Sex', data = data)
2.3 年齡與獲救
print('Oldest Passenger was of:',data['Age'].max(),'Years')
print('Youngest Passenger was of:',data['Age'].min(),'Years')
print('Average Age on the ship:',data['Age'].mean(),'Years')
plt.figure(figsize = (14, 6))
plt.subplot(1, 2, 1)
sns.violinplot('Pclass', 'Age', hue = 'Survived', data = data, split = True)
plt.title('Pclass and Age vs Survived')
plt.yticks(range(0, 110, 10));
plt.subplot(1, 2, 2)
sns.violinplot('Sex', 'Age', hue = 'Survived', data = data, split = True)
plt.title('Sex and Age vs Survived')
plt.yticks(range(0, 110, 10));
Oldest Passenger was of: 80.0 Years
Youngest Passenger was of: 0.42 Years
Average Age on the ship: 29.69911764705882 Years
2.4 姓名(稱謂) 與獲救
data['Initial'] = 0
data['Initial'] = data.Name.str.extract('([A-Za-z]+)\.')
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(cmap = 'autumn')
data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace = True)
pd.crosstab(data.Initial, data.Sex).style.background_gradient(cmap = 'summer')
data.groupby('Initial')['Age'].mean()
2.5 填充缺失值
# 使用每組的均值來進行填充
data.loc[(data.Age.isnull()) & (data.Initial == 'Mr'), 'Age'] = 33
data.loc[(data.Age.isnull()) & (data.Initial == 'Mrs'),'Age']=36
data.loc[(data.Age.isnull()) & (data.Initial == 'Master'),'Age']=5
data.loc[(data.Age.isnull()) & (data.Initial == 'Miss'),'Age']=22
data.loc[(data.Age.isnull()) & (data.Initial == 'Other'),'Age']=46
data.Age.isnull().any()
False
plt.style.use('seaborn-darkgrid')
plt.figure(figsize = (8, 6))
data[data['Survived'] == 0].Age.plot.hist(bins = 30, color = 'red', alpha = 0.5, label = 'Not Survived', edgecolor = 'k')
data[data['Survived'] == 1].Age.plot.hist(bins = 30, color = 'green', alpha = 0.5, label = 'Survived', edgecolor = 'k')
plt.legend()
plt.figure(figsize = (16, 5))
g = sns.FacetGrid(data, col = 'Initial')
g.map(sns.pointplot, 'Pclass', 'Survived')
2.6 登船地點與獲救
pd.crosstab([data.Embarked, data.Pclass], [data.Sex, data.Survived], margins = True).style.background_gradient(cmap = 'autumn')
#plt.figure(figsize = (6, 4))
sns.factorplot('Embarked', 'Survived', data = data)
fig = plt.gcf()
fig.set_size_inches(5,3)
fig, ax = plt.subplots(2, 2, figsize = (16, 14))
sns.countplot('Embarked', data = data, ax = ax[0, 0])
ax[0, 0].set_title('No. Of Passengers Boarded')
sns.countplot('Embarked', hue = 'Sex', data = data, ax = ax[0, 1])
ax[0, 1].set_title('Male-Female Split For Embarked')
sns.countplot('Embarked', hue = 'Survived', data = data, ax = ax[1, 0])
ax[1, 0].set_title('Embarked vs Survived')
sns.countplot('Embarked', hue = 'Pclass', data = data, ax = ax[1, 1])
ax[1, 1].set_title('Embarked vs Pclass')
plt.tight_layout()
sns.factorplot('Pclass', 'Survived', hue = 'Sex', col = 'Embarked', data = data)
data['Embarked'].fillna('S', inplace = True)
data.Embarked.isnull().any()
False
2.7 兄弟姐妹的數量
pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap = 'autumn')
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
sns.barplot('SibSp', 'Survived', data = data, ax = ax[0])
ax[0].set_title('SibSp vs Survived')
sns.factorplot('SibSp', 'Survived', data = data, ax = ax[1])
ax[1].set_title('SibSp vs Survived')
plt.close()
pd.crosstab(data.SibSp, data.Pclass).style.background_gradient(cmap = 'summer')
2.8 父母和孩子的數量
pd.crosstab(data.Parch, data.Pclass).style.background_gradient(cmap = 'autumn')
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
sns.barplot('Parch', 'Survived', data = data, ax = ax[0])
ax[0].set_title('Parch vs Survived')
sns.factorplot('Parch', 'Survived', data = data, ax = ax[1])
ax[1].set_title('Parch vs Survived')
plt.close()
2.9 船票的價格
print('Highest Fare was:',data['Fare'].max())
print('Lowest Fare was:',data['Fare'].min())
print('Average Fare was:',data['Fare'].mean())
fig, ax = plt.subplots(1, 3, figsize = (21, 6))
sns.distplot(data[data['Pclass'] == 1].Fare, ax = ax[0])
ax[0].set_title('Fare in Pclass 1')
sns.distplot(data[data['Pclass'] == 2].Fare, ax = ax[1])
ax[1].set_title('Fare in Pclass 2')
sns.distplot(data[data['Pclass'] == 3].Fare, ax = ax[2])
ax[2].set_title('Fare in Pclass 3')
Highest Fare was: 512.3292
Lowest Fare was: 0.0
Average Fare was: 32.2042079685746
三. 特徵相關性
3.1 相關性熱度圖
plt.figure(figsize = (8, 6))
sns.heatmap(data.corr(), annot = True, cmap = 'RdYlGn', linewidths = 0.5)
3.2 熱度圖下三角
datacorr = data.corr()
mask = np.array(datacorr)
mask[np.tril_indices_from(mask)] = False
plt.figure(figsize = (8, 6))
sns.heatmap(datacorr, mask = mask, annot = True, cmap = 'autumn', linewidths = 0.2)
四. 構建特徵
4.1 年齡特徵
data['Age_band'] = 0
data.loc[data['Age'] <= 16, 'Age_band'] = 0
data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age_band'] = 1
data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age_band'] = 2
data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age_band'] = 3
data.loc[data['Age'] > 64, 'Age_band'] = 4
data['Age_band'].value_counts().to_frame().style.background_gradient(cmap = 'summer')
sns.factorplot('Age_band', 'Survived', data = data, col = 'Pclass')
4.2 家庭總人口
data['Family_Size'] = 0
data['Family_Size'] = data['Parch'] + data['SibSp']
data['Alone'] = 0
data.loc[data.Family_Size == 0, 'Alone'] = 1
fig, ax = plt.subplots(1, 2, figsize = ( 16, 6))
sns.factorplot('Family_Size', 'Survived', data = data, ax = ax[0])
ax[0].set_title('Family_Size vs Survived')
sns.pointplot('Alone', 'Survived', data = data, ax = ax[1])
ax[1].set_title('Alone vs Survived')
plt.close()
sns.factorplot('Alone', 'Survived', data = data, hue = 'Sex', col = 'Pclass')
4.3 船票價格
from sklearn.preprocessing import LabelEncoder
data['Fare_Range'] = pd.qcut(data['Fare'], 4)
data['Fare_Range'] = LabelEncoder().fit_transform(data['Fare_Range'])
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap = 'autumn')
sns.factorplot('Fare_Range', 'Survived', data = data, hue = 'Sex')
4.4 類型轉換與特徵清洗
data['Sex'].replace(['male', 'female'], [0, 1], inplace = True)
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
data['Initial'] = LabelEncoder().fit_transform(data['Initial'])
data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'], axis = 1, inplace = True)
plt.figure(figsize = (10, 8))
sns.heatmap(data.corr(), annot = True, cmap = 'RdYlGn', linewidths = 0.2)
五. 機器學習建模
5.1 切分訓練集與測試集
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.3, random_state = 42, stratify = data['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X = test[test.columns[1:]]
test_Y = test[test.columns[:1]]
X = data[data.columns[1:]]
Y = data['Survived']
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)
(623, 9)
(623, 1)
(268, 9)
(268, 1)
5.2 邏輯迴歸 LogisticRegression
from sklearn import metrics
def acc_score(pre_y, y):
return metrics.accuracy_score(pre_y, y)
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(train_X, train_Y)
pre_logistic = logistic.predict(test_X)
#print('Score for logistic regression is ', logistic.score(test_X, test_Y))
print('Accuracy for logistic regression is ', acc_score(pre_logistic, test_Y))
Accuracy for logistic regression is 0.7947761194029851
5.3 支持向量機 SVM
from sklearn import svm
svc_linear = svm.SVC(kernel = 'linear', C = 0.1, gamma = 0.1)
svc_linear.fit(train_X, train_Y)
pre_svc_linear = svc_linear.predict(test_X)
print('Accuracy for linear SVM is ', acc_score(pre_svc_linear, test_Y))
svc_rbf = svm.SVC(kernel = 'rbf', C = 1, gamma = 0.1)
svc_rbf.fit(train_X, train_Y)
pre_svc_rbf = svc_rbf.predict(test_X)
print('Accuracy for rbf SVM is ', acc_score(pre_svc_rbf, test_Y))
Accuracy for linear SVM is 0.7761194029850746
Accuracy for rbf SVM is 0.8171641791044776
5.4 決策樹 DecisionTree
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(train_X, train_Y)
pre_dtc = dtc.predict(test_X)
print('Accuracy for decision tree classifier is ', acc_score(pre_dtc, test_Y))
Accuracy for decision tree classifier is 0.7835820895522388
5.5 隨機森林 RandomForest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(train_X, train_Y)
pre_rfc = rfc.predict(test_X)
print('Accuracy for random forest classifier is', acc_score(pre_rfc, test_Y))
Accuracy for random forest classifier is 0.7873134328358209
5.6 K近鄰 KNeighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(train_X, train_Y)
pre_knn = knn.predict(test_X)
print('Accuracy for KNN classifier is', acc_score(pre_knn, test_Y))
Accuracy for KNN classifier is 0.7873134328358209
a_index = list(range(1, 11))
a = pd.Series()
for i in a_index:
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(train_X, train_Y)
pre_knn = knn.predict(test_X)
a = a.append(pd.Series(acc_score(pre_knn, test_Y)))
print('Accuracies for different values of n are:', a.values, 'with the max value is ', a.max())
plt.figure(figsize = (8, 6))
plt.plot(a_index, a)
Accuracies for different values of n are: [0.72761194 0.78731343 0.78358209 0.79477612 0.78731343 0.77985075
0.77238806 0.7761194 0.7761194 0.77238806]
with the max value is 0.7947761194029851
5.7 高斯貝葉斯 Naive Bayes
from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()
bayes.fit(train_X, train_Y)
pre_bayes = bayes.predict(test_X)
print("The accuracy of the NaiveBayes is", acc_score(pre_bayes, test_Y))
The accuracy of the NaiveBayes is 0.7985074626865671
5.8 梯度提升樹 GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 50)
gbc.fit(train_X, train_Y)
pre_gbc = gbc.predict(test_X)
print("The accuracy of the gradient boosting is", acc_score(pre_gbc, test_Y))
The accuracy of the gradient boosting is 0.8246268656716418
六. 交叉驗證
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
kfold = KFold(n_splits = 10, random_state = 42)
cv_mean = []
accuracy = []
std = []
classifiers = ['Logistic Regression', 'Linear Svm', 'Radial Svm', 'KNN',
'Decision Tree', 'Random Forest', 'Naive Bayes', 'Gradient Boosting']
models = [LogisticRegression(), svm.SVC(kernel = 'linear'), svm.SVC(kernel = 'rbf'),
KNeighborsClassifier(n_neighbors = 4), DecisionTreeClassifier(),
RandomForestClassifier(n_estimators = 200), GaussianNB(), GradientBoostingClassifier(n_estimators = 50)]
for model in models:
cv_result = cross_val_score(model, X, Y, cv = kfold, scoring = 'accuracy')
cv_mean.append(cv_result.mean())
std.append(cv_result.std())
accuracy.append(cv_result)
new_models_dataframe = pd.DataFrame({'CV Mean': cv_mean, 'Std':std}, index = classifiers)
new_models_dataframe
accuracy_dataframe = pd.DataFrame(accuracy, index = classifiers)
plt.figure(figsize = (16, 6))
accuracy_dataframe.T.boxplot()
new_models_dataframe.sort_values('CV Mean', ascending = False).plot()
fig = plt.gcf()
fig.set_size_inches(8, 6)
plt.title("Average CV Mean Accuracy & Std")
plt.xticks(rotation = 45)
七. 混淆矩陣
from sklearn.metrics import confusion_matrix
fig, ax = plt.subplots(3, 3, figsize = (12, 10))
pre_lr = cross_val_predict(LogisticRegression(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_lr), ax = ax[0, 0], annot = True, fmt = '3.0f')
ax[0, 0].set_title('Matrix for Logistic Regression')
pre_linear = cross_val_predict(svm.SVC(kernel = 'linear'), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_linear), ax = ax[0, 1], annot = True, fmt = '0.0f')
ax[0, 1].set_title('Matrix for Linear SVM')
pre_rbf = cross_val_predict(svm.SVC(kernel = 'rbf'), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_rbf), ax = ax[0, 2], annot = True, fmt = '0.0f')
ax[0, 2].set_title('Matrix for rbf SVM')
pre_knn = cross_val_predict(KNeighborsClassifier(n_neighbors = 4), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_knn), ax = ax[1, 0], annot = True, fmt = '0.0f')
ax[1, 0].set_title('Matrix for KNN')
pre_dtc = cross_val_predict(DecisionTreeClassifier(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_dtc), ax = ax[1, 1], annot = True, fmt = '0.0f')
ax[1, 1].set_title('Matrix for Dicision Tree')
pre_rfc = cross_val_predict(RandomForestClassifier(n_estimators = 200), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_rfc), ax = ax[1, 2], annot = True, fmt = '0.0f')
ax[1, 2].set_title('Matrix for Random Forest')
pre_bayes = cross_val_predict(GaussianNB(), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_bayes), ax = ax[2, 0], annot = True, fmt = '0.0f')
ax[2, 0].set_title('Matrix for Naive Bayes')
pre_GBDT = cross_val_predict(GradientBoostingClassifier(n_estimators = 50), X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, pre_GBDT), ax = ax[2, 1], annot = True, fmt = '0.0f')
ax[2, 1].set_title('Matrix for Gradient Boosting')
八. 超參數調整 GridSearchCV
from sklearn.model_selection import GridSearchCV
C = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
gamma = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
kernel = ['rbf', 'linear']
hyper = {'kernel': kernel, 'C': C, 'gamma': gamma}
gridsearch = GridSearchCV(estimator = svm.SVC(), param_grid = hyper, verbose = True)
gridsearch.fit(X, Y)
print(gridsearch.best_score_)
print(gridsearch.best_params_)
0.8327721661054994
{‘C’: 0.8, ‘gamma’: 0.1, ‘kernel’: ‘rbf’}
hyper = {'n_estimators': [10, 50, 100, 200, 300, 500, 800, 1000],
'max_depth': [None, 5, 8], 'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3]}
grid_search = GridSearchCV(estimator = RandomForestClassifier(random_state = 42),
param_grid = hyper, verbose = True )
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_
0.8338945005611672
九. 集成模塊
9.1 投票分類器
from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf = VotingClassifier(estimators = [('LR', LogisticRegression(C = 0.05)),
('SVM', svm.SVC(kernel = 'linear', probability = True)),
('SVC', svm.SVC(probability = True, kernel = 'rbf', C = 0.8, gamma = 0.1)),
('KNN', KNeighborsClassifier(n_neighbors = 4)),
('DTC', DecisionTreeClassifier(random_state = 42)),
('RFC', RandomForestClassifier(n_estimators = 500, max_depth = 5, min_samples_leaf = 3, min_samples_split = 2, random_state = 42)),
('NB', GaussianNB()),
('GBDT', GradientBoostingClassifier(n_estimators = 50))],
voting = 'soft').fit(train_X, train_Y)
print('The accuracy for ensembled model is: ', ensemble_lin_rbf.score(test_X, test_Y))
cross = cross_val_score(ensemble_lin_rbf, X, Y, cv = 10, scoring = 'accuracy')
print('The score validated score is ', cross.mean())
The accuracy for ensembled model is: 0.8171641791044776
The score validated score is 0.8282984337759618
9.2 引導聚類算法 Bagging
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 4), random_state = 42, n_estimators = 500)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print('The accuracy for bagged KNN is: ', acc_score(prediction, test_Y))
result = cross_val_score(model, X, Y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged KNN is:', result.mean())
The accuracy for bagged KNN is: 0.7947761194029851
The cross validated score for bagged KNN is: 0.7957496311428895
model = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 42, n_estimators = 100)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print('The accuracy for bagged Decision Tree is: ', acc_score(prediction, test_Y))
result = cross_val_score(model, X, Y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged Decision Tree is: ', result.mean())
The accuracy for bagged Decision Tree is: 0.7761194029850746
The cross validated score for bagged Decision Tree is: 0.8070369424582908
9.3 自適應增強 AdaBoost
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators = 200, random_state = 42, learning_rate = 0.05)
result = cross_val_score(ada, X, Y, cv = 10, scoring = 'accuracy')
result.mean()
0.8327553626149132
hyper = {'n_estimators': list(range(100, 1100, 100)), 'learning_rate': [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
grid_search = GridSearchCV(estimator = AdaBoostClassifier(random_state = 42), param_grid = hyper, verbose = True)
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_
0.8327721661054994
AdaBoostClassifier(algorithm=‘SAMME.R’, base_estimator=None,
learning_rate=0.05, n_estimators=200, random_state=42)
ada = AdaBoostClassifier(n_estimators = 200, random_state = 42, learning_rate = 0.05)
result = cross_val_predict(ada, X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, result), cmap = 'autumn', annot = True, fmt = '2.0f')
9.4 梯度提升 GradientBoosting
grad = GradientBoostingClassifier(n_estimators = 500, random_state = 42, learning_rate = 0.05)
result = cross_val_score(grad, X, Y, cv = 10, scoring = 'accuracy')
result.mean()
0.8160265577119509
hyper = {'n_estimators': range(100, 1100, 100), 'max_depth': [3, 4, 5],
'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(random_state = 42),
param_grid = hyper, verbose = True)
grid_search.fit(X, Y)
print(grid_search.best_score_)
grid_search.best_estimator_
0.8282828282828283
gbc = grid_search.best_estimator_
result = cross_val_predict(gbc, X, Y, cv = 10)
sns.heatmap(confusion_matrix(Y, result), cmap = 'summer', annot = True, fmt = '.0f')
十. 特徵重要性
fig, ax = plt.subplots(2, 2, figsize = (16, 12))
model = RandomForestClassifier(n_estimators = 500, max_depth = 5, min_samples_leaf = 3,
min_samples_split=2, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[0, 0], color='green')
ax[0, 0].set_title('Feature Importance in Random Forests')
# 支持向量機沒有feature_importances_
#model = svm.SVC(kernel = 'rbf', C = 0.9, gamma = 0.1, random_state = 42)
#model.fit(X, Y)
#pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(ax = ax[0, 1])
#ax[0, 1].set_title('Feature Importance in SVM')
model = AdaBoostClassifier(n_estimators = 200, learning_rate = 0.05, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[0, 1], color='#ddff11')
ax[0, 1].set_title('Feature Importance in AdaBoost')
model = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05, max_depth = 3, random_state = 42)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[1, 0], color='blue')
ax[1, 0].set_title('Feature Importance in Gradient Boosting')
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators = 900, learning_rate = 0.1)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending = True).plot.barh(width = 0.8, ax = ax[1, 1], color='red')
ax[1, 1].set_title('Feature Importance in XgBoost')