1 需求
1.1 驗證曲線:
缺點:只能逐一對超參數進行調參
# 由於無法確定隨機森林中的各個參數的最佳值,需要使用驗證曲線進行選擇
model = se.RandomForestClassifier(max_depth=6, n_estimators=200, random_state=7)
1.2 學習曲線
功能:確定最佳的訓練集和測試集
1.3 網格搜索
功能:驗證曲線只能每次獲取一個最優超參數。如果多個超參數有很多排列組合的話,就可以使用網格搜索尋求最優超參數組合。
2. 所需api
2.1 驗證曲線
import sklearn.model_selection as ms # 交叉驗證
train_scores, test_scores = ms.validation_curve(
model, # 模型
輸入集, 輸出集,
'n_estimators', #超參數名
np.arange(50, 550, 50), #超參數序列
cv=5 #摺疊數
)
2.2 學習曲線
import sklearn.model_selection as ms # 交叉驗證
_, train_scores, test_scores = ms.learning_curve(
model, # 模型
輸入集, 輸出集,
[0.9, 0.8, 0.7], # 訓練集大小序列
cv=5 # 摺疊數
)
2.3 網格搜索
import sklearn.model_selection as ms
model = ms.GridSearchCV(模型, 超參數組合列表, cv=摺疊數)
model.fit(輸入集,輸出集)
# 獲取網格搜索每個參數組合
model.cv_results_['params']
# 獲取網格搜索每個參數組合所對應的平均測試分值
model.cv_results_['mean_test_score']
# 獲取最好的參數
model.best_params_
model.best_score_
model.best_estimator_
3. 舉例
3.1 驗證曲線
import sklearn.model_selection as ms # 交叉驗證
#構建隨機森林模型
model = se.RandomForestClassifier(max_depth=9, n_estimators=144, random_state=7)
# 驗證曲線選擇最優的n_estimators超參數
train_score, test_score = ms.validation_curve(
model, train_x, train_y, 'n_estimators',
np.arange(140, 150, 1), cv=5)
print(test_score.mean(axis=1))
#繪製驗證曲線結果圖
import matplotlib.pyplot as mp
mp.grid(linestyle= ':')
mp.plot(np.arange(140, 150, 1),test_score.mean(axis=1),'o-',
color = 'dodgerblue',label = 'n_estimators')
mp.legend()
mp.show()
# 驗證曲線選擇最優的max_depth超參數
train_score, test_score = ms.validation_curve(
model, train_x, train_y, 'max_depth',
np.arange(1, 11, 1), cv=5)
print(test_score.mean(axis=1))
#繪製驗證曲線結果圖
import matplotlib.pyplot as mp
mp.grid(linestyle= ':')
mp.plot(np.arange(1, 11, 1),test_score.mean(axis=1),'o-',
color = 'dodgerblue',label = 'max_depth')
mp.legend()
mp.show()
3.2學習曲線
import sklearn.model_selection as ms # 交叉驗證
model = se.RandomForestClassifier(max_depth=9, n_estimators=144, random_state=7)
# 學習曲線
train_size = np.arange(0.1, 1.0, 0.1)
_, train_score, test_score = ms.learning_curve(
model, train_x, train_y,
train_sizes=train_size, cv=5
)
test_mean = test_score.mean(axis=1)
import matplotlib.pyplot as mp
mp.grid(linestyle=':')
mp.plot(train_size, test_mean, 'o-',
color='dodgerblue')
mp.legend()
mp.show()
網格搜索
# 基於徑向基核函數的支持向量機分類器
params = [{'kernel':['linear'], 'C':[1, 10, 100, 1000]},
{'kernel':['poly'], 'C':[1], 'degree':[2, 3]},
{'kernel':['rbf'], 'C':[1,10,100,1000], 'gamma':[1, 0.1, 0.01, 0.001]}]
model = ms.GridSearchCV(svm.SVC(probability=True), params, cv=5)
model.fit(train_x, train_y)
for p, s in zip(model.cv_results_['params'],
model.cv_results_['mean_test_score']):
print(p, s)
# 獲取得分最優的的超參數信息
print(model.best_params_)
# 獲取最優得分
print(model.best_score_)
# 獲取最優模型的信息
print(model.best_estimator_)