二、模型複雜度與訓練集大小
模型訓練經常出現兩類現象:過擬合(訓練誤差遠小於泛化誤差)和欠擬合(訓練誤差較高),導致這兩類現象的兩個重要因素是:模型複雜度和訓練集大小。機器學習模型應關注降低泛化誤差。
1. 訓練集大小
如果訓練集過小,特別是比模型參數數量(按元素計)更小時,過擬合更容易發生。另外,泛化誤差不會隨訓練集的增大而增大,所以我們通常希望訓練集大一些。
2. 模型複雜度
如果模型參數過多(少),則模型的複雜度會較高(低),從而導致過擬合(欠擬合)。模型複雜度對過擬合和欠擬合的影響,如下圖所示:
3.編程實驗(多項式函數擬合實驗)
3.1 模型複雜度
設數據集中輸入特徵向量爲X,輸出目標值爲y,且X和y滿足下面數學函數關係:
過擬合模型:
正常模型:
欠擬合模型:
代碼實現:
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 12 10:51:57 2020
@author: chengang
"""
import numpy as np
import matplotlib.pyplot as plt
import torch
np.random.seed(53)
batch_size = 10
lr = 0.01
num_epochs = 300
sample_num = 1000
train_num = 700
def GenerateDataset(X, y):
train_dataset = torch.utils.data.TensorDataset(X[:train_num], y[:train_num])
train_dataset = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test, y_test = X[train_num:], y[train_num:]
return train_dataset, X_test, y_test
# overfitting model
def net1(x, w, b):
return w[0] * x**5 + w[1] * x**4 + w[2] * x**3 + w[3] * x**2 + w[4] * x + b
# normal model
def net2(x, w, b):
return w[0] * x**3 + w[1] * x**2 + w[2] * x + b
# underfitting model
def net3(x, w, b):
return w[0] * x + w[1] + b
if __name__ == '__main__':
X_data = torch.from_numpy(np.random.randn(sample_num, 1).astype(np.float32))
y_data = 2 * torch.pow(X_data, 3) + 3 * torch.pow(X_data, 2) - 12 * X_data + 1 + torch.randn(X_data.shape)
train_dataset, X_test, y_test = GenerateDataset(X_data, y_data)
# parameters for overfitting model, normal model, underfitting model
w1 = torch.randn(5, requires_grad = True)
b1 = torch.randn(1, requires_grad = True)
w2 = torch.randn(3, requires_grad = True)
b2 = torch.randn(1, requires_grad = True)
w3 = torch.randn(2, requires_grad = True)
b3 = torch.randn(1, requires_grad = True)
trainloss1 = []
trainloss2 = []
trainloss3 = []
testloss1 = []
testloss2 = []
testloss3 = []
for epoch in range(num_epochs):
cur_train_loss1 = 0.0
cur_train_loss2 = 0.0
cur_train_loss3 = 0.0
cnt = 0
for X, y in train_dataset:
y_pred1 = net1(X, w1, b1)
y_pred2 = net2(X, w2, b2)
y_pred3 = net3(X, w3, b3)
loss1 = 1 / (2 * y.shape[0]) * torch.sum((y_pred1 - y)**2)
loss2 = 1 / (2 * y.shape[0]) * torch.sum((y_pred2 - y)**2)
loss3 = 1 / (2 * y.shape[0]) * torch.sum((y_pred3 - y)**2)
cur_train_loss1 += loss1.item()
cur_train_loss2 += loss2.item()
cur_train_loss3 += loss3.item()
cnt += 1
loss1.backward()
loss2.backward()
loss3.backward()
w1.data -= lr / y.shape[0] * w1.grad
b1.data -= lr / y.shape[0] * b1.grad
w2.data -= lr / y.shape[0] * w2.grad
b2.data -= lr / y.shape[0] * b2.grad
w3.data -= lr / y.shape[0] * w3.grad
b3.data -= lr / y.shape[0] * b3.grad
w1.grad.data.zero_()
b1.grad.data.zero_()
w2.grad.data.zero_()
b2.grad.data.zero_()
w3.grad.data.zero_()
b3.grad.data.zero_()
trainloss1.append(cur_train_loss1 / cnt)
trainloss2.append(cur_train_loss2 / cnt)
trainloss3.append(cur_train_loss3 / cnt)
with torch.no_grad():
y_pred1 = net1(X_test, w1, b1)
y_pred2 = net2(X_test, w2, b2)
y_pred3 = net3(X_test, w3, b3)
loss_test1 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred1 - y_test)**2)
loss_test2 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred2 - y_test)**2)
loss_test3 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred3 - y_test)**2)
testloss1.append(loss_test1.item())
testloss2.append(loss_test2.item())
testloss3.append(loss_test3.item())
_, axes = plt.subplots()
t = np.arange(1, num_epochs + 1)
axes.semilogy(t, trainloss1, 'r-', t, testloss1, 'g:')
axes.set_title('overfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
_, axes = plt.subplots()
axes.semilogy(t, trainloss2, 'r-', t, testloss2, 'g:')
axes.set_title('normal model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
_, axes = plt.subplots()
axes.semilogy(t, trainloss3, 'r-', t, testloss3, 'g:')
axes.set_title('underfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
plt.show()
運行結果:
過擬合模型:
正常擬合模型:
欠擬合模型:
3.2 訓練集大小
當訓練集過小時,容易產生過擬合現象,將上面代碼中train_num = 700改爲train_num = 20,可以看到本來正常擬合的模型變成了過擬合,如圖2.5所示:
4. 解決過擬合方法
4.1 範數懲罰項
範數懲罰項指的是模型中所有權重參數的平⽅的和與⼀個正的常數的乘積。在模型原損失函數的基礎上添加範數懲罰項,可以有效地抑制過擬合現象,即
範數懲罰項通過懲罰絕對值較大的模型參數爲需要學習的模型增加了限制,範數懲罰項又叫做權重衰減。
代碼驗證如下:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 12:01:25 2020
@author: chengang
"""
import torch
import numpy as np
import matplotlib.pyplot as plt
num_feature = 100
num_epoch = 100
lr = 0.01
batch_size = 1
num_train = 20
num_test = 50
np.random.seed(53)
def L2_norm(params, k, n):
return k / (2 * n) * (params**2).sum()
def net(X, w, b):
return torch.matmul(X, w) + b
def loss(y_pred, y):
return 1 / (2 * y.shape[0]) * ((y_pred - y)**2).sum()
if __name__ == '__main__':
w1 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b1 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
w1.requires_grad_(True)
b1.requires_grad_(True)
w2 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b2 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
w2.requires_grad_(True)
b2.requires_grad_(True)
true_w = 0.1 * torch.ones((num_feature, 1))
true_b = 0.5 * torch.ones((1, 1))
X = torch.randn((num_train + num_test, num_feature), dtype = torch.float32)
y = torch.matmul(X, true_w) + true_b + torch.from_numpy(np.random.normal(0.0, 0.1, size = (num_train + num_test, 1)).astype(np.float32))
X_train = X[ : num_train, 0 : num_feature]
y_train = y[ : num_train, 0]
X_test = X[num_train : (num_train + num_test), 0 : num_feature]
y_test = y[num_train : (num_train + num_test), 0]
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net(X_batch, w1, b1)
y_pred2 = net(X_batch, w2, b2)
l1 = loss(y_pred1, y_batch)
l2 = loss(y_pred2, y_batch) + L2_norm(w2, 0.2, y_pred2.shape[0])
train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()
cnt += 1
l1.backward()
l2.backward()
w1.data -= lr / y_batch.shape[0] * w1.grad
b1.data -= lr / y_batch[0] * b1.grad
w1.grad.data.zero_()
b1.grad.data.zero_()
w2.data -= lr / y_batch.shape[0] * w2.grad
b2.data -= lr / y_batch[0] * b2.grad
w2.grad.data.zero_()
b2.grad.data.zero_()
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
test_loss1.append(loss(net(X_test, w1, b1), y_test))
test_loss2.append(loss(net(X_test, w2, b2), y_test))
fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, np.log(train_loss1), 'r-', label = 'train loss')
axes[0].plot(t, np.log(test_loss1), 'b:', label = 'test loss')
axes[0].legend()
axes[0].set_title('Not Add L2 Regular Penalty')
axes[1].plot(t, np.log(train_loss2), 'r-', label = 'train loss')
axes[1].plot(t, np.log(test_loss2), 'b:', label = 'test loss')
axes[1].legend()
axes[1].set_title('Add L2 Regular Penalty')
plt.show()
運行結果:
4.2 丟棄法(dropout)
對於如下圖所示多層感知機,對於隱藏層中的隱藏單元,有計算公式:
其中,爲激活函數,爲輸入,爲隱藏單元的權重參數,爲偏置參數。
現在對該隱藏層使用丟棄法(dropout),該層的隱藏單元將有一定概率被丟棄掉。設丟棄概率爲,則有概率使得,有概率使得變爲。事實上,我們設隨機變量爲0和1得概率分別爲和,則有
因爲,所以
可知丟棄法不改變輸入的期望值。由於在訓練中隱藏層神經元的丟棄是隨機的,即都有可能清零,輸出層的計算無法過度依賴中的任一個,從而在訓練模型時起到了正則化的作用,可以用來應對過擬合。
代碼驗證:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 16:32:22 2020
@author: chengang
"""
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
num_input = 30
num_hidden = 50
num_output = 1
num_sample = 50
batch_size = 1
num_epoch = 100
np.random.seed(123)
if __name__ == '__main__':
X = torch.from_numpy(np.random.randn(num_sample, num_input).astype(np.float32))
y = 0.1 * X.sum() + 0.5 + 0.05 * torch.randn((num_sample, 1)).float()
train_dataset = torch.utils.data.TensorDataset(X[ : 20], y[ : 20])
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test = X[20 : ]
y_test = y[20 : ]
net1 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Linear(num_hidden, num_output)
)
net2 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Dropout(p = 0.5),
nn.Linear(num_hidden, num_output)
)
optimizer1 = torch.optim.SGD(net1.parameters(), lr = 0.01)
optimizer2 = torch.optim.SGD(net2.parameters(), lr = 0.01)
loss1 = torch.nn.MSELoss()
loss2 = torch.nn.MSELoss()
train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net1(X_batch)
y_pred2 = net2(X_batch)
l1 = loss1(y_pred1, y_batch)
l2 = loss2(y_pred2, y_batch)
train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()
cnt += 1
optimizer1.zero_grad()
optimizer2.zero_grad()
l1.backward()
l2.backward()
optimizer1.step()
optimizer2.step()
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
with torch.no_grad():
test_loss1.append(loss1(net1(X_test), y_test).item())
test_loss2.append(loss2(net2(X_test), y_test).item())
fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, train_loss1, 'r-', label = 'train loss')
axes[0].plot(t, test_loss1, 'b:', label = 'test loss')
axes[0].legend()
axes[0].set_title('Not Add Dropout')
axes[1].plot(t, train_loss2, 'r-', label = 'train loss')
axes[1].plot(t, test_loss2, 'b:', label = 'test loss')
axes[1].legend()
axes[1].set_title('Add Dropout')
plt.show()
運行結果: