二、模型复杂度与训练集大小
模型训练经常出现两类现象:过拟合(训练误差远小于泛化误差)和欠拟合(训练误差较高),导致这两类现象的两个重要因素是:模型复杂度和训练集大小。机器学习模型应关注降低泛化误差。
1. 训练集大小
如果训练集过小,特别是比模型参数数量(按元素计)更小时,过拟合更容易发生。另外,泛化误差不会随训练集的增大而增大,所以我们通常希望训练集大一些。
2. 模型复杂度
如果模型参数过多(少),则模型的复杂度会较高(低),从而导致过拟合(欠拟合)。模型复杂度对过拟合和欠拟合的影响,如下图所示:
3.编程实验(多项式函数拟合实验)
3.1 模型复杂度
设数据集中输入特征向量为X,输出目标值为y,且X和y满足下面数学函数关系:
过拟合模型:
正常模型:
欠拟合模型:
代码实现:
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 12 10:51:57 2020
@author: chengang
"""
import numpy as np
import matplotlib.pyplot as plt
import torch
np.random.seed(53)
batch_size = 10
lr = 0.01
num_epochs = 300
sample_num = 1000
train_num = 700
def GenerateDataset(X, y):
train_dataset = torch.utils.data.TensorDataset(X[:train_num], y[:train_num])
train_dataset = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test, y_test = X[train_num:], y[train_num:]
return train_dataset, X_test, y_test
# overfitting model
def net1(x, w, b):
return w[0] * x**5 + w[1] * x**4 + w[2] * x**3 + w[3] * x**2 + w[4] * x + b
# normal model
def net2(x, w, b):
return w[0] * x**3 + w[1] * x**2 + w[2] * x + b
# underfitting model
def net3(x, w, b):
return w[0] * x + w[1] + b
if __name__ == '__main__':
X_data = torch.from_numpy(np.random.randn(sample_num, 1).astype(np.float32))
y_data = 2 * torch.pow(X_data, 3) + 3 * torch.pow(X_data, 2) - 12 * X_data + 1 + torch.randn(X_data.shape)
train_dataset, X_test, y_test = GenerateDataset(X_data, y_data)
# parameters for overfitting model, normal model, underfitting model
w1 = torch.randn(5, requires_grad = True)
b1 = torch.randn(1, requires_grad = True)
w2 = torch.randn(3, requires_grad = True)
b2 = torch.randn(1, requires_grad = True)
w3 = torch.randn(2, requires_grad = True)
b3 = torch.randn(1, requires_grad = True)
trainloss1 = []
trainloss2 = []
trainloss3 = []
testloss1 = []
testloss2 = []
testloss3 = []
for epoch in range(num_epochs):
cur_train_loss1 = 0.0
cur_train_loss2 = 0.0
cur_train_loss3 = 0.0
cnt = 0
for X, y in train_dataset:
y_pred1 = net1(X, w1, b1)
y_pred2 = net2(X, w2, b2)
y_pred3 = net3(X, w3, b3)
loss1 = 1 / (2 * y.shape[0]) * torch.sum((y_pred1 - y)**2)
loss2 = 1 / (2 * y.shape[0]) * torch.sum((y_pred2 - y)**2)
loss3 = 1 / (2 * y.shape[0]) * torch.sum((y_pred3 - y)**2)
cur_train_loss1 += loss1.item()
cur_train_loss2 += loss2.item()
cur_train_loss3 += loss3.item()
cnt += 1
loss1.backward()
loss2.backward()
loss3.backward()
w1.data -= lr / y.shape[0] * w1.grad
b1.data -= lr / y.shape[0] * b1.grad
w2.data -= lr / y.shape[0] * w2.grad
b2.data -= lr / y.shape[0] * b2.grad
w3.data -= lr / y.shape[0] * w3.grad
b3.data -= lr / y.shape[0] * b3.grad
w1.grad.data.zero_()
b1.grad.data.zero_()
w2.grad.data.zero_()
b2.grad.data.zero_()
w3.grad.data.zero_()
b3.grad.data.zero_()
trainloss1.append(cur_train_loss1 / cnt)
trainloss2.append(cur_train_loss2 / cnt)
trainloss3.append(cur_train_loss3 / cnt)
with torch.no_grad():
y_pred1 = net1(X_test, w1, b1)
y_pred2 = net2(X_test, w2, b2)
y_pred3 = net3(X_test, w3, b3)
loss_test1 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred1 - y_test)**2)
loss_test2 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred2 - y_test)**2)
loss_test3 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred3 - y_test)**2)
testloss1.append(loss_test1.item())
testloss2.append(loss_test2.item())
testloss3.append(loss_test3.item())
_, axes = plt.subplots()
t = np.arange(1, num_epochs + 1)
axes.semilogy(t, trainloss1, 'r-', t, testloss1, 'g:')
axes.set_title('overfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
_, axes = plt.subplots()
axes.semilogy(t, trainloss2, 'r-', t, testloss2, 'g:')
axes.set_title('normal model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
_, axes = plt.subplots()
axes.semilogy(t, trainloss3, 'r-', t, testloss3, 'g:')
axes.set_title('underfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
plt.show()
运行结果:
过拟合模型:
正常拟合模型:
欠拟合模型:
3.2 训练集大小
当训练集过小时,容易产生过拟合现象,将上面代码中train_num = 700改为train_num = 20,可以看到本来正常拟合的模型变成了过拟合,如图2.5所示:
4. 解决过拟合方法
4.1 范数惩罚项
范数惩罚项指的是模型中所有权重参数的平⽅的和与⼀个正的常数的乘积。在模型原损失函数的基础上添加范数惩罚项,可以有效地抑制过拟合现象,即
范数惩罚项通过惩罚绝对值较大的模型参数为需要学习的模型增加了限制,范数惩罚项又叫做权重衰减。
代码验证如下:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 12:01:25 2020
@author: chengang
"""
import torch
import numpy as np
import matplotlib.pyplot as plt
num_feature = 100
num_epoch = 100
lr = 0.01
batch_size = 1
num_train = 20
num_test = 50
np.random.seed(53)
def L2_norm(params, k, n):
return k / (2 * n) * (params**2).sum()
def net(X, w, b):
return torch.matmul(X, w) + b
def loss(y_pred, y):
return 1 / (2 * y.shape[0]) * ((y_pred - y)**2).sum()
if __name__ == '__main__':
w1 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b1 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
w1.requires_grad_(True)
b1.requires_grad_(True)
w2 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b2 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
w2.requires_grad_(True)
b2.requires_grad_(True)
true_w = 0.1 * torch.ones((num_feature, 1))
true_b = 0.5 * torch.ones((1, 1))
X = torch.randn((num_train + num_test, num_feature), dtype = torch.float32)
y = torch.matmul(X, true_w) + true_b + torch.from_numpy(np.random.normal(0.0, 0.1, size = (num_train + num_test, 1)).astype(np.float32))
X_train = X[ : num_train, 0 : num_feature]
y_train = y[ : num_train, 0]
X_test = X[num_train : (num_train + num_test), 0 : num_feature]
y_test = y[num_train : (num_train + num_test), 0]
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net(X_batch, w1, b1)
y_pred2 = net(X_batch, w2, b2)
l1 = loss(y_pred1, y_batch)
l2 = loss(y_pred2, y_batch) + L2_norm(w2, 0.2, y_pred2.shape[0])
train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()
cnt += 1
l1.backward()
l2.backward()
w1.data -= lr / y_batch.shape[0] * w1.grad
b1.data -= lr / y_batch[0] * b1.grad
w1.grad.data.zero_()
b1.grad.data.zero_()
w2.data -= lr / y_batch.shape[0] * w2.grad
b2.data -= lr / y_batch[0] * b2.grad
w2.grad.data.zero_()
b2.grad.data.zero_()
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
test_loss1.append(loss(net(X_test, w1, b1), y_test))
test_loss2.append(loss(net(X_test, w2, b2), y_test))
fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, np.log(train_loss1), 'r-', label = 'train loss')
axes[0].plot(t, np.log(test_loss1), 'b:', label = 'test loss')
axes[0].legend()
axes[0].set_title('Not Add L2 Regular Penalty')
axes[1].plot(t, np.log(train_loss2), 'r-', label = 'train loss')
axes[1].plot(t, np.log(test_loss2), 'b:', label = 'test loss')
axes[1].legend()
axes[1].set_title('Add L2 Regular Penalty')
plt.show()
运行结果:
4.2 丢弃法(dropout)
对于如下图所示多层感知机,对于隐藏层中的隐藏单元,有计算公式:
其中,为激活函数,为输入,为隐藏单元的权重参数,为偏置参数。
现在对该隐藏层使用丢弃法(dropout),该层的隐藏单元将有一定概率被丢弃掉。设丢弃概率为,则有概率使得,有概率使得变为。事实上,我们设随机变量为0和1得概率分别为和,则有
因为,所以
可知丢弃法不改变输入的期望值。由于在训练中隐藏层神经元的丢弃是随机的,即都有可能清零,输出层的计算无法过度依赖中的任一个,从而在训练模型时起到了正则化的作用,可以用来应对过拟合。
代码验证:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 16:32:22 2020
@author: chengang
"""
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
num_input = 30
num_hidden = 50
num_output = 1
num_sample = 50
batch_size = 1
num_epoch = 100
np.random.seed(123)
if __name__ == '__main__':
X = torch.from_numpy(np.random.randn(num_sample, num_input).astype(np.float32))
y = 0.1 * X.sum() + 0.5 + 0.05 * torch.randn((num_sample, 1)).float()
train_dataset = torch.utils.data.TensorDataset(X[ : 20], y[ : 20])
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test = X[20 : ]
y_test = y[20 : ]
net1 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Linear(num_hidden, num_output)
)
net2 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Dropout(p = 0.5),
nn.Linear(num_hidden, num_output)
)
optimizer1 = torch.optim.SGD(net1.parameters(), lr = 0.01)
optimizer2 = torch.optim.SGD(net2.parameters(), lr = 0.01)
loss1 = torch.nn.MSELoss()
loss2 = torch.nn.MSELoss()
train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net1(X_batch)
y_pred2 = net2(X_batch)
l1 = loss1(y_pred1, y_batch)
l2 = loss2(y_pred2, y_batch)
train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()
cnt += 1
optimizer1.zero_grad()
optimizer2.zero_grad()
l1.backward()
l2.backward()
optimizer1.step()
optimizer2.step()
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
with torch.no_grad():
test_loss1.append(loss1(net1(X_test), y_test).item())
test_loss2.append(loss2(net2(X_test), y_test).item())
fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, train_loss1, 'r-', label = 'train loss')
axes[0].plot(t, test_loss1, 'b:', label = 'test loss')
axes[0].legend()
axes[0].set_title('Not Add Dropout')
axes[1].plot(t, train_loss2, 'r-', label = 'train loss')
axes[1].plot(t, test_loss2, 'b:', label = 'test loss')
axes[1].legend()
axes[1].set_title('Add Dropout')
plt.show()
运行结果: