# 《動手學習深度學習》筆記之模型複雜度與訓練集大小

## 二、模型複雜度與訓練集大小

### 3.編程實驗（多項式函數擬合實驗）

#### 3.1 模型複雜度

$y = 2X^3+3X^2-12X+1+\epsilon,其中噪音 \epsilon \sim N(0,1)$

$y = w0 * X^5 + w1 * X^4 + w2 * x^3 + w3 * x^2 + w4 * x + b$

$y = w0 * X^3 + w1 * X^2 + w2 * x + b$

$y = w0 * x + b$

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import torch

np.random.seed(53)
batch_size = 10
lr = 0.01
num_epochs = 300
sample_num = 1000
train_num = 700

def GenerateDataset(X, y):
train_dataset = torch.utils.data.TensorDataset(X[:train_num], y[:train_num])
train_dataset = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test, y_test = X[train_num:], y[train_num:]
return train_dataset, X_test, y_test

# overfitting model
def net1(x, w, b):
return w[0] * x**5 + w[1] * x**4 + w[2] * x**3 + w[3] * x**2 + w[4] * x + b

# normal model
def net2(x, w, b):
return w[0] * x**3 + w[1] * x**2 + w[2] * x + b

# underfitting model
def net3(x, w, b):
return w[0] * x + w[1] + b

if __name__ == '__main__':
X_data = torch.from_numpy(np.random.randn(sample_num, 1).astype(np.float32))
y_data = 2 * torch.pow(X_data, 3) + 3 * torch.pow(X_data, 2) - 12 * X_data + 1 + torch.randn(X_data.shape)
train_dataset, X_test, y_test = GenerateDataset(X_data, y_data)

# parameters for overfitting model, normal model, underfitting model
w1 = torch.randn(5, requires_grad = True)
b1 = torch.randn(1, requires_grad = True)
w2 = torch.randn(3, requires_grad = True)
b2 = torch.randn(1, requires_grad = True)
w3 = torch.randn(2, requires_grad = True)
b3 = torch.randn(1, requires_grad = True)

trainloss1 = []
trainloss2 = []
trainloss3 = []
testloss1 = []
testloss2 = []
testloss3 = []
for epoch in range(num_epochs):
cur_train_loss1 = 0.0
cur_train_loss2 = 0.0
cur_train_loss3 = 0.0
cnt = 0
for X, y in train_dataset:
y_pred1 = net1(X, w1, b1)
y_pred2 = net2(X, w2, b2)
y_pred3 = net3(X, w3, b3)

loss1 = 1 / (2 * y.shape[0]) * torch.sum((y_pred1 - y)**2)
loss2 = 1 / (2 * y.shape[0]) * torch.sum((y_pred2 - y)**2)
loss3 = 1 / (2 * y.shape[0]) * torch.sum((y_pred3 - y)**2)
cur_train_loss1 += loss1.item()
cur_train_loss2 += loss2.item()
cur_train_loss3 += loss3.item()
cnt += 1

loss1.backward()
loss2.backward()
loss3.backward()

w1.data -= lr / y.shape[0] * w1.grad
b1.data -= lr / y.shape[0] * b1.grad
w2.data -= lr / y.shape[0] * w2.grad
b2.data -= lr / y.shape[0] * b2.grad
w3.data -= lr / y.shape[0] * w3.grad
b3.data -= lr / y.shape[0] * b3.grad

trainloss1.append(cur_train_loss1 / cnt)
trainloss2.append(cur_train_loss2 / cnt)
trainloss3.append(cur_train_loss3 / cnt)

y_pred1 = net1(X_test, w1, b1)
y_pred2 = net2(X_test, w2, b2)
y_pred3 = net3(X_test, w3, b3)
loss_test1 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred1 - y_test)**2)
loss_test2 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred2 - y_test)**2)
loss_test3 = 1/(2 * y_test.shape[0]) * torch.sum((y_pred3 - y_test)**2)
testloss1.append(loss_test1.item())
testloss2.append(loss_test2.item())
testloss3.append(loss_test3.item())

_, axes = plt.subplots()
t = np.arange(1, num_epochs + 1)
axes.semilogy(t, trainloss1, 'r-', t, testloss1, 'g:')
axes.set_title('overfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')

_, axes = plt.subplots()
axes.semilogy(t, trainloss2, 'r-', t, testloss2, 'g:')
axes.set_title('normal model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')

_, axes = plt.subplots()
axes.semilogy(t, trainloss3, 'r-', t, testloss3, 'g:')
axes.set_title('underfitting model')
axes.set_xlabel('epoch')
axes.set_ylabel('log(loss)')
plt.show()


#### 4.1 $L_{2}$範數懲罰項

$L_{2}$範數懲罰項指的是模型中所有權重參數的平⽅的和與⼀個正的常數的乘積。在模型原損失函數的基礎上添加$L_{2}$範數懲罰項，可以有效地抑制過擬合現象，即
$loss2 = loss1 + \lambda||W||^{2}$
$L_{2}$範數懲罰項通過懲罰絕對值較大的模型參數爲需要學習的模型增加了限制，$L_{2}$範數懲罰項又叫做權重衰減。

# -*- coding: utf-8 -*-
import torch
import numpy as np
import matplotlib.pyplot as plt

num_feature = 100
num_epoch = 100
lr = 0.01
batch_size = 1
num_train = 20
num_test = 50

np.random.seed(53)

def L2_norm(params, k, n):
return k / (2 * n) * (params**2).sum()

def net(X, w, b):

def loss(y_pred, y):
return 1 / (2 * y.shape[0]) * ((y_pred - y)**2).sum()

if __name__ == '__main__':
w1 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b1 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
w2 = torch.from_numpy(np.random.randn(num_feature, 1).astype(np.float32))
b2 = torch.from_numpy(np.random.randn(1, 1).astype(np.float32))
true_w = 0.1 * torch.ones((num_feature, 1))
true_b = 0.5 * torch.ones((1, 1))

X = torch.randn((num_train + num_test, num_feature), dtype = torch.float32)
y = torch.matmul(X, true_w) + true_b + torch.from_numpy(np.random.normal(0.0, 0.1, size = (num_train + num_test, 1)).astype(np.float32))
X_train = X[ : num_train, 0 : num_feature]
y_train = y[ : num_train, 0]
X_test = X[num_train : (num_train + num_test), 0 : num_feature]
y_test = y[num_train : (num_train + num_test), 0]
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)

train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net(X_batch, w1, b1)
y_pred2 = net(X_batch, w2, b2)
l1 = loss(y_pred1, y_batch)
l2 = loss(y_pred2, y_batch) + L2_norm(w2, 0.2, y_pred2.shape[0])

train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()

cnt += 1

l1.backward()
l2.backward()

w1.data -= lr / y_batch.shape[0] * w1.grad
b1.data -= lr / y_batch[0] * b1.grad
w2.data -= lr / y_batch.shape[0] * w2.grad
b2.data -= lr / y_batch[0] * b2.grad
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
test_loss1.append(loss(net(X_test, w1, b1), y_test))
test_loss2.append(loss(net(X_test, w2, b2), y_test))

fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, np.log(train_loss1), 'r-', label = 'train loss')
axes[0].plot(t, np.log(test_loss1), 'b:', label = 'test loss')
axes[0].legend()
axes[1].plot(t, np.log(train_loss2), 'r-', label = 'train loss')
axes[1].plot(t, np.log(test_loss2), 'b:', label = 'test loss')
axes[1].legend()
plt.show()


#### 4.2 丟棄法（dropout）

$h_{i}=\phi(x_{1}w_{1i}+...+x_{5}w_{5i}+b_{i})$

$h_{i}^{'}=\frac{\xi_{i}}{1-p}h_{i}$

$E(h_{i}^{'})=\frac{E(\xi_{i})}{1-p}h_{i}=h_{i}$

# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

num_input = 30
num_hidden = 50
num_output = 1
num_sample = 50
batch_size = 1
num_epoch = 100

np.random.seed(123)

if __name__ == '__main__':
X = torch.from_numpy(np.random.randn(num_sample, num_input).astype(np.float32))
y = 0.1 * X.sum() + 0.5 + 0.05 * torch.randn((num_sample, 1)).float()
train_dataset = torch.utils.data.TensorDataset(X[ : 20], y[ : 20])
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
X_test = X[20 : ]
y_test = y[20 : ]

net1 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Linear(num_hidden, num_output)
)
net2 = nn.Sequential(
nn.Linear(num_input, num_hidden),
nn.Dropout(p = 0.5),
nn.Linear(num_hidden, num_output)
)
optimizer1 = torch.optim.SGD(net1.parameters(), lr = 0.01)
optimizer2 = torch.optim.SGD(net2.parameters(), lr = 0.01)

loss1 = torch.nn.MSELoss()
loss2 = torch.nn.MSELoss()

train_loss1 = []
train_loss2 = []
test_loss1 = []
test_loss2 = []
for epoch in range(num_epoch):
train_loss_sum1 = 0.0
train_loss_sum2 = 0.0
cnt = 0
for X_batch, y_batch in train_iter:
y_pred1 = net1(X_batch)
y_pred2 = net2(X_batch)

l1 = loss1(y_pred1, y_batch)
l2 = loss2(y_pred2, y_batch)
train_loss_sum1 += l1.item()
train_loss_sum2 += l2.item()
cnt += 1

l1.backward()
l2.backward()
optimizer1.step()
optimizer2.step()
train_loss1.append(train_loss_sum1 / cnt)
train_loss2.append(train_loss_sum2 / cnt)
test_loss1.append(loss1(net1(X_test), y_test).item())
test_loss2.append(loss2(net2(X_test), y_test).item())

fig, axes = plt.subplots(1, 2)
t = np.arange(1, num_epoch + 1)
axes[0].plot(t, train_loss1, 'r-', label = 'train loss')
axes[0].plot(t, test_loss1, 'b:', label = 'test loss')
axes[0].legend()