CIFAR10數據庫
CIFAR10是一套含有60000張大小爲32×32彩色RGB圖像的10分類圖像數據庫,其中的50000張圖像爲訓練數據,10000張圖像爲測試數據,另外驗證集的數據是從訓練集中取出的。可以在下列網站下載CIFAR10數據庫:
CIFAR10數據庫下載地址
深度全連接神經網絡
在本質上,深度全連接神經網絡就是添加了多個隱含層的神經網絡,互聯網上的相關介紹已經足夠多,在此不多贅述,只介紹本網絡使用的超參數:
隱含層使用的激活函數:ReLU函數
輸出層使用的損失函數:Softmax函數
訓練集數據特徵數量(即維度):32×32×3,3表示有RGB三個色彩通道。
爲了提高訓練速度,並故意地給出過擬合結果以體現該代碼是正確的,在此減少訓練集數量,訓練集數量越少,訓練準確度會越高,即出現過擬合現象
訓練集數據量:500個
驗證集數據量:1000個
由於該案例並不專注於模型的泛化能力,而是專注於如何使用代碼實現該網絡,所以不需要測試集。
隱含層層數:5層
隱含層神經元數量:從第一層到第五層隱含層 100 100 100 100 100
初始權重矩陣的標準差:0.1
訓練迭代次數:500次
每個批次的訓練集數量:500個
輸出層神經元數量:10,即輸出10個類別
學習率:0.001
正則化係數:0.0,無正則化
學習率衰減率:無衰減
訓練結果
訓練損失:
訓練準確率:
驗證準確率:
可以看到,由於未使用批量梯度下降,損失函數下降曲線十分光滑。而且由於未添加正則化,訓練準確率在最後達到了100%,但是由於訓練集太小,只有500個樣本,所以導致驗證準確率只有不到30%。
代碼!
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import os
import _pickle as pickle
#獲取CIFAR10圖像數據集
def get_CIFAR10_data(num_training=50, num_validation=1000, num_test=0):
cifar10_dir = **請在這裏寫明CIFAR10圖像數據庫的文件夾路徑,例如'E:/cifar-10-batches-py'**
xs = []
ys = []
for b in range(1,6):
f = os.path.join(cifar10_dir, 'data_batch_%d' % (b, ))
with open(f, 'rb') as f:
datadict = pickle.load(f, encoding = 'latin1')
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
xs.append(X)
ys.append(Y)
X_train = np.concatenate(xs)
y_train = np.concatenate(ys)
del X, Y
with open(os.path.join(cifar10_dir, 'test_batch'), 'rb') as f:
datadict = pickle.load(f, encoding = 'latin1')
X_test = datadict['data']
y_test = datadict['labels']
X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
y_test = np.array(y_test)
#從原始訓練集的50000張圖中選出最後1000張圖作爲驗證集
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
#從原始測試集中的10000張圖中選出最開始的1000張圖作爲測試集
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
#將訓練集數據進行中心化
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
#重構尺寸
X_train = X_train.transpose(0, 3, 1, 2).copy()
X_val = X_val.transpose(0, 3, 1, 2).copy()
X_test = X_test.transpose(0, 3, 1, 2).copy()
return {
'X_train': X_train, 'y_train': y_train,
'X_val': X_val, 'y_val': y_val, \
'X_test': X_test, 'y_test': y_test,
}
#前向傳播
def affine_forward(x, w, b):
"""
計算某一層的前向傳播。
輸入:x,N*D,若爲RGB圖像,則爲N*(長*寬*3),N爲本層的神經元數量,D爲特徵數量
w,D*M,M爲下一層的神經元數量
b,偏置向量,M*1
返回:out,N*M
"""
out = None
#數據量
N = x.shape[0]
#將x整理成二維數組,N行,剩下的組成列
x = np.reshape(x, (N,-1))
#在數據矩陣的最後一列再添加一列1,作爲偏置x0,x變成N*(D+1)
x = np.hstack((x, np.ones((N, 1))))
#將偏置向量b添加到權重矩陣w的最後一行,作爲偏置x0的權重,w變成(D+1)*M
w = np.vstack((w, np.transpose(b)))
#計算下一層所有神經元對本層所有神經元的激活值,例如本層有2個神經元,下一層有3個,則輸出2*3矩陣
out = x.dot(w)
return out
#反向傳播
def affine_backward(dout, cache):
"""
計算反向傳播
輸入:
dout,上層梯度,即殘差,N*M
cache, 上層
輸出:
dx,輸入數據的梯度,N*d1*d2*...*dk
dw,權重矩陣的梯度,D*M
db,偏置項b的梯度,M*1
"""
x, w, b = cache
dx, dw, db = None, None, None
#反向傳播
#數據量
N = x.shape[0]
#將x重塑成N*D
x = np.reshape(x, (N, -1))
#計算殘差的梯度
dx = dout.dot(np.transpose(w))
#計算權重的梯度
dw = np.transpose(x)
dw = dw.dot(dout)
#計算偏置的梯度
db = np.sum(dout, axis = 0)
#將dx重塑回來
dx = np.reshape(dx, x.shape)
return dx, dw, db
#RelU傳播
def relu_forward(x):
"""
計算ReLUs激活函數的前向傳播,然後保存結果。
輸入:
x - 輸入數據
返回:
out - 與輸入數據的尺寸相同。
cache - x。
"""
out = None
out = np.max(np.dstack((x,np.zeros(x.shape))),axis = 2)
return out
#ReLUs反向傳播
def relu_backward(dout, x):
"""
計算ReLU函數的反向傳播。
輸入:
dout - 上層誤差梯度
x - 輸入數據x
返回:
dx - x的梯度
"""
dx = dout
dx[x <= 0] = 0
return dx
#softmax損失函數
def softmax_loss(X, y):
"""
無正則化
輸入:
X:神經網絡的輸出層激活值
y:訓練數據的標籤,即真實標籤
輸出:
loss:損失值
dx:輸入數據的梯度
"""
#初始化損失值
loss = 0.0
#計算損失-------------
#訓練集數據數量N
num_train = X.shape[0]
#數據類別數量C
num_catogries = X.shape[1]
#歸一化概率的分子,N*C
#爲了防止指數運算時結果太大導致溢出,這裏要將X的每行減去每行的最大值
score_fenzi = X - np.max(X, axis = 1, keepdims = True)
score_fenzi = np.exp(score_fenzi)
#歸一化概率的分母,即,將歸一化概率的分子按行求和,N*1
score_fenmu = np.sum(score_fenzi, axis = 1, keepdims = True)
#將分母按列複製,
score_fenmu = score_fenmu.dot(np.ones((1, num_catogries)))
#歸一化概率,N*C/(N*1)*(1*C)=N*C/N*C
prob = np.log(score_fenzi/score_fenmu)
y_true = np.zeros((num_train, num_catogries))
#把訓練數據的標籤鋪開,例如,x是第3類,則x對應的標籤爲[0,0,1,0,0,0,0,0,0,0]
y_true[range(num_train), y] = 1.0
#y_true與p對應元素相乘後,只留下了每個數據真實標籤對應的分數,例如x屬於第3類,則留下第3個歸一化概率
#求出每一行歸一化概率的和,即把多餘的0消除,再計算所有數據歸一化概率的和
loss = -np.sum(y_true * prob) / num_train
#計算梯度--------------
dx = (score_fenzi/score_fenmu).copy()
dx[np.arange(num_train), y] -= 1
dx /= num_train
return loss, dx
#導入CIFAR10數據庫
data = get_CIFAR10_data()
X_train = data['X_train']
y_train = data['y_train']
X_val = data['X_val']
y_val = data['y_val']
X_test = data['X_test']
y_test = data['y_test']
for k, v in data.items():
print(f"{k}:", v.shape)
#測試含多層隱含層的神經網絡-------------------------------------------------
#網絡超參數設置
hidden_layers_num = 5 #隱含層的層數
weight_scale = 1e-1 #初始權重矩陣中各元素的標準差
input_dim = 32*32*3 #輸入數據的特徵數量,即維度
hidden_dim = [100, 100, 100, 100, 100] #從左到右分別表示第一層到最後一層隱含層的神經元數量
num_classes = 10 #輸出層神經元數量
params = {} #初始化存儲有權重矩陣和偏置矩陣的字典
print("參數初始化...", end = '')
#初始化各權重矩陣與偏置向量
#初始化從輸入層到第一層隱含層的權重矩陣與偏置矩陣
params['W_i_b_h'] = weight_scale * np.random.randn(input_dim, hidden_dim[0])
params['b_i_b_h'] = np.zeros(hidden_dim[0])
#初始化從第一層到最後一層隱含層之間所有的權重矩陣與偏置矩陣
for i in range(hidden_layers_num - 1):
if hidden_layers_num == 1:
break;
params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = weight_scale * np.random.randn(hidden_dim[i], hidden_dim[i + 1])
params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] = np.zeros(hidden_dim[i + 1])
#初始化從最後一層隱含層到輸出層的權重矩陣與偏置矩陣
params['W_h_b_o'] = weight_scale * np.random.randn(hidden_dim[-1], num_classes)
params['b_h_b_o'] = np.zeros(num_classes)
print('完成')
#輸入數據
print('輸入數據...', end = '')
X = X_train #訓練集數據
y = y_train #訓練集標籤
print('完成')
#開始訓練
#輸入訓練超參數
num_iters = 500 #迭代次數
batch_size = 25 #每一次迭代中從訓練集中隨機選取的數據量,選取出來作爲一個批次
learning_rate = 1e-3 #學習率
verbose = True #是否在命令行顯示訓練消息
reg = 0.6 #正則化係數
learning_rate_decay = 0.95 #學習率衰減率
iterations_per_epoch = X_train.shape[0] / batch_size #遍歷整個訓練集需要多少個批次
#存儲歷次迭代的損失值、訓練準確率與驗證準確率
loss_history = [] #歷次損失值
train_history = [] #歷次訓練準確率
val_history = [] #歷次驗證準確率
#存儲每次迭代生成的權重梯度與偏置梯度的字典
grad = {}
forward_out = {}
for it in range(num_iters):
num_train = X.shape[0] #訓練集中的總數據量
X_batch = None
y_batch = None
#從訓練集中隨機取出batch_size個訓練數據
#從0到num_train-1中隨機取batch_size個數字,作爲一個批次的訓練數據的索引
i = np.random.choice(range(num_train), batch_size, replace = True)
X_batch = X[i,:]
#標籤y和訓練數據X必須對應,例如取出了第3個數據,則必須取出第3個標籤
y_batch = y[i]
#前向傳播------------------------
#從輸入層傳到第一層隱含層,注意只有隱含層纔有激活函數
forward_out_i2h = affine_forward(X_batch, params['W_i_b_h'], params['b_i_b_h'])
forward_out['i2h'] = relu_forward(forward_out_i2h)
#從第二層隱含層到最後一層隱含層
if hidden_layers_num > 1:
for i in range(hidden_layers_num - 1):
if i == 0:
forward_out['h' + str(i) + '_2_' + 'h' + str(i + 1)] = forward_out['i2h']
forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = affine_forward(forward_out['h' + str(i) + '_2_' + 'h' + str(i + 1)],
params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)],
params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)])
forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = relu_forward(forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)])
forward_out_hidden = forward_out['h' + str(hidden_layers_num - 1) + '_2_' + 'h' + str(hidden_layers_num)]
else:
forward_out_hidden = forward_out['i2h']
#從最後一層隱含層到輸出層
scores = affine_forward(forward_out_hidden, params['W_h_b_o'], params['b_h_b_o'])
#在輸出層使用softmax損失函數,計算網絡的總損失與梯度
loss, grad_out = softmax_loss(scores, y_batch)
#對總損失加入正則項
loss += 0.5 * reg * np.sum(params['W_i_b_h'] ** 2) + np.sum(params['W_h_b_o'] ** 2)
for i in range(hidden_layers_num - 1):
loss += 0.5 * reg * (np.sum(params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] ** 2))
#存儲損失值
loss_history.append(loss)
#反向傳播-------------------------------------------------------------------
#計算輸出層到最後一層隱含層的殘差、權重梯度與偏置梯度
dx, grad['W_o_b_h'], grad['b_o_b_h'] = affine_backward(grad_out, (forward_out_hidden, params['W_h_b_o'], params['b_h_b_o']))
#對輸出層到隱含層的權重梯度加入正則項
grad['W_o_b_h'] += reg * params['W_h_b_o']
#從最後一層隱含層到第一層隱含層的所有殘差、權重梯度與偏置梯度
if hidden_layers_num > 1:
for i in range(hidden_layers_num - 1, 0, -1):
if i == hidden_layers_num - 1:
forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)] = forward_out_hidden
#計算第i+1層到第i層隱含層的殘差、權重梯度與偏置梯度
dx = relu_backward(dx, forward_out['h' + str(i + 1) + '_2_' + 'h' + str(i + 2)])
dx, grad['W_h' + str(i + 1) + '_b_' + 'h' + str(i)], grad['b_h' + str(i + 1) + '_b_' + 'h' + str(i)] = affine_backward(dx,
(forward_out['h' + str(i - 1) + '_2_' + 'h' + str(i)],
params['W_h' + str(i) + '_b_' + 'h' + str(i + 1)],
params['b_h' + str(i) + '_b_' + 'h' + str(i + 1)]))
#對第i+1層到第i層隱含層的權重梯度加入正則項
grad['W_h' + str(i + 1) + '_b_' + 'h' + str(i)] += reg * params['W_h' + str(i) + '_b_' + 'h' + str(i + 1)]
forward_relu_out = forward_out['h' + str(0) + '_2_' + 'h' + str(1)]
else:
forward_relu_out = forward_out_hidden
#計算第一層隱含層到輸入層的殘差、權重梯度與偏置梯度
dx = relu_backward(dx, forward_relu_out)
dx, grad['W_h_b_i'], grad['b_h_b_i'] = affine_backward(dx, (X_batch, params['W_i_b_h'], params['b_i_b_h']))
#對第一層隱含層到輸入層的權重梯度加入正則項
grad['W_h_b_i'] += reg * params['W_i_b_h']
#反向傳播完成,此時回到了輸入層
#通過梯度和學習率更新權重與偏置
#先更新輸入層與第一層隱含層之間的權重和偏置,與,最後一層隱含層與輸出層之間的權重和偏置
params['W_i_b_h'] -= learning_rate * grad['W_h_b_i']
params['W_h_b_o'] -= learning_rate * grad['W_o_b_h']
params['b_i_b_h'] -= learning_rate * grad['b_h_b_i']
params['b_h_b_o'] -= learning_rate * grad['b_o_b_h']
#如果隱含層數量大於1層,則再更新從第一層隱含層到最後一層隱含層之間的權重和偏置
if hidden_layers_num > 1:
for i in range(hidden_layers_num - 1):
params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * grad['W_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)]
params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)] -= learning_rate * grad['b_h' + str(i + 2) + '_b_' + 'h' + str(i + 1)]
if verbose and it % batch_size == 0:
print(f"第{it}/{num_iters}次迭代,損失爲{loss},", end = '')
if it % iterations_per_epoch == 0:
#做一個預測,計算此次迭代的訓練準確率、驗證準確率
#計算訓練準確率
pred_out = affine_forward(X_batch, params['W_i_b_h'], params['b_i_b_h'])
pred_out = relu_forward(pred_out)
if hidden_layers_num > 1:
for i in range(hidden_layers_num - 1):
pred_out = affine_forward(pred_out,
params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)],
params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], )
pred_out = relu_forward(pred_out)
pred_out = affine_forward(pred_out, params['W_h_b_o'], params['b_h_b_o'])
pred_out = np.argmax(pred_out, axis = 1)
train_acc = np.mean(pred_out == y_batch)
train_history.append(train_acc)
#計算驗證準確率
valpred_out = affine_forward(X_val, params['W_i_b_h'], params['b_i_b_h'])
valpred_out = relu_forward(valpred_out)
if hidden_layers_num > 1:
for i in range(hidden_layers_num - 1):
valpred_out = affine_forward(valpred_out,
params['W_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)],
params['b_h' + str(i + 1) + '_b_' + 'h' + str(i + 2)], )
valpred_out = relu_forward(valpred_out)
valpred_out = affine_forward(valpred_out, params['W_h_b_o'], params['b_h_b_o'])
valpred_out = np.argmax(valpred_out, axis = 1)
val_acc = np.mean(valpred_out == y_val)
val_history.append(val_acc)
print(f'訓練準確度爲{train_acc}, 驗證準確度爲{val_acc}')
#學習率衰減
learning_rate *= learning_rate_decay
#結果顯示
plt.figure(1)
plt.plot(loss_history)
plt.title('Training Loss')
plt.figure(2)
plt.plot(train_history)
plt.title('Training Accuracy')
plt.figure(3)
plt.plot(val_history)
plt.title('Validation Accuracy')```