一、如何實現加法層和乘法層
用代碼實現昨天的蘋果、橘子問題。
這裏的layer應該理解爲節點,MulLayer是乘法節點的實現,AddLayer是加法節點的實現。
對於每個節點聲明一個類變量
mul_apple_layer = MulLayer() #節點1,乘法節點,蘋果單價*蘋果個數
mul_orange_layer = MulLayer() #節點2,乘法節點,橘子單價*橘子個數
add_apple_orange_layer = AddLayer() #節點3,加法節點,蘋果總價+橘子總價
mul_tax_layer = MulLayer() #節點4,乘法節點,總價*稅率
正向求得稅後總價,反向求得各輸入的偏導。
class MulLayer:
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y): #前向,存儲輸入變量x,y,便於反向求偏導時使用
self.x = x
self.y = y
out = x * y
return out
def backward(self, dout):
dx = dout*self.y
dy = dout*self.x
return dx, dy
class AddLayer:
def __init__(self):
pass
def forward(self, x, y):
out = x+ y
return out
def backward(self, dout):
dx = dout*1
dy = dout*1
return dx, dy
if __name__ == '__main__':
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1
# layer 理解成節點,圖上有四個節點,聲明四個class
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()
# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price,orange_price)
price = mul_tax_layer.forward(all_price, tax)
print(price)
# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
print(dapple, dapple_num, dorange, dorange_num, dtax)
二、如何實現ReLU層和Sigmoid層
回顧一下ReLU函數:
他的導數:
class Relu:
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x <= 0) # 判斷輸入中哪些元素小於等於0,mask中對應的位爲True
out = x.copy() # 把輸入直接拷貝給輸出
out[self.mask] = 0 # 把輸入中小於等於0的元素都置零
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
同理Sigmoid函數: ,導數:
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
out = 1 / (1 + np.exp(-x))
self.out = out
return out
def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out
return dx
三、如何實現全連接層和Softmax層
相鄰層的所有神經元之間都有連接,這稱爲全連接(fully-connected)。
所以書裏這章的全連接指的是Affine(仿射變換:1個線性變換【加權】+1個平移【偏移】)
這裏涉及到矩陣求導,不詳細寫了。
關於偏移,因爲在一次前向計算過程中,對於每一個輸入向量,他的第 個輸入 對應的偏移量 都是一樣的,所以反向求導的時候,對於 的求導要把從1到N的導數加起來。
關於softmax的實現,需要注意的是反向傳播的時候如果一個節點有多個輸入,要把他們加起來。
比如除法節點 / 處反向時:,因爲 t 是one hot表示,某個輸入對應的 t 有且僅有一個元素爲 1,其餘值均爲0.
如果不用計算圖的方法,用直接求導也可以推出相同的結論。
四、如何將這些層組裝成一個二層神經網絡
仿射層+ReLU層是神經網絡的一層。因爲仿射層是將輸入加權、偏置,ReLU是激活函數。
而輸出層是由一個Affine層+Softmax層構成。Softmax是一個分類器。
所以二層神經網絡只需要Affine1+ReLU+Affine2+Softmax即可。
五、如何使用SGD更新這個網絡的參數
因爲每一個層都有forward和backward,因此,SGD中的predict可以由forward完成,而求gradient的時候可以由backward完成。
只需要把前面代碼中的predict和gradient修改爲各層的forward和backward即可。
六、三層神經網絡的Python實現
下面是作者搭建的一個三層神經網絡的代碼(結構:Affine1+ReLU+Affine2+ReLU+Affine3+Softmax),在書中二層神經網絡代碼的基礎上進行了一些修改(輸入60000*784,W1:784*100, W2:100*50, W3: 50*10):
import numpy as np
import matplotlib.pyplot as plt
import sys, os
sys.path.append(os.pardir)
from dataset.mnist import load_mnist
from three_layer_net import *
import datetime
starttime = datetime.datetime.now()
# x_train.shape = (60000, 784)
# t_train.shape = (60000, 10)
# x_test.shape = (10000, 784)
# t_test.shape = (10000, 10)
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)
train_loss_list = []
# 超參數
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
iter_per_epoch = 100 # max(iters_num/batch_size,1)
# initialize the net
threeLayerNet = ThreeLayerNet(x_train.shape[1],100,50,10)
trainLoss = np.zeros(iters_num)
trainAccuracy = np.zeros(iter_per_epoch)
testAccuracy = np.zeros(iter_per_epoch)
# start training
k = 0
for i in range(0,iters_num,1):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
grads = threeLayerNet.gradient(x_batch, t_batch) # backward grads
for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3'):
threeLayerNet.params[key] -= learning_rate * grads[key]
# trainLoss[i] = threeLayerNet.loss(x_batch, t_batch)
if i%batch_size == 0:
trainAccuracy[k] = threeLayerNet.accuracy(x_train, t_train)
testAccuracy[k] = threeLayerNet.accuracy(x_test, t_test)
k += 1
endtime = datetime.datetime.now()
print("runtime = ",(endtime - starttime))
trainStep = range(0,iter_per_epoch,1)
# plt.plot(trainStep,trainLoss)
plt.plot(trainStep,trainAccuracy, 'r')
plt.plot(trainStep,testAccuracy, 'b')
plt.show()
three_layer_net.py
# coding: utf-8
import sys, os
sys.path.append(os.pardir) # 爲了導入父目錄的文件而進行的設定
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict
class ThreeLayerNet:
def __init__(self, input_size, hidden_size1,hidden_size2, output_size, weight_init_std=0.01):
# 初始化權重
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size1)
self.params['b1'] = np.zeros(hidden_size1)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size1, hidden_size2)
self.params['b2'] = np.zeros(hidden_size2)
self.params['W3'] = weight_init_std * np.random.randn(hidden_size2, output_size)
self.params['b3'] = np.zeros(output_size)
# 生成層
self.layers = OrderedDict() # 使字典按先後輸入順序有序排列
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
self.layers['Relu1'] = Relu()
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
self.layers['Relu2'] = Relu()
self.layers['Affine3'] = Affine(self.params['W3'], self.params['b3'])
self.lastLayer = SoftmaxWithLoss()
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
# x:輸入數據, t:監督數據
def loss(self, x, t):
y = self.predict(x)
return self.lastLayer.forward(y, t)
def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
if t.ndim != 1: t = np.argmax(t, axis=1)
accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy
# x:輸入數據, t:監督數據
def gradient(self, x, t):
# forward
self.loss(x, t)
# backward
dout = 1
dout = self.lastLayer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
# 設定
grads = {}
grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
grads['W3'], grads['b3'] = self.layers['Affine3'].dW, self.layers['Affine3'].db
return grads
運行結果,顯示訓練完的網絡準確度接近1,但是跟兩層網絡比相差不大,沒有顯著提升性能。
|
三層網絡訓練時長:1分25秒;二層網絡訓練時長:57秒