提高神經學習的學習效率
並行計算
可以使用GPU進行並行計算,以此提高學習效率。
梯度消失問題
梯度消失問題的存在使得學習效率會變慢,出現梯度消失的原因如下:
其中,對sigmoid函數求導的時候,在[-4, 4]的範圍外會發生導數很小,接近於0的情況,進而導致學習的梯度消失。
改進的思路
歸一化
因爲數據度量的量綱可能不同,所以需要對數據進行歸一化處理。
歸一化的效果如圖:
參數初始化問題
上圖的參數初始化方法是業界比較認可的一種初始化方式。
參數初始化的代碼如下:
def default_weight_initializer(self):
# 初始化每層的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每層的權重
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
正則化
L1正則化:
L1正則化項的導數:
L2正則化:
L2正則化項的導數:
代碼需要修改如下:
self.weights = [(1-eta*(lmbda/n))*w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
學習率
交叉熵
交叉熵定義
交叉熵求導
交叉熵代碼:
class CrossEntropyCost(object):
'''
a = np.array([[np.nan, np.inf],
[-np.nan, -np.inf]])
np.nan_to_sum(a)
array([[0.00000000e+000, 1.79769313e+308],
[0.00000000e+000, -1.79769313e+308]])
'''
@staticmethod
def fn(a, y):
return np.sum(np.nan_to_num(-y * np.log(a) - (1-y) * np.log(1-a)))
@staticmethod
def delta(z, a, y):
return (a-y)
模型的保存與加載
模型保存與加載代碼如下:
# 保存模型
def save(self, filename):
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)
}
f = open(filename, "w")
json.dump(data, f) # json把字典類型轉換成字符串
f.close()
# 加載模型
def load(filename):
f = open(filename, "r")
data = json.load(f)
f.close()
cost = getattr(sys.modules[__name__], data["cost"])
net = Network(data["sizes"], cost=cost)
net.weights = [np.array(w) for w in data["weights"]]
net.biases = [np.array(b) for b in data["biases"]]
return net
最後在MNIST數據集上訓練改進的模型,同時加上準確率等度量方法。
完整代碼如下:
#!/user/bin/env python3
# -*- coding: utf-8 -*-
import random
import json
import sys
import numpy as np
#定義神經網絡結構
class QuadraticCost(object):
@staticmethod
def fn(a, y):
return 0.5 * np.linalg.norm(a-y) ** 2
@staticmethod
def delta(z, a, y):
return (a-y) * sigmoid_prime(z)
class CrossEntropyCost(object):
'''
a = np.array([[np.nan, np.inf],
[-np.nan, -np.inf]])
np.nan_to_sum(a)
array([[0.00000000e+000, 1.79769313e+308],
[0.00000000e+000, -1.79769313e+308]])
'''
@staticmethod
def fn(a, y):
return np.sum(np.nan_to_num(-y * np.log(a) - (1-y) * np.log(1-a)))
@staticmethod
def delta(z, a, y):
return (a-y)
class Network(object):
def __init__(self, sizes, cost=CrossEntropyCost):
# 網絡層數
self.num_layers = len(sizes)
# 每層神經元的個數
self.sizes = sizes
# 初始化每層的偏置和權重
self.default_weight_initializer()
# 損失函數
self.cost = cost
def default_weight_initializer(self):
# 初始化每層的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每層的權重
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def large_weight_initializer(self):
# 初始化每層的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每層的權重
self.weights = [np.random.randn(y, x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a) + b)
return a
# 梯度下降
def SGD(self, training_data, epochs, mini_batch_size, eta, lmbda=0.0, test_data=None):
if test_data:
n_test = len(test_data)
# 訓練數據總個數
n = len(training_data)
# 開始訓練,循環每一個epochs
for j in range(epochs): # 在python2.7中爲xrange
# 洗牌 打亂訓練數據
random.shuffle(training_data)
# mini_batch
mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
# 訓練mini_batch
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta, lmbda, n)
print("Epoch {0} complete".format(j))
cost = self.total_cost(training_data, lmbda)
print("Cost on training data: {}".format(cost))
accuracy = self.accuracy(training_data, convert=True)
print("Accuracy on training data: {} / {}".format(accuracy, n))
if test_data:
cost = self.total_cost(test_data, lmbda, convert=True)
print("Cost on test data: {}".format(cost))
accuracy = self.accuracy(test_data)
print("Accuracy on test data: {} / {}".format(accuracy, len(test_data)))
def update_mini_batch(self, mini_batch, eta, lmbda, n):
# 保存每層偏導
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# 訓練一個mini_batch
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.update(x, y)
# 保存一次訓練網絡中每層的偏導
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
# 更新權重和偏置 Wn+1 = Wn - eta * nw
self.weights = [(1-eta*(lmbda/n))*w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]
# 前向傳播
def update(self, x, y):
# 保存每層偏導
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
activation = x # 保存的輸入(訓練數據)
# 保存每一層的激勵值a=sigmoid(z)
activations = [x]
# 保存每一層的z=wx+b
zs = []
# 前向傳播
for b, w in zip(self.biases, self.weights):
# 計算每層的z
z = np.dot(w, activation) + b
# 保存每層的z
zs.append(z)
# 計算每層的a
activation = sigmoid(z)
# 保存每一層的a
activations.append(activation)
# 反向更新
# 計算最後一層的誤差
# delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
delta = (self.cost).delta(zs[-1], activations[-1], y)
# 最後一層權重和偏置的導數
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].T)
# 倒數第二層一直到第一層 權重和偏置的導數
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
# 當前層的誤差
delta = np.dot(self.weights[-l+1].T, delta) * sp
# 當前層的偏置和權重的導數
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].T)
return (nabla_b, nabla_w)
def accuracy(self, data, convert=False):
if convert:
# 如果是訓練集數據,得到的結果是非0即1的one-hot編碼方式
results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data]
else:
# 如果是測試集數據,得到的結果就是0-9之間的數字分類
results = [(np.argmax(self.feedforward(x)), y) for (x, y) in data]
return sum(int(x == y) for (x, y) in results)
def total_cost(self, data, lmbda, convert=False):
cost = 0.0
for x, y in data:
a = self.feedforward(x)
if convert:
# convert爲真,表示是測試數據集,將y轉換爲一個one-hot編碼的十維向量
y = mnist_loader.vectorized_result(y)
cost += self.cost.fn(a, y) / len(data)
cost += 0.5*(lmbda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights)
return cost
def cost_derivative(self, output_activation, y):
return (output_activation - y)
# 保存模型
def save(self, filename):
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)
}
f = open(filename, "w")
json.dump(data, f) # json把字典類型轉換成字符串
f.close()
# 加載模型
def load(filename):
f = open(filename, "r")
data = json.load(f)
f.close()
cost = getattr(sys.modules[__name__], data["cost"])
net = Network(data["sizes"], cost=cost)
net.weights = [np.array(w) for w in data["weights"]]
net.biases = [np.array(b) for b in data["biases"]]
return net
# sigmoid激勵函數
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z) * (1-sigmoid(z))
if __name__ == "__main__":
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
# 28*28=784個像素, 可以定義30個神經元, 共有10種分類
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 0.5, test_data=test_data)
運行結果如下圖所示:
可以看到,模型的效果顯然比不採取提升學習效率的措施時效果要好很多