提高神经学习的学习效率
并行计算
可以使用GPU进行并行计算,以此提高学习效率。
梯度消失问题
梯度消失问题的存在使得学习效率会变慢,出现梯度消失的原因如下:
其中,对sigmoid函数求导的时候,在[-4, 4]的范围外会发生导数很小,接近于0的情况,进而导致学习的梯度消失。
改进的思路
归一化
因为数据度量的量纲可能不同,所以需要对数据进行归一化处理。
归一化的效果如图:
参数初始化问题
上图的参数初始化方法是业界比较认可的一种初始化方式。
参数初始化的代码如下:
def default_weight_initializer(self):
# 初始化每层的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每层的权重
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
正则化
L1正则化:
L1正则化项的导数:
L2正则化:
L2正则化项的导数:
代码需要修改如下:
self.weights = [(1-eta*(lmbda/n))*w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
学习率
交叉熵
交叉熵定义
交叉熵求导
交叉熵代码:
class CrossEntropyCost(object):
'''
a = np.array([[np.nan, np.inf],
[-np.nan, -np.inf]])
np.nan_to_sum(a)
array([[0.00000000e+000, 1.79769313e+308],
[0.00000000e+000, -1.79769313e+308]])
'''
@staticmethod
def fn(a, y):
return np.sum(np.nan_to_num(-y * np.log(a) - (1-y) * np.log(1-a)))
@staticmethod
def delta(z, a, y):
return (a-y)
模型的保存与加载
模型保存与加载代码如下:
# 保存模型
def save(self, filename):
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)
}
f = open(filename, "w")
json.dump(data, f) # json把字典类型转换成字符串
f.close()
# 加载模型
def load(filename):
f = open(filename, "r")
data = json.load(f)
f.close()
cost = getattr(sys.modules[__name__], data["cost"])
net = Network(data["sizes"], cost=cost)
net.weights = [np.array(w) for w in data["weights"]]
net.biases = [np.array(b) for b in data["biases"]]
return net
最后在MNIST数据集上训练改进的模型,同时加上准确率等度量方法。
完整代码如下:
#!/user/bin/env python3
# -*- coding: utf-8 -*-
import random
import json
import sys
import numpy as np
#定义神经网络结构
class QuadraticCost(object):
@staticmethod
def fn(a, y):
return 0.5 * np.linalg.norm(a-y) ** 2
@staticmethod
def delta(z, a, y):
return (a-y) * sigmoid_prime(z)
class CrossEntropyCost(object):
'''
a = np.array([[np.nan, np.inf],
[-np.nan, -np.inf]])
np.nan_to_sum(a)
array([[0.00000000e+000, 1.79769313e+308],
[0.00000000e+000, -1.79769313e+308]])
'''
@staticmethod
def fn(a, y):
return np.sum(np.nan_to_num(-y * np.log(a) - (1-y) * np.log(1-a)))
@staticmethod
def delta(z, a, y):
return (a-y)
class Network(object):
def __init__(self, sizes, cost=CrossEntropyCost):
# 网络层数
self.num_layers = len(sizes)
# 每层神经元的个数
self.sizes = sizes
# 初始化每层的偏置和权重
self.default_weight_initializer()
# 损失函数
self.cost = cost
def default_weight_initializer(self):
# 初始化每层的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每层的权重
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def large_weight_initializer(self):
# 初始化每层的偏置
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
# 初始化每层的权重
self.weights = [np.random.randn(y, x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a) + b)
return a
# 梯度下降
def SGD(self, training_data, epochs, mini_batch_size, eta, lmbda=0.0, test_data=None):
if test_data:
n_test = len(test_data)
# 训练数据总个数
n = len(training_data)
# 开始训练,循环每一个epochs
for j in range(epochs): # 在python2.7中为xrange
# 洗牌 打乱训练数据
random.shuffle(training_data)
# mini_batch
mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
# 训练mini_batch
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta, lmbda, n)
print("Epoch {0} complete".format(j))
cost = self.total_cost(training_data, lmbda)
print("Cost on training data: {}".format(cost))
accuracy = self.accuracy(training_data, convert=True)
print("Accuracy on training data: {} / {}".format(accuracy, n))
if test_data:
cost = self.total_cost(test_data, lmbda, convert=True)
print("Cost on test data: {}".format(cost))
accuracy = self.accuracy(test_data)
print("Accuracy on test data: {} / {}".format(accuracy, len(test_data)))
def update_mini_batch(self, mini_batch, eta, lmbda, n):
# 保存每层偏导
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# 训练一个mini_batch
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.update(x, y)
# 保存一次训练网络中每层的偏导
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
# 更新权重和偏置 Wn+1 = Wn - eta * nw
self.weights = [(1-eta*(lmbda/n))*w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]
# 前向传播
def update(self, x, y):
# 保存每层偏导
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
activation = x # 保存的输入(训练数据)
# 保存每一层的激励值a=sigmoid(z)
activations = [x]
# 保存每一层的z=wx+b
zs = []
# 前向传播
for b, w in zip(self.biases, self.weights):
# 计算每层的z
z = np.dot(w, activation) + b
# 保存每层的z
zs.append(z)
# 计算每层的a
activation = sigmoid(z)
# 保存每一层的a
activations.append(activation)
# 反向更新
# 计算最后一层的误差
# delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
delta = (self.cost).delta(zs[-1], activations[-1], y)
# 最后一层权重和偏置的导数
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].T)
# 倒数第二层一直到第一层 权重和偏置的导数
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
# 当前层的误差
delta = np.dot(self.weights[-l+1].T, delta) * sp
# 当前层的偏置和权重的导数
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].T)
return (nabla_b, nabla_w)
def accuracy(self, data, convert=False):
if convert:
# 如果是训练集数据,得到的结果是非0即1的one-hot编码方式
results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data]
else:
# 如果是测试集数据,得到的结果就是0-9之间的数字分类
results = [(np.argmax(self.feedforward(x)), y) for (x, y) in data]
return sum(int(x == y) for (x, y) in results)
def total_cost(self, data, lmbda, convert=False):
cost = 0.0
for x, y in data:
a = self.feedforward(x)
if convert:
# convert为真,表示是测试数据集,将y转换为一个one-hot编码的十维向量
y = mnist_loader.vectorized_result(y)
cost += self.cost.fn(a, y) / len(data)
cost += 0.5*(lmbda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights)
return cost
def cost_derivative(self, output_activation, y):
return (output_activation - y)
# 保存模型
def save(self, filename):
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)
}
f = open(filename, "w")
json.dump(data, f) # json把字典类型转换成字符串
f.close()
# 加载模型
def load(filename):
f = open(filename, "r")
data = json.load(f)
f.close()
cost = getattr(sys.modules[__name__], data["cost"])
net = Network(data["sizes"], cost=cost)
net.weights = [np.array(w) for w in data["weights"]]
net.biases = [np.array(b) for b in data["biases"]]
return net
# sigmoid激励函数
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z) * (1-sigmoid(z))
if __name__ == "__main__":
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
# 28*28=784个像素, 可以定义30个神经元, 共有10种分类
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 0.5, test_data=test_data)
运行结果如下图所示:
可以看到,模型的效果显然比不采取提升学习效率的措施时效果要好很多