git詳細代碼倉庫: https://github.com/justDoForever/deep_learning/digital_recognition_fc.py(python 2.7)
理論分析:https://www.zybuluo.com/hanbingtao/note/485480
往期回顧
在前面的文章中,我們介紹了全連接神經網絡,以及它的訓練和使用。我們用它來識別了手寫數字,然而,這種結構的網絡對於圖像識別任務來說並不是很合適。本文將要介紹一種更適合圖像、語音識別任務的神經網絡結構——卷積神經網絡(Convolutional Neural Network, CNN)。說卷積神經網絡是最重要的一種神經網絡也不爲過,它在最近幾年大放異彩,幾乎所有圖像、語音識別領域的重要突破都是卷積神經網絡取得的,比如谷歌的GoogleNet、微軟的ResNet等,打敗李世石的AlphaGo也用到了這種網絡。本文將詳細介紹卷積神經網絡以及它的訓練算法,以及動手實現一個簡單的卷積神經網絡。
一個新的激活函數——Relu
最近幾年卷積神經網絡中,激活函數往往不選擇sigmoid或tanh函數,而是選擇relu函數。Relu函數的定義是:
Relu函數圖像如下圖所示:
Relu函數作爲激活函數,有下面幾大優勢:
- 速度快 和sigmoid函數需要計算指數和倒數相比,relu函數其實就是一個max(0,x),計算代價小很多。
- 減輕梯度消失問題 回憶一下計算梯度的公式。其中,是sigmoid函數的導數。在使用反向傳播算法進行梯度計算時,每經過一層sigmoid神經元,梯度就要乘上一個。從下圖可以看出,函數最大值是1/4。因此,乘一個會導致梯度越來越小,這對於深層網絡的訓練是個很大的問題。而relu函數的導數是1,不會導致梯度變小。當然,激活函數僅僅是導致梯度減小的一個因素,但無論如何在這方面relu的表現強於sigmoid。使用relu激活函數可以讓你訓練更深的網絡。
# encoding:utf-8
#手寫實現一個簡單的卷積神經網絡
#一些 工具函數
import numpy as np
from activator import ReluActivator, IdentityActivator
def padding(input_array, zero_padding):
#爲數組周圍添zero_padding圈0, 由input_array.ndim自動適配輸入爲2D和3D的情況
if zero_padding == 0:
return input_array
elif input_array.ndim == 2:
input_height = input_array.shape[0]
input_width = input_array.shape[1]
padded_array = np.zeros((input_height+ 2*zero_padding,input_width+2*zero_padding))
padded_array[zero_padding:zero_padding+input_height,zero_padding:zero_padding+input_width] = input_array
return padded_array;
elif input_array.ndim == 3:
depth = input_array.shape[0]
height = input_array.shape[1]
width = input_array.shape[2]
padded_array = np.zeros((depth,height+2*zero_padding,width+2*zero_padding))
padded_array[:,zero_padding:zero_padding+height,zero_padding:zero_padding+width] = input_array;
return padded_array;
def get_patch(input_array, i, j, filter_height, filter_width, stride):
#獲得輸入矩陣和filter相乘對應的部分矩陣
#首先得到部分矩陣的左上角起始點位置
start_i = i * stride;
start_j = j * stride;
if input_array.ndim == 2:
return input_array[start_i:start_i+filter_height,start_j:start_j+filter_width]
elif input_array.ndim == 3:
return input_array[:,start_i:start_i+filter_height,start_j:start_j+filter_width]
def conv(filter, input_array, bias, stride, output_array):
output_height = output_array.shape[0]
output_width = output_array.shape[1]
filter_height = filter.shape[-2]#0
filter_width = filter.shape[-1]#1
#從輸出數組索引遍歷
for i in range(output_height):
for j in range(output_width):
# * 爲兩個數組對應元素相乘仍爲一個數組 3維數組.sum()爲所有元素之和
output_array[i][j] = (get_patch(input_array,i,j,filter_height,filter_width,stride) * filter).sum()+bias
def element_wise_op(output_array, activator):
#按元素操作 逐元素激活
for i in np.nditer(output_array,op_flags=['readwrite']):
i[...] = activator(i)
# i[...]=2*i
class Filter(object):
#Filter類保存了卷積層的參數以及梯度,並實現了梯度下降算法更新參數
def __init__(self,depth, height, width):
#權重初始化用了常用的策略,即權重初始化一個小的隨機數,偏置項置爲0
self.weights = np.random.uniform(-1e-4, 1e-4,(depth,height,width))
self.bias = 0;
#梯度元素和權重元素一一對應
self.weights_grad = np.zeros(self.weights.shape)
self.bias_grad = 0;
def get_weights(self):
return self.weights
def get_bias(self):
return self.bias
def update(self,learning_rate):
self.weights = self.weights - self.weights_grad * learning_rate
self.bias -= self.bias_grad * learning_rate
#打印輸出信息 重載__repr__
def __repr__(self):
return 'filter weights:\n%s bias\n%s' % (repr(self.weights),repr(self.bias))
class ConvLayer(object):
def __init__(self,input_height,input_width,channel_number,
filter_height,filter_width,filter_number,
zero_padding,stride,activator,learning_rate):
'''
可以在構造函數裏設置超參數
:param input_height:
:param input_width:
:param channel_number: 輸入圖像的深度和filter的深度相等
:param filter_height:
:param filter_width:
:param filter_number:filter的個數和輸出數組的深度相等, 一個filter提取某種特徵,他的深度爲channel_number
:param zero_padding: 補零圈數
:param stride: 步長
:param activator:
:param learning_rate:
'''
self.input_height = input_height
self.input_width = input_width;
self.channel_number = channel_number
self.filter_height = filter_height
self.filter_width = filter_width
self.filter_number = filter_number
self.zero_padding = zero_padding
self.stride = stride
self.activator = activator
self.learning_rate = learning_rate
self.output_array_height = (input_height - filter_height + 2 * zero_padding) / stride + 1
self.output_array_width = (input_width - filter_width + 2 * zero_padding ) / stride + 1;
self.output_array = np.zeros((filter_number,self.output_array_height,self.output_array_width))
self.filters = []
for i in range(filter_number):
self.filters.append(Filter(channel_number,filter_height,filter_width))
def forward(self,input_array):
'''
計算卷積層的輸出
輸出結果保存在self.output_array
:param input_array: 輸入樣本矩陣
:return:
'''
self.input_array = input_array;
self.padded_input_array = padding(input_array, self.zero_padding);
#根據式1計算 aij = f(wmn*xi+m,y+n + wb)
for i in range(self.filter_number):
filter = self.filters[i]
#卷積 參數依次: 權值矩陣 補零的輸入矩陣 權重矩陣的偏置項 步長 輸出數組
conv(filter.get_weights(),self.padded_input_array,filter.get_bias(),self.stride,self.output_array[i])
#按元素激活輸出數組的值
element_wise_op(self.output_array[i],self.activator.forward)
def backward(self, input_array, sensitivity_map, activator):
#這裏的後向傳播和之前的訓練算法不太一樣,該後向傳播主要計算梯度未更新權重,之前的訓練算法是更新的權重的
#計算傳遞給前一層的誤差項,以及計算每個權重的梯度
#前一層的誤差項保存在self.delta_array
#梯度保存在Filter.weights_grad
self.forward(input_array)
self.bp_sensitivity_map(sensitivity_map,activator)
self.bp_gradient(sensitivity_map)
def update(self):
#梯度下降算法更新權重
for filter in self.filters:
filter.update(self.learning_rate)
def bp_gradient(self,sensitivity_map):
expanded_array = self.expand_sensitivity_map(sensitivity_map)
#有幾個filter就有幾個誤差項數組
for f_number in range(self.filter_number):
filter = self.filters[f_number]
#filter的深度和第l-1層輸入的深度相等 每個誤差項數組和一個filter的所有深度個數的元素數組相乘得到
# 這個filter的所有深度個數的梯度數組 梯度和每個權重元素是一一對應的
for d in range(filter.weights_grad.shape[0]):
#梯度更新爲第l-1層的輸入乘上第l層的誤差項
conv(expanded_array[f_number],self.padded_input_array[d],0,1,filter.weights_grad[d])
filter.bias_grad = expanded_array[f_number].sum()
def expand_sensitivity_map(self, sensitivity_map):
#記得誤差項一般會有深度的
depth = sensitivity_map.shape[0]
expand_height = (self.input_height - self.filter_height + 2 * self.zero_padding) / 1 + 1
expand_width = (self.input_width - self.filter_width + 2 * self.zero_padding) / 1 + 1
expand_array = np.zeros((depth, expand_height,expand_width))
#用[:,i,j]遍歷深度元素 先遍歷小的即步長大的誤差項數組 對應位置(i,j)一次乘步長得到的對應位置 得到對應位置元素拷貝到還原數組expand_array
for i in range(sensitivity_map.shape[1]):
for j in range(sensitivity_map.shape[2]):
i_pos = i * self.stride
j_pos = j * self.stride
expand_array[:,i_pos,j_pos] = sensitivity_map[:,i,j]
return expand_array
def bp_sensitivity_map(self, sensitivity_map, activator):
#計算前一層的誤差項並保存在self.delta_array
#還原輸入誤差項步長爲1
expanded_array = self.expand_sensitivity_map(sensitivity_map)
#周圍補一圈0 由式2推導
expanded_width = expanded_array.shape[2]
zp = (self.input_width + self.filter_width - 1 - expanded_width) / 2
padded_array = padding(expanded_array,zp)
#將filter翻轉180度
weights_height = self.filter_height
weights_width = self.filter_width
# fan_filter = self.create_delta_array()
self.delta_array = self.create_delta_array()
for f_number in range(self.filter_number):
filter = self.filters[f_number]
# for i in range(weights_height):
# for j in range(weights_width):
# fan_filter[:,weights_height - 1 - i, weights_width - 1 - j] = filter.weights[:, i, j]
# 上方註釋的代碼也是翻轉180度爲自己所寫 下面的是numpy 將filter翻轉180度
fan_filter = np.array(map(lambda i: np.rot90(i, 2),filter.weights))
#誤差項和翻轉的filter卷積,
delta_array = self.create_delta_array()
#注意是反向 所以padded_array[]爲f_number 個數與filter個數對應
for d in range(self.channel_number):
conv(fan_filter[d],padded_array[f_number],0,1,delta_array[d])
#對於具有多個filter的卷積層而言,最終傳遞到上一層的 sensitivity_map
#等於所有filter的sensitivity_map之和
self.delta_array += delta_array
# 再和前一層的激活函數的導數逐元素相乘得到前一層的誤差項並保存到self.delta_array
derivative_array = np.array(self.input_array)
element_wise_op(derivative_array,activator.backward)
self.delta_array *= derivative_array
def create_delta_array(self):
return np.zeros((self.channel_number, self.input_height, self.input_width))
def init_test():
a = np.array([
[[0,1,1,0,2],
[2,2,2,2,1],
[1,0,0,2,0],
[0,1,1,0,0],
[1,2,0,0,2]],
[[1,0,2,2,0],
[0,0,0,2,0],
[1,2,1,2,1],
[1,0,0,0,0],
[1,2,1,1,1]],
[[2,1,2,0,0],
[1,0,0,1,0],
[0,2,1,0,1],
[0,1,2,2,2],
[2,1,0,0,1]]
]);
b = np.array([
[[0,1,1],
[2,2,2],
[1,0,0]],
[[1,0,2],
[0,0,0],
[1,2,1]]
])
c = ConvLayer(5,5,3,3,3,2,1,2,IdentityActivator(),0.001)
c.filters[0].weights = np.array([[[-1,1,0],
[0,1,0],
[0,1,1]],
[[-1,-1,0],
[0,0,0],
[0,-1,0]],
[[0,0,-1],
[0,1,0],
[1,-1,-1]]],dtype=np.float64)
c.filters[0].bias = 1;
c.filters[1].weights = np.array([[[1,1,-1],
[-1,-1,1],
[0,-1,1]],
[[0,1,0],
[-1,0,-1],
[-1,1,0]],
[[-1,0,0],
[-1,0,1],
[-1,0,0]]],dtype=np.float64)
c.filters[1].bias = 0;
return a,b,c;
def gradient_check():
#卷積層梯度檢查 本函數是針對一層卷積網絡而言的
#建立誤差函數
error_function = lambda o:o.sum();
#得到數據集
a,b,c1 = init_test()
#前向計算
c1.forward(a)
#獲得誤差項數組 因爲是線性函數 所以誤差項數組爲一個全爲一的數組
sensitivity_map = np.ones(c1.output_array.shape,dtype=np.float64)
#後向傳播
# c1.backward(a,sensitivity_map,IdentityActivator())
c1.bp_gradient(sensitivity_map)
epision = 10e-4
#遍歷每個權重,得到訓練值並於計算值一起打印輸出
for filter in c1.filters:
for d in range(filter.weights_grad.shape[0]):
for i in range(filter.weights_grad.shape[1]):
for j in range(filter.weights_grad.shape[2]):
weight = filter.weights[d,i,j]
#當前權重加一個精度epision=10^-4,前向計算,輸出數組求和爲誤差Ed+
filter.weights[d,i,j] = weight + epision;
c1.forward(a)
err1 = error_function(c1.output_array)
# 當前權重減一個精度10^-4,前向計算,輸出數組求和爲誤差Ed-
filter.weights[d,i,j] = weight - epision;
c1.forward(a)
err2 = error_function(c1.output_array)
#計算梯度近似值爲(Ed+ - Ed-)/(2*epision)
calc_gradient = (err1 - err2) / (2 * epision)
print "weights[%d,%d,%d]:actual_gradiet-calc_gradient: %f-%f" % (d,i,j,filter.weights_grad[d,i,j],calc_gradient)
#恢復權重
filter.weights[d,i,j] = weight
def test():
a,b,c1 = init_test()
c1.forward(a)
print c1.output_array
def test_bp():
a,b,c1 = init_test()
c1.backward(a,b,IdentityActivator())
c1.update()
print c1.filters[0]
print c1.filters[1]
def get_max_index(patch):
max = patch[0][0];
max_i = 0;
max_j = 0;
for i in range(patch.shape[0]):
for j in range(patch.shape[1]):
if patch[i][j] > max:
max = patch[i][j]
# max_i = i;
# max_j = j;
max_i, max_j = i, j
return max_i, max_j
class Maxpool(object):
def __init__(self,input_height,input_width,channel_number,
filter_height,filter_width,stride):
#池化層作用就是下采樣 因此filter緯度就是2維
self.input_height = input_height
self.input_width = input_width
self.channel_number = channel_number
self.filter_height = filter_height
self.filter_width = filter_width
self.stride = stride
self.output_height = (input_height - filter_height) / stride + 1
self.output_width = (input_width - filter_width) / stride + 1
self.output_array = np.zeros((self.channel_number,self.output_height,self.output_width),dtype=np.float64)
def forward(self,input_array):
#因爲緯度不變,所以要依次遍歷每個輸入數組
for d in range(self.channel_number):
for i in range(self.output_height):
for j in range(self.output_width):
self.output_array[d][i][j] = get_patch(input_array[d],i,j,self.filter_height,self.filter_width,self.stride).max()
# print (get_patch(input_array[d],i,j,self.filter_height,self.filter_width,self.stride).max())
def backward(self,input_array,sensitivity_map):
self.delta_array = np.zeros(input_array.shape)
for d in range(self.channel_number):
for i in range(self.output_height):
for j in range(self.output_width):
patch = get_patch(input_array[d],i,j,self.filter_height,self.filter_width,self.stride)
i_max,j_max = get_max_index(patch)
self.delta_array[d,i * self.stride + i_max,j * self.stride + j_max] = sensitivity_map[d,i,j]
def init_test_mpl():
a = np.array(
[[[1,1,2,4],
[5,6,7,8],
[3,2,1,0],
[1,2,3,4]],
[[0,1,2,3],
[4,5,6,7],
[8,9,0,1],
[3,4,5,6]]],dtype=np.float64)
b = np.array(
[[[1,2],
[2,4]],
[[3,5],
[8,2]]],dtype=np.float64)
mpl = Maxpool(4,4,2,2,2,2)
return a,b,mpl
def test_maxpool():
a,b,mpl = init_test_mpl()
mpl.forward(a)
print mpl.output_array
def test_bp_maxpool():
a,b,mpl = init_test_mpl()
#池化層沒有梯度的計算 無激活函數 僅僅是將誤差項傳遞到上一層
mpl.backward(a,b)
print "input: \n%s\n sensitity_map:\n %s\n delta_array:\n %s\n" % (a,b,mpl.delta_array)
if __name__ == '__main__':
a = np.random.randint(2,9,(2,2))
print a
# # element_wise_op(a,2)
# print a
# for i in a:
# print i
# gradient_check()
#數組翻轉180度
# b = np.zeros(a.shape)
# for i in range(2):
# for j in range(2):
# b[ 1 - i, 1 - j] = a[i, j]
# print b
# c = np.array(np.rot90(a,2))
# print c
#測試卷積層
#測試前向計算
test()
#測試反向傳播
test_bp()
#測試池化層
#測試前向計算
test_maxpool()
#測試後向傳播
test_bp_maxpool()