莫煩tensorflow Batch Normalization總結與tf.identity()/EMA功能

代碼來源於莫煩tensorflow Batch Normalization一節。我添加了一些註釋（自己的理解），添加了一些函數的功能介紹。

# 對輸入的分散的數據要統一數據規格，這樣神經網絡可以更好的學習數據的規律，因爲數據分佈對神經網絡是有很大影響的，比如：
# w = 0.1, x1 = 1, x2 =20 --> w*x 0.1, w*x2=2 --> tanh activation function --> tanh(w*x1)接近0,tanh(w*x2)接近1，
#這會使得x增加，w*x接近1或者-1，w*x對較大的x特徵範圍不敏感.這時候可以對輸入數據進行Normalization.

#但是這種情況不僅發生在輸入層，所以引入Batch Normalization,在每個batch前向傳播時，都要在全連接層與激活函數之間進行Normalization,保持數據分佈集中在敏感區域.SGD
# Batch Normalization 包括標準化工序(x' = x-u/s),s是方差,u是均值.
# 反標準化工序 y = r * x' + b = BN(x')(參數r, b) 自動學習，用於與標準化工序共同調節數據分佈.r:擴展參數,b:平移參數

#=================tf.identity()函數作用與control_dependencies===================
# 第一種：
# x_plus_1 = tf.assign_add(x, 1)
#
# #control_dependencies的意義是，在執行with包含的內容（在這裏就是 y = x）前，
# #先執行control_dependencies參數中的內容（在這裏就是 x_plus_1），這裏的解釋不準確，先接着看。。。
# with tf.control_dependencies([x_plus_1]):
# y = x
# 第二種：
# x_plus_1 = tf.assign_add(x, 1)
# with tf.control_dependencies([x_plus_1]):
# y = tf.identity(x)#修改部分
#
# 結論：
# 對於control_dependencies這個管理器，只有當裏面的操作是一個op時，纔會生效，也就是先執行傳入的參數op，再執行裏面的op。
# 而y=x僅僅是tensor的一個簡單賦值，不是定義的op，所以在圖中不會形成一個節點，這樣該管理器就失效了。
# tf.identity是返回一個一模一樣新的tensor的op，這會增加一個新節點到gragh中，這時control_dependencies就會生效，所以第二種情況的輸出符合預期。
#
#=========================================================================

#=============================EMA=========================================
# 指數滑動平均(ExponentialMovingAverage)EMA被廣泛的應用在深度學習的BN層中，RMSprop，adadelta，adam等梯度下降方法。
# 1. tensorflow中提供了tf.train.ExponentialMovingAverage來實現滑動平均模型，他使用指數衰減來計算變量的移動平均值。
# tf.train.ExponentialMovingAverage.init(self, decay, num_updates=None, zero_debias=False, name="ExponentialMovingAverage"):
# decay是衰減率在創建ExponentialMovingAverage對象時，需指定衰減率(decay)，用於控制模型的更新速度。decay設置爲接近1的值比較合理，通常爲：0.999,0.9999。
#
# 2. 影子變量( shadow variable)的初始值與訓練變量(variable)的初始值相同。當運行變量更新時，每個影子變量都會更新爲：
# shadow variable = (1-decay)* shadow variable + decay * variable
# num_updates是ExponentialMovingAverage提供用來動態設置decay的參數，當初始化時提供了參數，即不爲none時，每次的衰減率是：
# decay = min{decay, 1+ num_updates/10+num_updates}
# 3. apply()方法添加了訓練變量的影子副本，並保持了其影子副本中訓練變量的移動平均值操作。在每次訓練之後調用此操作，更新移動平均值。
# 4. average()和average_name()方法可以獲取影子變量及其名稱。
# 5. https://blog.csdn.net/qq_14845119/article/details/78767544 以上信息來源於這個網址。
#==========================================================================

#代碼可運行，如下。

# -*- coding: utf-8 -*-

"""

Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou

Dependencies:
tensorflow: 1.1.0
matplotlib
numpy
"""
#import packages

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# define hyperparameter
ACTIVATION = tf.nn.tanh # relu最終loss曲線消失，因爲最後數據分佈極度不均勻，最終只剩下甚至一個數據點的loss，tanh效果好
N_LAYERS = 7 #隱藏層數
N_HIDDEN_UNITS = 30 #隱藏層神經元數目

def built_net(xs, ys, norm): #添加norm是否標準化的參數
def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0, stddev=1.))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights)+biases

#判斷是否需要標準化，標準化是在全連接層輸出和激活層輸入之間
if norm:
fc_mean, fc_var = tf.nn.moments(Wx_plus_b, axes=[0]) #對數據的每個列求均值和方差
scale = tf.Variable(tf.ones([out_size])) # 初始化擴展參數r
shift = tf.Variable(tf.zeros([out_size])) # 初始化平移參數B
epsilon = 0.001
#1版本. tf.nn.batch_normalization 等價於
# Wx_plus_b = (Wx_plus_b-fc_mean)/tf.square(fc_var+epsilon)
# Wx_plus_b = Wx_plus_b * scale + shift
#2版本. Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, fc_mean, fc_var, shift, scale, epsilon)
#tf.nn.batch_normalization 修改爲下邊幾句
#3版本. apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5) # decay是衰減權重
# decay和shadow variable和variable都是隨着batch更新的!!!!!!
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var]) # 將ema操作作用在fc_mean和fc_var變量上，產生兩個shadow variable，用於更新variable
with tf.control_dependencies([ema_apply_op]): # control_dependencies先執行括號的操作
return tf.identity(fc_mean), tf.identity(fc_var) # 獲得batch更新後的mean和var
#獲得最新的mean and variance
mean, var = mean_var_with_update()
Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)

if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs

tf.set_random_seed(1)
np.random.seed(1)

if norm:
fc_mean, fc_var = tf.nn.moments(xs, axes=[0]) #對數據的每個列求均值和方差
#輸入和輸出都是一維的
scale = tf.Variable(tf.ones([1])) # 初始化擴展參數r
shift = tf.Variable(tf.zeros([1])) # 初始化平移參數B
epsilon = 0.001
#tf.nn.batch_normalization 等價於
# Wx_plus_b = (Wx_plus_b-fc_mean)/tf.square(fc_var+epsilon)
# Wx_plus_b = Wx_plus_b * scale + shift
# xs = tf.nn.batch_normalization(xs, fc_mean, fc_var, shift, scale, epsilon)
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()
xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

layers_inputs = [xs] #初始化輸入數據,記錄每層的input

#隱藏層節點數都是N_HIDDEN_UNITS(循環創建多個隱藏層)
for l_n in range(N_LAYERS):
layer_input = layers_inputs[l_n] #第(l_n+1)層的輸入數據，是第l_n層的輸出，會都加入layers_inputs列表.
in_size = layers_inputs[l_n].get_shape()[1].value # 第(l_n+1)層的輸入大小是輸入數據的列數
output = add_layer(layer_input, in_size, N_HIDDEN_UNITS, ACTIVATION,norm)
layers_inputs.append(output)

#最終一個數據對應一個點,即一維輸出
prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys-prediction), reduction_indices=[1]))
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
return [train_op, cost, layers_inputs]

#畫hist，即各層輸入數據分佈圖
def plot_his(inputs, inputs_norm):
# plot histogram for the inputs of every layer
for j, all_inputs in enumerate([inputs, inputs_norm]):
for i, input in enumerate(all_inputs):
plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
plt.cla()
if i == 0:
the_range = (-7, 10)
else:
the_range = (-1, 1)
plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
plt.yticks(())
if j == 1:
plt.xticks(the_range)
else:
plt.xticks(())
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
plt.draw()
plt.pause(0.01)

if __name__ == '__main__':
# fake data
tf.set_random_seed(1)
np.random.seed(1)
x_data = np.linspace(-7, 10, 500)[:, np.newaxis]
noise = np.random.normal(0, 8, x_data.shape) # 概率分佈(均值，方差，形狀),方差顯示分散程度
y_data = np.square(x_data) - 5 + noise

#==============================================================================
# plt.scatter(x_data, y_data)
# plt.show()
#==============================================================================

#define variable
xs = tf.placeholder(tf.float32, [None, 1]) # shpape=(batch_size, nfeatures=1)
ys = tf.placeholder(tf.float32, [None, 1])

#創建網絡
train_op, cost, layers_inputs = built_net(xs, ys, norm=False)
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

cost_hist = []
cost_hist_norm = []
record_step = 5


plt.ion()
plt.figure(figsize=(7, 3))
for i in range(251):
sess.run(train_op, feed_dict={xs:x_data, ys:y_data}) #run training
sess.run(train_op_norm, feed_dict={xs:x_data, ys:y_data})
if i%50 == 0:
all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs:x_data, ys:y_data}) # run layers_inputs
plot_his(all_inputs, all_inputs_norm)

if i % record_step == 0:
# record cost
cost_hist.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
cost_hist_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))
plt.ioff()
plt.figure()
# 畫是否標準化的損失曲線對比圖
plt.plot(np.arange(len(cost_hist)) * record_step, np.array(cost_hist), label='no BN')
plt.plot(np.arange(len(cost_hist)) * record_step, np.array(cost_hist_norm), label='BN')
plt.legend()
#plt.show()
plt.savefig('cost-tanh.png')

莫煩tensorflow Batch Normalization總結與tf.identity()/EMA功能

tensorflow - mnist入門實例

Regression,model select,gradient descent，overfitting,regularization學習入門

python 垃圾回收機制

python 深拷貝與淺拷貝理解

機器學習,模型誤差分析,error,bias,variance

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結