注意一點就是: sigmoid函數和tanh函數有所區別:
sigmoid將一個實數輸入映射到[0,1]範圍內 tanh函數將一個實數輸入映射到[-1,1]範圍內
不過兩者都有所優劣,Relu激活函數可以極大地加快收斂速度,相比tanh函數,收斂速度可以加快6倍
# 從零實現 LSTM,其實與 RNN、GRU 的主要區別在於有門的設計
# 確定哪些是需要初始化模型參數:
# 核心公式就是以下六個
# It=σ(XtWxi+Ht−1Whi+bi), Ft=σ(XtWxf+Ht−1Whf+bf), Ot=σ(XtWxo+Ht−1Who+bo), C~t=tanh(XtWxc+Ht−1Whc+bc),
# Ct=Ft⊙Ct−1+It⊙C~t,Ht=Ot⊙tanh(Ct)
# 初始化參數有:Ht-1 的維度
vocab_size = 1027
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
class LSTM(object):
def __init__(self, hiden_dim, params=None):
"""
初始化待訓練的參數 variabel
params = (Wxr, Whr, br, Wxz, Whz, bz, Wxh, Whh, bh)
"""
# 是否加載預訓練參數
if params:
self.Wxi, self.Whi, self.bi, self.Wxf, self.Whf, self.bf, self.Wxo, self.Who, self.bo, self.wxc, \
self.whc, self.bc = params
else:
# 定義變量
self.Wxi = self._ones(shape=(vocab_size, num_hiddens))
self.Whi = self._ones(shape=(num_hiddens, num_hiddens))
self.Wxf = self._ones(shape=(vocab_size, num_hiddens))
self.Whf = self._ones(shape=(num_hiddens, num_hiddens))
self.Wxo = self._ones(shape=(vocab_size, num_hiddens))
self.Who = self._ones(shape=(num_hiddens, num_hiddens))
self.Wxc = self._ones(shape=(vocab_size, num_hiddens))
self.Whc = self._ones(shape=(num_hiddens, num_hiddens))
# 偏置項
self.bi = tf.Variable(tf.zeros([1,num_hiddens]), dtype=tf.float32)
self.bf = tf.Variable(tf.zeros([1,num_hiddens]), dtype=tf.float32)
self.bo = tf.Variable(tf.zeros([1,num_hiddens]), dtype=tf.float32)
self.bc = tf.Variable(tf.zeros([1,num_hiddens]), dtype=tf.float32)
# 輸出層參數
self.Whq = self._ones(shape=(num_hiddens, vocab_size))
self.bq = tf.Variable(tf.zeros([1,vocab_size]), dtype=tf.float32)
def _ones(self,shape):
return tf.Variable(tf.random.normal(shape=shape,stddev=0.01,mean=0,dtype=tf.float32))
def net(self, inputs, Ht, Ct):
# 展開做循環計算
outputs = []
for X in inputs:
x = tf.reshape(X,(-1, vocab_size))
# Rt=σ(XtWxr+Ht−1Whr+br)
It = tf.sigmoid(x@self.Wxi + Ht@self.Whi + self.bi)
Ft = tf.sigmoid(x@self.Wxf + Ht@self.Whf + self.bf)
Ot = tf.sigmoid(x@self.Wxo + Ht@self.Who + self.bo)
C_hat = tf.tanh(x@self.Wxc + Ht@self.Whc + self.bc)
C_t = Ft * Ct + It * C_hat
Ht = Ot * tf.tanh(Ct)
Y = tf.matmul(Ht, self.Whq) + self.bq
outputs.append(Y)
return outputs, Ht, Ct