第一步:載入訓練數據、英文德文字典
with self.graph.as_default():
if is_training:
self.x, self.y, self.num_batch = get_batch_data()
self.y = tf.expand_dims(self.y, 0)
self.x = tf.expand_dims(self.x, 0)
else:
self.x = tf.compat.v1.placeholder(tf.int32, shape=(None, hp.maxlen)) # maxlen句子中的最大單詞長度
self.y = tf.compat.v1.placeholder(tf.int32, shape=(None, hp.maxlen))
# define decoder inputs,在倒數第一個維度拼接
self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始輸入
de2idx, idx2de = load_de_vocab()
en2idx, idx2en = load_en_vocab()
- Tqdm:一個快速,可擴展的Python進度條,可以在 Python 長循環中添加一個進度提示信息,用戶只需要封裝任意的迭代器 tqdm(iterator)。
- tf.ones_like(tensor, dtype):創建一個所有元素都爲1的張量。給定一個張量,這個操作返回一個與所有元素都設爲1的張量相同類型和形狀的張量。可以選擇爲返回的張量指定一個新的類型(dtype)。
- tf.placeholder:在神經網絡構建graph的時候在模型中的佔位,此時並沒有把要輸入的數據傳入模型,它只會分配必要的內存。等建立session,在會話中,運行模型的時候通過feed_dict()函數向佔位符喂入數據。
- tf.expand_dims:ValueError: Index out of range using input dim 1; input has only 1 dims for '{{node strided_slice}} = StridedSlice[Index=DT_INT32, T=DT_INT32, begin_mask=3, ellipsis_mask=0, end_mask=1, new_axis_mask=0直接從batch傳來的數據只有一維:[ 129 1622 6 358 7 6349 3 0 0 0] ,embedding的輸入應該是二維數據,所以:
self.y = tf.expand_dims(self.y, 0)
self.x = tf.expand_dims(self.x, 0)
在第0維增加一維:
x: [[ 129 1622 6 358 7 6349 3 0 0 0]]
第二步:embedding x
with tf.compat.v1.variable_scope("encoder"):
# Embedding
self.enc = embedding(self.x,
vocab_size=len(de2idx),
num_units = hp.hidden_units,
zero_pad=True, # 讓padding一直是0
scale=True,
scope="enc_embed")
## Positional Encoding
if hp.sinusoid: # 加入位置信息
self.enc += positional_encoding(self.x,
num_units = hp.hidden_units,
zero_pad = False,
scale = False,
scope='enc_pe')
else:
self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]),0),[tf.shape(self.x)[0],1]),
vocab_size = hp.maxlen,
num_units = hp.hidden_units,
zero_pad = False,
scale = False,
scope = "enc_pe")
##Drop out
self.enc = tf.compat.v1.layers.dropout(self.enc,rate = hp.dropout_rate,
training = tf.convert_to_tensor(is_training)) # 轉換爲tensor布爾值
- tf.convert_to_tensor:將給定值轉換爲張量。
- 建立lookup_table,大小爲(詞庫中單詞總數*hidden_units(超參數,設爲512)),將句子中的單詞一一映射到lookup_table。
enc:
[[[-0.30356652 -0.37562662 -0.2533778 ... -0.51391613 0.01039215
-0.2459533 ]
[-0.2541417 -0.39680204 0.10571449 ... 0.37125608 0.2542436
0.27990717]
[ 0.42342553 -0.5167016 0.13769649 ... 0.26156923 0.09989393
0.5327784 ]
...
[ 0. 0. 0. ... 0. 0.
0. ]
[ 0. 0. 0. ... 0. 0.
0. ]
[ 0. 0. 0. ... 0. 0.
0. ]]]
shape: Tensor("encoder/Shape:0", shape=(3,), dtype=int32)
- 加入位置信息:
enc with position encoding:
[[[ 0.10200937 -0.38363728 0.24297762 ... 0.29200065 -0.566911
-0.0279227 ]
[-0.19171125 0.2297355 -0.39826143 ... 0.06599768 0.33916172
-0.08940344]
[-0.04868246 0.02391308 -0.2647874 ... -0.51808816 -0.04866951
-0.2427479 ]
...
[ 0.08749113 0.08159737 -0.0503442 ... -0.07358517 -0.07496481
0.06467593]
[ 0.08940188 0.02686641 -0.01230696 ... -0.01856501 -0.05984045
-0.08068989]
[-0.06354884 0.03901545 0.0311534 ... -0.03089441 -0.04416377
-0.08697766]]]
第三步:encoder 多頭自注意力,8頭6層
encoder的輸入:Q = K = enc
for i in range(hp.num_blocks):
with tf.compat.v1.variable_scope("num_blocks_{}".format(i)):
### MultiHead Attention
self.enc = multihead_attention(queries = self.enc,
keys = self.enc,
num_units = hp.hidden_units,
num_heads = hp.num_heads,
dropout_rate = hp.dropout_rate,
is_training = is_training,
causality = False
)
self.enc = feedforward(self.enc,num_units = [4 * hp.hidden_units,hp.hidden_units])
第四步:embedding y(同x)
第五步:decoder多頭注意力
首先自注意力decoder的輸入Q = K = dec(進行Mask,不讓decoder看到當前單詞後的信息),然後將輸出的dec作爲Q,K = enc輸入到decoder中。
for i in range(hp.num_blocks):
with tf.compat.v1.variable_scope("num_blocks_{}".format(i)):
## Multihead Attention ( self-attention)對target進行自注意力計算,Q=K
self.dec = multihead_attention(queries=self.dec,
keys=self.dec,
num_units=hp.hidden_units,
num_heads=hp.num_heads,
dropout_rate=hp.dropout_rate,
is_training=is_training,
causality=True,
scope="self_attention")
## Multihead Attention ( vanilla attention),Q是target,K是encoder的輸出
self.dec = multihead_attention(queries=self.dec,
keys=self.enc,
num_units=hp.hidden_units,
num_heads=hp.num_heads,
dropout_rate=hp.dropout_rate,
is_training=is_training,
causality=False,
scope="vanilla_attention")
## Feed Forward
self.dec = feedforward(self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units])
第六步:訓練
self.logits = tf.compat.v1.layers.dense(self.dec,len(en2idx))
self.preds = tf.compat.v1.to_int32(tf.compat.v1.argmax(self.logits,dimension=-1)) #預測結果
self.istarget = tf.compat.v1.to_float(tf.not_equal(self.y,0))
self.acc = tf.reduce_sum(tf.compat.v1.to_float(tf.equal(self.preds,self.y)) * self.istarget / (tf.reduce_sum(self.istarget)))
if is_training:
# Loss
# 將one_hot中的0改成了一個很小的數,1改成了一個比較接近於1的數。
self.y_smoothed = label_smoothing(tf.one_hot(self.y,depth=len(en2idx)))
self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,labels=self.y_smoothed)
self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (tf.reduce_sum(self.istarget))
self.global_step = tf.Variable(0,name='global_step',trainable=False)
self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = hp.lr,beta1 = 0.9,beta2 = 0.98,epsilon = 1e-8)
self.train_op = self.optimizer.minimize(self.mean_loss,global_step = self.global_step)
tf.summary.scalar('mean_loss',self.mean_loss)
self.merged = tf.compat.v1.summary.merge_all()
- tf.not_equal(x, y, name=None):返回x!=y的真值
-
tf.reduce_sum(input_tensor, axis=None, keepdims=None):用於計算張量tensor沿着某一維度的和,可以在求和後降維。axis:指定的維,如果不指定,則計算所有元素的總和