000 通過 Pytorch 實現 Transformer 框架完整代碼（帶註釋）

#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
# @Time : 2021/10/29 10:48
# @Author : nickchen121
# @Email : [email protected]
# Cnblogs : https://www.cnblogs.com/nickchen121
# @File : abd_transformer_cyd.py
# @Software: PyCharm
import math
import torch
import collections
import numpy as np
import torch.nn as nn
from copy import deepcopy
import torch.nn.functional as F
from torch.autograd import Variable
# 讓Hypothesis擁有可訪問的屬性，即Hypothesis.value
Hypothesis = collections.namedtuple('Hypothesis', ['value', 'score'])
def clone_module_to_modulelist(module, module_num):
"""
克隆n個Module類放入ModuleList中，並返回ModuleList，這個ModuleList中的每個Module都是一模一樣的
nn.ModuleList，它是一個儲存不同 module，並自動將每個 module 的 parameters 添加到網絡之中的容器。
你可以把任意 nn.Module 的子類 (比如 nn.Conv2d, nn.Linear 之類的) 加到這個 list 裏面，
加入到 nn.ModuleList 裏面的 module 是會自動註冊到整個網絡上的，
同時 module 的 parameters 也會自動添加到整個網絡中。
:param module: 被克隆的module
:param module_num: 被克隆的module數
:return: 裝有module_num個相同module的ModuleList
"""
return nn.ModuleList([deepcopy(module) for _ in range(module_num)])
class LayerNorm(nn.Module):
"""
構建一個LayerNorm Module
LayerNorm的作用：對x歸一化，使x的均值爲0，方差爲1
LayerNorm計算公式：x-mean(x)/\sqrt{var(x)+\epsilon} = x-mean(x)/std(x)+\epsilon
"""
def __init__(self, x_size, eps=1e-6):
"""
:param x_size: 特徵的維度
:param eps: eps是一個平滑的過程，取值通常在（10^-4~10^-8 之間）
其含義是，對於每個參數，隨着其更新的總距離增多，其學習速率也隨之變慢。
防止出現除以0的情況。
nn.Parameter將一個不可訓練的類型Tensor轉換成可以訓練的類型parameter，
並將這個parameter綁定到這個module裏面。
使用這個函數的目的也是想讓某些變量在學習的過程中不斷的修改其值以達到最優化。
"""
super(LayerNorm, self).__init__()
self.ones_tensor = nn.Parameter(torch.ones(x_size)) # 按照特徵向量大小返回一個全1的張量，並且轉換成可訓練的parameter類型
self.zeros_tensor = nn.Parameter(torch.zeros(x_size))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True) # 求標準差
return self.ones_tensor * (x - mean) / (std + self.eps) + self.zeros_tensor # LayerNorm的計算公式
class FeatEmbedding(nn.Module):
"""
視頻特徵向量生成器
"""
def __init__(self, d_feat, d_model, dropout):
"""
FeatEmbedding的初始化
:param d_feat: per frame dimension（每幀的維度），作爲Linear層輸入的維度
:param d_model: 作爲Linear層輸出的維度
:param dropout: Dropout層的比率
nn.Sequential：這是一個有順序的容器，將特定神經網絡模塊按照在傳入構造器的順序依次被添加到計算圖中
在這裏構造的容器是：LayerNorm --> Dropout --> Linear
"""
super(FeatEmbedding, self).__init__()
self.video_embeddings = nn.Sequential(
# TODO:爲什麼這裏對視頻做處理，即圖片做處理，不使用BatchNorm
# nn.BatchNorm2d(d_feat)
# nn.LayerNorm(d_feat),
LayerNorm(d_feat),
nn.Dropout(p=dropout),
nn.Linear(d_feat, d_model)
)
def forward(self, x):
return self.video_embeddings(x) # 返回被處理後的視頻特徵向量
class WordEmbedding(nn.Module):
"""
把向量構造成d_model維度的詞向量，以便後續送入編碼器
"""
def __init__(self, vocab_size, d_model):
"""
:param vocab_size: 字典長度
:param d_model: 詞向量維度
"""
super(WordEmbedding, self).__init__()
self.d_model = d_model
# 字典中有vocab_size個詞，詞向量維度是d_model，每個詞將會被映射成d_model維度的向量
self.embedding = nn.Embedding(vocab_size, d_model)
self.embed = self.embedding
def forward(self, x):
# TODO：爲什麼要乘以一個sqrt，Transformer中的？
return self.embed(x) * math.sqrt(self.d_model)
class PositionalEncoding(nn.Module):
"""
正弦位置編碼，即通過三角函數構建位置編碼
Implementation based on "Attention Is All You Need"
:cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
"""
def __init__(self, dim: int, dropout: float, max_len=5000):
"""
:param dim: 位置向量的向量維度，一般與詞向量維度相同，即d_model
:param dropout: Dropout層的比率
:param max_len: 句子的最大長度
"""
# 判斷能夠構建位置向量
if dim % 2 != 0:
raise ValueError(f"不能使用 sin/cos 位置編碼，得到了奇數的維度{dim:d}，應該使用偶數維度")
"""
構建位置編碼pe
pe公式爲：
PE(pos,2i/2i+1) = sin/cos(pos/10000^{2i/d_{model}})
"""
pe = torch.zeros(max_len, dim) # 初始化pe
position = torch.arange(0, max_len).unsqueeze(1) # 構建pos，爲句子的長度，相當於pos
div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * torch.tensor(
-(math.log(10000.0) / dim)))) # 復現位置編碼sin/cos中的公式
pe[:, 0::2] = torch.sin(position.float() * div_term) # 偶數使用sin函數
pe[:, 1::2] = torch.cos(position.float() * div_term) # 奇數使用cos函數
pe = pe.unsqueeze(1) # 扁平化成一維向量
super(PositionalEncoding, self).__init__()
self.register_buffer('pe', pe) # pe不是模型的一個參數，通過register_buffer把pe寫入內存緩衝區，當做一個內存中的常量
self.drop_out = nn.Dropout(p=dropout)
self.dim = dim
def forward(self, emb, step=None):
"""
詞向量和位置編碼拼接並輸出
:param emb: 詞向量序列（FloatTensor），``(seq_len, batch_size, self.dim)``
:param step: 如果 stepwise("seq_len=1")，則用此位置的編碼
:return: 詞向量和位置編碼的拼接
"""
emb = emb * math.sqrt(self.dim)
if step is None:
emb = emb + self.pe[:emb.size(0)] # 拼接詞向量和位置編碼
else:
emb = emb + self.pe[step]
emb = self.drop_out(emb)
return emb
def self_attention(query, key, value, dropout=None, mask=None):
"""
自注意力計算
:param query: Q
:param key: K
:param value: V
:param dropout: drop比率
:param mask: 是否mask
:return: 經自注意力機制計算後的值
"""
d_k = query.size(-1) # 防止softmax未來求梯度消失時的d_k
# Q,K相似度計算公式：\frac{Q^TK}{\sqrt{d_k}}
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) # Q,K相似度計算
# 判斷是否要mask，注：mask的操作在QK之後，softmax之前
if mask is not None:
"""
scores.masked_fill默認是按照傳入的mask中爲1的元素所在的索引，
在scores中相同的的索引處替換爲value，替換值爲-1e9，即-(10^9)
"""
# mask.cuda()
# 進行mask操作，由於參數mask==0，因此替換上述mask中爲0的元素所在的索引
scores = scores.masked_fill(mask == 0, -1e9)
self_attn_softmax = F.softmax(scores, dim=-1) # 進行softmax
# 判斷是否要對相似概率分佈進行dropout操作
if dropout is not None:
self_attn_softmax = dropout(self_attn_softmax)
# 注意：返回經自注意力計算後的值，以及進行softmax後的相似度（即相似概率分佈）
return torch.matmul(self_attn_softmax, value), self_attn_softmax
class MultiHeadAttention(nn.Module):
"""
多頭注意力計算
"""
def __init__(self, head, d_model, dropout=0.1):
"""
:param head: 頭數
:param d_model: 詞向量的維度，必須是head的整數倍
:param dropout: drop比率
"""
super(MultiHeadAttention, self).__init__()
assert (d_model % head == 0) # 確保詞向量維度是頭數的整數倍
self.d_k = d_model // head # 被拆分爲多頭後的某一頭詞向量的維度
self.head = head
self.d_model = d_model
"""
由於多頭注意力機制是針對多組Q、K、V，因此有了下面這四行代碼，具體作用是，
針對未來每一次輸入的Q、K、V，都給予參數進行構建
其中linear_out是針對多頭彙總時給予的參數
"""
self.linear_query = nn.Linear(d_model, d_model) # 進行一個普通的全連接層變化，但不修改維度
self.linear_key = nn.Linear(d_model, d_model)
self.linear_value = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(p=dropout)
self.attn_softmax = None # attn_softmax是能量分數, 即句子中某一個詞與所有詞的相關性分數， softmax(QK^T)
def forward(self, query, key, value, mask=None):
if mask is not None:
"""
多頭注意力機制的線性變換層是4維，是把query[batch, frame_num, d_model]變成[batch, -1, head, d_k]
再1，2維交換變成[batch, head, -1, d_k], 所以mask要在第二維（head維）添加一維，與後面的self_attention計算維度一樣
具體點將，就是：
因爲mask的作用是未來傳入self_attention這個函數的時候，作爲masked_fill需要mask哪些信息的依據
針對多head的數據，Q、K、V的形狀維度中，只有head是通過view計算出來的，是多餘的，爲了保證mask和
view變換之後的Q、K、V的形狀一直，mask就得在head這個維度添加一個維度出來，進而做到對正確信息的mask
"""
mask = mask.unsqueeze(1)
n_batch = query.size(0) # batch_size大小，假設query的維度是：[10, 32, 512]，其中10是batch_size的大小
"""
下列三行代碼都在做類似的事情，對Q、K、V三個矩陣做處理
其中view函數是對Linear層的輸出做一個形狀的重構，其中-1是自適應（自主計算）
從這種重構中，可以看出，雖然增加了頭數，但是數據的總維度是沒有變化的，也就是說多頭是對數據內部進行了一次拆分
transopose(1,2)是對前形狀的兩個維度(索引從0開始)做一個交換，例如(2,3,4,5)會變成(2,4,3,5)
因此通過transpose可以讓view的第二維度參數變成n_head
假設Linear成的輸出維度是：[10, 32, 512]，其中10是batch_size的大小
注：這裏解釋了爲什麼d_model // head == d_k，如若不是，則view函數做形狀重構的時候會出現異常
"""
query = self.linear_query(query).view(n_batch, -1, self.head, self.d_k).transpose(1, 2) # [b, 8, 32, 64]，head=8
key = self.linear_key(key).view(n_batch, -1, self.head, self.d_k).transpose(1, 2) # [b, 8, 28, 64]
value = self.linear_value(value).view(n_batch, -1, self.head, self.d_k).transpose(1, 2) # [b, 8, 28, 64]
# x是通過自注意力機制計算出來的值， self.attn_softmax是相似概率分佈
x, self.attn_softmax = self_attention(query, key, value, dropout=self.dropout, mask=mask)
"""
下面的代碼是彙總各個頭的信息，拼接後形成一個新的x
其中self.head * self.d_k，可以看出x的形狀是按照head數拼接成了一個大矩陣，然後輸入到linear_out層添加參數
contiguous()是重新開闢一塊內存後存儲x，然後纔可以使用.view方法，否則直接使用.view方法會報錯
"""
x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.head * self.d_k)
return self.linear_out(x)
class FeedForward(nn.Module):
"""
兩層具有殘差網絡的前饋神經網絡，FNN網絡
"""
def __init__(self, d_model: int, d_ff: int, dropout=0.1):
"""
:param d_model: FFN第一層輸入的維度
:param d_ff: FNN第二層隱藏層輸入的維度
:param dropout: drop比率
"""
super(FeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.dropout_1 = nn.Dropout(dropout)
self.relu = nn.ReLU()
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x):
"""
:param x: 輸入數據，形狀爲(batch_size, input_len, model_dim)
:return: 輸出數據（FloatTensor），形狀爲(batch_size, input_len, model_dim)
"""
inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
output = self.dropout_2(self.w_2(inter))
# return output + x，即爲殘差網絡
return output # + x
class SublayerConnection(nn.Module):
"""
子層的連接: layer_norm(x + sublayer(x))
上述可以理解爲一個殘差網絡加上一個LayerNorm歸一化
"""
def __init__(self, size, dropout=0.1):
"""
:param size: d_model
:param dropout: drop比率
"""
super(SublayerConnection, self).__init__()
self.layer_norm = LayerNorm(size)
# TODO：在SublayerConnection中LayerNorm可以換成nn.BatchNorm2d
# self.layer_norm = nn.BatchNorm2d()
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, sublayer):
return self.dropout(self.layer_norm(x + sublayer(x)))
class EncoderLayer(nn.Module):
"""
一層編碼Encoder層
MultiHeadAttention -> Add & Norm -> Feed Forward -> Add & Norm
"""
def __init__(self, size, attn, feed_forward, dropout=0.1):
"""
:param size: d_model
:param attn: 已經初始化的Multi-Head Attention層
:param feed_forward: 已經初始化的Feed Forward層
:param dropout: drop比率
"""
super(EncoderLayer, self).__init__()
self.attn = attn
self.feed_forward = feed_forward
"""
下面一行的作用是因爲一個Encoder層具有兩個殘差結構的網絡
因此構建一個ModuleList存儲兩個SublayerConnection，以便未來對數據進行殘差處理
"""
self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(size, dropout), 2)
def forward(self, x, mask):
"""
:param x: Encoder層的輸入
:param mask: mask標誌
:return: 經過一個Encoder層處理後的輸出
"""
"""
編碼層第一層子層
self.attn 應該是一個已經初始化的Multi-Head Attention層
把Encoder的輸入數據x和經過一個Multi-Head Attention處理後的x_attn送入第一個殘差網絡進行處理得到first_x
"""
first_x = self.sublayer_connection_list[0](x, lambda x_attn: self.attn(x, x, x, mask))
"""
編碼層第二層子層
把經過第一層子層處理後的數據first_x與前饋神經網絡送入第二個殘差網絡進行處理得到Encoder層的輸出
"""
return self.sublayer_connection_list[1](first_x, self.feed_forward)
class DecoderLayer(nn.Module):
"""
一層解碼Decoder層
Mask MultiHeadAttention -> Add & Norm -> Multi-Head Attention -> Add & Norm
-> Feed Forward -> Add & Norm
"""
def __init__(self, d_model, attn, feed_forward, sublayer_num, dropout=0.1):
"""
:param d_model: d_model
:param attn: 已經初始化的Multi-Head Attention層
:param feed_forward: 已經初始化的Feed Forward層
:param sublayer_num: 解碼器內部子層數，如果未來r2l_memory傳入有值，則爲4層，否則爲普通的3層
:param dropout: drop比率
"""
super(DecoderLayer, self).__init__()
self.attn = attn
self.feed_forward = feed_forward
self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(d_model, dropout), sublayer_num)
def forward(self, x, l2r_memory, src_mask, trg_mask, r2l_memory=None, r2l_trg_mask=None):
"""
:param x: Decoder的輸入(captioning)
:param l2r_memory: Encoder的輸出，作爲Multi-Head Attention的K，V值，爲從左到右的Encoder的輸出
:param src_mask: 編碼器輸入的填充掩碼
:param trg_mask: 解碼器輸入的填充掩碼和序列掩碼，即對後面單詞的掩碼
:param r2l_memory: 從右到左解碼器的輸出
:param r2l_trg_mask: 從右到左解碼器的輸出的填充掩碼和序列掩碼
:return: Encoder的輸出
"""
"""
解碼器第一層子層
把Decoder的輸入數據x和經過一個Masked Multi-Head Attention處理後的first_x_attn送入第一個殘差網絡進行處理得到first_x
"""
first_x = self.sublayer_connection_list[0](x, lambda first_x_attn: self.attn(x, x, x, trg_mask))
"""
解碼器第二層子層
把第一層子層得到的first_x和
經過一個Multi-Head Attention處理後的second_x_attn（由first_x和Encoder的輸出進行自注意力計算）
送入第二個殘差網絡進行處理
"""
second_x = self.sublayer_connection_list[1](first_x,
lambda second_x_attn: self.attn(first_x, l2r_memory, l2r_memory,
src_mask))
"""
解碼器第三層子層
把經過第二層子層處理後的數據second_x與前饋神經網絡送入第三個殘差網絡進行處理得到Decoder層的輸出
如果有r2l_memory數據，則還需要經過一層多頭注意力計算，也就是說會有四個殘差網絡
r2l_memory是讓Decoder層多了一層雙向編碼中從右到左的編碼層
而只要三個殘差網絡的Decoder層只有從左到右的編碼
"""
if not r2l_memory:
# 進行從右到左的編碼，增加語義信息
third_x = self.sublayer_connection_list[-2](second_x,
lambda third_x_attn: self.attn(second_x, r2l_memory, r2l_memory,
r2l_trg_mask))
return self.sublayer_connection_list[-1](third_x, self.feed_forward)
else:
return self.sublayer_connection_list[-1](second_x, self.feed_forward)
class Encoder(nn.Module):
"""
構建n層編碼層
"""
def __init__(self, n, encoder_layer):
"""
:param n: Encoder層的層數
:param encoder_layer: 初始化的Encoder層
"""
super(Encoder, self).__init__()
self.encoder_layer_list = clone_module_to_modulelist(encoder_layer, n)
def forward(self, x, src_mask):
"""
:param x: 輸入數據
:param src_mask: mask標誌
:return: 經過n層Encoder處理後的數據
"""
for encoder_layer in self.encoder_layer_list:
x = encoder_layer(x, src_mask)
return x
class R2LDecoder(nn.Module):
"""
n個含有R2L自注意計算的解碼層，該解碼層只有3個殘差網絡
"""
def __init__(self, n_layers, decoder_layer):
"""
:param n_layers: Decoder層的層數
:param decoder_layer: 初始化的Decoder層
"""
super(R2LDecoder, self).__init__()
self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)
def forward(self, x, memory, src_mask, trg_mask):
for decoder_layer in self.decoder_layer_list:
# 沒有傳入r2l_memory和r2l_trg_mask，默認值爲None，即該Decoder只有3個殘差網絡
x = decoder_layer(x, memory, src_mask, trg_mask)
return x
class L2RDecoder(nn.Module):
"""
n個含有L2R自注意計算的解碼層，該解碼層有4個殘差網絡
"""
def __init__(self, n_layers, decoder_layer):
"""
:param n_layers: Decoder層的層數
:param decoder_layer: 初始化的Decoder層
"""
super(L2RDecoder, self).__init__()
self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)
def forward(self, x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
for decoder_layer in self.decoder_layer_list:
# 傳入r2l_memory和r2l_trg_mask，即修改默認值，Decoder將具有4個殘差網絡
x = decoder_layer(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)
return x
def sequence_mask(size):
"""
序列掩碼，解碼器輸入數據時掩蓋後續詞的位置
:param size: 生成詞個數
:return: 右上角爲False，主對角線及左下角爲True的bool矩陣
"""
attn_shape = (1, size, size)
"""
np.triu：返回函數的上三角矩陣A，k=1得到主對角線向上平移一個距離的對角線，
即保留右上對角線及其以上的數據，其餘置爲0，即a_11=0
"""
mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return (torch.from_numpy(mask) == 0).cuda() # 通過==0返回的是bool矩陣，即矩陣元素爲bool值
def src_trg_mask(src, r2l_trg, trg, pad_idx):
"""
:param src: 編碼器的輸入
:param r2l_trg: r2l方向解碼器的輸入
:param trg: l2r方向解碼器的輸入
:param pad_idx: pad的索引
:return: trg爲None，返回編碼器輸入的掩碼；trg存在，返回編碼器和解碼器輸入的掩碼
"""
# TODO: enc_src_mask是元組，是否可以改成list，然後修改這種冗餘代碼
# 通過src的長短，即視頻特徵向量提取的模式，判斷有多少種特徵向量需要進行mask
if isinstance(src, tuple) and len(src) == 4:
# 不同模式的視頻特徵向量的mask
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1) # 二維特徵向量
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1) # 三維特徵向量
src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1) # 目標檢測特徵向量
src_rel_mask = (src[3][:, :, 0] != pad_idx).unsqueeze(1) # 目標關係特徵向量
# 視頻所有特徵向量mask的拼接
enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask, src_rel_mask)
dec_src_mask = src_image_mask & src_motion_mask # 視頻二維和三維特徵向量mask的拼接
src_mask = (enc_src_mask, dec_src_mask) # 視頻最終的mask
elif isinstance(src, tuple) and len(src) == 3:
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1)
enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask)
dec_src_mask = src_image_mask & src_motion_mask
src_mask = (enc_src_mask, dec_src_mask)
elif isinstance(src, tuple) and len(src) == 2:
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
enc_src_mask = (src_image_mask, src_motion_mask)
dec_src_mask = src_image_mask & src_motion_mask
src_mask = (enc_src_mask, dec_src_mask)
else:
# 即只有src_image_mask，即二維特徵的mask
src_mask = src_image_mask = (src[:, :, 0] != pad_idx).unsqueeze(1)
# 判斷是否需要對trg，也就是解碼器的輸入進行掩碼
if trg and r2l_trg:
"""
trg_mask是填充掩碼和序列掩碼，&前是填充掩碼，&後是通過subsequent_mask函數得到的序列掩碼
其中type_as，是爲了讓序列掩碼和填充掩碼的維度一致
"""
trg_mask = (trg != pad_idx).unsqueeze(1) & sequence_mask(trg.size(1)).type_as(src_image_mask.data)
# r2l_trg的填充掩碼
r2l_pad_mask = (r2l_trg != pad_idx).unsqueeze(1).type_as(src_image_mask.data)
# r2l_trg的填充掩碼和序列掩碼
r2l_trg_mask = r2l_pad_mask & sequence_mask(r2l_trg.size(1)).type_as(src_image_mask.data)
# src_mask[batch, 1, lens] trg_mask[batch, 1, lens]
return src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask
else:
return src_mask
class WordProbGenerator(nn.Module):
"""
文本生成器，即把Decoder層的輸出通過最後一層softmax層變化爲詞概率
"""
def __init__(self, d_model, vocab_size):
"""
:param d_model: 詞向量維度
:param vocab_size: 詞典大小
"""
super(WordProbGenerator, self).__init__()
# 通過線性層的映射，映射成詞典大小的維度
self.linear = nn.Linear(d_model, vocab_size)
def forward(self, x):
# 通過softmax函數對詞概率做出估計
return F.log_softmax(self.linear(x), dim=-1)
class ABDTransformer(nn.Module):
"""
拼湊出Transformer
"""
def __init__(self, vocab, d_feat, d_model, d_ff, n_heads, n_layers, dropout, feature_mode, device='cuda'):
"""
:param vocab: 字典長度
:param d_feat: per frame dimension（每幀的維度）
:param d_model: 詞向量的長度
:param d_ff: FNN（FeedForward）第二層隱藏層輸入的維度
:param n_heads: 多頭注意力時的頭數
:param n_layers: 編碼器和解碼器的層數
:param dropout: drop的比率
:param feature_mode: 提取視頻特徵的模式
:param device: 是否使用gpu
"""
super(ABDTransformer, self).__init__()
self.vocab = vocab
self.device = device
self.feature_mode = feature_mode
attn = MultiHeadAttention(n_heads, d_model, dropout) # 多頭注意力計算
feed_forward = FeedForward(d_model, d_ff) # 前饋神經網絡
"""
提取視頻特徵向量
通過feature_mode判斷d_feat提取出的維度，也就是提取了多少種信息
共有四種特徵向量信息，四種特徵向量依次爲：
image_mask：二維特徵向量
motion_mask：三維特徵向量
object_mask：目標檢測，分爲兩部分，第一部分是目標檢測框的座標，第二部分是被檢測目標的特徵向量
rel_mask：是目標之間的關係特徵向量
"""
if feature_mode == 'one':
# 使用unknown_src_embed命名的目的：提取視頻一個特徵向量的時候，不一定會提取什麼種類的特徵向量
self.unknown_src_embed = FeatEmbedding(d_feat, d_model, dropout)
elif feature_mode == 'two':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
elif feature_mode == 'three':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
elif feature_mode == 'four':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
self.rel_src_embed = FeatEmbedding(d_feat[3], d_model, dropout)
else:
raise "feature_mode沒有該模式，只有['one','two','three','four']四種模式"
# 把特徵向量提取成d_model維度的詞向量
self.trg_embed = WordEmbedding(vocab.n_vocabs, d_model)
# 提取位置向量
self.pos_embed = PositionalEncoding(d_model, dropout)
# 編碼層
self.encoder = Encoder(n_layers, EncoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward), dropout))
"""
單向解碼層
使用deepcopy的原因：因爲每個層的參數是不同的，因此通過deepcopy拷貝一份到新的內存裏，避免共享參數
"""
self.r2l_decoder = R2LDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
sublayer_num=3, dropout=dropout))
# 雙向解碼層
self.l2r_decoder = L2RDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
sublayer_num=4, dropout=dropout))
# 生成單詞概率分佈
self.word_prob_generator = WordProbGenerator(d_model, vocab.n_vocabs)
def _encoder_feature_concat(self, src, feature_type, src_mask):
"""
爲接下來的encoder函數做準備，主要目的是對視頻的特徵向量做處理
:param src: 特徵向量
:param feature_type: 根據視頻的類別不同，使用不同的特徵向量生成函數， ['image', 'motion', 'object', 'rel']
:param src_mask: 特徵向量掩碼的標誌
:return: 經過處理後的視頻特徵向量
"""
if feature_type == 'rel':
# 視頻的關係特徵向量不需要進行位置向量
x = self.rel_src_embed(src) # 提取目標關係特徵向量
return self.encoder(x, src_mask) # 送入編碼器進行編碼
# 例：對於'image'，下面的調用爲 self.image_src_embed(src)
x = self.__getattribute__(f'{feature_type}_src_embed')(src)
x = self.pos_embed(x) # 提取視頻位置特徵向量
return self.encoder(x, src_mask) # 送入編碼器進行編碼
def encode(self, src, src_mask):
"""
對數據進行編碼，此處主要目的是對不同類型的視頻特徵向量進行編碼
:param src: 視頻的特徵向量
:param src_mask: 視頻特徵向量的掩碼標誌
:return: 成功被編碼器編碼的視頻特徵向量
"""
x_list = [] # 存儲不同類型的視頻特徵向量被編碼後的向量
feature_type_list = ['image', 'motion', 'object', 'rel'] # 視頻特徵向量的類型
feature_mode_dict = {'two': 2, 'three': 3, 'four': 4} # 輸入視頻特徵向量的種類
if self.feature_mode == 'one':
return self._encoder_feature_concat[src, 'unknown', src_mask]
for i, feature_type in enumerate(feature_type_list):
# 對於不同的feature_mode，擁有的encode的種類也不同
if i == feature_mode_dict[self.feature_mode]:
break
x_list.append(self._encoder_feature_concat[src[i], feature_type, src_mask[i]])
# TODO（靈感）：這裏是否能添加一個線性變化，找出對於視頻詞向量更爲有作用的模式和權重，這樣也具有一定的解釋性
return sum(x_list) # 對於不同feature_type提取的向量進行疊加
def r2l_decode(self, trg, memory, src_mask, trg_mask):
"""
對於單向編碼，把視頻向量轉爲文本向量，並且添加位置向量
:param trg: 解碼器的輸入
:param memory: 編碼器的輸出，也就是傳給解碼器的K、V
:param src_mask: 編碼器輸出的掩碼標誌
:param trg_mask: 解碼器的掩碼和單詞掩碼序列（看不見後面的詞）
:return:
"""
x = self.trg_embed(trg) # 把視頻向量轉爲單向編碼的詞向量
x = self.pos_embed(x)
return self.r2l_decoder(x, memory, src_mask, trg_mask)
def l2r_decode(self, trg, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
"""
對於雙向編碼，把視頻向量轉爲文本向量，並且添加位置向量
:param trg: 解碼器的輸入
:param memory: 編碼器的輸出，也就是傳給解碼器的K、V
:param src_mask: 編碼器輸出的掩碼標誌
:param trg_mask: 解碼器的掩碼和單詞掩碼序列（看不見後面的詞）
:param r2l_memory: 從右到左解碼器的輸出
:param r2l_trg_mask: 從右到左解碼器的輸出的填充掩碼和序列掩碼
:return:
"""
x = self.trg_embed(trg) # 把視頻向量轉爲雙向編碼的詞向量
x = self.pos_embed(x)
return self.l2r_decoder(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)
def forward(self, src, r2l_trg, trg, mask):
"""
:param src: 編碼器的輸入
:param r2l_trg: 從右到左解碼器的輸入
:param trg: 從左到右解碼器的輸入
:param mask: mask標誌
:return: 從右到左解碼器和從左到右解碼器的輸出詞概率分佈
"""
# mask應該是個元組，其中src_mask是包括了不同特徵模式的mask的元組
if len(mask) == 4:
src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask = mask
else:
raise "mask返回的是不帶有解碼器輸入掩碼的掩碼元組，確認src_trg_mask()函數的參數"
if self.feature_mode == 'one':
# 得到視頻encode後的輸出
encoding_output = self.encode(src, src_mask)
# 視頻特徵單向編碼後送入三層殘差網絡的解碼器後得到的輸出
r2l_output = self.r2l_decode(r2l_trg, encoding_output, src_mask, r2l_trg_mask)
# 視頻特徵雙向編碼後送入四層殘差網絡的解碼器後得到的輸出
l2r_output = self.l2r_decode(trg, encoding_output, src_mask, trg_mask, r2l_output, r2l_pad_mask)
elif self.feature_mode == 'two' or 'three' or 'four':
# enc_src_mask是視頻所有類型的特徵掩碼；dec_src_mask是二維和三維類型的特徵的掩碼
enc_src_mask, dec_src_mask = src_mask
# 視頻特徵向量模式的不同，對應不同的掩碼方式
encoding_output = self.encode(src, enc_src_mask)
r2l_output = self.r2l_decode(r2l_trg, encoding_output, dec_src_mask, r2l_trg_mask)
l2r_output = self.l2r_decode(trg, encoding_output, dec_src_mask, trg_mask, r2l_output, r2l_pad_mask)
else:
raise "沒有這種feature_mode，只有['one','two','three','four']"
# 預測解碼詞概率分佈
r2l_pred = self.word_prob_generator(r2l_output)
l2r_pred = self.word_prob_generator(l2r_output)
return r2l_pred, l2r_pred
def greedy_decode(self, batch_size, src_mask, memory, max_len):
"""
針對r2l的解碼單詞生成，貪婪解碼，每次按照最大概率的詞作爲候選詞
:param batch_size: 每次送入的數據的數量
:param src_mask: 編碼器輸入數據的掩碼標誌
:param memory: 編碼器的輸出
:param max_len: 最大的迭代次數，即生成單詞數
:return: 返回的r2l_hidden，即未來送入l2r中的r2l_memory；output是r2l層的預測輸出
"""
eos_idx = self.vocab.word2idx['<S>'] # <S>符號，表示結束輸出的標誌
with torch.no_grad():
# 構建一個batch_size大小的向量存儲eos標誌，作爲初始化的output
output = torch.ones(batch_size, 1).fill_(eos_idx).long().cuda()
# 迭代生成最終輸出
for i in range(max_len + 2 - 1):
# 構建解碼器輸入的序列掩碼，掩蓋後續的詞
trg_mask = sequence_mask(output.size(1))
# 把初始化的輸出和編碼器的輸出進行解碼輸出
dec_out = self.r2l_decode(output, memory, src_mask, trg_mask) # batch, len, d_model
r2l_hidden = dec_out
# 按照最大概率的詞作爲候選詞
pred = self.word_prob_generator(dec_out) # batch, len, n_vocabs
next_word = pred[:, -1].max(dim=-1)[1].unsqueeze(1) # pred[:, -1]([batch, n_vocabs])
output = torch.cat([output, next_word], dim=-1) # 拼接預測單詞送入解碼器解碼
# 返回的r2l_hidden，即未來送入l2r中的r2l_memory；output是r2l層的預測輸出
return r2l_hidden, output
def r2l_beam_search_decode(self, batch_size, src, src_mask, model_encodings, beam_size, max_len):
"""
Beam Search算法可以參考：https://www.cnblogs.com/nickchen121/p/15499576.html
在每生成一個單詞的時間步上，不是隻保留當前分數最高的1個輸出，而是保留num_beams個。
當num_beams=1時集束搜索就退化成了貪心搜索，也就是上述的greedy_decode。
:param batch_size: 一次送入數據的大小
:param src: 編碼器的輸入
:param src_mask: 編碼器輸入的掩碼
:param model_encodings:
:param beam_size:
:param max_len: 最大迭代數，即生成單詞數
:return:
"""
# batch_size = src.shape[0]
end_symbol = self.vocab.word2idx['<S>'] # 結束符號
start_symbol = self.vocab.word2idx['<S>'] # 開始符號
r2l_output = None # r2l解碼器的輸出
r2l_outputs = None
# 1.1 Setup Src
# src has shape (batch_size, sent_len)
# src_mask has shape (batch_size, 1, sent_len)
# src_mask = (src[:, :, 0] != self.vocab.word2idx['<PAD>']).unsqueeze(-2) # TODO Untested
# model_encodings has shape (batch_size, sentence_len, d_model)
# model_encodings = self.encode(src, src_mask)
# 1.2 Setup Tgt Hypothesis Tracking
# hypothesis is List(4 bt)[(cur beam_sz, dec_sent_len)], init: List(4 bt)[(1 init_beam_sz, dec_sent_len)]
# hypotheses[i] is shape (cur beam_sz, dec_sent_len)
hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
device=self.device)) for _ in range(batch_size)]
# List after init: List 4 bt of List of len max_len_completed, init: List of len 4 bt of []
completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
# List len batch_sz of shape (cur beam_sz), init: List(4 bt)[(1 init_beam_sz)]
# hyp_scores[i] is shape (cur beam_sz)
hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
for _ in range(batch_size)] # probs are log_probs must be init at 0.
# 2. Iterate: Generate one char at a time until maxlen
for _ in range(max_len + 1):
if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
break
"""
2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
We run all hypotheses for all examples together through the decoder and log-softmax,
and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
"""
cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l = [], [], [], []
for i in range(batch_size):
if hypotheses[i] is None:
cur_beam_sizes += [0]
continue
cur_beam_size, decoded_len = hypotheses[i].shape
cur_beam_sizes += [cur_beam_size]
last_tokens += [hypotheses[i]]
model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
model_encodings_cur = torch.cat(model_encodings_l, dim=0)
src_mask_cur = torch.cat(src_mask_l, dim=0)
y_tm1 = torch.cat(last_tokens, dim=0)
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
if self.feature_mode == 'one':
out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device))
elif self.feature_mode == 'two' or 'three' or 'four':
out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device))
else:
raise "out爲None"
r2l_output = out
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
_, decoded_len, vocab_sz = log_prob.shape
# log_prob = log_prob.reshape(batch_size, cur_beam_size, decoded_len, vocab_sz)
# shape List(4 bt)[(cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)]
# log_prob[i] is (cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)
log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)
"""
2.2 Now we process each example in the batch.
Note that the example may have already finished processing before
other examples (no more hypotheses to try), in which case we continue
"""
new_hypotheses, new_hyp_scores = [], []
for i in range(batch_size):
if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
new_hypotheses += [None]
new_hyp_scores += [None]
continue
"""
2.2.1 We compute the cumulative scores for each live hypotheses for the example
hyp_scores is the old scores for the previous stage, and `log_prob` are the new probs for
this stage. Since they are log probs, we sum them instead of multiplying them.
The .view(-1) forces all the hypotheses into one dimension. The shape of this dimension is
cur_beam_sz * vocab_sz (ex: 5 * 50002).
So after getting the topk from it,
we can recover the generating sentence and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
# shape (vocab_sz,)
cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
.expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)
"""
2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
# shape (cur_beam_sz,)
live_hyp_num_i = beam_size - len(completed_hypotheses[i])
# shape (cur_beam_sz,). Vals are between 0 and 50002 vocab_sz
top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
"""
shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz.
hyp_word_ids vals are 0 <= val < vocab_len
"""
prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs
"""
2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
"""
# Removed live_hyp_ids_i, which is used in the LSTM decoder to track live hypothesis ids
new_hypotheses_i, new_hyp_scores_i = [], []
for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
top_cand_hyp_scores):
prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()
new_hyp_sent = torch.cat(
(hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
if hyp_word_id == end_symbol:
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
score=cand_new_hyp_score))
else:
new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
new_hyp_scores_i.append(cand_new_hyp_score)
"""
2.2.4 We may find that the hypotheses_i for some example in the batch
is empty - we have fully processed that example. We use None as a sentinel in this case.
Above, the loops gracefully handle None examples.
"""
if len(new_hypotheses_i) > 0:
hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
else:
hypotheses_i, hyp_scores_i = None, None
new_hypotheses += [hypotheses_i]
new_hyp_scores += [hyp_scores_i]
# print(new_hypotheses, new_hyp_scores)
hypotheses, hyp_scores = new_hypotheses, new_hyp_scores
"""
2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
Sometimes, we may get to max_len of a sentence and still not generate the </s> end token.
In this case, the partial sentence we have generated will not be added to the completed_hypotheses
automatically, and we have to manually add it in. We add in as many as necessary so that there are
`beam_size` completed hypotheses for each example.
Finally, we sort each completed hypothesis by score.
"""
for i in range(batch_size):
hyps_to_add = beam_size - len(completed_hypotheses[i])
if hyps_to_add > 0:
scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
for score, id_ in zip(scores, ix):
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
score=score))
completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
return r2l_output, completed_hypotheses
def beam_search_decode(self, src, beam_size, max_len):
"""
An Implementation of Beam Search for the Transformer Model.
Beam search is performed in a batched manner. Each example in a batch generates `beam_size` hypotheses.
We return a list (len: batch_size) of list (len: beam_size) of Hypothesis,
which contain our output decoded sentence sand their scores.
:param src: shape (sent_len, batch_size). Each val is 0 < val < len(vocab_dec). The input tokens to the decoder.
:param max_len: the maximum length to decode
:param beam_size: the beam size to use
:return completed_hypotheses: A List of length batch_size,
each containing a List of beam_size Hypothesis objects.Hypothesis is a named Tuple,
its first entry is "value" and is a List of strings which contains the translated word
(one string is one word token).
The second entry is "score" and it is the log-prob score for this translated sentence.
Note: Below I note "4 bt", "5 beam_size" as the shapes of objects. 4, 5 are default values.
Actual values may differ.
"""
# 1. Setup
start_symbol = self.vocab.word2idx['<S>']
end_symbol = self.vocab.word2idx['<S>']
# 1.1 Setup Src
# src has shape (batch_size, sent_len)
# src_mask has shape (batch_size, 1, sent_len)
# src_mask = (src[:, :, 0] != self.vocab.word2idx['<PAD>']).unsqueeze(-2) # TODO Untested
src_mask = src_trg_mask(src, r2l_trg=None, trg=None, pad_idx=self.vocab.word2idx['<PAD>'])
# model_encodings has shape (batch_size, sentence_len, d_model)
if self.feature_mode == 'one':
batch_size = src.shape[0]
dec_src_mask = None
model_encodings = self.encode(src, src_mask)
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
elif self.feature_mode == 'two' or 'three' or 'four':
batch_size = src[0].shape[0]
enc_src_mask = src_mask[0]
dec_src_mask = src_mask[1]
model_encodings = self.encode(src, enc_src_mask)
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, dec_src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
else:
raise "batch_size爲None"
"""
1.2 Setup r2l target output
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
r2l_memory, r2l_completed_hypotheses = self.greedy_decode(batch_size, src_mask, model_encodings, max_len)
beam_r2l_memory = [copy.deepcopy(r2l_memory) for _ in range(beam_size)]
1.3 Setup Tgt Hypothesis Tracking
"""
# hypothesis is List(4 bt)[(cur beam_sz, dec_sent_len)], init: List(4 bt)[(1 init_beam_sz, dec_sent_len)]
# hypotheses[i] is shape (cur beam_sz, dec_sent_len)
hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
device=self.device)) for _ in range(batch_size)]
# List after init: List 4 bt of List of len max_len_completed, init: List of len 4 bt of []
completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
# List len batch_sz of shape (cur beam_sz), init: List(4 bt)[(1 init_beam_sz)]
# hyp_scores[i] is shape (cur beam_sz)
hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
for _ in range(batch_size)] # probs are log_probs must be init at 0.
# 2. Iterate: Generate one char at a time until maxlen
for _ in range(max_len + 1):
if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
break
"""
2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
We run all hypotheses for all examples together through the decoder and log-softmax,
and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
"""
cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l, r2l_memory_l = [], [], [], [], []
for i in range(batch_size):
if hypotheses[i] is None:
cur_beam_sizes += [0]
continue
cur_beam_size, decoded_len = hypotheses[i].shape
cur_beam_sizes += [cur_beam_size]
last_tokens += [hypotheses[i]]
model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
if self.feature_mode == 'one':
src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
elif dec_src_mask and (self.feature_mode == 'two' or 'three' or 'four'):
src_mask_l += [dec_src_mask[i:i + 1]] * cur_beam_size
r2l_memory_l += [r2l_memory[i: i + 1]] * cur_beam_size
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
model_encodings_cur = torch.cat(model_encodings_l, dim=0)
src_mask_cur = torch.cat(src_mask_l, dim=0)
y_tm1 = torch.cat(last_tokens, dim=0)
r2l_memory_cur = torch.cat(r2l_memory_l, dim=0)
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
if self.feature_mode == 'one':
out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device),
r2l_memory_cur, r2l_trg_mask=None)
elif self.feature_mode == 'two' or 'three' or 'four':
out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device),
r2l_memory_cur, r2l_trg_mask=None)
else:
raise "out爲None"
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
# shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
_, decoded_len, vocab_sz = log_prob.shape
# log_prob = log_prob.reshape(batch_size, cur_beam_size, decoded_len, vocab_sz)
# shape List(4 bt)[(cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)]
# log_prob[i] is (cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)
log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)
"""
2.2 Now we process each example in the batch.
Note that the example may have already finished processing before.
other examples (no more hypotheses to try), in which case we continue
"""
new_hypotheses, new_hyp_scores = [], []
for i in range(batch_size):
if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
new_hypotheses += [None]
new_hyp_scores += [None]
continue
# 2.2.1 We compute the cumulative scores for each live hypotheses for the example
# hyp_scores is the old scores for the previous stage, and `log_prob` are the new probs for
# this stage. Since they are log probs, we sum them instead of multiplying them.
# The .view(-1) forces all the hypotheses into one dimension. The shape of this dimension is
# cur_beam_sz * vocab_sz (ex: 5 * 50002). So after getting the topk from it, we can recover the
# generating sentence and the next word using: ix // vocab_sz, ix % vocab_sz.
cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
"shape (vocab_sz,)"
cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
.expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)
"""
2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
# shape (cur_beam_sz,)
live_hyp_num_i = beam_size - len(completed_hypotheses[i])
# shape (cur_beam_sz,). Vals are between 0 and 50002 vocab_sz
top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
"""
shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz.
hyp_word_ids vals are 0 <= val < vocab_len
"""
prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs
"""
2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
"""
# Removed live_hyp_ids_i, which is used in the LSTM decoder to track live hypothesis ids
new_hypotheses_i, new_hyp_scores_i = [], []
for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
top_cand_hyp_scores):
prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()
new_hyp_sent = torch.cat(
(hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
if hyp_word_id == end_symbol:
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
score=cand_new_hyp_score))
else:
new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
new_hyp_scores_i.append(cand_new_hyp_score)
# 2.2.4 We may find that the hypotheses_i for some example in the batch
# is empty - we have fully processed that example. We use None as a sentinel in this case.
# Above, the loops gracefully handle None examples.
if len(new_hypotheses_i) > 0:
hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
else:
hypotheses_i, hyp_scores_i = None, None
new_hypotheses += [hypotheses_i]
new_hyp_scores += [hyp_scores_i]
# print(new_hypotheses, new_hyp_scores)
hypotheses, hyp_scores = new_hypotheses, new_hyp_scores
"""
2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
Sometimes, we may get to max_len of a sentence and still not generate the </s> end token.
In this case, the partial sentence we have generated will not be added to the completed_hypotheses
automatically, and we have to manually add it in. We add in as many as necessary so that there are
`beam_size` completed hypotheses for each example.
Finally, we sort each completed hypothesis by score.
"""
for i in range(batch_size):
hyps_to_add = beam_size - len(completed_hypotheses[i])
if hyps_to_add > 0:
scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
for score, id_ in zip(scores, ix):
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
score=score))
completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
# print('completed_hypotheses', completed_hypotheses)
return r2l_completed_hypotheses, completed_hypotheses
000 通過 Pytorch 實現 Transformer 框架完整代碼（帶註釋）

如何熟悉一個陌生系統

裁員了！別錯過2024年大數據工程師必備的10項技能

低代碼集成Java系列：高效構建自定義插件

23 導師不敢和你說的開題報告寫法

21 如何寫出一篇高質量的sci水文

12 水論文如何吹一個好故事

11 導師讓你造航母怎麼辦？

導師不敢和你說的水論文隱藏技巧，只教你水論文

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結