ICLR2017 雙向attention 機制的QA模型
Code
- 論文的工作:對上下文(Content)和查詢(Query)之間的交互建模。BlDAF,是一個多階段的層次化模型,在不同的粒度級別上表示上下文,並使用雙向注意流機制獲得一個查詢aware的上下文表示。
- 輸入:文章和問題pair, 輸出答案的Start-End 位置作爲預測結果
- 模型結構圖如下
1. Character Embedding Layer 使用字符級CNNs將每個單詞映射到向量空間,CNN的輸出在整個寬度上最大化,以獲得每個單詞的固定大小向量。
#Init函數中定義 1. Character Embedding Layer
self.char_emb = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=1)
nn.init.uniform_(self.char_emb.weight, -0.001, 0.001)
self.char_conv = nn.Sequential(
nn.Conv2d(1, args.char_channel_size, (args.char_dim, args.char_channel_width)),
nn.ReLU()
)
def char_emb_layer(x):
"""
:param x: (batch, seq_len, word_len)
:return: (batch, seq_len, char_channel_size)
"""
batch_size = x.size(0)
# (batch, seq_len, word_len, char_dim)
x = self.dropout(self.char_emb(x))
# (batch, seq_len, char_dim, word_len)
x = x.transpose(2, 3)
# (batch * seq_len, 1, char_dim, word_len)
x = x.view(-1, self.args.char_dim, x.size(3)).unsqueeze(1)
# (batch * seq_len, char_channel_size, 1, conv_len) -> (batch * seq_len, char_channel_size, conv_len)
x = self.char_conv(x).squeeze()
# (batch * seq_len, char_channel_size, 1) -> (batch * seq_len, char_channel_size)
x = F.max_pool1d(x, x.size(2)).squeeze()
# (batch, seq_len, char_channel_size)
x = x.view(batch_size, -1, self.args.char_channel_size)
return x
2. Word Embedding Layer maps each word to a vector space using a pre-trained word embedding model.
# 2. Word Embedding Layer
# initialize word embedding with GloVe
self.word_emb = nn.Embedding.from_pretrained(pretrained, freeze=True)
# highway network
assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim)
for i in range(2):
setattr(self, 'highway_linear{}'.format(i),
nn.Sequential(Linear(args.hidden_size * 2, args.hidden_size * 2),
nn.ReLU()))
setattr(self, 'highway_gate{}'.format(i),
nn.Sequential(Linear(args.hidden_size * 2, args.hidden_size * 2),
nn.Sigmoid()))
def highway_network(x1, x2):
"""
:param x1: (batch, seq_len, char_channel_size)
:param x2: (batch, seq_len, word_dim)
:return: (batch, seq_len, hidden_size * 2)
"""
# (batch, seq_len, char_channel_size + word_dim)
x = torch.cat([x1, x2], dim=-1)
for i in range(2):
h = getattr(self, 'highway_linear{}'.format(i))(x)
g = getattr(self, 'highway_gate{}'.format(i))(x)
x = g * h + (1 - g) * x
# (batch, seq_len, hidden_size * 2)
return x
3. Contextual Embedding Layer utilizes contextual cues from surrounding words to refine the embedding of the words. These first three layers are applied to both the query and context.
雙向lstm 維度都是2d
# 3. Contextual Embedding Layer
self.context_LSTM = LSTM(input_size=args.hidden_size * 2,
hidden_size=args.hidden_size,
bidirectional=True,
batch_first=True,
dropout=args.dropout)
4. Attention Flow Layer couples the query and context of vectors vectors and produces a set query- aware feature for each word in the context.
def att_flow_layer(c, q):
"""
:param c: (batch, c_len, hidden_size * 2)
:param q: (batch, q_len, hidden_size * 2)
:return: (batch, c_len, q_len)
"""
c_len = c.size(1)
q_len = q.size(1)
# (batch, c_len, q_len, hidden_size * 2)
#c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1)
# (batch, c_len, q_len, hidden_size * 2)
#q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1)
# (batch, c_len, q_len, hidden_size * 2)
#cq_tiled = c_tiled * q_tiled
#cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1)
cq = []
for i in range(q_len):
#(batch, 1, hidden_size * 2)
qi = q.select(1, i).unsqueeze(1)
#(batch, c_len, 1)
ci = self.att_weight_cq(c * qi).squeeze()
cq.append(ci)
# (batch, c_len, q_len)
cq = torch.stack(cq, dim=-1)
# (batch, c_len, q_len)
s = self.att_weight_c(c).expand(-1, -1, q_len) + \
self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1) + \
cq
# (batch, c_len, q_len)
a = F.softmax(s, dim=2)
# (batch, c_len, q_len) * (batch, q_len, hidden_size * 2) -> (batch, c_len, hidden_size * 2)
c2q_att = torch.bmm(a, q)
# (batch, 1, c_len)
b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
# (batch, 1, c_len) * (batch, c_len, hidden_size * 2) -> (batch, hidden_size * 2)
q2c_att = torch.bmm(b, c).squeeze()
# (batch, c_len, hidden_size * 2) (tiled)
q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1)
# q2c_att = torch.stack([q2c_att] * c_len, dim=1)
# (batch, c_len, hidden_size * 8)
x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1)
return x
5. Modeling Layer employs a Recurrent Neural Network to scan the context.
# 5. Modeling Layer
self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8,
hidden_size=args.hidden_size,
bidirectional=True,
batch_first=True,
dropout=args.dropout)
self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2,
hidden_size=args.hidden_size,
bidirectional=True,
batch_first=True,
dropout=args.dropout)
6. Output Layer provides an answer to the query.
def output_layer(g, m, l):
"""
:param g: (batch, c_len, hidden_size * 8)
:param m: (batch, c_len ,hidden_size * 2)
:return: p1: (batch, c_len), p2: (batch, c_len)
"""
# (batch, c_len)
p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze()
# (batch, c_len, hidden_size * 2)
m2 = self.output_LSTM((m, l))[0]
# (batch, c_len)
p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze()
return p1, p2
forword 總的結構定義
# 1. Character Embedding Layer
c_char = char_emb_layer(batch.c_char)
q_char = char_emb_layer(batch.q_char)
# 2. Word Embedding Layer
c_word = self.word_emb(batch.c_word[0])
q_word = self.word_emb(batch.q_word[0])
c_lens = batch.c_word[1]
q_lens = batch.q_word[1]
# Highway network
c = highway_network(c_char, c_word)
q = highway_network(q_char, q_word)
# 3. Contextual Embedding Layer
c = self.context_LSTM((c, c_lens))[0]
q = self.context_LSTM((q, q_lens))[0]
# 4. Attention Flow Layer
g = att_flow_layer(c, q)
# 5. Modeling Layer
m = self.modeling_LSTM2((self.modeling_LSTM1((g, c_lens))[0], c_lens))[0]
# 6. Output Layer
p1, p2 = output_layer(g, m, c_lens)
# (batch, c_len), (batch, c_len)
return p1, p2
模型實現:
- 實驗結果。