keras官網上有一些序列生成模型相應的例子:https://keras.io/
本文研究了facebook bAbI 項目
網址:https:/keras.io/zh/examples/babi_rnn/
官網給的代碼 Baby RNN.py 進行了更詳細的註釋
關鍵詞:多輸入、多輸出的model;keras的Embedding層;LSTM單元;layers.concatenate用法(連接一些神經網絡層);
from __future__ import print_function
from functools import reduce
import re
import tarfile
import numpy as np
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
def tokenize(sent):
'''返回包含標點符號的句子的標記。
>>> tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
return [x.strip() for x in re.split(r'(\W+)?', sent) if x.strip()]
def parse_stories(lines, only_supporting=False):
'''解析 bAbi 任務格式中提供的故事
如果 only_supporting 爲 true,
則只保留支持答案的句子。
'''
data = []
story = []
for line in lines: # 逐行處理
line = line.decode('utf-8').strip()# trip() 方法用於移除字符串頭尾指定的字符(默認爲空格或換行符)或字符序列
nid, line = line.split(' ', 1) # nid爲序號
nid = int(nid) # 轉化爲int型
if nid == 1:
story = [] # 序號爲1,這意味着一個新的故事
if '\t' in line: # '\t'代表這是個QA
q, a, supporting = line.split('\t') # \t代表橫向製表符
q = tokenize(q) # 將問題切割成含標點符號的單詞的數組
if only_supporting:
# 只選擇相關的子故事 (子故事可以理解爲一個句子)
supporting = map(int, supporting.split())
substory = [story[i - 1] for i in supporting] # i應該理解爲map中的值
else:
# 提供所有子故事
substory = [x for x in story if x] # 爲什麼要加 if(x) 因爲問題句子也以''的形式放進了story,
data.append((substory, q, a)) # 三元組構成以個QA (線索,問題,答案)
story.append('') # ''代表這是個問題
else:
sent = tokenize(line) # 將線索句子分割成單詞列表
story.append(sent) # 將這個列表放入story數組裏
return data
def get_stories(f, only_supporting=False, max_length=None):
'''給定文件名,讀取文件,檢索故事,
然後將句子轉換爲一個獨立故事。
如果提供了 max_length,
任何長於 max_length 的故事都將被丟棄。
'''
data = parse_stories(f.readlines(), only_supporting=only_supporting)
flatten = lambda data: reduce(lambda x, y: x + y, data)
# reduce() 函數會對參數序列中元素進行累積
# 用傳給 reduce 中的函數 function(有兩個參數)先對集合中的第 1、2 個元素進行操作,得到的結果再與第
# 三個數據用 function 函數運算,最後得到一個結果
data = [(flatten(story), q, answer) for story, q, answer in data # flatten的作用是將story的各個線索句進行拼接
if not max_length or len(flatten(story)) < max_length] #如果拼接後的句子的長度大於max_length則不加入
return data
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
xs = []
xqs = []
ys = []
for story, query, answer in data:
x = [word_idx[w] for w in story] # 一個故事裏所有word在word_idx裏的下標
xq = [word_idx[w] for w in query] # 一個question裏所有word在word_idx裏的下標
# let's not forget that index 0 is reserved
y = np.zeros(len(word_idx) + 1)
y[word_idx[answer]] = 1 # y是answer的獨熱編碼
xs.append(x) # xs是x的集合
xqs.append(xq) # xqs是xq的集合
ys.append(y) # ys是y的集合
return (pad_sequences(xs, maxlen=story_maxlen),
pad_sequences(xqs, maxlen=query_maxlen), np.array(ys))
# pad__sequences的第一個參數 必須是a list of iterables 作用是截斷或補齊
# a=[[1,2,3,4,5,6,7]]
# b=pad_sequences(a, maxlen=4) b變爲了[4,5,6,7]
# a=[[5,6,7]]
# b=pad_sequences(a, maxlen=4) b變爲了[0,5,6,7]
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 20
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
EMBED_HIDDEN_SIZE,
SENT_HIDDEN_SIZE,
QUERY_HIDDEN_SIZE))
try:
path = get_file('tasks_1-20_v1-2.tar.gz',
origin='https://s3.amazonaws.com/text-datasets/'
'babi_tasks_1-20_v1-2.tar.gz')
except:
print('Error downloading dataset, please download it manually:\n'
'$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2'
'.tar.gz\n'
'$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
raise
# 默認 QA1 任務,1000 樣本
challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
# QA1 任務,10,000 樣本
# challenge = 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt'
# QA2 任務,1000 樣本
# challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
# QA2 任務,10,000 樣本
# challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
with tarfile.open(path) as tar:
train = get_stories(tar.extractfile(challenge.format('train'))) # 也就是data
test = get_stories(tar.extractfile(challenge.format('test')))
vocab = set() # vocab是set類型 vocab是統計train和test裏面所有詞彙的
for story, q, answer in train + test:
vocab |= set(story + q + [answer])
vocab = sorted(vocab) # 按字典序的順序排序 返回了一個列表 這邊不太清楚爲什麼
# 保留 0 以留作 pad_sequences 進行 masking
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
# word->index {"key":value,}形式 {'.': 1, '?': 2, 'Daniel': 3,}
# 注意python的map不是字典,而是做一個映射!!!
story_maxlen = max(map(len, (x for x, _, _ in train + test))) # 一個故事最長有幾個word
query_maxlen = max(map(len, (x for _, x, _ in train + test))) # 問題最多有幾個word
x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))
print('Build model...')
sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
# Embedding層的含義 參考1: https://blog.csdn.net/u013249853/article/details/89194787?depth_1.utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task
# 參考2: http://frankchen.xyz/2017/12/18/How-to-Use-Word-Embedding-Layers-for-Deep-Learning-with-Keras/
# 簡單理解 將正整數(索引值)轉換爲固定尺寸的稠密向量 [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
# 個人理解 局部表示->分佈式表示
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence) # EMBED_HIDDEN_SIZE = 50
encoded_sentence = RNN(SENT_HIDDEN_SIZE)(encoded_sentence)
#第一個參數 units: Positive integer, dimensionality of the output space. 爲什麼設置這個數?
# 另一說指的是 每一個lstm單元的hidden layer 的神經元數量 hidden layer後面常常跟softmax
question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = RNN(QUERY_HIDDEN_SIZE)(encoded_question) # 構造和上面很像
# layers.concatenate
# 連接一個輸入張量的列表。
# 它接受一個張量的列表, 除了連接軸之外,其他的尺寸都必須相同, 然後返回一個由所有輸入張量連接起來的輸出張量。
merged = layers.concatenate([encoded_sentence, encoded_question])
preds = layers.Dense(vocab_size, activation='softmax')(merged) # 加一個softmax層 keras文檔裏有一個連接
# Model的第一個參數是inputs,第二個是outputs
# 多輸入、多輸出模型案例:https://keras.io/zh/getting-started/functional-api-guide/
model = Model([sentence, question], preds) # [sentence, question]代表兩個個輸入 preds代表輸出
model.compile(optimizer='adam', # compile裏需要指明損失函數是什麼
loss='categorical_crossentropy',
metrics=['accuracy'])
print('Training')
model.fit([x, xq], y,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=0.05)
print('Evaluation')
loss, acc = model.evaluate([tx, txq], ty,
batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
總結:
- python中的map是映射,和字典無關
- 可以通過keras的Embedding層實現特徵的分佈式表示,分佈式表示通常可以表示爲低維的稠密向量,具體底層如何實現的沒有研究
- pad_sequences實現輸入數據的維度相同
- Model的第一個參數是inputs,第二個是outputs
- 代碼在輸入文本的預處理上花了很多時間,猜測:工程問題中預處理也是同樣佔了相當大的比重時間