教程:https://www.bilibili.com/video/BV1vz4y1R7Mm?p=3
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
torch.cuda.manual_seed(1)
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 5000
text = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
path='text8', train='text8.train.txt', validation='text8.dev.txt',
test='text8.test.txt', text_field=text)
text.build_vocab(train, max_size=MAX_VOCAB_SIZE)
len(text.vocab)
5002
print(text.vocab.itos[:10]) # list: index to string?
print(list(text.vocab.stoi.items())[:5]) # dict: string to index
['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']
[('<unk>', 0), ('<pad>', 1), ('the', 2), ('of', 3), ('and', 4)]
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
(train, val, test), batch_size=BATCH_SIZE, device=device,
bptt_len=50, repeat=False, shuffle=True) # bptt: back propagation +??
it = iter(train_iter)
batch = next(it)
print(" ".join([text.vocab.itos[i] for i in batch.text[:,0].data]))
print(" ".join([text.vocab.itos[i] for i in batch.target[:,0].data]))
anarchism originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the
originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the organization
定義模型
import torch.nn as nn
class RNNModel(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size):
super(RNNModel, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size) # batch_first=False
self.decoder = nn.Linear(hidden_size, vocab_size) # 變成 vocab_size 是爲了與 one-hot 形式作比較
self.hidden_size = hidden_size
def forward(self, input, hidden):
emb = self.embed(input) # seq_len * batch_size * embed_size
output, hidden = self.lstm(emb, hidden) # seq_len * batch_size * hidden_size;
# 2nd output of lstm:
# h_n of shape (num_layers * num_directions, batch, hidden_size):
# tensor containing the hidden state for t = seq_len.
# Like output, the layers can be separated using
# h_n.view(num_layers, num_directions, batch, hidden_size) and similarly for c_n.
output1 = output.view(-1, output.shape[2]) # (seq_len*batch_size) * hidden_size
decoded = self.decoder(output1) # (seq_len*batch_size) * vocab_size
return decoded.view(output.shape[0], output.shape[1], -1), hidden
def init_hidden(self, batch_size, requires_grad=True):
weight = next(self.parameters()) # 隨機獲得一個參數?
return (weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad),
weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad))
model = RNNModel(vocab_size=len(text.vocab),
embed_size=EMBEDDING_SIZE,
hidden_size=100)
if USE_CUDA:
model = model.to(device)
def repackage_hidden(h):
if isinstance(h, torch.Tensor):
return h.detach() # 跟之前的節點失去聯繫,只保留值,不繼續反向傳播
else:
tuple(repackage_hidden(v) for v in h)
loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # 每次降一半
GRAD_CLIP = 5.0
def evaluate(model, val_iter):
model.eval()
total_loss = 0.
total_count = 0.
it = iter(val_iter)
with torch.no_grad():
hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
for i, batch in enumerate(it):
data, target = batch.text, batch.target
hidden = repackage_hidden(hidden)
output, hidden = model(data, hidden)
loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
total_loss += loss.item() * np.multiply(*data.size())
total_count += np.multiply(*data.size())
if i > 5: break # TODO: REMOVE THIS
model.train()
return total_loss / total_count
min_val_loss = None
for epoch in range(2):
model.train()
it = iter(train_iter)
hidden = model.init_hidden(BATCH_SIZE) # lstm 內部狀態初始化
for i, batch in enumerate(it):
data, target = batch.text, batch.target
hidden = repackage_hidden(hidden)
output, hidden = model(data, hidden)
# output和target形狀不一樣,分別是batch_size * target_dim, batch_size
loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
optimizer.step()
# if i % 100 == 0: # TODO: UNCOMMENT THIS
print("loss", loss.item())
if i % 2 == 0:
val_loss = evaluate(model, val_iter)
print("val loss: ", val_loss)
if min_val_loss is None or val_loss < min_val_loss:
min_val_loss = val_loss
torch.save(model.state_dict(), "lm.pth")
print("model saved to lm.pth")
else:
# learning rate decay
scheduler.step()
loss 5.831831455230713
val loss: 6.243127346038818
model saved to lm.pth
loss 5.8846540451049805
loss 5.957371234893799
val loss: 6.169812338692801
model saved to lm.pth
loss 5.931859016418457
loss 5.970404148101807
val loss: 6.123684201921735
model saved to lm.pth
loss 5.694666862487793
loss 5.788720607757568
val loss: 6.091047423226493
model saved to lm.pth
loss 5.983757495880127
--------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-62-4392e163776c> in <module>
12 loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
13 optimizer.zero_grad()
---> 14 loss.backward()
15 torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
16 optimizer.step()
D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
116 products. Defaults to ``False``.
117 """
--> 118 torch.autograd.backward(self, gradient, retain_graph, create_graph)
119
120 def register_hook(self, hook):
D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
91 Variable._execution_engine.run_backward(
92 tensors, grad_tensors, retain_graph, create_graph,
---> 93 allow_unreachable=True) # allow_unreachable flag
94
95
KeyboardInterrupt:
best_model = RNNModel(vocab_size=len(text.vocab),
embed_size=EMBEDDING_SIZE,
hidden_size=100)
if USE_CUDA:
best_model = best_model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))
# torch.load: 將數據讀入內存
# load_state_dict: 將內存的數據加載到模型中
<All keys matched successfully>
用模型生成句子
hidden = best_model.init_hidden(1)
input = torch.randint(len(text.vocab), (1, 1), dtype=torch.long).to(device) # 隨機生成一個整數
print(input)
words = []
for i in range(100):
output, hidden = best_model(input, hidden) # 爲什麼連input的shape都不用管
# 輸出:seq_len(1) * batch_size(1) * vocab_size(5002)
word_weights = output.squeeze().exp().cpu()
# multinominal 隨機採樣:
word_idx = torch.multinomial(word_weights, 1)[0] # or use argmax
input.fill_(word_idx) # fill是爲Tensor填充數據,下一次循環要用
word = text.vocab.itos[word_idx]
words.append(word)
print(" ".join(words))
tensor([[612]])
interest integral read come metric moscow churches claim sharp resistance adults mormon demands explicitly universe time gregorian czech following baptist seven by vertical intervention running freedom drive <unk> of <unk> divided of major <unk> elements <unk> in other from united five empire un maritime of a to malaysia philosophical is <unk> if real for according five two three liquid <unk> to <unk> <unk> by <unk> elsewhere sequence <unk> <unk> the <unk> culture life animals that <unk> <unk> <unk> ion are one taken <unk> <unk> <unk> <unk> <unk> <unk> <unk> could used apple <unk> <unk> <unk> scene <unk> temperature kennedy educational