Pytorch學習(3)——語言模型

教程:https://www.bilibili.com/video/BV1vz4y1R7Mm?p=3

import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 5000
text = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
        path='text8', train='text8.train.txt', validation='text8.dev.txt', 
        test='text8.test.txt', text_field=text)
text.build_vocab(train, max_size=MAX_VOCAB_SIZE)
len(text.vocab)
5002
print(text.vocab.itos[:10])  # list: index to string?
print(list(text.vocab.stoi.items())[:5])  # dict: string to index
['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']
[('<unk>', 0), ('<pad>', 1), ('the', 2), ('of', 3), ('and', 4)]
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
        (train, val, test), batch_size=BATCH_SIZE, device=device, 
        bptt_len=50, repeat=False, shuffle=True)  # bptt: back propagation +??
it = iter(train_iter)
batch = next(it)
print(" ".join([text.vocab.itos[i] for i in batch.text[:,0].data]))
print(" ".join([text.vocab.itos[i] for i in batch.target[:,0].data]))
anarchism originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the
originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the organization

定義模型

import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)  # batch_first=False
        self.decoder = nn.Linear(hidden_size, vocab_size)  # 變成 vocab_size 是爲了與 one-hot 形式作比較
        self.hidden_size = hidden_size
        
    def forward(self, input, hidden):
        emb = self.embed(input)  # seq_len * batch_size * embed_size
        output, hidden = self.lstm(emb, hidden)  # seq_len * batch_size * hidden_size;
        # 2nd output of lstm: 
        # h_n of shape (num_layers * num_directions, batch, hidden_size): 
        # tensor containing the hidden state for t = seq_len.
        # Like output, the layers can be separated using 
        # h_n.view(num_layers, num_directions, batch, hidden_size) and similarly for c_n.

        output1 = output.view(-1, output.shape[2])  # (seq_len*batch_size) * hidden_size
        decoded = self.decoder(output1)  # (seq_len*batch_size) * vocab_size
        return decoded.view(output.shape[0], output.shape[1], -1), hidden
    
    def init_hidden(self, batch_size, requires_grad=True):
        weight = next(self.parameters())  # 隨機獲得一個參數?
        return (weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad),
                weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad))
model = RNNModel(vocab_size=len(text.vocab),
                embed_size=EMBEDDING_SIZE,
                hidden_size=100)
if USE_CUDA:
    model = model.to(device)
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()  # 跟之前的節點失去聯繫,只保留值,不繼續反向傳播
    else:
        tuple(repackage_hidden(v) for v in h)
loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)  # 每次降一半
GRAD_CLIP = 5.0
def evaluate(model, val_iter):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(val_iter)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)

            loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
            total_loss += loss.item() * np.multiply(*data.size())
            total_count += np.multiply(*data.size())
            
            if i > 5: break  # TODO: REMOVE THIS
    
    model.train()
    return total_loss / total_count
min_val_loss = None
for epoch in range(2):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)  # lstm 內部狀態初始化
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        
        # output和target形狀不一樣,分別是batch_size * target_dim, batch_size
        loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))  
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        
        # if i % 100 == 0:  # TODO: UNCOMMENT THIS
        print("loss", loss.item())
            
        if i % 2 == 0:
            val_loss = evaluate(model, val_iter)
            print("val loss: ", val_loss)
            if min_val_loss is None or val_loss < min_val_loss:
                min_val_loss = val_loss
                torch.save(model.state_dict(), "lm.pth")
                print("model saved to lm.pth")
            else:
                # learning rate decay
                scheduler.step()
loss 5.831831455230713
val loss:  6.243127346038818
model saved to lm.pth
loss 5.8846540451049805
loss 5.957371234893799
val loss:  6.169812338692801
model saved to lm.pth
loss 5.931859016418457
loss 5.970404148101807
val loss:  6.123684201921735
model saved to lm.pth
loss 5.694666862487793
loss 5.788720607757568
val loss:  6.091047423226493
model saved to lm.pth
loss 5.983757495880127



--------------------------------------------------------------------------

KeyboardInterrupt                        Traceback (most recent call last)

<ipython-input-62-4392e163776c> in <module>
     12         loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
     13         optimizer.zero_grad()
---> 14         loss.backward()
     15         torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
     16         optimizer.step()


D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
    116                 products. Defaults to ``False``.
    117         """
--> 118         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    119 
    120     def register_hook(self, hook):


D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     91     Variable._execution_engine.run_backward(
     92         tensors, grad_tensors, retain_graph, create_graph,
---> 93         allow_unreachable=True)  # allow_unreachable flag
     94 
     95 


KeyboardInterrupt: 
best_model = RNNModel(vocab_size=len(text.vocab), 
                     embed_size=EMBEDDING_SIZE,
                     hidden_size=100)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))
# torch.load: 將數據讀入內存
# load_state_dict: 將內存的數據加載到模型中
<All keys matched successfully>

用模型生成句子

hidden = best_model.init_hidden(1)
input = torch.randint(len(text.vocab), (1, 1), dtype=torch.long).to(device)  # 隨機生成一個整數
print(input)
words = []
for i in range(100):
    output, hidden = best_model(input, hidden)  # 爲什麼連input的shape都不用管
    # 輸出:seq_len(1) * batch_size(1) * vocab_size(5002)
    word_weights = output.squeeze().exp().cpu()
    # multinominal 隨機採樣:
    word_idx = torch.multinomial(word_weights, 1)[0]  # or use argmax
    input.fill_(word_idx)  # fill是爲Tensor填充數據,下一次循環要用
    word = text.vocab.itos[word_idx]
    words.append(word)

print(" ".join(words))
tensor([[612]])
interest integral read come metric moscow churches claim sharp resistance adults mormon demands explicitly universe time gregorian czech following baptist seven by vertical intervention running freedom drive <unk> of <unk> divided of major <unk> elements <unk> in other from united five empire un maritime of a to malaysia philosophical is <unk> if real for according five two three liquid <unk> to <unk> <unk> by <unk> elsewhere sequence <unk> <unk> the <unk> culture life animals that <unk> <unk> <unk> ion are one taken <unk> <unk> <unk> <unk> <unk> <unk> <unk> could used apple <unk> <unk> <unk> scene <unk> temperature kennedy educational
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章