Pytorch學習(3)——語言模型

教程：https://www.bilibili.com/video/BV1vz4y1R7Mm?p=3

import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 5000

text = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
        path='text8', train='text8.train.txt', validation='text8.dev.txt', 
        test='text8.test.txt', text_field=text)

text.build_vocab(train, max_size=MAX_VOCAB_SIZE)
len(text.vocab)

print(text.vocab.itos[:10])  # list: index to string?
print(list(text.vocab.stoi.items())[:5])  # dict: string to index

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']
[('<unk>', 0), ('<pad>', 1), ('the', 2), ('of', 3), ('and', 4)]

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
        (train, val, test), batch_size=BATCH_SIZE, device=device, 
        bptt_len=50, repeat=False, shuffle=True)  # bptt: back propagation +??

it = iter(train_iter)
batch = next(it)

print(" ".join([text.vocab.itos[i] for i in batch.text[:,0].data]))
print(" ".join([text.vocab.itos[i] for i in batch.target[:,0].data]))

anarchism originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the
originated as a term of abuse first used against early working class <unk> including the <unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term is still used in a <unk> way to describe any act that used violent means to destroy the organization

定義模型

import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)  # batch_first=False
        self.decoder = nn.Linear(hidden_size, vocab_size)  # 變成 vocab_size 是爲了與 one-hot 形式作比較
        self.hidden_size = hidden_size
        
    def forward(self, input, hidden):
        emb = self.embed(input)  # seq_len * batch_size * embed_size
        output, hidden = self.lstm(emb, hidden)  # seq_len * batch_size * hidden_size；
        # 2nd output of lstm: 
        # h_n of shape (num_layers * num_directions, batch, hidden_size): 
        # tensor containing the hidden state for t = seq_len.
        # Like output, the layers can be separated using 
        # h_n.view(num_layers, num_directions, batch, hidden_size) and similarly for c_n.

        output1 = output.view(-1, output.shape[2])  # (seq_len*batch_size) * hidden_size
        decoded = self.decoder(output1)  # (seq_len*batch_size) * vocab_size
        return decoded.view(output.shape[0], output.shape[1], -1), hidden
    
    def init_hidden(self, batch_size, requires_grad=True):
        weight = next(self.parameters())  # 隨機獲得一個參數？
        return (weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad),
                weight.new_zeros((1, batch_size, self.hidden_size), requires_grad=requires_grad))

model = RNNModel(vocab_size=len(text.vocab),
                embed_size=EMBEDDING_SIZE,
                hidden_size=100)
if USE_CUDA:
    model = model.to(device)

def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()  # 跟之前的節點失去聯繫，只保留值，不繼續反向傳播
    else:
        tuple(repackage_hidden(v) for v in h)

loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)  # 每次降一半
GRAD_CLIP = 5.0

def evaluate(model, val_iter):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(val_iter)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)

            loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
            total_loss += loss.item() * np.multiply(*data.size())
            total_count += np.multiply(*data.size())
            
            if i > 5: break  # TODO: REMOVE THIS
    
    model.train()
    return total_loss / total_count

min_val_loss = None
for epoch in range(2):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)  # lstm 內部狀態初始化
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        
        # output和target形狀不一樣，分別是batch_size * target_dim, batch_size
        loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))  
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        
        # if i % 100 == 0:  # TODO: UNCOMMENT THIS
        print("loss", loss.item())
            
        if i % 2 == 0:
            val_loss = evaluate(model, val_iter)
            print("val loss: ", val_loss)
            if min_val_loss is None or val_loss < min_val_loss:
                min_val_loss = val_loss
                torch.save(model.state_dict(), "lm.pth")
                print("model saved to lm.pth")
            else:
                # learning rate decay
                scheduler.step()

loss 5.831831455230713
val loss:  6.243127346038818
model saved to lm.pth
loss 5.8846540451049805
loss 5.957371234893799
val loss:  6.169812338692801
model saved to lm.pth
loss 5.931859016418457
loss 5.970404148101807
val loss:  6.123684201921735
model saved to lm.pth
loss 5.694666862487793
loss 5.788720607757568
val loss:  6.091047423226493
model saved to lm.pth
loss 5.983757495880127



--------------------------------------------------------------------------

KeyboardInterrupt                        Traceback (most recent call last)

<ipython-input-62-4392e163776c> in <module>
     12         loss = loss_fn(output.view(-1, len(text.vocab)), target.view(-1))
     13         optimizer.zero_grad()
---> 14         loss.backward()
     15         torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
     16         optimizer.step()


D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
    116                 products. Defaults to ``False``.
    117         """
--> 118         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    119 
    120     def register_hook(self, hook):


D:\App\Anaconda3\envs\py36torch\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     91     Variable._execution_engine.run_backward(
     92         tensors, grad_tensors, retain_graph, create_graph,
---> 93         allow_unreachable=True)  # allow_unreachable flag
     94 
     95 


KeyboardInterrupt:

best_model = RNNModel(vocab_size=len(text.vocab), 
                     embed_size=EMBEDDING_SIZE,
                     hidden_size=100)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))
# torch.load: 將數據讀入內存
# load_state_dict: 將內存的數據加載到模型中

<All keys matched successfully>

用模型生成句子

hidden = best_model.init_hidden(1)
input = torch.randint(len(text.vocab), (1, 1), dtype=torch.long).to(device)  # 隨機生成一個整數
print(input)
words = []
for i in range(100):
    output, hidden = best_model(input, hidden)  # 爲什麼連input的shape都不用管
    # 輸出：seq_len(1) * batch_size(1) * vocab_size(5002)
    word_weights = output.squeeze().exp().cpu()
    # multinominal 隨機採樣：
    word_idx = torch.multinomial(word_weights, 1)[0]  # or use argmax
    input.fill_(word_idx)  # fill是爲Tensor填充數據，下一次循環要用
    word = text.vocab.itos[word_idx]
    words.append(word)

print(" ".join(words))

tensor([[612]])
interest integral read come metric moscow churches claim sharp resistance adults mormon demands explicitly universe time gregorian czech following baptist seven by vertical intervention running freedom drive <unk> of <unk> divided of major <unk> elements <unk> in other from united five empire un maritime of a to malaysia philosophical is <unk> if real for according five two three liquid <unk> to <unk> <unk> by <unk> elsewhere sequence <unk> <unk> the <unk> culture life animals that <unk> <unk> <unk> ion are one taken <unk> <unk> <unk> <unk> <unk> <unk> <unk> could used apple <unk> <unk> <unk> scene <unk> temperature kennedy educational

Pytorch學習(3)——語言模型

定義模型

用模型生成句子

前端使用 Konva 實現可視化設計器（13）- 折線 - 最優路徑應用【思路篇】

Pytorch學習(5)——簡單圖片分類

Tensorflow 2.0 學習(chapter 5)

latex的new command

無標題 about windows docker

二十一個心理學效應筆記

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結