every blog every motto:
0. 前言
文本生成—1. 數據處理
1. 代碼部分
1. 導入模塊
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
os.environ['CUDA_VISIBLE_DEVICES'] = '/gpu:0'
print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
print(module.__name__,module.__version__)
2. 讀取文件與處理
2.1 讀取
input_filepath = './shakespear.txt'
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])
2.2 生成詞表
- generate vocab
- build mapping char->id
- data -> id_data
- abcd -> bcd
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>
vocab = sorted(set(text))
print(len(vocab))
print(vocab)
2.3 char -> id
char2idx = {char:idx for idx,char in enumerate(vocab)}
print(char2idx)
2.4 id -> char
idx2char = np.array(vocab)
print(idx2char)
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])
2.5 生成輸入和輸出
def split_input_target(id_text):
""" abcde -> abcd,bcde"""
return id_text[0:-1],id_text[1:]
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1,drop_remainder = True)
for ch_id in char_dataset.take(2):
print(ch_id,idx2char[ch_id.numpy()])
for seq_id in seq_dataset.take(2):
print(seq_id)
print(repr('.join(idx2char[seq_id.numpy()])'))
seq_dataset = seq_dataset.map(split_input_target)
for item_input,item_output in seq_dataset.take(2):
print(item_input.numpy())
print(item_output.numpy())
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)