從零基礎入門Tensorflow2.0 ----七、35. 文本生成之---1.數據處理

every blog every motto:

0. 前言

文本生成—1. 數據處理

1. 代碼部分

1. 導入模塊

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

os.environ['CUDA_VISIBLE_DEVICES'] = '/gpu:0'
print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2. 讀取文件與處理

2.1 讀取

input_filepath = './shakespear.txt'
text = open(input_filepath,'r').read()

print(len(text))
print(text[0:100])

2.2 生成詞表

  1. generate vocab
  2. build mapping char->id
  3. data -> id_data
  4. abcd -> bcd
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>

vocab = sorted(set(text))
print(len(vocab))
print(vocab)

2.3 char -> id

char2idx = {char:idx for idx,char in enumerate(vocab)}
print(char2idx)

2.4 id -> char

idx2char = np.array(vocab)
print(idx2char)
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

2.5 生成輸入和輸出

def split_input_target(id_text):
    """ abcde -> abcd,bcde"""
    return id_text[0:-1],id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1,drop_remainder = True)

for ch_id in char_dataset.take(2):
    print(ch_id,idx2char[ch_id.numpy()])
    
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr('.join(idx2char[seq_id.numpy()])'))
seq_dataset = seq_dataset.map(split_input_target)

for item_input,item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章