TensorlFlow 2.0 數據準備

筆記整理自《Google老師親授 TensorFlow2.0 入門到進階_課程》
官方文檔: tf.data: Build TensorFlow input pipelines
TFRecord and tf.Example

1. 使用tf.data.Dataset.from_tensor_slices()直接從內存中構造數據集

使用tf.data.Dataset.from_tensor_slices()將數據轉爲Tensor。版本不同可能會報錯,一般是這個‘iter() is only supported inside of tf.function or when eager execution is enabled’。只要在開頭導入enable_eager_execution()即可。

tf.enable_eager_execution()
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)
# <TensorSliceDataset shapes: (), types: tf.int64>
for item in dataset:
    print(item)
    
# tf.Tensor(0, shape=(), dtype=int64)
# tf.Tensor(1, shape=(), dtype=int64)
# tf.Tensor(2, shape=(), dtype=int64)
# ...
# tf.Tensor(9, shape=(), dtype=int64)

傳入打包好的數據構建特徵和label

x = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array(['cat', 'dog', 'fox'])
dataset3 = tf.data.Dataset.from_tensor_slices((x, y))
for item_x, item_y in dataset3:
    print(item_x.numpy(), item_y.numpy())
    
# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'

傳入字典的方式構建數據

dataset4 = tf.data.Dataset.from_tensor_slices({"feature": x,
                                               "label": y})
for item in dataset4:
    print(item["feature"].numpy(), item["label"].numpy())

# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'

分批次

test_arr = numpy.zeros((1000, 10))
for i in range(1000):
    test_arr[i, :] = i * numpy.ones((1, 10))

label = test_arr[:, 9]   
for i in range(10):
    print(test_arr[i,:])
    print(label[i])
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
# 0.0
# [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
# 1.0
# [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
# 2.0
# [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
# 3.0
# [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
# 4.0

tfdata = tf.data.Dataset.from_tensor_slices((test_arr, label))
train_ds = tfdata.batch(3)
for item in train_ds:
    print(item)

batch_3
batch_5

打亂 shuffle

打亂數據集,需要填入一個buffer大小,一般爲訓練集的大小。我這裏有1000個數據,就設爲1000。
shaffle——1

嘗試了不同的shuffle

shuffle(10)

shaffle——10

shuffle(100)

shaffle——100

shuffle(10000)

shaffle——10000

repeat()

指將數據重複多少遍,直接使用repeat()時表示無限循環。
下面例子指定重複重複兩遍,相當於一個完整數據與其自身拼接變成一個數據集。
然後通過batch()從這個數據集中取數。
repeat

模板

def make_dataset(data, labels, epochs, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
    return dataset

steps_per_epoch

使用tfrecord形式構建的數據每次給模型傳遞一個batch。對於無限循環的數據,即上面的直接使用了repeat()而不指定重複次數,在訓練,驗證,和測試時要指定就要每個epoch有多少步。

指定循環次數

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)

def make_dataset(data, labels, epochs, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
    return dataset
    
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, 1, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, 1, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, 1, batch_size)
# ...
history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    epochs = 100,
                    callbacks = callbacks)
model.evaluate(tfrecords_test_set)

無限循環

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)

# 直接使用 repeat()
def make_dataset(data, labels, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat().batch(batch_size).prefetch(50)
    return dataset
    
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, batch_size)
# ...
history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)
model.evaluate(tfrecords_test_set, steps = 5160 // batch_size)

tfrecord 文件格式

保存爲tfrecord文件

def serialize_example(x, y):
    """Converts x, y to tf.train.Example and serialize"""
    input_feautres = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
            "input_features": tf.train.Feature(
                float_list = input_feautres),
            "label": tf.train.Feature(float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()


def serialize_example_tfcords(filename_fullpath, dataset, label, compression_type = None):
    """save tfrecord data to local"""
    options = tf.io.TFRecordOptions(
        compression_type = compression_type)
   
    with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
        for x, y in zip(dataset, label):         
            writer.write(serialize_example(x, y))
 
rand_feats = np.random.rand(10000,10)
print(rand_feats.shape)
rand_feats_labels = np.random.rand(10000,1)
print(rand_feats_labels.shape)

output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'train.tfrecord'
train_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_set_path, rand_feats, rand_feats_labels, compression_type=None)
# compress to zip
filename = 'train.tfrecord.zip'
train_compressed_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_compressed_set_path, rand_feats, rand_feats_labels, compression_type='GZIP')

tfrecord 文件讀取

讀取未壓縮的tfrecord文件

從剛纔存放tfrecord數據的路徑使用tf.data.TFRecordDataset()讀入文件。

# read tfrecord
dataset = tf.data.TFRecordDataset([train_set_path])
for serialized_example_tensor in dataset.take(3):
    print(serialized_example_tensor)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\x9f\xe9\x99>\n>\n\x0einput_features\x12,\x12*\n(,m\xc0>\x17\xd4\xf5>f_S>9\x93\xc6>\x9f,5=\xce\x96W>\xb3T\x1c?\xd93<?\x89\x0bn?\x9d\xbf\x8f>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n>\n\x0einput_features\x12,\x12*\n(\x94\xf6g?\x8a\x0c\xf6<\xbe%\x0b?\xa2\xcd\x94>\xfaF\x99=\x7f\xa4b?uPw?$@d?$m\xd0>\x8e\xb9_?\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\xff\xb4\xae>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04c\x9fB?\n>\n\x0einput_features\x12,\x12*\n(3\x9em=7\x95\xe2>K\xb8\xf0=\x0f\x19k?\x0e\xb8\xfe>l\xa4=?\xa6\xa5^>X5\x97>:*\x8a>k\x04!?', shape=(), dtype=string)
# 讀入數據
dataset = tf.data.TFRecordDataset([train_set_path])

# 這個字典裏的多有key必須與前面寫入features的一樣
feature_description = {
	# 一定要添加維度,不知道就是[]
    "input_features": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0),
    "labels": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0)
}

# 使用tf.io.parse_single_example()解析數據
def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example
print(dataset)
# <TFRecordDatasetV2 shapes: (), types: tf.string>
# 使用map逐條解析所有數據
dataset = dataset.map(parse_example, num_parallel_calls=5)
dataset
# <ParallelMapDataset shapes: {input_features: (), labels: ()}, types: {input_features: tf.float32, labels: tf.float32}>
for parsed_record in dataset.take(10):
  print(repr(parsed_record['input_features']))
# 這個for循環一直報錯,沒解決
# InvalidArgumentError: Key: input_features.  Can't parse serialized Example[[{{node ParseSingleExample/ParseExample/ParseExampleV2}}]]

讀取壓縮後的tfrecord文件

dataset = tf.data.TFRecordDataset([train_compressed_set_path], compression_type='GZIP')
print(dataset)

dataset = dataset.map(parse_example, num_parallel_calls=5)

# 這裏也一直報錯
for raw_record in dataset.take(1):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章