笔记整理自《Google老师亲授 TensorFlow2.0 入门到进阶_课程》
官方文档: tf.data: Build TensorFlow input pipelines
TFRecord and tf.Example
1. 使用tf.data.Dataset.from_tensor_slices()直接从内存中构造数据集
使用tf.data.Dataset.from_tensor_slices()将数据转为Tensor。版本不同可能会报错,一般是这个‘iter() is only supported inside of tf.function or when eager execution is enabled’。只要在开头导入enable_eager_execution()即可。
tf.enable_eager_execution()
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)
# <TensorSliceDataset shapes: (), types: tf.int64>
for item in dataset:
print(item)
# tf.Tensor(0, shape=(), dtype=int64)
# tf.Tensor(1, shape=(), dtype=int64)
# tf.Tensor(2, shape=(), dtype=int64)
# ...
# tf.Tensor(9, shape=(), dtype=int64)
传入打包好的数据构建特征和label
x = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array(['cat', 'dog', 'fox'])
dataset3 = tf.data.Dataset.from_tensor_slices((x, y))
for item_x, item_y in dataset3:
print(item_x.numpy(), item_y.numpy())
# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'
传入字典的方式构建数据
dataset4 = tf.data.Dataset.from_tensor_slices({"feature": x,
"label": y})
for item in dataset4:
print(item["feature"].numpy(), item["label"].numpy())
# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'
分批次
test_arr = numpy.zeros((1000, 10))
for i in range(1000):
test_arr[i, :] = i * numpy.ones((1, 10))
label = test_arr[:, 9]
for i in range(10):
print(test_arr[i,:])
print(label[i])
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
# 0.0
# [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
# 1.0
# [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
# 2.0
# [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
# 3.0
# [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
# 4.0
tfdata = tf.data.Dataset.from_tensor_slices((test_arr, label))
train_ds = tfdata.batch(3)
for item in train_ds:
print(item)
打乱 shuffle
打乱数据集,需要填入一个buffer大小,一般为训练集的大小。我这里有1000个数据,就设为1000。
尝试了不同的shuffle
shuffle(10)
shuffle(100)
shuffle(10000)
repeat()
指将数据重复多少遍,直接使用repeat()时表示无限循环。
下面例子指定重复重复两遍,相当于一个完整数据与其自身拼接变成一个数据集。
然后通过batch()从这个数据集中取数。
模板
def make_dataset(data, labels, epochs, batch_size, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices((data, labels))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
return dataset
steps_per_epoch
使用tfrecord形式构建的数据每次给模型传递一个batch。对于无限循环的数据,即上面的直接使用了repeat()而不指定重复次数,在训练,验证,和测试时要指定就要每个epoch有多少步。
指定循环次数
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)
def make_dataset(data, labels, epochs, batch_size, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices((data, labels))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
return dataset
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, 1, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, 1, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, 1, batch_size)
# ...
history = model.fit(tfrecords_train_set,
validation_data = tfrecords_valid_set,
epochs = 100,
callbacks = callbacks)
model.evaluate(tfrecords_test_set)
无限循环
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)
# 直接使用 repeat()
def make_dataset(data, labels, batch_size, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices((data, labels))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat().batch(batch_size).prefetch(50)
return dataset
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, batch_size)
# ...
history = model.fit(tfrecords_train_set,
validation_data = tfrecords_valid_set,
steps_per_epoch = 11160 // batch_size,
validation_steps = 3870 // batch_size,
epochs = 100,
callbacks = callbacks)
model.evaluate(tfrecords_test_set, steps = 5160 // batch_size)
tfrecord 文件格式
保存为tfrecord文件
def serialize_example(x, y):
"""Converts x, y to tf.train.Example and serialize"""
input_feautres = tf.train.FloatList(value = x)
label = tf.train.FloatList(value = y)
features = tf.train.Features(
feature = {
"input_features": tf.train.Feature(
float_list = input_feautres),
"label": tf.train.Feature(float_list = label)
}
)
example = tf.train.Example(features = features)
return example.SerializeToString()
def serialize_example_tfcords(filename_fullpath, dataset, label, compression_type = None):
"""save tfrecord data to local"""
options = tf.io.TFRecordOptions(
compression_type = compression_type)
with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
for x, y in zip(dataset, label):
writer.write(serialize_example(x, y))
rand_feats = np.random.rand(10000,10)
print(rand_feats.shape)
rand_feats_labels = np.random.rand(10000,1)
print(rand_feats_labels.shape)
output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
filename = 'train.tfrecord'
train_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_set_path, rand_feats, rand_feats_labels, compression_type=None)
# compress to zip
filename = 'train.tfrecord.zip'
train_compressed_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_compressed_set_path, rand_feats, rand_feats_labels, compression_type='GZIP')
tfrecord 文件读取
读取未压缩的tfrecord文件
从刚才存放tfrecord数据的路径使用tf.data.TFRecordDataset()读入文件。
# read tfrecord
dataset = tf.data.TFRecordDataset([train_set_path])
for serialized_example_tensor in dataset.take(3):
print(serialized_example_tensor)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\x9f\xe9\x99>\n>\n\x0einput_features\x12,\x12*\n(,m\xc0>\x17\xd4\xf5>f_S>9\x93\xc6>\x9f,5=\xce\x96W>\xb3T\x1c?\xd93<?\x89\x0bn?\x9d\xbf\x8f>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n>\n\x0einput_features\x12,\x12*\n(\x94\xf6g?\x8a\x0c\xf6<\xbe%\x0b?\xa2\xcd\x94>\xfaF\x99=\x7f\xa4b?uPw?$@d?$m\xd0>\x8e\xb9_?\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\xff\xb4\xae>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04c\x9fB?\n>\n\x0einput_features\x12,\x12*\n(3\x9em=7\x95\xe2>K\xb8\xf0=\x0f\x19k?\x0e\xb8\xfe>l\xa4=?\xa6\xa5^>X5\x97>:*\x8a>k\x04!?', shape=(), dtype=string)
# 读入数据
dataset = tf.data.TFRecordDataset([train_set_path])
# 这个字典里的多有key必须与前面写入features的一样
feature_description = {
# 一定要添加维度,不知道就是[]
"input_features": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0),
"labels": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0)
}
# 使用tf.io.parse_single_example()解析数据
def parse_example(serialized_example):
example = tf.io.parse_single_example(serialized_example, feature_description)
return example
print(dataset)
# <TFRecordDatasetV2 shapes: (), types: tf.string>
# 使用map逐条解析所有数据
dataset = dataset.map(parse_example, num_parallel_calls=5)
dataset
# <ParallelMapDataset shapes: {input_features: (), labels: ()}, types: {input_features: tf.float32, labels: tf.float32}>
for parsed_record in dataset.take(10):
print(repr(parsed_record['input_features']))
# 这个for循环一直报错,没解决
# InvalidArgumentError: Key: input_features. Can't parse serialized Example[[{{node ParseSingleExample/ParseExample/ParseExampleV2}}]]
读取压缩后的tfrecord文件
dataset = tf.data.TFRecordDataset([train_compressed_set_path], compression_type='GZIP')
print(dataset)
dataset = dataset.map(parse_example, num_parallel_calls=5)
# 这里也一直报错
for raw_record in dataset.take(1):
example = tf.train.Example()
example.ParseFromString(raw_record.numpy())
print(example)