TensorlFlow 2.0 数据准备

笔记整理自《Google老师亲授 TensorFlow2.0 入门到进阶_课程》
官方文档: tf.data: Build TensorFlow input pipelines
TFRecord and tf.Example

1. 使用tf.data.Dataset.from_tensor_slices()直接从内存中构造数据集

使用tf.data.Dataset.from_tensor_slices()将数据转为Tensor。版本不同可能会报错，一般是这个‘iter() is only supported inside of tf.function or when eager execution is enabled’。只要在开头导入enable_eager_execution()即可。

tf.enable_eager_execution()

dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)
# <TensorSliceDataset shapes: (), types: tf.int64>
for item in dataset:
    print(item)
    
# tf.Tensor(0, shape=(), dtype=int64)
# tf.Tensor(1, shape=(), dtype=int64)
# tf.Tensor(2, shape=(), dtype=int64)
# ...
# tf.Tensor(9, shape=(), dtype=int64)

传入打包好的数据构建特征和label

x = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array(['cat', 'dog', 'fox'])
dataset3 = tf.data.Dataset.from_tensor_slices((x, y))
for item_x, item_y in dataset3:
    print(item_x.numpy(), item_y.numpy())
    
# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'

传入字典的方式构建数据

dataset4 = tf.data.Dataset.from_tensor_slices({"feature": x,
                                               "label": y})
for item in dataset4:
    print(item["feature"].numpy(), item["label"].numpy())

# [1 2] b'cat'
# [3 4] b'dog'
# [5 6] b'fox'

分批次

test_arr = numpy.zeros((1000, 10))
for i in range(1000):
    test_arr[i, :] = i * numpy.ones((1, 10))

label = test_arr[:, 9]   
for i in range(10):
    print(test_arr[i,:])
    print(label[i])
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
# 0.0
# [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
# 1.0
# [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
# 2.0
# [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
# 3.0
# [4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
# 4.0

tfdata = tf.data.Dataset.from_tensor_slices((test_arr, label))
train_ds = tfdata.batch(3)
for item in train_ds:
    print(item)

打乱 shuffle

打乱数据集，需要填入一个buffer大小，一般为训练集的大小。我这里有1000个数据，就设为1000。

尝试了不同的shuffle

shuffle(10)

shuffle(100)

shuffle(10000)

repeat()

指将数据重复多少遍，直接使用repeat()时表示无限循环。
下面例子指定重复重复两遍，相当于一个完整数据与其自身拼接变成一个数据集。
然后通过batch()从这个数据集中取数。

模板

def make_dataset(data, labels, epochs, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
    return dataset

steps_per_epoch

使用tfrecord形式构建的数据每次给模型传递一个batch。对于无限循环的数据，即上面的直接使用了repeat()而不指定重复次数，在训练，验证，和测试时要指定就要每个epoch有多少步。

指定循环次数

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)

def make_dataset(data, labels, epochs, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50)
    return dataset
    
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, 1, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, 1, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, 1, batch_size)
# ...
history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    epochs = 100,
                    callbacks = callbacks)
model.evaluate(tfrecords_test_set)

无限循环

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
# (11610, 8) (11610,)
# (3870, 8) (3870,)
# (5160, 8) (5160,)

# 直接使用 repeat()
def make_dataset(data, labels, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat().batch(batch_size).prefetch(50)
    return dataset
    
batch_size = 32
tfrecords_train_set = make_dataset(x_train, y_train, batch_size)
tfrecords_valid_set = make_dataset(x_valid, y_valid, batch_size)
tfrecords_test_set = make_dataset(x_test, y_test, batch_size)
# ...
history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)
model.evaluate(tfrecords_test_set, steps = 5160 // batch_size)

tfrecord 文件格式

保存为tfrecord文件

def serialize_example(x, y):
    """Converts x, y to tf.train.Example and serialize"""
    input_feautres = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
            "input_features": tf.train.Feature(
                float_list = input_feautres),
            "label": tf.train.Feature(float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()


def serialize_example_tfcords(filename_fullpath, dataset, label, compression_type = None):
    """save tfrecord data to local"""
    options = tf.io.TFRecordOptions(
        compression_type = compression_type)
   
    with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
        for x, y in zip(dataset, label):         
            writer.write(serialize_example(x, y))
 
rand_feats = np.random.rand(10000,10)
print(rand_feats.shape)
rand_feats_labels = np.random.rand(10000,1)
print(rand_feats_labels.shape)

output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'train.tfrecord'
train_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_set_path, rand_feats, rand_feats_labels, compression_type=None)
# compress to zip
filename = 'train.tfrecord.zip'
train_compressed_set_path = os.path.join(output_dir, filename)
serialize_example_tfcords(train_compressed_set_path, rand_feats, rand_feats_labels, compression_type='GZIP')

tfrecord 文件读取

读取未压缩的tfrecord文件

从刚才存放tfrecord数据的路径使用tf.data.TFRecordDataset()读入文件。

# read tfrecord
dataset = tf.data.TFRecordDataset([train_set_path])
for serialized_example_tensor in dataset.take(3):
    print(serialized_example_tensor)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\x9f\xe9\x99>\n>\n\x0einput_features\x12,\x12*\n(,m\xc0>\x17\xd4\xf5>f_S>9\x93\xc6>\x9f,5=\xce\x96W>\xb3T\x1c?\xd93<?\x89\x0bn?\x9d\xbf\x8f>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n>\n\x0einput_features\x12,\x12*\n(\x94\xf6g?\x8a\x0c\xf6<\xbe%\x0b?\xa2\xcd\x94>\xfaF\x99=\x7f\xa4b?uPw?$@d?$m\xd0>\x8e\xb9_?\n\x12\n\x06labels\x12\x08\x12\x06\n\x04\xff\xb4\xae>', shape=(), dtype=string)
# tf.Tensor(b'\nT\n\x12\n\x06labels\x12\x08\x12\x06\n\x04c\x9fB?\n>\n\x0einput_features\x12,\x12*\n(3\x9em=7\x95\xe2>K\xb8\xf0=\x0f\x19k?\x0e\xb8\xfe>l\xa4=?\xa6\xa5^>X5\x97>:*\x8a>k\x04!?', shape=(), dtype=string)

# 读入数据
dataset = tf.data.TFRecordDataset([train_set_path])

# 这个字典里的多有key必须与前面写入features的一样
feature_description = {
	# 一定要添加维度，不知道就是[]
    "input_features": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0),
    "labels": tf.io.FixedLenFeature([], dtype=tf.float32, default_value=0.0)
}

# 使用tf.io.parse_single_example()解析数据
def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example, feature_description)
    return example
print(dataset)
# <TFRecordDatasetV2 shapes: (), types: tf.string>
# 使用map逐条解析所有数据
dataset = dataset.map(parse_example, num_parallel_calls=5)
dataset
# <ParallelMapDataset shapes: {input_features: (), labels: ()}, types: {input_features: tf.float32, labels: tf.float32}>
for parsed_record in dataset.take(10):
  print(repr(parsed_record['input_features']))
# 这个for循环一直报错，没解决
# InvalidArgumentError: Key: input_features.  Can't parse serialized Example[[{{node ParseSingleExample/ParseExample/ParseExampleV2}}]]

读取压缩后的tfrecord文件

dataset = tf.data.TFRecordDataset([train_compressed_set_path], compression_type='GZIP')
print(dataset)

dataset = dataset.map(parse_example, num_parallel_calls=5)

# 这里也一直报错
for raw_record in dataset.take(1):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)

TensorlFlow 2.0 数据准备

1. 使用tf.data.Dataset.from_tensor_slices()直接从内存中构造数据集

传入打包好的数据构建特征和label

传入字典的方式构建数据

分批次

打乱 shuffle

尝试了不同的shuffle

shuffle(10)

shuffle(100)

shuffle(10000)

repeat()

模板

steps_per_epoch

指定循环次数

无限循环

tfrecord 文件格式

tfrecord 文件读取

读取未压缩的tfrecord文件

读取压缩后的tfrecord文件

容器中nginx无法使用同一个网络下的容器域名

Python: SunMoonTimeCalculator

「Pygors跨平台GUI」1：Pygors跨平台GUI应用研究

NETCore中实现一个轻量无负担的极简任务调度ScheduleTask

docker使用特定的网络

使用c#强大的表达式树实现对象的深克隆之解决循环引用的问题

「Pygors跨平台GUI」2：安装MinGW-w64、MSYS2还是WSL2

nodejs学习07——API

避免DbContext同时在多个线程调用

GPT-4o 引领人机交互新风向，向量数据库赛道沸腾了

重讀經典神經網絡-AlexNet

卷積神經網絡-LeNet5

音頻信號增強

Python開發環境遷移

TensorlFlow 2.0基本API

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結