Horovod運行的示例源碼解析

文|Serpah

01 | tensorflow_mnist.py

import os
import errno
import tensorflow as tf
import horovod.tensorflow as hvd
import numpy as np

from tensorflow import keras

layers = tf.layers

tf.logging.set_verbosity(tf.logging.INFO)


def conv_model(feature, target, mode):
    """2-layer convolution model."""
    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
    # with a on-value of 1 for each one-hot vector of length 10.
    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
    # image width and height final dimension being the number of color channels.
    feature = tf.reshape(feature, [-1, 28, 28, 1])

    # First conv layer will compute 32 features for each 5x5 patch
    with tf.variable_scope('conv_layer1'):
        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool1 = tf.nn.max_pool(
            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    # Second conv layer will compute 64 features for each 5x5 patch.
    with tf.variable_scope('conv_layer2'):
        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                activation=tf.nn.relu, padding="SAME")
        h_pool2 = tf.nn.max_pool(
            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
        # reshape tensor into a batch of vectors
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

    # Densely connected layer with 1024 neurons.
    h_fc1 = layers.dropout(
        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Compute logits (1 per class) and compute loss.
    logits = layers.dense(h_fc1, 10, activation=None)
    loss = tf.losses.softmax_cross_entropy(target, logits)

    return tf.argmax(logits, 1), loss


def train_input_generator(x_train, y_train, batch_size=64):
    assert len(x_train) == len(y_train)
    while True:
        p = np.random.permutation(len(x_train))
        x_train, y_train = x_train[p], y_train[p]
        index = 0
        while index <= len(x_train) - batch_size:
            yield x_train[index:index + batch_size], \
                  y_train[index:index + batch_size],
            index += batch_size


def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.AdamOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})


if __name__ == "__main__":
    tf.app.run()

layters = tf.layers
tf.layers是提供用於深度學習的更高層次封裝的 API，主要提供全連接和卷積等基本操作。
提供的方法有：

方法	含義
Input(…)	用於實例化一個輸入 Tensor，作爲神經網絡的輸入。
average_pooling1d(…)	一維平均池化層
average_pooling2d(…)	二維平均池化層
average_pooling3d(…)	三維平均池化層
batch_normalization(…)	批量標準化層
conv1d(…)	一維卷積層
conv2d(…)	二維卷積層
conv2d_transpose(…)	二維反捲積層
conv3d(…)	三維卷積層
conv3d_transpose(…)	三維反捲積層
dense(…)	全連接層
dropout(…)	Dropout層
flatten(…)	Flatten層，即把一個 Tensor 展平
max_pooling1d(…)	一維最大池化層
max_pooling2d(…)	二維最大池化層
max_pooling3d(…)	三維最大池化層
separable_conv2d(…)	二維深度可分離卷積層

tf.logging.set_verbosity(tf.logging.INFO)
設置日誌輸出級別爲INFO。
TensorFlow使用五個不同級別的日誌消息。按照上升的順序，它們是DEBUG，INFO，WARN，ERROR和FATAL。
當您在任何這些級別配置日誌記錄時，TensorFlow將輸出與該級別相對應的所有日誌消息以及比當前級別的更嚴重的級別信息。
TensorFlow在WARN的日誌記錄級別進行配置，但是在跟蹤模型訓練時，您需要將級別調整爲INFO，這將提供適合操作正在進行的其他反饋。
os.path
os.path.join 路徑拼接
os.path.exists 路徑是否存在
os.path.mkdir 用於以數字權限模式創建目錄。默認的模式爲 0777 (八進制)。
os.path.isdir 判斷路徑是否爲目錄
pass
語句佔位符，不作任何操作
raise
程序出現錯誤，會自動引發異常，Python也允許使用raise語句自行引發異常。
keras.datasets.mnist.load_data
keras自帶的加載MNIST數據集
np.reshape
在不改變數據內容的情況下，改變一個數組的格式。注意這裏是引用修改，如果改變了原數據，新數組的內容也會隨之改變。
-1標識，以其它的維度去改變數組的格式
x_train = np.reshape(x_train, (-1, 784)) / 255.0將x_train數據改變爲每行有784元素的數組，且數據歸一化到0~1之間。
tf.name_scope
指定的區域中定義的所有對象及各種操作，他們的“name”屬性上會增加該命名區的區域名，用以區別對象屬於哪個區域；
tf.estimator.ModeKeys
model 模式的標準名稱.
定義了以下的標準鍵：
TRAIN：訓練模式.
EVAL：計算模式.
PREDICT：推理模式.
tf.train.AdamOptimizer()
Adam優化算法：是一個尋找全局最優點的優化算法，引入了二次方梯度校正。

11.tf.train.get_or_create_global_step()
這個函數主要用於返回或者創建（如果有必要的話）一個全局步數的tensor。

optimizer.minimize()
minimize的內部存在兩個操作：(1)計算各個變量的梯度 (2)用梯度更新這些變量的值
MonitoredTrainingSession()
監控訓練的回話.
tf.train.StopAtStepHook鉤子，這個鉤子定義了訓練的最後一步，之後參數服務器和worker服務器會被關閉。
tf.train.LoggingTensorHook鉤子，以每N步或者N秒打印給定的張量，張量以INFO信息輸出日誌。
config=tf.ConfigProto()
用在創建session的時候，用來對session進行參數配置
config.gpu_options.allow_growth #True表示允許動態申請顯存
config.gpu_options.visible_device_list #指定使用GPU列表
Numpy.random.permutation
對原來的數組進行重新洗牌（即隨機打亂原來的元素順序）。
區別在於shuffle直接在原來的數組上進行操作，改變原來數組的順序，無返回值。而permutation不直接在原來的數組上進行操作，而是返回一個新的打亂順序的數組，並不改變原來的數組。
Next
返回迭代器的下一個項目。
tf.cast
tf.cast()函數的作用是執行 tensorflow 中張量數據類型轉換。
tf.one_hot
使用one-hot編碼，將離散特徵的取值擴展到了歐式空間，離散特徵的某個取值就對對應歐式空間的某個點。
將離散型特徵使用one-hot編碼，會讓特徵之間的距離計算更加合理。
例如：

tf.one_hot([0, 1, 2],    3, on_value=7, off_value=4)
#[7 4 4]
#[4 7 4]
#[4 4 7]]

model.evaluate
評估訓練的模型，輸入數據和標籤，輸出損失和精確度。
model.predict
預測測試結果，輸入測試數據，輸出預測結果。

02 | tensorflow_synthetic_benchmark.py

from __future__ import absolute_import, division, print_function

import argparse
import os
import numpy as np
import timeit

import tensorflow as tf
import horovod.tensorflow as hvd
from tensorflow.keras import applications

# Benchmark settings
parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--fp16-allreduce', action='store_true', default=False,
                    help='use fp16 compression during allreduce')

parser.add_argument('--model', type=str, default='ResNet50',
                    help='model to benchmark')
parser.add_argument('--batch-size', type=int, default=32,
                    help='input batch size')

parser.add_argument('--num-warmup-batches', type=int, default=10,
                    help='number of warm-up batches that don\'t count towards benchmark')
parser.add_argument('--num-batches-per-iter', type=int, default=10,
                    help='number of batches per benchmark iteration')
parser.add_argument('--num-iters', type=int, default=10,
                    help='number of benchmark iterations')

parser.add_argument('--eager', action='store_true', default=False,
                    help='enables eager execution')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')

args = parser.parse_args()
args.cuda = not args.no_cuda

hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
if args.cuda:
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

opt = tf.train.GradientDescentOptimizer(0.01)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(opt, compression=compression)

init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)

data = tf.random_uniform([args.batch_size, 224, 224, 3])
target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)


def loss_function():
    probs = model(data, training=True)
    return tf.losses.sparse_softmax_cross_entropy(target, probs)


def log(s, nl=True):
    if hvd.rank() != 0:
        return
    print(s, end='\n' if nl else '')


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, hvd.size()))


def run(benchmark_step):
    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))


if tf.executing_eagerly():
    with tf.device(device):
        run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
else:
    with tf.Session(config=config) as session:
        init.run()
        bcast_op.run()

        loss = loss_function()
        train_opt = opt.minimize(loss)
        run(lambda: session.run(train_opt))

from future import
做法的作用就是將新版本的特性引進當前版本中。
absolute_import 絕對引用包
division 精確除法
print_function Python3打印
import argparse
argparse模塊是Python用來處理命令行參數模塊。
argparse.ArgumentParser生成一個parser對象（參數解析器）
parse.add_argument增加參數
parse.parse_args獲取解析的參數
import timeit
timeit模塊是Python計時工具。
timeit.timeit 第一個參數爲執行表達式或函數，number參數爲執行次數。
from tensorflow.keras import applications
tensorflow.keras.application模塊裏有多個預訓練的模型類。
os.environ[“CUDA_VISIBLE_DEVICES”] = “-1”
指定使用的GPU，序號從0開始，以,隔開多個。“-1”表示不使用。
tensorflow eager
動態圖特性，使Tensorflow可以立刻執行運算：並返回具體值。
tf.enable_eager_execution 啓動eager
tf.executing_eagerly() 判斷eager是否啓動
getattr返回指定模塊的屬性
源碼中model = getattr(applications, args.model)(weights=None)表示返回applications模塊中的args.model屬性（默認輸入參數爲ResNet50），後面爲參數。
hvd.broadcast_global_variables(0)
在沒有使用MonitoredTrainingSession的情況下，可以初始化初始值後使用這個函數廣播初始值。
tf.random_uniform
構造均勻分佈數據
numpy.std()
計算標準差
--fp16-allreduce
該參數的意思使將梯度轉換爲fp16類型的數據，即雙字節的float，以減少傳輸過程中數據的量。
即對梯度進行壓縮(gradient compression)。

Horovod運行的示例源碼解析

01 | tensorflow_mnist.py

02 | tensorflow_synthetic_benchmark.py

Horovod運行的示例源碼解析

Web相關問題解決

Python知識與錯誤解決

並行計算設計簡介

常用數據集介紹

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結