利用tensorflow estimator API實現分佈式推薦算法

Tensorflow estimator實現分佈式訓練很簡單,只需要將數據進行相應的切分丟給模型就可以很方便的完成分佈式訓練了。以下代碼是一個完整的推薦算法模板,可根據自己的需要修改數據讀取和模型結構部分,tensorflow==1.13.1。

1. 特徵處理部分,feature_processing.py

#coding:utf-8
import tensorflow as tf
from tensorflow import feature_column as fc
import config

FLAGS = config.FLAGS

class FeatureConfig(object):
    def __init__(self):
        self.all_columns = dict()
        self.feature_spec = dict()

    def create_features_columns(self):
        label = fc.numeric_column("label", default_value=1, dtype=tf.int64)
        self.all_columns["label"] = label

        # 向量類特徵
        user_vector = fc.numeric_column(key="user_vector", shape=(128,), default_value=[0.0] * 128, dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector", shape=(128,), default_value=[0.0] * 128, dtype=tf.float32)
        
        # 分桶類特徵
        age = fc.numeric_column(key="age", shape=(1,), default_value=[0], dtype=tf.int64)
        age = fc.bucketized_column(input_fc, boundaries=[0,10,20,30,40,50,60,70,80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')
        
        # 分類特徵
        city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')
        
        # hash特徵
        device_id = fc.categorical_column_with_hash_bucket(key="device_id", hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id, dimension=32, combiner='mean')
        
        self.all_columns["user_vector"] = user_vector
        self.all_columns["item_vector"] = item_vector
        self.all_columns["age"] = age
        self.all_columns["city"] = city
        self.all_columns["device_id"] = device_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(self.all_columns.values())

        return self

2. 模型部分,base_model.py

# coding:utf-8
import tensorflow as tf
from tensorflow import feature_column as fc
import config
import feature_processing as fe

FLAGS = config.FLAGS


def build_model(features, mode, params):
    # 特徵輸入
    feature_inputs = []
    for key, value in params["feature_configs"].all_columns.items():
        feature_inputs.append(tf.input_layer(features, value))
    # 特徵拼接
    net = tf.concat(feature_inputs, axis=1)
    # 全連接
    for idx, units in enumerate(params["hidden_units"]):
        net = tf.layers.dense(net, units=units, activation=tf.nn.leaky_relu, name="fc_layer_%s"%idx)
        net = tf.layers.dropout(net, 0.4, training=(mode == tf.estimator.ModeKeys.TRAIN))
    # 最後輸出
    net = tf.layers.dense(net, name="output_layer")
    return net

def model_fn(features, labels, mode, params):
    net = build_model(features, mode, params)
    pred = tf.sigmoid(net)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions={"output": pred})
    if mode == tf.estimator.ModeKeys.EVAL:
        loss = tf.losses.log_loss(labels, pred)
        metrics = {"auc": tf.metrics.auc(labels=labels, predictions=pred)}
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss = tf.losses.log_loss(labels, pred)
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.exponential_decay(params["learning_rate"], global_step, 100000, 0.9, staircase=True)
        train_op = (tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step))
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

3. 數據輸入部分,data_input.py

# coding:utf-8
import tensorflow as tf
import config

FLAGS = config.FLAGS

def parse_exp(example):
    features_def = {}
    features_def["label"] = tf.io.FixedLenFeature([1], tf.int64)
    """
    數據解析邏輯
    """
    features = tf.io.parse_single_example(example, features_def)
    label = features["label"]
    return features, label


def train_input_fn(filenames=None, batch_size=128):
    with tf.gfile.Open(filenames) as f:
        filenames = f.read().split()
    files_all = []
    for f in filenames:
        files_all += tf.gfile.Glob(f)
    train_worker_num = len(FLAGS.worker_hosts.split(","))
    hash_id = FLAGS.task_index if FLAGS.job_name == "worker" else train_worker_num - 1
    files_shard = [files for i, files in enumerate(files_all) if i % train_worker_num == hash_id]
    dataset = tf.data.TFRecordDataset(files_shard)
    dataset = dataset.shuffle(batch_size*100)
    dataset = dataset.map(parse_exp, num_parallel_calls=8)
    dataset = dataset.repeat().batch(batch_size).prefetch(1)
    return dataset

def eval_input_fn(filenames=None, batch_size=128):
    with tf.gfile.Open(filenames) as f:
        filenames = f.read().split()
    files = tf.data.Dataset.list_files(filenames)
    dataset = files.apply(tf.contrib.data.parallel_interleave(lambda filename: tf.data.TFRecordDataset(files), buffer_output_elements=batch_size*20, cycle_length=10))
    dataset = dataset.map(parse_exp, num_parallel_calls=4)
    dataset = dataset.batch(batch_size)
    return dataset

4. 主函數部分,main.py

# encoding:utf-8
import os
import tensorflow as tf
from tensorflow import feature_column as fc
import feature_engineering as fe
from feature_processing import FeatureConfig
import base_model
import data_input
import config

FLAGS = config.FLAGS

if FLAGS.run_on_cluster:
    cluster = json.loads(os.environ["TF_CONFIG"])
    task_index = int(os.environ["TF_INDEX"])
    task_type = os.environ["TF_ROLE"]


def main(unused_argv):
    feature_configs = FeatureConfig().create_features_columns()
    classifier = tf.estimator.Estimator(model_fn=model.model_fn,
                                        config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir,
                                                                      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
                                                                      keep_checkpoint_max=3),
                                        params={"feature_configs": feature_configs,
                                                "hidden_units": list(map(int, FLAGS.hidden_units.split(","))),
                                                "learning_rate": FLAGS.learning_rate}
                                        )
    def train_eval_model():
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: data_input.train_input_fn(FLAGS.train_data, FLAGS.batch_size),
                                            max_steps=FLAGS.train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: data_input.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size),
                                          start_delay_secs=60,
                                          throttle_secs = 30,
                                          steps=1000)
        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)

    def export_model():
        feature_spec = feature_configs.feature_spec
        feature_map = {}
        for key, feature in feature_spec.items():
            if key not in fe.feature_configs:
                continue
            if isinstance(feature, tf.io.VarLenFeature):  # 可變長度
                feature_map[key] = tf.placeholder(dtype=feature.dtype, shape=[1], name=key)
            elif isinstance(feature, tf.io.FixedLenFeature):  # 固定長度
                feature_map[key] = tf.placeholder(dtype=feature.dtype, shape=[None, feature.shape[0]], name=key)
        serving_input_recevier_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_map)
        export_dir = classifier.export_saved_model(FLAGS.output_model, serving_input_recevier_fn)

    # 模型訓練
    train_eval_model()
    
    # 導出模型,只在chief中導出一次即可
    if FLAGS.run_on_cluster: 
        if task_type == "chief":
            export_model()
    else:
        export_model()


if __name__ == "__main__":
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.app.run(main=main)

5. config.py

import json, os, re, codecs
import tensorflow as tf

flags = tf.app.flags

flags.DEFINE_boolean("run_on_cluster", True, "Whether the cluster info need to be passed in as input")

flags.DEFINE_string("train_dir", "", "")
flags.DEFINE_string("data_dir", "", "")
flags.DEFINE_string("log_dir", "", "")
flags.DEFINE_string("ps_hosts", "","")
flags.DEFINE_string("worker_hosts", "","")
flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
flags.DEFINE_integer("task_index", 0, "Index of task within the job")
flags.DEFINE_string("model_dir", "hdfs:/user/ranker/ckpt/", "Base directory for the model.")
flags.DEFINE_string("output_model", "hdfs:/user/ranker/model/", "Saved model.")
flags.DEFINE_string("train_data", "./train_data.txt", "Directory for storing mnist data")
flags.DEFINE_string("eval_data", "./eval_data.txt", "Path to the evaluation data.")
flags.DEFINE_string("hidden_units", "512,256,128", "hidden units.")
flags.DEFINE_integer("train_steps",1000000, "Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 256, "Training batch size")
flags.DEFINE_float("learning_rate", 0.0001, "learning rate")
flags.DEFINE_integer("save_checkpoints_steps", 10000, "Save checkpoints every this many steps")

FLAGS = flags.FLAGS

6. 最後是任務提交命令,各公司應該都有自己的提交平臺,這裏就不貼出來了。

當然該代碼直接單機運行也是沒有任何問題的,只需要將run_on_cluster設置爲False即可。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章