原理:在集群上每个机器生成TFRecord文件然后推送到指定的HDFS位置,并删除源TFRecord(这步可以不用,会自动删除)。
1. map阶段
import sys, random
sys.path.append("./")
# 产生随机数,用以shuffle数据
# 如果输入的rcfile格式的文件,则需要删除下标0的数据
for line in sys.stdin:
line = line.strip().split("\t")
print("{}\t{}".format(random.random(), "\t".join(line)))
2. reduce阶段
import sys
import time
import socket
import tensorflow as tf
def create_float_feature(values):
return tf.train.Feature(float_list=tf.train.FloatList(value=values))
def create_int_feature(values):
return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
def gen_filename():
# 生成文件名, 规则为:ip + time,防止文件名重复
myname = socket.getfqdn(socket.gethostname())
myaddr = socket.gethostbyname(myname)
mytime = time.time()
return myaddr+"_"+str(mytime)
def encode_line(line):
# 第1列为deviceID经过hash之后的整型,第2列为label
feed_dict = {}
feed_dict["device_id"] = create_int_feature([int(line[0])])
feed_dict["label"] = create_int_feature([int(line[1])])
return feed_dict
def convert(tmp_save_path, hdfs_path):
# tmp_save_path: 临时存储的位置
# hdfs_path: 最终存储的位置
writer = tf.io.TFRecordWriter(tmp_save_path)
for line in sys.stdin:
line = line.strip().split("\t")
feed_dict = encode_line(line[1:])
if not feed_dict:
continue
example = tf.train.Example(features=tf.train.Features(feature=feed_dict))
serialized = example.SerializeToString()
writer.write(serialized)
writer.close()
# 将临时存储的TFRecord推送到指定hdfs路径,并删除临时文件
os.system("hadoop fs -put %s %s"%(tmp_save_path, hdfs_path))
os.system("rm %s"%tmp_save_path)
if __name__ == "__main__":
# 输出的hdfs路径
hdfs_path = sys.argv[1]
file_name = gen_name()
tmp_save_path = file_name + ".tfrecord"
convert(tmp_save_path, hdfs_path)
3. 任务提交命令
# 文件输入
INPUT_DIR=/user/rcmd/source/dt=20200507/*
# recuder自带输出目录
OUTPUT_DIR=/user/rcmd/train/dt=20200507_tmp/
# tfrecord输出目录
TFRECORD_DIR=/user/rcmd/train/dt=20200507/
# python环境
PY="/user/envs/Python.zip#Python"
hadoop fs -rmr ${OUTPUT_DIR}
hadoop fs -rmr ${TFRECORD_DIR}
hadoop fs -mkdir -p {TFRECORD_DIR}
hadoop fs -chmod -R 777 ${TFRECORD_DIR}
hadoop fs -test -e ${OUTPUT_DIR}
if [ $? -eq 0 ] ; then
hadoop fs -rmr ${OUTPUT_DIR}
else
echo "${OUTPUT_DIR} not found!"
fi
hadoop jar /usr/local/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar \
-archives ${PY} \
-input ${INPUT_DIR} \
-output ${OUTPUT_DIR} \
-mapper "Python/bin/python3 mapper.py" \
-reducer "Python/bin/python3 reducer.py ${TFRECORD_DIR}" \
-jobconf mapred.map.tasks=1000 \
-jobconf mapred.reduce.tasks=1000 \
-jobconf mapred.job.name="convert__tfrecord" \
-jobconf mapreduce.map.memory.mb=8000 \
-jobconf mapreduce.reduce.memory.mb=8000 \
-file mapper.py \
-file reducer.py
# 删除临时目录
hadoop fs -rmr ${OUTPUT_DIR}