原理:在集羣上每個機器生成TFRecord文件然後推送到指定的HDFS位置,並刪除源TFRecord(這步可以不用,會自動刪除)。
1. map階段
import sys, random
sys.path.append("./")
# 產生隨機數,用以shuffle數據
# 如果輸入的rcfile格式的文件,則需要刪除下標0的數據
for line in sys.stdin:
line = line.strip().split("\t")
print("{}\t{}".format(random.random(), "\t".join(line)))
2. reduce階段
import sys
import time
import socket
import tensorflow as tf
def create_float_feature(values):
return tf.train.Feature(float_list=tf.train.FloatList(value=values))
def create_int_feature(values):
return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
def gen_filename():
# 生成文件名, 規則爲:ip + time,防止文件名重複
myname = socket.getfqdn(socket.gethostname())
myaddr = socket.gethostbyname(myname)
mytime = time.time()
return myaddr+"_"+str(mytime)
def encode_line(line):
# 第1列爲deviceID經過hash之後的整型,第2列爲label
feed_dict = {}
feed_dict["device_id"] = create_int_feature([int(line[0])])
feed_dict["label"] = create_int_feature([int(line[1])])
return feed_dict
def convert(tmp_save_path, hdfs_path):
# tmp_save_path: 臨時存儲的位置
# hdfs_path: 最終存儲的位置
writer = tf.io.TFRecordWriter(tmp_save_path)
for line in sys.stdin:
line = line.strip().split("\t")
feed_dict = encode_line(line[1:])
if not feed_dict:
continue
example = tf.train.Example(features=tf.train.Features(feature=feed_dict))
serialized = example.SerializeToString()
writer.write(serialized)
writer.close()
# 將臨時存儲的TFRecord推送到指定hdfs路徑,並刪除臨時文件
os.system("hadoop fs -put %s %s"%(tmp_save_path, hdfs_path))
os.system("rm %s"%tmp_save_path)
if __name__ == "__main__":
# 輸出的hdfs路徑
hdfs_path = sys.argv[1]
file_name = gen_name()
tmp_save_path = file_name + ".tfrecord"
convert(tmp_save_path, hdfs_path)
3. 任務提交命令
# 文件輸入
INPUT_DIR=/user/rcmd/source/dt=20200507/*
# recuder自帶輸出目錄
OUTPUT_DIR=/user/rcmd/train/dt=20200507_tmp/
# tfrecord輸出目錄
TFRECORD_DIR=/user/rcmd/train/dt=20200507/
# python環境
PY="/user/envs/Python.zip#Python"
hadoop fs -rmr ${OUTPUT_DIR}
hadoop fs -rmr ${TFRECORD_DIR}
hadoop fs -mkdir -p {TFRECORD_DIR}
hadoop fs -chmod -R 777 ${TFRECORD_DIR}
hadoop fs -test -e ${OUTPUT_DIR}
if [ $? -eq 0 ] ; then
hadoop fs -rmr ${OUTPUT_DIR}
else
echo "${OUTPUT_DIR} not found!"
fi
hadoop jar /usr/local/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar \
-archives ${PY} \
-input ${INPUT_DIR} \
-output ${OUTPUT_DIR} \
-mapper "Python/bin/python3 mapper.py" \
-reducer "Python/bin/python3 reducer.py ${TFRECORD_DIR}" \
-jobconf mapred.map.tasks=1000 \
-jobconf mapred.reduce.tasks=1000 \
-jobconf mapred.job.name="convert__tfrecord" \
-jobconf mapreduce.map.memory.mb=8000 \
-jobconf mapreduce.reduce.memory.mb=8000 \
-file mapper.py \
-file reducer.py
# 刪除臨時目錄
hadoop fs -rmr ${OUTPUT_DIR}