spark寫入hdfs

spark streaming寫入hdfs

場景: 需要將數據寫入hdfs,打包成一個gz包, 每5分鐘執行一次spark任務。

最終的結果如下:
在這裏插入圖片描述
5分鐘跑一次spark, 將數據寫入hdfs, 會產生很多的小文件。

spark代碼

val hadoopConf: Configuration = rdd.context.hadoopConfiguration
hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
hadoopConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec")
hadoopConf.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK")
dataToSaveHdfs.saveAsNewAPIHadoopFile(ConfigInfo.saveHdfsMarkDataPathConfig, classOf[Text], classOf[Text], classOf[StreamingDataGzipOutputFormat[Text, Text]])

註釋:
      dataToSaveHdfs是RDD
     ConfigInfo.saveHdfsMarkDataPathConfig 是配置文件的hdfs路徑, 上圖的/pub_stat_migu/tmp/log/路徑
     StreamingDataGzipOutputFormat是一個類


StreamingDataGzipOutputFormat.java類 // 主要是實現Hadoop MapReduce重寫FileOutputFormat

import com.hadoop.compression.lzo.LzopCodec;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;


import java.io.DataOutputStream;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Iterator;



// Hadoop MapReduce的FileOutputFormat

public class StreamingDataGzipOutputFormat<K, V> extends FileOutputFormat<K, V> {


    private StreamingDataGzipOutputFormat<K, V>.MultiRecordWriter writer;
    private String jobId;
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();


    public StreamingDataGzipOutputFormat() {
        this.jobId = null;
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
            throws IOException, InterruptedException {
        if (this.writer == null) {
            this.writer = new MultiRecordWriter(job, getTaskOutputPath(job));
        }
        if (this.jobId == null) {
            this.jobId = String.valueOf(job.getJobID().getId());
        }
        return this.writer;
    }


    private Path getTaskOutputPath(TaskAttemptContext job) throws IOException {
        Path workPath = null;
        OutputCommitter committer = super.getOutputCommitter(job);
        if ((committer instanceof FileOutputCommitter)) {
            workPath = ((FileOutputCommitter) committer).getWorkPath();
        } else {
            Path outputPath = FileOutputFormat.getOutputPath(job);
            if (outputPath == null) {
                throw new IOException("Undefined job output-path");
            }
            workPath = outputPath;
        }
        return workPath;
    }

    /**
     *   提供併發量: 批量寫入數據,存在寫入同一個或者不同的路徑情況
    */
    public class MultiRecordWriter extends RecordWriter<K, V> {
        private HashMap<String, RecordWriter<K, V>> recordWriters;
        private TaskAttemptContext job;
        private Path workPath;


        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
            this.job = job;
            this.workPath = workPath;
            this.recordWriters = new HashMap();
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            Iterator values = this.recordWriters.values().iterator();
            while (values.hasNext()) {
                ((RecordWriter) values.next()).close(context);
            }
            this.recordWriters.clear();
        }


        protected String[] generateFileNameForKeyValue(K key, V value) {
            if ((key == null) || (value == null)) {
                return null;
            }
            String keyStr = key.toString();
            String[] fileInfo = new String[3];
            if (keyStr.startsWith("ERROR")) {
                String[] keyStrs = StringUtils.split(keyStr.substring("ERROR".length() + 1), "_");
                fileInfo[0] = (this.workPath.toString() + "/ERROR/" + StringUtils.substring(keyStrs[1], 0, 10) + "/" + keyStrs[1] + "/" + keyStrs[0]);
                fileInfo[1] = ("ERROR_" + keyStrs[0] + "_" + keyStrs[1]);
            } else {
                //20001_202002132155_1583302930630.gz
                String[] keyStrs = StringUtils.split(keyStr, "_");
                fileInfo[0] = (this.workPath.toString() + "/" +StringUtils.substring(keyStrs[1], 0, 10)  + "/" + keyStrs[1] + "/" + keyStrs[0]);
                fileInfo[1] = (keyStrs[0] + "_" + keyStrs[1]);
            }
            fileInfo[2] = (fileInfo[1] + "_" + System.currentTimeMillis());
            return fileInfo;
        }

        @Override
        public void write(K key, V value)
                throws IOException, InterruptedException {
            String[] fileInfo = generateFileNameForKeyValue(key, value);
            if (fileInfo != null) {
                RecordWriter rw = (RecordWriter) this.recordWriters.get(fileInfo[1]);
                if (rw == null) {
                    rw = getBaseRecordWriter(this.job, fileInfo);
                    this.recordWriters.put(fileInfo[1], rw);
                }
                rw.write(key, value);
            }
        }


        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String[] fileInfo) throws IOException, InterruptedException {
            Configuration conf = job.getConfiguration();
            String keyValueSeparator = "_";
            RecordWriter recordWriter = null;
            if (!FileOutputFormat.getCompressOutput(job)) {  // 如果是壓縮,則根據壓縮獲取擴展名
                Path file = new Path(new Path(fileInfo[0]), fileInfo[2]);
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false, 1048576, (short) 3, 10485760);
                recordWriter = new MessageRecordWriter(new DataOutputStream(fileOut), keyValueSeparator);
            } else {
                Class codecClass = FileOutputFormat.getOutputCompressorClass(job, LzopCodec.class);
                CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
                Path file = new Path(new Path(fileInfo[0]), fileInfo[2] + codec.getDefaultExtension());
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false, 1048576, (short) 3, 10485760);
                recordWriter = new MessageRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator);
            }
            return recordWriter;
        }
    }
}

存在寫入同一個或者不同的路徑情況

MultiRecordWriter 類 —>提供併發量: 批量寫入數據,存在寫入同一個或者不同的路徑情況

MessageRecordWriter 有點像單個hdfs路徑的對象,把所有出現的路徑保存下來HashMap<String, RecordWriter<K, V>> recordWriters,
數據來就直接寫入, 不需要新打開一個hdfs路徑對象。


MessageRecordWriter.java

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

public class MessageRecordWriter<K, V> extends RecordWriter<K, V> {
    private static final String utf8 = "UTF-8";
    private static final byte[] newline;
    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

    public MessageRecordWriter(DataOutputStream out, String keyValueSeparator) {
        this.out = out;
        try {
            this.keyValueSeparator = keyValueSeparator.getBytes("UTF-8");
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find UTF-8 encoding");
        }
    }

    public MessageRecordWriter(DataOutputStream out) {
        this(out, "\t");
    }

    private void writeObject(Object o) throws IOException {
        if ((o instanceof Text)) {
            Text to = (Text) o;
            this.out.write(to.getBytes(), 0, to.getLength());
        } else {
            this.out.write(o.toString().getBytes("UTF-8"));
        }
    }
    
    @Override
    public synchronized void write(K key, V value) throws IOException {
        boolean nullKey = (key == null) || ((key instanceof NullWritable));
        boolean nullValue = (value == null) || ((value instanceof NullWritable));
        if ((nullKey) && (nullValue)) {
            return;
        }
        if (!nullValue) {
            writeObject(value);
        }
        this.out.write(newline);
    }
    
    @Override
    public synchronized void close(TaskAttemptContext context) throws IOException {
        this.out.flush();
        this.out.close();
    }

    static {
        try {
            newline = "\n".getBytes("UTF-8");
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find UTF-8 encoding");
        }
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章