MapReduce自定義文件輸出名

前言:

MapReduce默認情況下,一個reducer產生一個文件,以name-r-nnnnn來命名,其中默認的name爲part,nnnnn從(00000開始遞增),保證了每個reducer不會產生重複的文件。
 

一、僅替代文件名part,輸出結果爲score-r-00000

1.使用org.apache.hadoop.mapreduce.lib.output.MultipleOutputs類
2.MultipleOutputs類需要在Reduce的setup()方法初始化,最好在cleanup()中關閉
3.這個時候還會生產成part-r-000000這種文件,發現是裏面是空的,需要LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

代碼樣例:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

/**
 * Created by HuiQ on 2019-10-16.
 */
public class WordCount {

    public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        @Override
        public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String str: words){
                word.set(str);
                context.write(word,one);
            }
        }
    }

    public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        private MultipleOutputs<Text, IntWritable> multipleOutputs;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            multipleOutputs = new MultipleOutputs<Text, IntWritable>(context);
        }

        @Override
        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int total=0;
            for (IntWritable val : values){
                total++;
            }
            // 自定義輸出文件名
            multipleOutputs.write(key, new IntWritable(total), "score");
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            multipleOutputs.close();
        }
    }

    public static void main (String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 去掉臨時輸出目錄會生成part-r-00000或者part-m-00000的空文件
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // 注意:想全部自定義文件名這行一定不能有,否則最終生成的還是part-r-00000
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
二、要想全部自定義文件名,需要重寫RecordWriter

自定義reducer類輸出是通過重寫FileOutputFormat類和RecordWriter類實現的。具體操作是通過重寫RecordWriter類中的write方法,然後通過FileOutFormat類返回一個RecordWriter對象。

代碼樣例:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Created by HuiQ on 2019-10-16.
 */
public class WordCount {

    public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        @Override
        public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String str: words){
                word.set(str);
                context.write(word,one);
            }
        }
    }

    public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    
        @Override
        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int total=0;
            for (IntWritable val : values){
                total++;
            }
            context.write(key, new IntWritable(total));
        }
    }

    // 注意:1.必須要把static關鍵字加上 2.FileOutputFormat<Text,IntWritable>中的數據類型一定要和reduce端<Text,IntWritable>輸出對應上
    public static class MyFileOutputFormat extends FileOutputFormat<Text,IntWritable>{
        @Override
        public RecordWriter<Text, IntWritable> getRecordWriter(TaskAttemptContext job)throws IOException, InterruptedException {

            FileSystem fileSystem=FileSystem.newInstance(job.getConfiguration());
            //自定義的輸出路徑
            final FSDataOutputStream title=fileSystem.create(new Path("/huiqiang/output/test.txt"));
            RecordWriter<Text,IntWritable> recordWriter=new RecordWriter<Text, IntWritable>() {

                @Override
                public void close(TaskAttemptContext arg0) throws IOException,
                        InterruptedException {
                    if(title!=null){
                        title.close();
                    }
                }

                @Override
                public void write(Text key, IntWritable value) throws IOException,
                        InterruptedException {
                    String fenGe=" ";
                    String charSet="UTF-8";
                    System.out.println("key="+key.toString());
                    //輸出key
                    title.write(key.toString().getBytes(charSet),0,key.toString().getBytes(charSet).length);
                    //輸出key和value的分隔符
                    title.write(fenGe.getBytes(charSet),0,fenGe.getBytes(charSet).length);
                    //輸出value
                    title.write(value.toString().getBytes(charSet),0,value.toString().getBytes(charSet).length);
                    title.write("\n".getBytes(charSet),0,"\n".getBytes(charSet).length);
                    title.flush();
                }
            };
            return recordWriter;
        }
    }

    public static void main (String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputFormatClass(MyFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        MyFileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

參考:https://blog.csdn.net/smallpizza/article/details/78060638

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章