Hadoop的MapReduce中多文件輸出

My_LineRead.java

[java]view
plaincopy

public class My_LineRead<K, V> extends RecordWriter<K, V>{  

        private static final String utf8 = "UTF-8";  

        private static final  String colon = "----";  //劃分符號  

        private static final byte[] newline;  

        static {  

          try {  

            newline = "/n".getBytes(utf8);  

          } catch (UnsupportedEncodingException uee) {  

            throw new IllegalArgumentException("can't find " + utf8 + " encoding");  

          }  

        }  

        protected DataOutputStream out;  

        private final byte[] keyValueSeparator;  

        public My_LineRead(DataOutputStream out) {  

            this(out, colon); //調用下面的構造函數  

        }  

        public My_LineRead(DataOutputStream out, String keyValueSeparator) {  

            // TODO Auto-generated constructor stub  

            this.out = out;  

            try {  

                this.keyValueSeparator = keyValueSeparator.getBytes(utf8);  

            } catch (UnsupportedEncodingException e) {  

                // TODO Auto-generated catch block  

                throw new IllegalArgumentException("can't find " + utf8 + " encoding");  

            }  

        }  

        @Override  

        public void close(TaskAttemptContext arg0) throws IOException,  

                InterruptedException {  

            // TODO Auto-generated method stub  

            out.close();  

        }  

        @Override  

        public void write(K key, V value) throws IOException,  

                InterruptedException {  

            if (!(key == null && key instanceof NullWritable)){  

                //如果key不爲空者輸出key  

                if ((Object)key instanceof Text){  

                    Text to = (Text) key;  

                    out.write(to.getBytes(), 0, to.getLength());  

                }  

                else  

                {  

                    out.write(key.toString().getBytes(utf8));  

                }  

                out.write(keyValueSeparator);  

            }  

            if (!(value == null && value instanceof NullWritable)){  

                //如果value不爲空則輸出value  

                if ((Object)value instanceof Text){  

                    Text to = (Text) value;  

                    out.write(to.getBytes(), 0, to.getLength());  

                }  

                else  

                {  

                    out.write(value.toString().getBytes(utf8));  

                }  

                out.write(newline);  

            }  

        }  

    }

MyMultipleOutputFormat.java //這個類，我添加了些註釋便於理解

[c-sharp]view
plaincopy

public abstract class MyMultipleOutputFormat  <K extends WritableComparable<?>, V extends Writable>    

        extends FileOutputFormat<K, V> {  

    //接口類，需要在主程序中實現generateFileNameForKeyValue來獲取文件名  

    private MultiRecordWriter writer = null;    

    @Override  

    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)  

            throws IOException, InterruptedException {  

        // TODO Auto-generated method stub  

        //如果第一次調用那麼writer=null  

        if (writer == null) {    

            //getTaskOutputPath獲取output路徑  

            writer = new MultiRecordWriter(job, getTaskOutputPath(job));    

        }    

        return writer;  

    }  

    private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {  

        Path workPath = null;  

        OutputCommitter committer = super.getOutputCommitter(conf);  

        if (committer instanceof FileOutputCommitter) {  

            workPath = ((FileOutputCommitter) committer).getWorkPath();  

        } else {  

            Path outputPath = super.getOutputPath(conf);  

            if (outputPath == null) {  

                throw new IOException("Undefined job output-path");  

            }  

            workPath = outputPath;  

        }  

        return workPath;  

    }  

    /**通過key, value, conf來確定輸出文件名（含擴展名）*/  

    //返回值就是文件名。可以根據key,value來判斷  

    protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);  

    //MultiRecordWriter類  

    public class MultiRecordWriter extends RecordWriter<K, V> {  

        /**RecordWriter的緩存*/  

        private HashMap<String, RecordWriter<K, V>> recordWriters = null;  

        private TaskAttemptContext job = null;  

        /**輸出目錄*/  

        private Path workPath = null;  

        //構造函數  

        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {  

            super();  

            this.job = job;  

            this.workPath = workPath;  

            recordWriters = new HashMap<String, RecordWriter<K, V>>();  

        }  

        //關閉，應該可能是多個文件進行關閉，所有采用循環  

        //recordWriters.values() 就是指的getBaseRecordWriter返回的值。  

        @Override  

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {  

            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();  

            while (values.hasNext()) {  

                values.next().close(context);  

            }  

            this.recordWriters.clear();  

        }  

        @Override  

        public void write(K key, V value) throws IOException, InterruptedException {  

            //得到輸出文件名  

            String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());  

            //如果recordWriters裏沒有文件名，那麼就建立。否則就直接寫值。  

            RecordWriter<K, V> rw = this.recordWriters.get(baseName);  

            if (rw == null) {  

                rw = getBaseRecordWriter(job, baseName);  

                //放入HashMap  

                this.recordWriters.put(baseName, rw);  

            }  

            rw.write(key, value);  

        }  

        // ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}  

        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)  

                throws IOException, InterruptedException {  

            //獲取配置文件  

            Configuration conf = job.getConfiguration();  

            //查看是否使用解碼器  

            boolean isCompressed = getCompressOutput(job);  

            String keyValueSeparator = ",";  

            RecordWriter<K, V> recordWriter = null;  

            if (isCompressed) {  

                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,  

                        GzipCodec.class);  

                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);  

                Path file = new Path(workPath, baseName + codec.getDefaultExtension());  

                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  

                recordWriter = new My_LineRead<K, V>(new DataOutputStream(codec  

                        .createOutputStream(fileOut)), keyValueSeparator);  

            }  

            //如果不使用解碼器  

            else {  

                Path file = new Path(workPath, baseName);  

                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  

                //recordWriter = new My_LineRead<K, V>(fileOut, keyValueSeparator);  

                //這裏我使用的我自己的OutputFormat  

                recordWriter = new My_LineRead<K, V>(fileOut);  

            }  

            return recordWriter;  

        }  

    }  

}

最後就是測試類，WordCount_MulFileOut.java

[java]view
plaincopy

public class WordCount_MulFileOut {  

    public static  class wordcountMapper extends  

        Mapper<LongWritable, Text, Text, IntWritable>{  

        private final static IntWritable one = new IntWritable(1);  

        private Text word = new Text();  

        public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{  

            String line = value.toString();  

            StringTokenizer itr = new StringTokenizer(line);  

            while(itr.hasMoreElements()){  

                word.set(itr.nextToken());  

                context.write(word, one);  

            }  

        }  

    }  

    public static  class wordcountReduce extends  

        Reducer<Text, IntWritable, Text, IntWritable>{  

        public void reduce(Text key, Iterable<IntWritable>values, Context context)throws IOException, InterruptedException{  

            int sum = 0;  

            for (IntWritable str : values){  

                sum += str.get();  

            }  

            context.write(key, new IntWritable(sum));  

        }  

    }  

    public static class MyMultiple extends MyMultipleOutputFormat{  

        @Override  

        protected String generateFileNameForKeyValue(WritableComparable key,  

                Writable value, Configuration conf) {  

            // TODO Auto-generated method stub  

            return "other.txt";  

        }  

    }  

    public static  void main(String args[])throws Exception{  

        Configuration conf = new Configuration();  

        Job job = new Job(conf, "wordcount");  

        job.setJarByClass(WordCount_MulFileOut.class);  

        job.setInputFormatClass(TextInputFormat.class);  

        job.setOutputFormatClass(MyMultiple.class);  

        job.setOutputKeyClass(Text.class);  

        job.setOutputValueClass(IntWritable.class);  

        job.setMapperClass(wordcountMapper.class);  

        job.setReducerClass(wordcountReduce.class);  

        job.setCombinerClass(wordcountReduce.class);  

        FileInputFormat.setInputPaths(job, new Path(args[1]));  

        FileOutputFormat.setOutputPath(job, new Path(args[2]));  

        job.waitForCompletion(true);  

    }  

}