hadoop多文件輸出

在舊的API中使用多文件輸出，只需要自定義類繼承MultipleTextOutputFormat類重寫它下面的generateFileNameForKeyValue 方法即可，直接上例子。

輸入文件內容：

目的是按照字母開頭的文件輸出，並統計單詞計數，輸出結果爲：

代碼如下：

package defined;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
 
/**
 * User: XD
 */
public class test {
	static final String INPUT_PATH = "hdfs://localhost:9000/input";
	static final Path OUTPUT_PATH = new Path("hdfs://localhost:9000/output");
	
    public static class MapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text,  LongWritable> {
 
        @Override
        public void map(LongWritable key, Text value,  OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
        	final String[] splited = value.toString().split(" ");
    		for(String val : splited){
    			output.collect(new Text(val), new LongWritable(1));
    			}
        }
    }
    public static class ReduceClass extends MapReduceBase implements Reducer<Text, LongWritable, Text,  LongWritable> {

    	@Override
public void reduce(Text key, Iterator<LongWritable> values,
		OutputCollector<Text, LongWritable> collect, Reporter arg3)
		throws IOException {
	// TODO Auto-generated method stub
	long sum = 0L;
	while(values.hasNext()){
		sum += values.next().get();
	}
	collect.collect(key, new LongWritable(sum));
}
}

    public static class PartitionFormat extends MultipleTextOutputFormat<Text, LongWritable> {
    	@Override
		protected String generateFileNameForKeyValue(Text key , LongWritable value,String name){
			char c = key.toString().toLowerCase().charAt(0);
			if(c>='a' && c<='z'){
				return c+".txt";
			}else{
				return "other.txt";
			}
		}
    }
    
    public static void main(String[] args) throws IOException, URISyntaxException {
        Configuration conf = new Configuration();
        JobConf job = new JobConf(conf, test.class);
        final FileSystem filesystem = FileSystem.get(new URI(INPUT_PATH),conf);
		final Path outPath = OUTPUT_PATH;
		if(filesystem.exists(outPath)){
			filesystem.delete(outPath, true);
		}
		
		
		//1.1 讀取文件 位置
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		//輸出文件位置
        FileOutputFormat.setOutputPath(job, OUTPUT_PATH);
    
        job.setJobName("Multipleoutput");
        
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);
 
        job.setInputFormat(TextInputFormat.class);
        job.setOutputFormat(PartitionFormat.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
 
        job.setNumReduceTasks(1);
        JobClient.runJob(job);
    }
}

但是在新的api中，就不能像上面那樣操作，需要自定義MultipleOutputormat類,在重寫generateFileNameForKeyValue 方法，似乎難度較大，在此給出一個簡單的操作，使用org.apache.hadoop.mapred.lib.MultipleOutputs，也是直接上例子：

輸入：

還是統計輸出到不同的文件。

輸出結果：

結果是dest-r-00000文件下

代碼：

package wordcount;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;


public class wordcount {

	/**
	 * @param args
	 */
	static final String INPUT_PATH = "hdfs://localhost:9000/input";
	static final String OUTPUT_PATH = "hdfs://localhost:9000/output";
	
	public static class Map extends Mapper<LongWritable , Text , Text , LongWritable>{
		protected void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
		final String[] splited = value.toString().split(" ");
		for(String val : splited){
			context.write(new Text(val), new LongWritable(1));
			}	
		}
	}
	
	public static class Reduce extends Reducer<Text ,LongWritable, Text , LongWritable>{
		private MultipleOutputs<Text,LongWritable> mos;
		String dest;
		protected  void setup(Context context){
			mos = new MultipleOutputs<Text, LongWritable>(context);
		}
		protected void reduce (Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException{
			long sum = 0L;
			char c = key.toString().toLowerCase().charAt(0);
			for(LongWritable val : values){
				sum += val.get();
			}
			if(c>='a' && c<='z'){
				mos.write("dest", key, new LongWritable(sum));
			}else{
				mos.write("other", key, new LongWritable(sum));
			}
			context.write(key, new LongWritable(sum));
		}
		protected  void cleanup(Context context) throws IOException, InterruptedException{
			mos.close();
		}
	}
	
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		
		final FileSystem filesystem = FileSystem.get(new URI(INPUT_PATH),conf);
		final Path outPath = new Path(OUTPUT_PATH);
		if(filesystem.exists(outPath)){
			filesystem.delete(outPath, true);
		}
		Job job = new Job(conf,wordcount.class.getSimpleName());
		
		//1.1 讀取文件 位置
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		
		//1.2指定的map類//1.3 map輸出的key value 類型 要是和最終的輸出類型是一樣的 可以省略
		job.setMapperClass(Map.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		job.setJarByClass(wordcount.class);
		
		//1.3 分區
		job.setPartitionerClass(HashPartitioner.class);
		
		//1.4分組
		
		//1.5 歸約
		
		//2.1 copy 經由網絡
		
		//2.2 指定自定義的reduce類
		job.setReducerClass(Reduce.class);
		//指定 reduce的輸出類型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		//2.3指定寫出到什麼位置
		FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
		
		MultipleOutputs.addNamedOutput(job, "dest", TextOutputFormat.class, Text.class, LongWritable.class);
		MultipleOutputs.addNamedOutput(job, "other", TextOutputFormat.class, Text.class, LongWritable.class);
		//提交到jobtracker執行。  此函數還將會打印出作業執行的詳細信息
		job.waitForCompletion(true);
		
	}

}

由於懶惰，沒有書寫更好的例子，只是簡單的介紹，並且並沒有將路徑寫成通用的，讀者可自行書寫，如有更好的解決，還請大家不吝賜教，小弟拜謝！！

XD122

發佈了102 篇原創文章 · 獲贊 16 · 訪問量 15萬+

私信關注

hadoop多文件輸出

985 碩士程序員，空窗 4 個月沒有 Offer！

一文搞懂 Spring 循環依賴

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

記一次 .NET某工控視覺自動化系統卡死分析

WindowsServer--SQL Server搭建主從同步實現讀寫分離 - 事務性分發

java JDBC 鏈接hive 操作實例

TF-IDF 提取文本關鍵詞

hive與mysql安裝以及使用

淺談PageRank算法

Hadoop 面試題（一）

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結