MapReduce的Map side join

當有一個大表join小表的時候,可以選擇用Map side join。該方式只用到了map階段,不需要reduce。

適用場景:
1-小表很小,可以放在內存中,不會導致JVM的堆溢出;
2-內連接或者大數據在左邊的左外連接。

原理:
在mapper類中新建一個HashMap對象,在setup中加載小表的文件到HashMap中,然後與map輸入的value(大數據文件的值)做join操作。結構圖如下:



舉例:

兩組數據分別爲
-------------------------------------
author:     作者id和作者對應表
-------------------------------------
s201002017:::R. R. Thomys
s201002023:::Klaus R. Dittrich
s201002024:::Wolfgang Gentzsch
s201002025:::Rainer K?nig
s201002018:::Georg Walch
s201002019:::Hans J. Becker
s201002020:::Hagen Vogel
s201002011:::Jan-Peter Hazebrouck
s201002012:::Herbert L?the
s201002015:::Matthias Rinschede
s201002016:::Heiner Fuhrmann
s201002021:::Norbert Braun
s201002022:::H. Henseler
s201002026:::Richard Vahrenkamp
s201002013:::Roman Winkler
s201002027:::Niels Grabe
s201002014:::Marianne Winslett

----------------------------------------
book:     圖書名字和作者id
----------------------------------------
<linux study>:::s201002017
<linux study>:::s201002023
<linux study>:::s201002024
<hadoop study>:::s201002024
<hadoop study>:::s201002023
<English second publish>:::s201002025
<data structure>:::s201002018
<hbase study>:::s201002019
<hbase study>:::s201002020
<hive study>:::s201002016
<zookeeper>:::s201002030
<zookeeper>:::s201002038
<java>:::s201002040
<factory>:::s201002041
<deep study in python>:::s201002020
<how to learn shell>:::s201002033
<J2EE learn>:::s201002030
<made in china>:::s201002039

數據的分割符爲“:::”要求做內連接inner join,
代碼如下:
package com.inspur.mapreduce.join;

/*************************************
 * @author:	caolch
 * @date:	2013-12-31
 * @note:	利用mapper寫的表連接,小表讀到內存裏
 *************************************/

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MapJoin extends Configured implements Tool {

	public static class myMapper extends Mapper<Object, Text, Text, Text> {
		// TODO Auto-generated constructor stub
		private HashMap<String,String> authorMap = new HashMap<String,String>();

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			String []tokens = value.toString().split(":::");
			String joinData = authorMap.get(tokens[1]);
			
			if (joinData!=null) {
				context.write(new Text(tokens[0]),new Text(joinData));		
			}
		}

		//setup會先於map執行
		@Override
		public void setup(Context context) throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			//得到要緩存的文件的路徑
			Path []cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
			
			//將文件內容讀到分佈式緩存
			if (cacheFiles!=null && cacheFiles.length > 0) {
				String line;
				String []tokens;
				for(Path path:cacheFiles)
				{
					if(path.toString().contains("author"))
					{
						BufferedReader br = new BufferedReader(new FileReader(path.toString()));
						try{  
		                    while((line = br.readLine()) != null){  
		                        tokens = line.split(":::", 2);  
		                        authorMap.put(tokens[0], tokens[1]);               
		                    }  
		                }finally{  
		                    br.close();  
		                }  
					}
				}
			}
		}
		
	}
	
	@Override
	public int run(String[] args) throws Exception {
		// TODO Auto-generated method stub
		
		Configuration conf = getConf();
		Job job = new Job(conf,"MapJoin");
		job.setJarByClass(MapJoin.class);
		job.setMapperClass(myMapper.class);
		job.setNumReduceTasks(0);
		
		/*添加要加入到緩存中的文件*/
		Path cachefilePath = new Path(args[0]);
		FileSystem hdfs = FileSystem.get(conf);
		FileStatus fileStatus = hdfs.getFileStatus(cachefilePath); 
		//判斷輸入的路徑是文件還是文件夾
		if(fileStatus.isDir()==false){		//如果輸入的路徑是文件,添加文件到緩存
			DistributedCache.addCacheFile(cachefilePath.toUri(), job.getConfiguration());
		}
		if(fileStatus.isDir()==true)		//如果輸入的路徑是文件夾,獲取文件夾中的文件列表
		{
			//獲取文件夾元數據,並一一添加內部所有文件
			for (FileStatus fs : hdfs.listStatus(cachefilePath)) {
				DistributedCache.addCacheFile(fs.getPath().toUri(), job.getConfiguration());
			}
		}
	
		Path in = new Path(args[1]);
		Path out = new Path(args[2]);
		//設置輸入輸出格式
		FileInputFormat.setInputPaths(job, in);
		FileOutputFormat.setOutputPath(job, out);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		System.exit(job.waitForCompletion(true)? 0 : 1);
		return 0;
	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		int res = ToolRunner.run(new Configuration(), new MapJoin(), args);
		System.exit(res);
	}

}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章