當有一個大表join小表的時候,可以選擇用Map side join。該方式只用到了map階段,不需要reduce。
適用場景:
1-小表很小,可以放在內存中,不會導致JVM的堆溢出;
2-內連接或者大數據在左邊的左外連接。
原理:
在mapper類中新建一個HashMap對象,在setup中加載小表的文件到HashMap中,然後與map輸入的value(大數據文件的值)做join操作。結構圖如下:
舉例:
兩組數據分別爲
-------------------------------------
author: 作者id和作者對應表
-------------------------------------
s201002017:::R. R. Thomys
s201002023:::Klaus R. Dittrich
s201002024:::Wolfgang Gentzsch
s201002025:::Rainer K?nig
s201002018:::Georg Walch
s201002019:::Hans J. Becker
s201002020:::Hagen Vogel
s201002011:::Jan-Peter Hazebrouck
s201002012:::Herbert L?the
s201002015:::Matthias Rinschede
s201002016:::Heiner Fuhrmann
s201002021:::Norbert Braun
s201002022:::H. Henseler
s201002026:::Richard Vahrenkamp
s201002013:::Roman Winkler
s201002027:::Niels Grabe
s201002014:::Marianne Winslett
s201002023:::Klaus R. Dittrich
s201002024:::Wolfgang Gentzsch
s201002025:::Rainer K?nig
s201002018:::Georg Walch
s201002019:::Hans J. Becker
s201002020:::Hagen Vogel
s201002011:::Jan-Peter Hazebrouck
s201002012:::Herbert L?the
s201002015:::Matthias Rinschede
s201002016:::Heiner Fuhrmann
s201002021:::Norbert Braun
s201002022:::H. Henseler
s201002026:::Richard Vahrenkamp
s201002013:::Roman Winkler
s201002027:::Niels Grabe
s201002014:::Marianne Winslett
----------------------------------------
book: 圖書名字和作者id
book: 圖書名字和作者id
----------------------------------------
<linux study>:::s201002017
<linux study>:::s201002023
<linux study>:::s201002024
<hadoop study>:::s201002024
<hadoop study>:::s201002023
<English second publish>:::s201002025
<data structure>:::s201002018
<hbase study>:::s201002019
<hbase study>:::s201002020
<linux study>:::s201002023
<linux study>:::s201002024
<hadoop study>:::s201002024
<hadoop study>:::s201002023
<English second publish>:::s201002025
<data structure>:::s201002018
<hbase study>:::s201002019
<hbase study>:::s201002020
<hive study>:::s201002016
<zookeeper>:::s201002030
<zookeeper>:::s201002038
<java>:::s201002040
<factory>:::s201002041
<deep study in python>:::s201002020
<how to learn shell>:::s201002033
<J2EE learn>:::s201002030
<made in china>:::s201002039
<zookeeper>:::s201002030
<zookeeper>:::s201002038
<java>:::s201002040
<factory>:::s201002041
<deep study in python>:::s201002020
<how to learn shell>:::s201002033
<J2EE learn>:::s201002030
<made in china>:::s201002039
數據的分割符爲“:::”要求做內連接inner join,
代碼如下:
package com.inspur.mapreduce.join;
/*************************************
* @author: caolch
* @date: 2013-12-31
* @note: 利用mapper寫的表連接,小表讀到內存裏
*************************************/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MapJoin extends Configured implements Tool {
public static class myMapper extends Mapper<Object, Text, Text, Text> {
// TODO Auto-generated constructor stub
private HashMap<String,String> authorMap = new HashMap<String,String>();
@Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String []tokens = value.toString().split(":::");
String joinData = authorMap.get(tokens[1]);
if (joinData!=null) {
context.write(new Text(tokens[0]),new Text(joinData));
}
}
//setup會先於map執行
@Override
public void setup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
//得到要緩存的文件的路徑
Path []cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
//將文件內容讀到分佈式緩存
if (cacheFiles!=null && cacheFiles.length > 0) {
String line;
String []tokens;
for(Path path:cacheFiles)
{
if(path.toString().contains("author"))
{
BufferedReader br = new BufferedReader(new FileReader(path.toString()));
try{
while((line = br.readLine()) != null){
tokens = line.split(":::", 2);
authorMap.put(tokens[0], tokens[1]);
}
}finally{
br.close();
}
}
}
}
}
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf = getConf();
Job job = new Job(conf,"MapJoin");
job.setJarByClass(MapJoin.class);
job.setMapperClass(myMapper.class);
job.setNumReduceTasks(0);
/*添加要加入到緩存中的文件*/
Path cachefilePath = new Path(args[0]);
FileSystem hdfs = FileSystem.get(conf);
FileStatus fileStatus = hdfs.getFileStatus(cachefilePath);
//判斷輸入的路徑是文件還是文件夾
if(fileStatus.isDir()==false){ //如果輸入的路徑是文件,添加文件到緩存
DistributedCache.addCacheFile(cachefilePath.toUri(), job.getConfiguration());
}
if(fileStatus.isDir()==true) //如果輸入的路徑是文件夾,獲取文件夾中的文件列表
{
//獲取文件夾元數據,並一一添加內部所有文件
for (FileStatus fs : hdfs.listStatus(cachefilePath)) {
DistributedCache.addCacheFile(fs.getPath().toUri(), job.getConfiguration());
}
}
Path in = new Path(args[1]);
Path out = new Path(args[2]);
//設置輸入輸出格式
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true)? 0 : 1);
return 0;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int res = ToolRunner.run(new Configuration(), new MapJoin(), args);
System.exit(res);
}
}