MapReduce之Join多表查詢實現
0.思路:
1、在map階段注意區分讀取進來的數據所屬哪張表,需做判斷進行區分
2、在reduce階段注意對相同key的value進行處理,分別取出哪些是部門表和員工表的信息
3、編寫Job類,設置mapper及輸入輸出
4、注意將emp.cvs和dept.csv放在同一個目錄下
內容如下:
1.依賴
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.3</version>
</dependency>
2.添加log4j.properties文件在資源目錄下即resources,文件內容如下
log4j.rootLogger=FATAL, dest1
log4j.logger.dsaLogging=DEBUG, dsa
log4j.additivity.dsaLogging=false
log4j.appender.dest1=org.apache.log4j.ConsoleAppender
log4j.appender.dest1.layout=org.apache.log4j.PatternLayout
log4j.appender.dest1.layout.ConversionPattern=%-5p:%l: %m%n
log4j.appender.dest1.ImmediateFlush=true
log4j.appender.dsa=org.apache.log4j.RollingFileAppender
log4j.appender.dsa.File=./logs/dsa.log
log4j.appender.dsa.MaxFileSize=2000KB
# Previously MaxBackupIndex=2
log4j.appender.dsa.MaxBackupIndex=5
log4j.appender.dsa.layout=org.apache.log4j.PatternLayout
log4j.appender.dsa.layout.ConversionPattern=%l:%d: %m%n
3.編寫mapper類 EqualJoinMapper.java
package com.mr.jointable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class EqualJoinMapper extends Mapper<LongWritable, Text, IntWritable,Text> {
IntWritable key2 = new IntWritable();
Text value2 = new Text();
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
/**數據結構:
* emp:7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
* dept:20,RESEARCH,DALLAS
*/
System.out.println("偏移量:" + key + ",value : " + value.toString());
//1、分詞
String[] splits = value.toString().split(",");
//2、區別emp和dept
if (splits.length >= 8){ //讀取的是emp表數據
String empName = splits[1];
String empDept = splits[7];
key2.set(Integer.parseInt(empDept));
value2.set(empName);
}else{ //讀取是dept表的數據
String detpNo = splits[0];
String deptName = "*" + splits[1];//加*的目的是標識當前的數據是屬於部門表裏面的
key2.set(Integer.parseInt(detpNo));
value2.set(deptName);
}
//3、通過context寫出去
context.write(key2,value2);
}
}
4.編寫reducer類
package com.mr.jointable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class EqualJoinReducer extends Reducer<IntWritable,Text,Text,Text> {
Text key4 = new Text();
Text value4 = new Text();
protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//從values中取出部門名稱和員工的名稱
String empNameList = "";
String deptName = "";
for (Text v: values) {
String data = v.toString();
int deptFlag = data.indexOf("*");
if (deptFlag != -1 ){//找到包含有*號的數據:部門名稱 如*RESEARCH
deptName = data.substring(1);
}else{
empNameList = data + ";" + empNameList;
}
}
key4.set(deptName);
value4.set(empNameList);
context.write(key4,value4);
}
}
5.編寫Job類
package com.mr.jointable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.util.Random;
public class EqualJoinJob {
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EqualJoinJob.class);
//設置Mapper
job.setMapperClass(EqualJoinMapper.class);
job.setMapOutputKeyClass(IntWritable.class);//key2
job.setMapOutputValueClass(Text.class);//value2
//設置Reducer
job.setReducerClass(EqualJoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//先使用本地文件做測試
FileInputFormat.setInputPaths(job,new Path("D:\\equaljoin\\"));
FileOutputFormat.setOutputPath(job,new Path(getOutputDir()));
boolean result = job.waitForCompletion(true);
System.out.println("result:" + result);
}
//用於產生隨機輸出目錄
public static String getOutputDir(){
String prefix = "D:\\output_equaljoin\\";
long time = System.currentTimeMillis();
int random = new Random().nextInt();
return prefix + "result_" + time + "_" + random;
}
}
6.運行結果