一、Reduce join的应用
(一)join的主要工作
Map
端的主要工作:为来自不同表或文件的key/value
对,打标签以区别不同来源的记录。然后用连接字段作为key
,其余部分和新加的标志作为value
,最后进行输出。Reduce
端的主要工作:在Reduce
端以连接字段作为key
的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在Map
阶段已经打标志)分开,最后进行合并。
(二)Reduce join案例实操
-
需求
将
pd.txt
中数据根据商品pid
合并到order.txt
中。// order.txt // id pid amount 1001 01 1 1002 02 2 1003 03 3 1004 01 4 1005 02 5 1006 03 6 // pd.txt // pid pname 01 小米 02 华为 03 格力
-
创建包名:
com.easysir.reducejoin
: -
创建
TableBean
类:package com.easysir.reducejoin; import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; public class TableBean implements Writable { private String id; // 订单id private String pid; // 商品id private int amount; // 商品数量 private String pname; // 商品名称 private String flag; // 标记来自哪张表 public TableBean() { super(); } public TableBean(String id, String pid, int amount, String pname, String flag) { super(); this.id = id; this.pid = pid; this.amount = amount; this.pname = pname; this.flag = flag; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(id); out.writeUTF(pid); out.writeInt(amount); out.writeUTF(pname); out.writeUTF(flag); } @Override public void readFields(DataInput in) throws IOException { id = in.readUTF(); pid = in.readUTF(); amount = in.readInt(); pname = in.readUTF(); flag = in.readUTF(); } public String getId() { return id; } public void setId(String id) { this.id = id; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public int getAmount() { return amount; } public void setAmount(int amount) { this.amount = amount; } public String getPname() { return pname; } public void setPname(String pname) { this.pname = pname; } public String getFlag() { return flag; } public void setFlag(String flag) { this.flag = flag; } @Override public String toString() { return id + "\t" + amount + "\t" + pname ; } }
-
创建
TableMapper
类:package com.easysir.reducejoin; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> { String name; TableBean tableBean = new TableBean(); Text k =new Text(); @Override protected void setup(Context context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) context.getInputSplit(); name = fileSplit.getPath().getName(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取一行 String line = value.toString(); String[] fields = line.split("\t"); // 2 判断使哪个表 if (name.startsWith("order")){ // 订单表 tableBean.setId(fields[0]); tableBean.setPid(fields[1]); tableBean.setAmount(Integer.parseInt(fields[2])); tableBean.setPname(""); tableBean.setFlag("order"); k.set(fields[1]); }else { // 商品表 tableBean.setId(""); tableBean.setPid(fields[0]); tableBean.setAmount(0); tableBean.setPname(fields[1]); tableBean.setFlag("pd"); k.set(fields[0]); } context.write(k, tableBean); } }
-
创建
TableReducer
类:package com.easysir.reducejoin; import org.apache.commons.beanutils.BeanUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> { @Override protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException { // 存储所有订单集合 ArrayList<TableBean> orderBeans = new ArrayList<>(); // 存储商品信息 TableBean pdbean = new TableBean(); for (TableBean tableBean : values) { if ("order".equals(tableBean.getFlag())){ TableBean tmpBean = new TableBean(); try { BeanUtils.copyProperties(tmpBean, tableBean); orderBeans.add(tmpBean); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } }else { try { BeanUtils.copyProperties(pdbean, tableBean); } catch (IllegalAccessException | InvocationTargetException e) { e.printStackTrace(); } } } for (TableBean tableBean : orderBeans){ tableBean.setPname(pdbean.getPname()); context.write(tableBean, NullWritable.get()); } } }
-
创建
TableDriver
类:package com.easysir.reducejoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class TableDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // 输入输出路径需要根据自己电脑上实际的输入输出路径设置 args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3", "E:\\idea-workspace\\mrWordCount\\output" }; Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(TableDriver.class); job.setMapperClass(TableMapper.class); job.setReducerClass(TableReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(TableBean.class); job.setOutputKeyClass(TableBean.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean result = job.waitForCompletion(true); System.exit(result? 0:1); } }
(三)总结
- 缺点:这种方式中,合并的操作是在
Reduce
阶段完成,Reduce
端的处理压力太大,Map
节点的运算负载则很低,资源利用率不高,且在Reduce
阶段极易产生数据倾斜。 - 解决方案:采用在
Map
端进行数据合并的方式
二、Map join的应用
(一)Map join详解
-
使用场景
Map join
适用于一张一张表非常大,一张表非常小的场景。 -
优点
在
Map
端缓存多张表,提前处理业务逻辑,这样增加Map
端业务,能够减少Reduce
端数据的压力,尽可能的减少数据倾斜。 -
具体步骤:
(1)在
Mapper
的setup
阶段,将文件读取到缓存集合中;(2)在驱动函数中加载缓存。
// 缓存普通文件到Task运行节点。 job.addCacheFile(new URI("文件路径"));
(二)Map join案例实操
-
需求
将
pd.txt
中数据根据商品pid
合并到order.txt
中。// order.txt // id pid amount 1001 01 1 1002 02 2 1003 03 3 1004 01 4 1005 02 5 1006 03 6 // pd.txt // pid pname 01 小米 02 华为 03 格力
-
创建包名:
com.easysir.mapjoin
-
创建
DistributedCacheMapper
类:package com.easysir.mapjoin; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.Map; public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ Map<String, String> pdMap = new HashMap<>(); @Override protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { // 1 获取缓存的文件 URI[] cacheFiles = context.getCacheFiles(); String path = cacheFiles[0].getPath(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); String line; while(StringUtils.isNotEmpty(line = reader.readLine())){ // 2 切割 String[] fields = line.split("\t"); // 3 缓存数据到集合 pdMap.put(fields[0], fields[1]); } // 4 关流 reader.close(); } Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取一行 String line = value.toString(); // 2 截取 String[] fields = line.split("\t"); // 3 获取产品id String pId = fields[1]; // 4 获取商品名称 String pdName = pdMap.get(pId); // 5 拼接 k.set(line + "\t"+ pdName); // 6 写出 context.write(k, NullWritable.get()); } }
-
创建
DistributedCacheDriver
类:package com.easysir.mapjoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.net.URI; public class DistributedCacheDriver { public static void main(String[] args) throws Exception { // 输入输出路径需要根据自己电脑上实际的输入输出路径设置 args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3\\order.txt", "E:\\idea-workspace\\mrWordCount\\output" }; // 1 获取job信息 Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); // 2 设置加载jar包路径 job.setJarByClass(DistributedCacheDriver.class); // 3 关联map job.setMapperClass(DistributedCacheMapper.class); // 4 设置最终输出数据类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 5 设置输入输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 6 加载缓存数据 job.addCacheFile(new URI("file:///E:/idea-workspace/mrWordCount/input3/pd.txt")); // 7 Map端Join的逻辑不需要Reduce阶段,设置reduceTask数量为0 job.setNumReduceTasks(0); // 8 提交 boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
-
运行结果:
1001 01 1 小米 1002 02 2 华为 1003 03 3 格力 1004 01 4 小米 1005 02 5 华为 1006 03 6 格力