【Hadoop学习之MapReduce】_23MR之join的应用

文章目录

一、Reduce join的应用

二、Map join的应用

一、Reduce join的应用

（一）join的主要工作

Map端的主要工作：为来自不同表或文件的key/value对，打标签以区别不同来源的记录。然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。
Reduce端的主要工作：在Reduce端以连接字段作为key的分组已经完成，我们只需要在每一个分组当中将那些来源于不同文件的记录(在Map阶段已经打标志)分开，最后进行合并。

（二）Reduce join案例实操

需求

将pd.txt中数据根据商品pid合并到order.txt中。

// order.txt
//	id		pid amount
    1001	01	1
    1002	02	2
    1003	03	3
    1004	01	4
    1005	02	5
    1006	03	6

// pd.txt
// 	pid pname
    01	小米
    02	华为
    03	格力

创建包名：com.easysir.reducejoin：

创建TableBean类：

package com.easysir.reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TableBean implements Writable {

    private String id;      // 订单id
    private String pid;     // 商品id
    private int amount;     // 商品数量
    private String pname;   // 商品名称
    private String flag;    // 标记来自哪张表

    public TableBean() {
        super();
    }

    public TableBean(String id, String pid, int amount, String pname, String flag) {
        super();
        this.id = id;
        this.pid = pid;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(id);
        out.writeUTF(pid);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        id = in.readUTF();
        pid = in.readUTF();
        amount = in.readInt();
        pname = in.readUTF();
        flag = in.readUTF();
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return id +
                "\t" + amount +
                "\t" + pname ;
    }
}

创建TableMapper类：

package com.easysir.reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {

    String name;
    TableBean tableBean = new TableBean();
    Text k  =new Text();
    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        name = fileSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {

        // 1 获取一行
        String line = value.toString();

        String[] fields = line.split("\t");

        // 2 判断使哪个表
        if (name.startsWith("order")){  // 订单表
            tableBean.setId(fields[0]);
            tableBean.setPid(fields[1]);
            tableBean.setAmount(Integer.parseInt(fields[2]));
            tableBean.setPname("");
            tableBean.setFlag("order");

            k.set(fields[1]);
        }else { // 商品表
            tableBean.setId("");
            tableBean.setPid(fields[0]);
            tableBean.setAmount(0);
            tableBean.setPname(fields[1]);
            tableBean.setFlag("pd");

            k.set(fields[0]);
        }

        context.write(k, tableBean);
    }
}

创建TableReducer类：

package com.easysir.reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context)
            throws IOException, InterruptedException {

        // 存储所有订单集合
        ArrayList<TableBean> orderBeans =  new ArrayList<>();
        // 存储商品信息
        TableBean pdbean = new TableBean();

        for (TableBean tableBean : values) {

            if ("order".equals(tableBean.getFlag())){
                TableBean tmpBean = new TableBean();
                try {
                    BeanUtils.copyProperties(tmpBean, tableBean);
                    orderBeans.add(tmpBean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }else {
                try {
                    BeanUtils.copyProperties(pdbean, tableBean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        for (TableBean tableBean : orderBeans){
            tableBean.setPname(pdbean.getPname());
            context.write(tableBean, NullWritable.get());
        }
    }
}

创建TableDriver类：

package com.easysir.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 输入输出路径需要根据自己电脑上实际的输入输出路径设置
        args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3", "E:\\idea-workspace\\mrWordCount\\output" };

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(TableDriver.class);

        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean result = job.waitForCompletion(true);
        System.exit(result? 0:1);
    }
}

（三）总结

缺点：这种方式中，合并的操作是在Reduce阶段完成，Reduce端的处理压力太大，Map节点的运算负载则很低，资源利用率不高，且在Reduce阶段极易产生数据倾斜。
解决方案：采用在Map端进行数据合并的方式

二、Map join的应用

（一）Map join详解

使用场景

Map join适用于一张一张表非常大，一张表非常小的场景。
优点

在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，能够减少Reduce端数据的压力，尽可能的减少数据倾斜。
具体步骤：

（1）在Mapper的setup阶段，将文件读取到缓存集合中；

（2）在驱动函数中加载缓存。
```
// 缓存普通文件到Task运行节点。
job.addCacheFile(new URI("文件路径"));
```

（二）Map join案例实操

需求

将pd.txt中数据根据商品pid合并到order.txt中。

// order.txt
//	id		pid amount
    1001	01	1
    1002	02	2
    1003	03	3
    1004	01	4
    1005	02	5
    1006	03	6

// pd.txt
// 	pid pname
    01	小米
    02	华为
    03	格力

创建包名：com.easysir.mapjoin

创建DistributedCacheMapper类：

package com.easysir.mapjoin;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

    Map<String, String> pdMap = new HashMap<>();

    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        // 1 获取缓存的文件
        URI[] cacheFiles = context.getCacheFiles();
        String path = cacheFiles[0].getPath();

        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));

        String line;
        while(StringUtils.isNotEmpty(line = reader.readLine())){

            // 2 切割
            String[] fields = line.split("\t");

            // 3 缓存数据到集合
            pdMap.put(fields[0], fields[1]);
        }

        // 4 关流
        reader.close();
    }

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 1 获取一行
        String line = value.toString();

        // 2 截取
        String[] fields = line.split("\t");

        // 3 获取产品id
        String pId = fields[1];

        // 4 获取商品名称
        String pdName = pdMap.get(pId);

        // 5 拼接
        k.set(line + "\t"+ pdName);

        // 6 写出
        context.write(k, NullWritable.get());
    }
}

创建DistributedCacheDriver类：

package com.easysir.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

public class DistributedCacheDriver {

    public static void main(String[] args) throws Exception {

        // 输入输出路径需要根据自己电脑上实际的输入输出路径设置
        args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3\\order.txt", "E:\\idea-workspace\\mrWordCount\\output" };

        // 1 获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2 设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        // 3 关联map
        job.setMapperClass(DistributedCacheMapper.class);

        // 4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6 加载缓存数据
        job.addCacheFile(new URI("file:///E:/idea-workspace/mrWordCount/input3/pd.txt"));

        // 7 Map端Join的逻辑不需要Reduce阶段，设置reduceTask数量为0
        job.setNumReduceTasks(0);

        // 8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

运行结果：

1001	01	1	小米
1002	02	2	华为
1003	03	3	格力
1004	01	4	小米
1005	02	5	华为
1006	03	6	格力

【Hadoop学习之MapReduce】_23MR之join的应用

文章目录

一、Reduce join的应用

（一）join的主要工作

（二）Reduce join案例实操

（三）总结

二、Map join的应用

（一）Map join详解

（二）Map join案例实操

钉钉打卡速度慢

Nginx R31 doc 官方文档-01-nginx 如何安装

Python 潮流周刊#51：用 Python 绘制美观的图表

Qt/C++音视频开发74-合并标签图形/生成yolo运算结果图形/文字和图形合并成一个/水印滤镜

挑战程序设计竞赛 2.2章习题 POJ - 3617 Best Cow Line 贪心

字节面试：MySQL什么时候锁表？如何防止锁表？

.NET8连接SQL SERVER 2008 R2 报：证书链是由不受信任的颁发机构颁发的

golang开发环境搭建(win10)

python计算机视觉学习笔记——PIL库的用法

Golang初学：获取程序内存使用情况，std runtime

【每日一題】從尾到頭打印鏈表

【Hadoop學習之Yarn】_27Yarn資源調度器

【Hadoop學習之MapReduce】_23MR之join的應用

【Hadoop學習】_03Hadoop運行模式

【Hadoop學習之MapReduce】_21MR之MapTask和ReduceTask工作機制

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結