【Hadoop學習之MapReduce】_23MR之join的應用

文章目錄

一、Reduce join的應用

二、Map join的應用

一、Reduce join的應用

（一）join的主要工作

Map端的主要工作：爲來自不同表或文件的key/value對，打標籤以區別不同來源的記錄。然後用連接字段作爲key，其餘部分和新加的標誌作爲value，最後進行輸出。
Reduce端的主要工作：在Reduce端以連接字段作爲key的分組已經完成，我們只需要在每一個分組當中將那些來源於不同文件的記錄(在Map階段已經打標誌)分開，最後進行合併。

（二）Reduce join案例實操

需求

將pd.txt中數據根據商品pid合併到order.txt中。

// order.txt
//	id		pid amount
    1001	01	1
    1002	02	2
    1003	03	3
    1004	01	4
    1005	02	5
    1006	03	6

// pd.txt
// 	pid pname
    01	小米
    02	華爲
    03	格力

創建包名：com.easysir.reducejoin：

創建TableBean類：

package com.easysir.reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TableBean implements Writable {

    private String id;      // 訂單id
    private String pid;     // 商品id
    private int amount;     // 商品數量
    private String pname;   // 商品名稱
    private String flag;    // 標記來自哪張表

    public TableBean() {
        super();
    }

    public TableBean(String id, String pid, int amount, String pname, String flag) {
        super();
        this.id = id;
        this.pid = pid;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(id);
        out.writeUTF(pid);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        id = in.readUTF();
        pid = in.readUTF();
        amount = in.readInt();
        pname = in.readUTF();
        flag = in.readUTF();
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return id +
                "\t" + amount +
                "\t" + pname ;
    }
}

創建TableMapper類：

package com.easysir.reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {

    String name;
    TableBean tableBean = new TableBean();
    Text k  =new Text();
    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        name = fileSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {

        // 1 獲取一行
        String line = value.toString();

        String[] fields = line.split("\t");

        // 2 判斷使哪個表
        if (name.startsWith("order")){  // 訂單表
            tableBean.setId(fields[0]);
            tableBean.setPid(fields[1]);
            tableBean.setAmount(Integer.parseInt(fields[2]));
            tableBean.setPname("");
            tableBean.setFlag("order");

            k.set(fields[1]);
        }else { // 商品表
            tableBean.setId("");
            tableBean.setPid(fields[0]);
            tableBean.setAmount(0);
            tableBean.setPname(fields[1]);
            tableBean.setFlag("pd");

            k.set(fields[0]);
        }

        context.write(k, tableBean);
    }
}

創建TableReducer類：

package com.easysir.reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context)
            throws IOException, InterruptedException {

        // 存儲所有訂單集合
        ArrayList<TableBean> orderBeans =  new ArrayList<>();
        // 存儲商品信息
        TableBean pdbean = new TableBean();

        for (TableBean tableBean : values) {

            if ("order".equals(tableBean.getFlag())){
                TableBean tmpBean = new TableBean();
                try {
                    BeanUtils.copyProperties(tmpBean, tableBean);
                    orderBeans.add(tmpBean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }else {
                try {
                    BeanUtils.copyProperties(pdbean, tableBean);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        for (TableBean tableBean : orderBeans){
            tableBean.setPname(pdbean.getPname());
            context.write(tableBean, NullWritable.get());
        }
    }
}

創建TableDriver類：

package com.easysir.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置
        args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3", "E:\\idea-workspace\\mrWordCount\\output" };

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(TableDriver.class);

        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean result = job.waitForCompletion(true);
        System.exit(result? 0:1);
    }
}

（三）總結

缺點：這種方式中，合併的操作是在Reduce階段完成，Reduce端的處理壓力太大，Map節點的運算負載則很低，資源利用率不高，且在Reduce階段極易產生數據傾斜。
解決方案：採用在Map端進行數據合併的方式

二、Map join的應用

（一）Map join詳解

使用場景

Map join適用於一張一張表非常大，一張表非常小的場景。
優點

在Map端緩存多張表，提前處理業務邏輯，這樣增加Map端業務，能夠減少Reduce端數據的壓力，儘可能的減少數據傾斜。
具體步驟：

（1）在Mapper的setup階段，將文件讀取到緩存集合中；

（2）在驅動函數中加載緩存。
```
// 緩存普通文件到Task運行節點。
job.addCacheFile(new URI("文件路徑"));
```

（二）Map join案例實操

需求

將pd.txt中數據根據商品pid合併到order.txt中。

// order.txt
//	id		pid amount
    1001	01	1
    1002	02	2
    1003	03	3
    1004	01	4
    1005	02	5
    1006	03	6

// pd.txt
// 	pid pname
    01	小米
    02	華爲
    03	格力

創建包名：com.easysir.mapjoin

創建DistributedCacheMapper類：

package com.easysir.mapjoin;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

    Map<String, String> pdMap = new HashMap<>();

    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        // 1 獲取緩存的文件
        URI[] cacheFiles = context.getCacheFiles();
        String path = cacheFiles[0].getPath();

        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));

        String line;
        while(StringUtils.isNotEmpty(line = reader.readLine())){

            // 2 切割
            String[] fields = line.split("\t");

            // 3 緩存數據到集合
            pdMap.put(fields[0], fields[1]);
        }

        // 4 關流
        reader.close();
    }

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 1 獲取一行
        String line = value.toString();

        // 2 截取
        String[] fields = line.split("\t");

        // 3 獲取產品id
        String pId = fields[1];

        // 4 獲取商品名稱
        String pdName = pdMap.get(pId);

        // 5 拼接
        k.set(line + "\t"+ pdName);

        // 6 寫出
        context.write(k, NullWritable.get());
    }
}

創建DistributedCacheDriver類：

package com.easysir.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

public class DistributedCacheDriver {

    public static void main(String[] args) throws Exception {

        // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置
        args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3\\order.txt", "E:\\idea-workspace\\mrWordCount\\output" };

        // 1 獲取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2 設置加載jar包路徑
        job.setJarByClass(DistributedCacheDriver.class);

        // 3 關聯map
        job.setMapperClass(DistributedCacheMapper.class);

        // 4 設置最終輸出數據類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 5 設置輸入輸出路徑
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6 加載緩存數據
        job.addCacheFile(new URI("file:///E:/idea-workspace/mrWordCount/input3/pd.txt"));

        // 7 Map端Join的邏輯不需要Reduce階段，設置reduceTask數量爲0
        job.setNumReduceTasks(0);

        // 8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

運行結果：

1001	01	1	小米
1002	02	2	華爲
1003	03	3	格力
1004	01	4	小米
1005	02	5	華爲
1006	03	6	格力

【Hadoop學習之MapReduce】_23MR之join的應用

文章目錄

一、Reduce join的應用

（一）join的主要工作

（二）Reduce join案例實操

（三）總結

二、Map join的應用

（一）Map join詳解

（二）Map join案例實操

如何在低代碼平臺中引用 JavaScript ？

探究職業發展的關鍵：能力模型解讀

高效率使用windows

如何使用 JavaScript 獲取當前頁面幀率 FPS

工程款拖欠，農民工怎麼了？就得一直忍着委屈求全嗎？

HarmonyOS 實現下拉刷新，上拉加載更多

語音信號處理中的“窗函數”

智能決策新時代：可視化大屏是否能夠超越傳統白板？

解密Prompt系列28. LLM Agent之金融領域摸索：FinMem & FinAgent

分享幾個.NET開源的AI和LLM相關項目框架

【每日一題】從尾到頭打印鏈表

【Hadoop學習之Yarn】_27Yarn資源調度器

【Hadoop學習之MapReduce】_23MR之join的應用

【Hadoop學習】_03Hadoop運行模式

【Hadoop學習之MapReduce】_21MR之MapTask和ReduceTask工作機制

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結