【Hadoop學習之MapReduce】_23MR之join的應用

一、Reduce join的應用

(一)join的主要工作

  1. Map端的主要工作:爲來自不同表或文件的key/value對,打標籤以區別不同來源的記錄。然後用連接字段作爲key,其餘部分和新加的標誌作爲value,最後進行輸出。
  2. Reduce端的主要工作:在Reduce端以連接字段作爲key的分組已經完成,我們只需要在每一個分組當中將那些來源於不同文件的記錄(在Map階段已經打標誌)分開,最後進行合併。

(二)Reduce join案例實操

  1. 需求

    pd.txt中數據根據商品pid合併到order.txt中。

    // order.txt
    //	id		pid amount
        1001	01	1
        1002	02	2
        1003	03	3
        1004	01	4
        1005	02	5
        1006	03	6
    
    // pd.txt
    // 	pid pname
        01	小米
        02	華爲
        03	格力
    
  2. 創建包名:com.easysir.reducejoin

  3. 創建TableBean類:

    package com.easysir.reducejoin;
    
    import org.apache.hadoop.io.Writable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    public class TableBean implements Writable {
    
        private String id;      // 訂單id
        private String pid;     // 商品id
        private int amount;     // 商品數量
        private String pname;   // 商品名稱
        private String flag;    // 標記來自哪張表
    
        public TableBean() {
            super();
        }
    
        public TableBean(String id, String pid, int amount, String pname, String flag) {
            super();
            this.id = id;
            this.pid = pid;
            this.amount = amount;
            this.pname = pname;
            this.flag = flag;
        }
    
        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(id);
            out.writeUTF(pid);
            out.writeInt(amount);
            out.writeUTF(pname);
            out.writeUTF(flag);
        }
    
        @Override
        public void readFields(DataInput in) throws IOException {
            id = in.readUTF();
            pid = in.readUTF();
            amount = in.readInt();
            pname = in.readUTF();
            flag = in.readUTF();
        }
    
        public String getId() {
            return id;
        }
    
        public void setId(String id) {
            this.id = id;
        }
    
        public String getPid() {
            return pid;
        }
    
        public void setPid(String pid) {
            this.pid = pid;
        }
    
        public int getAmount() {
            return amount;
        }
    
        public void setAmount(int amount) {
            this.amount = amount;
        }
    
        public String getPname() {
            return pname;
        }
    
        public void setPname(String pname) {
            this.pname = pname;
        }
    
        public String getFlag() {
            return flag;
        }
    
        public void setFlag(String flag) {
            this.flag = flag;
        }
    
        @Override
        public String toString() {
            return id +
                    "\t" + amount +
                    "\t" + pname ;
        }
    }
    
  4. 創建TableMapper類:

    package com.easysir.reducejoin;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    import java.io.IOException;
    
    public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
    
        String name;
        TableBean tableBean = new TableBean();
        Text k  =new Text();
        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            name = fileSplit.getPath().getName();
        }
    
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
    
            // 1 獲取一行
            String line = value.toString();
    
            String[] fields = line.split("\t");
    
            // 2 判斷使哪個表
            if (name.startsWith("order")){  // 訂單表
                tableBean.setId(fields[0]);
                tableBean.setPid(fields[1]);
                tableBean.setAmount(Integer.parseInt(fields[2]));
                tableBean.setPname("");
                tableBean.setFlag("order");
    
                k.set(fields[1]);
            }else { // 商品表
                tableBean.setId("");
                tableBean.setPid(fields[0]);
                tableBean.setAmount(0);
                tableBean.setPname(fields[1]);
                tableBean.setFlag("pd");
    
                k.set(fields[0]);
            }
    
            context.write(k, tableBean);
        }
    }
    
  5. 創建TableReducer類:

    package com.easysir.reducejoin;
    
    import org.apache.commons.beanutils.BeanUtils;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    import java.lang.reflect.InvocationTargetException;
    import java.util.ArrayList;
    
    public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable> {
    
        @Override
        protected void reduce(Text key, Iterable<TableBean> values, Context context)
                throws IOException, InterruptedException {
    
            // 存儲所有訂單集合
            ArrayList<TableBean> orderBeans =  new ArrayList<>();
            // 存儲商品信息
            TableBean pdbean = new TableBean();
    
            for (TableBean tableBean : values) {
    
                if ("order".equals(tableBean.getFlag())){
                    TableBean tmpBean = new TableBean();
                    try {
                        BeanUtils.copyProperties(tmpBean, tableBean);
                        orderBeans.add(tmpBean);
                    } catch (IllegalAccessException | InvocationTargetException e) {
                        e.printStackTrace();
                    }
                }else {
                    try {
                        BeanUtils.copyProperties(pdbean, tableBean);
                    } catch (IllegalAccessException | InvocationTargetException e) {
                        e.printStackTrace();
                    }
                }
            }
    
            for (TableBean tableBean : orderBeans){
                tableBean.setPname(pdbean.getPname());
                context.write(tableBean, NullWritable.get());
            }
        }
    }
    
  6. 創建TableDriver類:

    package com.easysir.reducejoin;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    
    public class TableDriver {
    
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置
            args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3", "E:\\idea-workspace\\mrWordCount\\output" };
    
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
    
            job.setJarByClass(TableDriver.class);
    
            job.setMapperClass(TableMapper.class);
            job.setReducerClass(TableReducer.class);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(TableBean.class);
    
            job.setOutputKeyClass(TableBean.class);
            job.setOutputValueClass(NullWritable.class);
    
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            boolean result = job.waitForCompletion(true);
            System.exit(result? 0:1);
        }
    }
    

(三)總結

  1. 缺點:這種方式中,合併的操作是在Reduce階段完成,Reduce端的處理壓力太大,Map節點的運算負載則很低,資源利用率不高,且在Reduce階段極易產生數據傾斜。
  2. 解決方案:採用在Map端進行數據合併的方式

二、Map join的應用

(一)Map join詳解

  1. 使用場景

    Map join適用於一張一張表非常大,一張表非常小的場景。

  2. 優點

    Map端緩存多張表,提前處理業務邏輯,這樣增加Map端業務,能夠減少Reduce端數據的壓力,儘可能的減少數據傾斜。

  3. 具體步驟:

    (1)在Mappersetup階段,將文件讀取到緩存集合中;

    (2)在驅動函數中加載緩存。

    // 緩存普通文件到Task運行節點。
    job.addCacheFile(new URI("文件路徑"));
    

(二)Map join案例實操

  1. 需求

    pd.txt中數據根據商品pid合併到order.txt中。

    // order.txt
    //	id		pid amount
        1001	01	1
        1002	02	2
        1003	03	3
        1004	01	4
        1005	02	5
        1006	03	6
    
    // pd.txt
    // 	pid pname
        01	小米
        02	華爲
        03	格力
    
  2. 創建包名:com.easysir.mapjoin

  3. 創建DistributedCacheMapper類:

    package com.easysir.mapjoin;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URI;
    import java.util.HashMap;
    import java.util.Map;
    
    public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
    
        Map<String, String> pdMap = new HashMap<>();
    
        @Override
        protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
    
            // 1 獲取緩存的文件
            URI[] cacheFiles = context.getCacheFiles();
            String path = cacheFiles[0].getPath();
    
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
    
            String line;
            while(StringUtils.isNotEmpty(line = reader.readLine())){
    
                // 2 切割
                String[] fields = line.split("\t");
    
                // 3 緩存數據到集合
                pdMap.put(fields[0], fields[1]);
            }
    
            // 4 關流
            reader.close();
        }
    
        Text k = new Text();
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
            // 1 獲取一行
            String line = value.toString();
    
            // 2 截取
            String[] fields = line.split("\t");
    
            // 3 獲取產品id
            String pId = fields[1];
    
            // 4 獲取商品名稱
            String pdName = pdMap.get(pId);
    
            // 5 拼接
            k.set(line + "\t"+ pdName);
    
            // 6 寫出
            context.write(k, NullWritable.get());
        }
    }
    
  4. 創建DistributedCacheDriver類:

    package com.easysir.mapjoin;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.net.URI;
    
    public class DistributedCacheDriver {
    
        public static void main(String[] args) throws Exception {
    
            // 輸入輸出路徑需要根據自己電腦上實際的輸入輸出路徑設置
            args = new String[] { "E:\\idea-workspace\\mrWordCount\\input3\\order.txt", "E:\\idea-workspace\\mrWordCount\\output" };
    
            // 1 獲取job信息
            Configuration configuration = new Configuration();
            Job job = Job.getInstance(configuration);
    
            // 2 設置加載jar包路徑
            job.setJarByClass(DistributedCacheDriver.class);
    
            // 3 關聯map
            job.setMapperClass(DistributedCacheMapper.class);
    
            // 4 設置最終輸出數據類型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
    
            // 5 設置輸入輸出路徑
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            // 6 加載緩存數據
            job.addCacheFile(new URI("file:///E:/idea-workspace/mrWordCount/input3/pd.txt"));
    
            // 7 Map端Join的邏輯不需要Reduce階段,設置reduceTask數量爲0
            job.setNumReduceTasks(0);
    
            // 8 提交
            boolean result = job.waitForCompletion(true);
            System.exit(result ? 0 : 1);
        }
    }
    
  5. 運行結果:

    1001	01	1	小米
    1002	02	2	華爲
    1003	03	3	格力
    1004	01	4	小米
    1005	02	5	華爲
    1006	03	6	格力
    
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章