快速入門MapReduce④ reduce端join與map端join實現

目錄

1.需求

2.創建join對象

3.實現Map Join

    3.1創建map代碼

    3.2實現啓動類

4.實現reduce join

  4.1創建map類

    4.2創建reduce代碼

4.3reduce的啓動類

5.需要的資源及執行結果


1.需求

  訂單表

商品表

實現機制:

通過將關聯的條件作爲map輸出的key,將兩表滿足join條件的數據並攜帶數據所來源的文件信息,發往同一個reduce task,在reduce中進行數據的串聯 

2.創建join對象

package com.czxy.order;

import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

@Data
@NoArgsConstructor
public class JoinBean implements Writable {
    private String id; // 訂單id
    private String date; // 訂單時間
    private String pid; // 商品id
    private String amount; // 數量
    private String name; //訂單名稱
    private String categoryId; //類別id
    private String price; // 價格

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(id+"");
        out.writeUTF(date+"");
        out.writeUTF(pid+"");
        out.writeUTF(amount+"");
        out.writeUTF(name+"");
        out.writeUTF(categoryId+"");
        out.writeUTF(price+"");
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id = in.readUTF();
        this.date = in.readUTF();
        this.pid = in.readUTF();
        this.amount = in.readUTF();
        this.name = in.readUTF();
        this.categoryId = in.readUTF();
        this.price = in.readUTF();
    }


}

3.實現Map Join

注意:需要把商品表的數據上傳的HDFS 

    3.1創建map代碼

package com.czxy.order;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import javax.xml.transform.Source;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

public class MapJoinMapper extends Mapper<LongWritable, Text, Text, Text> {
    private Map<String, JoinBean> joinMap = new HashMap<>();


    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 獲取所有的緩存文件地址
        URI[] urls = DistributedCache.getCacheFiles(context.getConfiguration());
        // 獲取文件系統對象
        FileSystem fs = FileSystem.get(urls[0], context.getConfiguration());
        // 讀取文件
        FSDataInputStream open = fs.open(new Path(urls[0]));
        // 轉換爲高效流
        BufferedReader bf = new BufferedReader(new InputStreamReader(open));
        // 按行讀取
        String line = "";
        while ((line = bf.readLine()) != null) {
            // 創建對象
            JoinBean joinBean = new JoinBean();
            // 按行切割
            String[] split = line.split(",");
            // 給對象賦值
            joinBean.setPid(split[0]);
            joinBean.setName(split[1]);
            joinBean.setCategoryId(split[2]);
            joinBean.setPrice(split[3]);
            //給結合添加信息
            joinMap.put(joinBean.getPid(), joinBean);
        }
        // 關閉資源
        bf.close();
        open.close();
        fs.close();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 類型轉換
        String s = value.toString();
        // 字符串切割
        String[] split = s.split(",");
        // 根據pid 獲取對象
        JoinBean joinBean = joinMap.get(split[2]);
        if (joinBean == null) {
            joinBean = joinMap.get("\uFEFF" + split[2]);
        }
        // 添加信息
        joinBean.setId(split[0]);
        joinBean.setDate(split[1]);
        joinBean.setPid(split[2]);
        joinBean.setAmount(split[3]);

        // 輸出
        context.write(new Text(split[2]), new Text(joinBean.toString()));
    }
}

    3.2實現啓動類

package com.czxy.order;

import com.czxy.flow.FlowBean;
import com.czxy.flow.FlowDriver;
import com.czxy.flow.FlowMapper;
import com.czxy.flow.FlowReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class MapJoinDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        // 獲取 HDFS 文件
        DistributedCache.addCacheFile(new URI("hdfs://192.168.100.201:8020/input/product.txt"), configuration);
        // 獲取job
        Job job = Job.getInstance(configuration);
        //  設置支持jar執行
        job.setJarByClass(MapJoinDriver.class);
        // 設置執行的napper
        job.setMapperClass(MapJoinMapper.class);
        // 設置map輸出的key類型
        job.setMapOutputKeyClass(Text.class);
        // 設置map輸出value類型
        job.setMapOutputValueClass(Text.class);

        // 設置文件輸入
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("./data/join/orders.txt"));
        // 設置文件輸出
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("./outPut/join/map"));
        // 設置啓動類
        boolean b = job.waitForCompletion(true);
        return b ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new MapJoinDriver(), args);
    }
}

4.實現reduce join

  4.1創建map類

package com.czxy.order;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class ReduceJoinMap extends Mapper<LongWritable, Text, Text, JoinBean> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        JoinBean joinBean=new JoinBean();
        // 類型轉換
        String s = value.toString();
        // 字符串切割
        String[] split = s.split(",");
        // 獲取當前讀取的文件
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        String fileName = fileSplit.getPath().getName();
        System.out.println(fileName);
        //判斷文件名是不是order
        if (fileName.contains("orders")) {
            joinBean.setId(split[0]);
            joinBean.setDate(split[1]);
            joinBean.setPid(split[2]);
            joinBean.setAmount(split[3]);
        }else {
            joinBean.setPid(split[0]);
            joinBean.setName(split[1]);
            joinBean.setCategoryId(split[2]);
            joinBean.setPrice(split[3]);
        }

        //輸出
        context.write(new Text(joinBean.getPid()),joinBean);
    }
}

    4.2創建reduce代碼

package com.czxy.order;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class ReduceJoinReduce extends Reducer<Text, JoinBean, Text, JoinBean> {
    @Override
    protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {
        // 創建對象 用來保存合併後的數據
        JoinBean joinBean = new JoinBean();
        // 定義兩個變量用來標記對象是否添加了數據
        boolean tab1 = false;
        boolean tab2 = false;
        for (JoinBean value : values) {
            //判斷是不是訂單
            if (value.getName() == null || value.getName().equals("null")) {
                joinBean.setId(value.getId());
                joinBean.setDate(value.getDate());
                joinBean.setPid(value.getPid());
                joinBean.setAmount(value.getAmount());
                //改變標識
                tab1 = true;
            } else {
                joinBean.setPid(value.getPid());
                joinBean.setName(value.getName());
                joinBean.setCategoryId(value.getCategoryId());
                joinBean.setPrice(value.getPrice());
                // 改變標識
                tab2 = true;
            }
            if (tab1 && tab2) {
                // 輸出
                context.write(key, joinBean);
            }
        }
    }
}

4.3reduce的啓動類

package com.czxy.order;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class ReduceJoinDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        // 獲取job
        Job job = Job.getInstance(new Configuration());
        //  設置支持jar執行
        job.setJarByClass(ReduceJoinDriver.class);
        // 設置執行的napper
        job.setMapperClass(ReduceJoinMap.class);
        // 設置map輸出的key類型
        job.setMapOutputKeyClass(Text.class);
        // 設置map輸出value類型
        job.setMapOutputValueClass(JoinBean.class);
        // 設置執行的reduce
        job.setReducerClass(ReduceJoinReduce.class);
        // 設置reduce輸出key的類型
        job.setOutputKeyClass(Text.class);
        // 設置reduce輸出value的類型
        job.setOutputValueClass(JoinBean.class);
        // 設置文件輸入
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("./data/join/"));
        // 設置文件輸出
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("./outPut/join/reduce"));
        // 設置啓動類
        boolean b = job.waitForCompletion(true);
        return b ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new ReduceJoinDriver(), args);
    }
}

5.需要的資源及執行結果

   ordes.txt

1001,20150710,P0001,2
1002,20150710,P0001,3
1002,20150710,P0002,3

  product.txt

P0001,小米5,1000,2000
P0002,錘子T1,1000,3000

1

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章