Hadoop_day06_MapReduce 的 Reduce 和 Map 端实现 join 操作

一、Reduce 端

1. 需求

商品表

id pname category_id price
P0001 小米5 1000 2000
P0002 锤子T1 1000 3000

订单数据表

id date pid amount
1001 20150710 P0001 2
1002 20150710 P0002 3

2. 实现步骤

       通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联

2.1 定义 Mapper

public class ReduceJoinMapper extends Mapper<LongWritable,Text,Text,Text> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 判断数据来自哪个文件
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        String fileName = fileSplit.getPath().getName();
        String[] split = value.toString().split(",");
        if (fileName.equals("product.txt")){
            // 数据来自商品表
            context.write(new Text(split[0]),value);
        }else {
            // 数据来自订单表
            context.write(new Text(split[2]),value);
        }
    }
}

2.2 定义 Reducer 

public class ReduceJoinReducer extends Reducer<Text,Text,Text,Text> {
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        String first = "";
        String second = "";
        ArrayList<String> list = new ArrayList<String>();
        for (Text value : values) {
            if (value.toString().startsWith("p")){
                first = value.toString();
            }else {
                list.add(value.toString());
            }
        }
        for (String s : list) {
            second = second + s + "\t";
        }
        context.write(key,new Text(first + "\t" + second));
    }
}

2.3 定义主类

public class JobMain extends Configured implements Tool {

    @Override
    public int run(String[] strings) throws Exception {

        Job job = Job.getInstance(super.getConf(), "reduce_join");

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("d:\\mapreduce\\reduce_join_in"));

        job.setMapperClass(ReduceJoinMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceJoinReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("d:\\mapreduce\\reduce_join_out"));

        boolean bl = job.waitForCompletion(true);
        return bl ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int run = ToolRunner.run(configuration,new JobMain(),args);
        System.exit(run);
    }
}

二、Map 端

1. 概述

​ 适用于关联表中有小表的情形.

​ 使用分布式缓存,可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度

2. 实现步骤

先在mapper类中预先定义好小表,进行join

引入实际场景中的解决方案:一次加载数据库或者用

2.1 定义 Mapper

public class MapJoinMapper extends Mapper<LongWritable,Text,Text,Text> {

    private HashMap<String,String> map = new HashMap<String, String>();

    //第一件事情:将分布式缓存的小表数据读取到本地Map集合(只需要做一次)
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 获取分布式缓存文件列表
        URI[] cacheFiles = context.getCacheFiles();

        // 获取指定的分布式缓存文件的文件系统
        FileSystem fileSystem = FileSystem.get(cacheFiles[0], new Configuration());

        // 获取文件的输入流
        FSDataInputStream inputStream = fileSystem.open(new Path(cacheFiles[0]));

        // 读取文件内容,并将输入存入Map集合
        // 将字节输入流转为字符缓冲流
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        // 读取小表文件内容,以行为单位,并将读取的数据存入 map 集合
        String line = null;
        while((line = bufferedReader.readLine()) != null){
            map.put(line.split(",")[0],line);
        }

        // 关闭流
        bufferedReader.close();
        fileSystem.close();
    }

    //第二件事情:对大表的处理业务逻辑,而且要实现大表和小表的join操作
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String productId = value.toString().split(",")[2]; // K2

        String productLine = map.get(productId);
        String valueLine = productLine + "\t" + value.toString(); // V2

        context.write(new Text(productId),new Text(valueLine));
    }
}

2.2 定义主类

public class JobMain extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {

        Job job = Job.getInstance(super.getConf(), "map_join");

        // 将小表放在分布式缓存中
        // DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cache_file/product.txt"), super.getConf());
        job.addCacheFile(new URI("hdfs://node01:8020/cache_file/product.txt"));

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("d:\\mapreduce\\map_join_in"));

        job.setMapperClass(MapJoinMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("d:\\mapreduce\\map_join_out"));

        boolean bl = job.waitForCompletion(true);
        return bl ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int run = ToolRunner.run(configuration, new JobMain(), args);
        System.exit(run);
    }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章