MapReduce程序之實現單表關聯

原創

田小雨

2020-06-16 02:54

設計思路

分析這個實例，顯然需要進行單表連接，連接的是左表的parent列和右表的child列，且左表和右表是同一個表。

　　連接結果中除去連接的兩列就是所需要的結果——"grandchild--grandparent"表。要用MapReduce解決這個實例，首先應該考慮如何實現表的自連接；其次就是連接列的設置；最後是結果的整理。

考慮到MapReduce的shuffle過程會將相同的key會連接在一起，所以可以將map結果的key設置成待連接的列，然後列中相同的值就自然會連接在一起了。再與最開始的分析聯繫起來：

　　要連接的是左表的parent列和右表的child列，且左表和右表是同一個表，所以在map階段將讀入數據分割成child和parent之後，會將parent設置成key，child設置成value進行輸出，並作爲左表；再將同一對child和parent中的child設置成key，parent設置成value進行輸出，作爲右表。爲了區分輸出中的左右表，需要在輸出的value中再加上左右表的信息，比如在value的String最開始處加上字符1表示左表，加上字符2表示右表。這樣在map的結果中就形成了左表和右表，然後在shuffle過程中完成連接。reduce接收到連接的結果，其中每個key的value-list就包含了"grandchild--grandparent"關係。取出每個key的value-list進行解析，將左表中的child放入一個數組，右表中的parent放入一個數組，然後對兩個數組求笛卡爾積就是最後的結果了

程序代碼

import java.io.IOException;

import java.util.*;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class STjoin {

public static int time = 0;

* map將輸出分割child和parent，然後正序輸出一次作爲右表，

* 反序輸出一次作爲左表，需要注意的是在輸出的value中必須

* 加上左右表的區別標識。

public static class Map extends Mapper<Object, Text, Text, Text> {

// 實現map函數

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

String line=value.toString();

String[] strs= line.split("\t");

context.write(new Text(strs[1]),new Text("1+"+strs[0]));//輸出左表

context.write(new Text(strs[0]),new Text("2+"+strs[1]));//輸出右表

}

public static class Reduce extends Reducer<Text, Text, Text, Text> {

// 實現reduce函數

public void reduce(Text key, Iterable<Text> values, Context context)

throws IOException, InterruptedException {

// 輸出表頭

if (0 == time) {

context.write(new Text("grandchild"), new Text("grandparent"));

time++;

}

String[] grandchild = null;

int grandchildnum = 0;

String[] grandparent = null;

int grandparentnum = 0;

Iterator iter = values.iterator();

while (iter.hasNext()) {

String record = ite.next().toString();

String[] st=record.split("+");

if(st[0]==1){

grandchild[grandchildnum ] =st[1];

grandchildnum ++;

} else if(st[0]==2){

grandparent [grandparentnum ]=st[1];

grandparentnum ++;

}

// grandchild和grandparent數組求笛卡爾兒積

if (0 != grandchildnum && 0 != grandparentnum) {

for (int m = 0; m < grandchildnum; m++) {

for (int n = 0; n < grandparentnum; n++) {

// 輸出結果

context.write(new Text(grandchild[m]), new Text(grandparent[n]));

}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();

// 這句話很關鍵

conf.set("mapred.job.tracker", "192.168.1.2:9001");

String[] ioArgs = new String[] { "STjoin_in", "STjoin_out" };

String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: Single Table Join <in> <out>");

System.exit(2);

}

Job job = new Job(conf, "Single Table Join");

job.setJarByClass(STjoin.class);

// 設置Map和Reduce處理類

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

// 設置輸出類型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

// 設置輸入和輸出目錄

FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

MapReduce程序之實現單表關聯

設計思路

程序代碼

京東面試：如何進行JVM調優？

美團一面：項目中有 10000 個 if else 如何優化？想了半天，被問懵了！

Python 將PowerPoint (PPT/PPTX) 轉爲HTML

SQL優化-20231016

Spark學習筆記之淺釋

Hive控制Reduce個數

MapReduce編程之數據去重

MapReduce編程之倒排索引

Linux、hive、sqoop常用腳本

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結