MapReduce單表關聯

由於書中給出的代碼閱讀性比較差..就自己稍微修改了下..

此處是child-parent表,要求輸出grandchild-grandparent表

input file:

child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma

package me.river.study.hadoop.mr;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class STjoin {
	private static boolean isFirst = true;

	public static class STMapper extends Mapper<LongWritable, Text, Text, Text> {
		private Text outKey = new Text();
		private Text outValue = new Text();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] line = value.toString().split("\\s");
			if (!"child".equals(line[0])) {
				String relationtype = null;
				String childname = line[0];
				String parentname = line[1];

				relationtype = "1"; // 左表 key的子女信息
				outKey.set(parentname);
				outValue.set(relationtype + "\t" + childname);
				context.write(outKey, outValue);

				relationtype = "2"; // 右表 key的父母信息
				outKey.set(childname);
				outValue.set(relationtype + "\t" + parentname);
				context.write(outKey, outValue);
			}
		}
	}

	public static class STReducer extends Reducer<Text, Text, Text, Text> {
		private Text outKey = new Text();
		private Text outValue = new Text();

		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			// 表頭
			if (isFirst) {
				outKey.set("grandchild");
				outValue.set("grandparent");
				context.write(outKey, outValue);
				isFirst = false;
			}
			// 封裝key的兒女(子孫)與父母(祖父母)
			List<String> grandchildren = new ArrayList<String>();
			List<String> grandparents = new ArrayList<String>();
			for (Text value : values) {
				String[] line = value.toString().split("\t");
				if ("1".equals(line[0])) {
					grandchildren.add(line[1]);
				} else {
					grandparents.add(line[1]);
				}
			}
			// 輸出笛卡爾積--子孫與祖父母
			if (!grandchildren.isEmpty() && !grandparents.isEmpty()) {
				for (String grandchild : grandchildren) {
					for (String grandparent : grandparents) {
						outKey.set(grandchild);
						outValue.set(grandparent);
						context.write(outKey, outValue);
					}
				}
			}
		}
	}

	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration(), "single table join");
		job.setJarByClass(STjoin.class);
		job.setMapperClass(STMapper.class);
		job.setReducerClass(STReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

輸出結果:

grandchild      grandparent
Tom     Alice
Tom     Jesse
Jone    Alice
Jone    Jesse
Tom     Ben
Tom     Mary
Jone    Ben
Jone    Mary
Philip  Alice
Philip  Jesse
Mark    Alice
Mark    Jesse

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章