hadoop09--map端及reduce端的join, 數據壓縮

map端join算法實現

原理闡述

適用於關聯表中有小表的情形:
可以將小表分發到所有的map節點,這樣,map節點就可以在本地對自己所讀到的大表數據進行join
並輸出最終結果,可以大大提高join操作的併發度,加快處理速度

實現示例

--先在mapper類中預先定義好小表,進行join
--引入實際場景中的解決方案:一次加載數據庫或者用distributedcache

總結

適用場景

一個大表join一個小表

實現方式:

a. 將小表先準備在一個hdfs的目錄中
b. 在代碼的main方法中用job.addCacheFile()將其分發到maptask的工作目錄下;還需要將reduce task的數量設置爲0
c. 在代碼的mapper的setup方法中用本地文件api讀取小表文件到內存中
d. 在map方法中根據輸入數據匹配內存小表進行拼接即可

代碼實現

緩存小表

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CacheMap extends Mapper<LongWritable, Text, Text, NullWritable> {

	// v保存緩存數據的hashmap
	Map<String, String> pMap = new HashMap<>();
	Text k = new Text();

	@Override
	protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub

		// 1. 獲得緩存的文件
		BufferedReader reader = new BufferedReader(
				new InputStreamReader(new FileInputStream("C:\\Users\\55454_000\\Desktop\\product.txt"), "UTF-8"));

		String line = null;

		while (StringUtils.isNotEmpty(line = reader.readLine())) {
			// 切割
			String[] fieds = line.split(",");

			// 緩存到集合中
			pMap.put(fieds[0], fieds[1]);

		}

		// 關閉流
		reader.close();
	}

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// 獲取數據
		String line = value.toString();

		// 截取字段
		String[] fields = line.split(",");

		// 獲得訂單ID
		String id = fields[0];

		// 獲得產品的id
		String pid = fields[2];

		// 獲得商品名稱
		String pName = pMap.get(pid);

		// join
		k.set(line + "\t" + pName);

		// 輸出
		context.write(k, NullWritable.get());
	}

}

主函數Driver
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args)
			throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(CacheMap.class);
		// job.setReducerClass(InputFormatReduce.class);

		// job.setInputFormatClass(MyFileInputFormat.class);
		//job.addCacheFile(new URI("file:///C:/Users/55454_000/Desktop/product.txt"));
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		conf.setBoolean("mapreduce.map.output.compress", true);
		conf.setClass("mapreduce.map.output.compress", BZip2Codec.class, CompressionCodec.class);
		
		FileOutputFormat.setCompressOutput(job, true);
		FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
		// job.setOutputKeyClass(Text.class);
		// job.setOutputValueClass(BytesWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);

	}
}

reduce端join實現

實現

自定義數據類型
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class InfobeanWritable implements Writable {

	private int order_id; // 訂單id
	private String date; // 日期
	private String pid; // 商品id
	private int amount; // 訂單數量
	private String name; // 商品名稱
	private String category_id; // 商品類別
	private double price; // 商品價格
	private String flag;// 標記位 0 表示訂單表 1 表示商品表

	// 無參構造方法
	public InfobeanWritable() {

	}

	// 有參構造方法
	public InfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
			double price, String flag) {
		this.setInfobeanWritable(order_id, date, pid, amount, name, category_id, price, flag);
	}

	@Override
	public String toString() {
		return "InfobeanWritable [order_id=" + order_id + ", date=" + date + ", pid=" + pid + ", amount=" + amount
				+ ", name=" + name + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag + "]";
	}

	public void setInfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
			double price, String flag) {
		this.order_id = order_id;
		this.date = date;
		this.pid = pid;
		this.amount = amount;
		this.name = name;
		this.category_id = category_id;
		this.price = price;
		this.flag = flag;
	}

	public int getOrder_id() {
		return order_id;
	}

	public void setOrder_id(int order_id) {
		this.order_id = order_id;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	public String getPid() {
		return pid;
	}

	public void setPid(String pid) {
		this.pid = pid;
	}

	public int getAmount() {
		return amount;
	}

	public void setAmount(int amount) {
		this.amount = amount;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getCategory_id() {
		return category_id;
	}

	public void setCategory_id(String category_id) {
		this.category_id = category_id;
	}

	public double getPrice() {
		return price;
	}

	public void setPrice(double price) {
		this.price = price;
	}

	public String getFlag() {
		return flag;
	}

	public void setFlag(String flag) {
		this.flag = flag;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub

		this.order_id = in.readInt();
		this.date = in.readUTF();
		this.pid = in.readUTF();
		this.amount = in.readInt();
		this.name = in.readUTF();
		this.category_id = in.readUTF();
		this.price = in.readDouble();
		this.flag = in.readUTF();

	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub

		out.writeInt(this.order_id);
		out.writeUTF(this.date);
		out.writeUTF(this.pid);
		out.writeInt(this.amount);
		out.writeUTF(this.name);
		out.writeUTF(this.category_id);
		out.writeDouble(this.price);
		out.writeUTF(this.flag);
	}

}
map端
import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class MapJoin extends Mapper<LongWritable, Text, Text, InfobeanWritable> {

	Text outputkey = new Text();
	InfobeanWritable infobean = new InfobeanWritable();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// 1. 獲取一行的內容
		String line = value.toString();

		// 2. 獲得文件名稱
		FileSplit fileSplit = (FileSplit) context.getInputSplit();

		String fileName = fileSplit.getPath().getName();

		// 切割
		String[] fileds = line.split(",");

		String pid;

		// 從文件名稱裏判斷, 如果是order開頭說明是訂單表
		if (fileName.startsWith("order")) {
			int order_id = Integer.valueOf(fileds[0]);
			String date = fileds[1];
			pid = fileds[2];
			int amount = Integer.valueOf(fileds[3]);
			infobean.setInfobeanWritable(order_id, date, pid, amount, "", "", 0, "0");

		} else {
			pid = fileds[0];
			String name = fileds[1];
			String category_id = fileds[2];
			double price = Double.valueOf(fileds[3]);
			infobean.setInfobeanWritable(0, "", pid, 0, name, category_id, price, "1");
		}

		outputkey.set(pid);
		context.write(outputkey, infobean);

	}
}
reduce端
import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class JoinReduce extends Reducer<Text, InfobeanWritable, InfobeanWritable, NullWritable> {

	@Override
	protected void reduce(Text key, Iterable<InfobeanWritable> values, Context context)
			throws IOException, InterruptedException {

		ArrayList<InfobeanWritable> orderlist = new ArrayList<>();
		InfobeanWritable pdBean = new InfobeanWritable();

		for (InfobeanWritable value : values) {
			if ("1".equals(value.getFlag())) {
				try {
					BeanUtils.copyProperties(pdBean, value);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			} else {
				InfobeanWritable odBean = new InfobeanWritable();
				try {
					BeanUtils.copyProperties(odBean, value);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				orderlist.add(odBean);
			}

		}

		for (InfobeanWritable bean : orderlist) {
			bean.setName(pdBean.getName());
			bean.setCategory_id(pdBean.getCategory_id());
			bean.setPrice(pdBean.getPrice());

			context.write(bean, NullWritable.get());
		}
	}
}
主函數Driver
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(MapJoin.class);
		job.setReducerClass(JoinReduce.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(InfobeanWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		// job.setOutputFormatClass(FilteroutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : 1);
	}
}

數據壓縮

作用

有效減少磁盤空間或者IO的帶寬

常用的壓縮的方式

壓縮的格式 是否切分 解壓縮
gzip 不需要處理
Bzip2 不需要處理
Snappy 不需要處理
Snappy 特點

Snappy 需要單獨的安裝 hive
Snappy 速度是最快的

使用壓縮的情況

在不頻繁進行計算的時候, 並且有大量文件傳輸的情景下可以使用壓縮

使用階段

1.輸入階段
2.map輸出階段
// 在driver類中開啓map端的壓縮 	
config.setBoolean("mapreduce.map.output.compress", true);

//設置壓縮方式
config.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
3.reduce輸出階段
//開啓reduce端壓縮	
FileOutputFormat.setCompressOutput(job, true);

//壓縮格式的設置
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章