MapReduce(分佈式計算模型)---序列化和分區
一、序列化
- 在MapReduce中,要求數據能夠被序列化
- MapReduce的序列化機制默認採用的AVRO
- MapReduce對AVRO的序列化機制進行了封裝,提供了更簡便的序列化形式 - 實現接口Writable
案例一、創建一個flow類並對其序列化
package cn.zyj.flow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Flow implements Writable{
private String phone;
private String name;
private String addr;
private int flow;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public int getFlow() {
return flow;
}
public void setFlow(int flow) {
this.flow = flow;
}
//反序列化
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
//按照什麼順序寫的就按照什麼順序讀
this.phone = in.readUTF();
this.addr = in.readUTF();
this.name = in.readUTF();
this.flow = in.readInt();
}
//序列化
//只需要將有必要的屬性來依次寫出即可序列化
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(phone);
out.writeUTF(addr);
out.writeUTF(name);
out.writeInt(flow);
}
}
二、分區
- 分區操作是shuffle操作中的一個重要過程,作用就是將map的結果按照規則分發到不同reduce中進行處理,從而按照分區得到多個輸出結果。
- Partitioner是分區的基類,如果需要定製partitioner也需要繼承該類
- HashPartitioner是MapReduce的默認partitioner。計算方法是:which reducer=(key.hashCode() & Integer.MAX_VALUE) % numReduceTasks
- 默認情況下,reduceTask數量爲1
- 很多時候MapReduce自帶的分區規則並不能滿足業務需求,爲了實現特定的效果,可以需要自己來定義分區規則
- 如果定義了幾個分區,則需要定義對應數量的ReduceTask
案例一、求每個城市中每個人使用的流量
Mapper:
package cn.tedu.partflow;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class PartFlowMapper extends Mapper<LongWritable, Text, Text, Flow> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arr = value.toString().split(" ");
Flow f = new Flow();
f.setPhone(arr[0]);
f.setAddr(arr[1]);
f.setName(arr[2]);
f.setFlow(Integer.parseInt(arr[3]));
context.write(new Text(f.getName()), f);
}
}
Partitioner:
package cn.tedu.partflow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class AddrPartitioner extends Partitioner<Text, Flow> {
// 指定分類規則
@Override
public int getPartition(Text key, Flow value, int numReduceTasks) {
// 按照地區分類
// 先拿到地區
String addr = value.getAddr();
if (addr.equals("bj"))
return 0;
else if (addr.equals("sh"))
return 1;
else
return 2;
}
}
Reducer:
package cn.tedu.partflow;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class PartFlowReducer extends Reducer<Text, Flow, Text, IntWritable> {
public void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (Flow val : values) {
sum += val.getFlow();
}
context.write(key, new IntWritable(sum));
}
}
Driver:
package cn.tedu.partflow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PartFlowDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.serialflow.SerialFlowDriver.class);
job.setMapperClass(PartFlowMapper.class);
job.setReducerClass(PartFlowReducer.class);
// 設置分區類
job.setPartitionerClass(AddrPartitioner.class);
// 設置ReduceTask的數量
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Flow.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://10.42.3.8:9000/txt/flow.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://10.42.3.8:9000/result/partflow"));
if (!job.waitForCompletion(true))
return;
}
}