文章目錄
自定義數據類型(序列化)
自定義數據類型
Java類型 | Hadoop類型 |
---|---|
boolean | BooleanWritable |
byte | ByteWriteable |
int | IntWritable |
float | FloatWritable |
long | LongWritable |
double | DoubleWritable |
String | Text |
map | MapWritable |
array | ArrayWritable |
對於以上的數據類型, hadoop都提供了相應的實現, 能滿足基本開發需求, 但是有一些需求不能滿足
自定義數據類型規則
- 必須實現** writable **接口
- 必須提供無參的構造方法, 因爲反射的時候 , 默認調用無參的構造方法
- 分爲 key 和 **value ** , 如果自定義數據類型爲key , 則必須實現writableComparable接口
實例1
一張數據表如下格式:
13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27
想要得到的格式:
手機號 上行流量 下行流量 總流量
13726230503 2481 24681 53456
使用hadoop提供的數據類型實現如上格式輸出
- 分析: 自定義一個數據類型, 把上行流量, 下行流量進行保存
自定義數據類型 FlowWritable
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowWritable implements Writable{
private long upFlow; //上行總流量
private long downFlow; //下行總流量
/**
* 數據序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(upFlow);
out.writeLong(downFlow);
}
/**
* 反序列化方法
*/
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.upFlow=in.readLong();
this.downFlow=in.readLong();
}
}
實現map方法
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowMap extends Mapper<LongWritable, Text, Text, FlowWritable>{
private Text outputKey=new Text();
private FlowWritable outputValue=new FlowWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
//1 獲取數據
String line =value.toString();
//2 開始切割
String[] fileds=line.split("\t");
// 3 獲得手機號
String phoneNumber=fileds[0];
//4 上行總流量 和下行總流量
long upFlowValue=Long.valueOf(fileds[fileds.length-3]);
long downFlowValue=Long.valueOf(fileds[fileds.length-2]);
// 設置 手機號, 上行流量 下行流量 到序列化類型中
outputKey.set(phoneNumber);
outputValue.set(upFlowValue, downFlowValue);
context.write(outputKey, outputValue);
}
}
實現reduce方法
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowReduce extends Reducer<Text, FlowWritable, Text, FlowWritable>{
private FlowWritable outputValue=new FlowWritable();
@Override
protected void reduce(Text key, Iterable<FlowWritable> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
long totalUpFlow=0;
long totalDownFlow=0;
//相同的手機號,上行流量 和下行流量的和進行累加
for(FlowWritable value:values) {
totalUpFlow+=value.getUpFlow();
totalDownFlow+=value.getDownFlow();
}
outputValue.set(totalUpFlow, totalDownFlow);
context.write(key, outputValue);
}
}
主函數 Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 獲得配置信息
Configuration config=new Configuration();
// 實例化 job類 並且把配置信息傳給job
Job job=Job.getInstance(config);
// 通過反射機制 加載主類的位置
job.setJarByClass(Driver.class);
//設置map和reduce類
job.setMapperClass(FlowMap.class);
job.setReducerClass(FlowReduce.class);
//設置map的輸出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//設置redue的輸出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//設置文件的輸入 輸出路徑
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任務
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
- 注意: 序列化的順序和反序列化的順序一致
Map的分片
- Map的執行數量是由分片決定的
- 每一個分片 對應一個MapTask任務
- 默認情況下 , 切片的大小等於 blocksize 塊的大小, 可以進行修改
自定義分區
實例2
- 要求:對這個文件實現自定義分區,並且是按照省份ID進行分區的
默認分區的源碼
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
Key.hashCode % numberReduceTask = 0 或者 1
integer.MAX_VALUE = 01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111
假設key.hashcode = 10101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101
做 & 運算
01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111
10101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101
00101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101
注意:
- 這裏所做的&運算主要就是爲了防止key.hashcode爲負數, 對2取餘出現其他分區
- 默認的分區規則: key.hashCode%reduce 的數量
上一層源碼
@InterfaceAudience.Public
@InterfaceStability.Stable
public class HashPartitioner<K, V> extends Partitioner<K, V> {
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
從上面的源碼可以發現,一個類繼承了Partitoner就可以實現一個分區規則
過程分析
- 拿到手機號
- 獲得手機號前三位
- 存在一個數據塊, 存儲對應的省份ID, 可以通過手機號查詢到對應的省份
- 去數據庫查詢對應的省份, 獲得省份ID
繼承Partitioner類
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text, FlowWritable>{
static HashMap<String, Integer> provinveID=new HashMap<>();
static {
provinveID.put("136", 0);
provinveID.put("137", 1);
provinveID.put("138", 2);
provinveID.put("139", 3);
}
@Override
public int getPartition(Text key, FlowWritable value, int numPartitions) {
// TODO Auto-generated method stub
//1 獲得手機號碼的字符串
String phoneNumber=key.toString();
//2 獲得手機號碼的前三位
String prefix=phoneNumber.substring(0, 3);
Integer proviceId=provinveID.get(prefix);
return proviceId ==null? 4:proviceId;
}
}
修改主函數Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 獲得配置信息
Configuration config=new Configuration();
// 實例化 job類 並且把配置信息傳給job
Job job=Job.getInstance(config);
// 通過反射機制 加載主類的位置
job.setJarByClass(Driver.class);
//設置map和reduce類
job.setMapperClass(CountSortMap.class);
job.setReducerClass(CountSortReduce.class);
//設置map的輸出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//設置reduce數量
job.setNumReduceTasks(5);
// 設置自定義分區類
job.setPartitionerClass(ProvincePartitioner.class);
//設置redue的輸出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//設置文件的輸入 輸出路徑
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任務
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
排序
實例3
對實例1的結果中的總流量進行降序排列
分析
修改FlowWritable 實現 WritableComparable 接口
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
public class FlowWritable implements WritableComparable<FlowWritable>{
private long upFlow; //上行總流量
private long downFlow; //下行總流量
private long totalFlow; //總流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public void set(long upFlow,long downFlow) {
this.upFlow=upFlow;
this.downFlow=downFlow;
this.totalFlow=this.upFlow+this.downFlow;
}
/**
* 數據序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(upFlow);
out.writeLong(downFlow);
}
/**
* 反序列化方法
*/
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.upFlow=in.readLong();
this.downFlow=in.readLong();
}
@Override
public String toString() {
return this.upFlow+"\t"+this.downFlow+"\t"+this.totalFlow;
}
@Override
public int compareTo(FlowWritable o) {
// TODO Auto-generated method stub
return this.totalFlow>o.totalFlow?-1:1;
}
}
實現map函數
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// values 13480253104 key 180 180 360
public class CountSortMap extends Mapper<LongWritable, Text, FlowWritable, Text>{
FlowWritable outputKey=new FlowWritable();
Text outputValue=new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 1 獲取一行的數據文本
String line=value.toString();
//2 截取想要的內容
String[] fileds=line.split("\t");
//3 手機號
String phoneNumber=fileds[0];
//4 上行流量
long upFlow=Long.valueOf(fileds[1]);
//5 下行流量
long downFlow=Long.valueOf(fileds[2]);
outputKey.set(upFlow, downFlow);
outputValue.set(phoneNumber);
//6 輸出到 reduce
context.write(outputKey, outputValue);
}
}
實現reduce函數
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class CountSortReduce extends Reducer<FlowWritable, Text, Text, FlowWritable>{
@Override
protected void reduce(FlowWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
for(Text text:values) {
System.out.println(key.getUpFlow());
context.write(text, key);
}
}
}
實現主函數Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 獲得配置信息
Configuration config=new Configuration();
// 實例化 job類 並且把配置信息傳給job
Job job=Job.getInstance(config);
// 通過反射機制 加載主類的位置
job.setJarByClass(Driver.class);
//設置map和reduce類
job.setMapperClass(CountSortMap.class);
job.setReducerClass(CountSortReduce.class);
//設置map的輸出
job.setMapOutputKeyClass(FlowWritable.class);
job.setMapOutputValueClass(Text.class);
//設置redue的輸出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//設置文件的輸入 輸出路徑
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任務
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}