MapReduce編程小案例.11th—數據傾斜場景
數據:
a a a a a a b b b a a a a a a a c c b c a a a c a b b c a a d d e e f f f g a a a b a b h h g j |
需求:
需要做wordcount
但是,會有一個問題存在:
a特別多,
負責處理a這個單詞數據的reduce worker就會很累(負載不均衡,過大)
思考:如何處理?會讓整個數據處理過程中,數據傾斜的狀況得到緩解。
數據傾斜場景part1-解決代碼方法:
WordcountCombiner類實現
package cn.edu360.mr.wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
WordcountMapper類實現
package cn.edu360.mr.wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
* KEYIN:是map task讀取到的數據的key的類型;是一行的起始偏移量Long
* VALUE:是map task讀取的數據value的類型:是一行的內容String
*
* KEYOUT:是用戶的自定義map方法返回的結果kv數據的key的類型;在wordcount邏輯中,我們需要返回的是單詞String
* VALUEOUT:是用戶自定義map方法返回結果kv的value的類型:在wordcount邏輯中,我們需要返回的是整數Integer
*
* 但是:在MapReduce中,map產生的數據需要傳輸給reduce,需要進行序列化和反序列化,而jdk中的原生序列化機制產生的數據量比較冗餘,就會導致數據在MapReduce運行過程中傳輸效率低下
* 所以,hadoop專門設計了自己的序列化機制,那麼MapReduce中傳輸的數據類型就必須實現hadoop自己的序列化接口
*
* hadoop爲jdk中的常用基本類型Long,String,Integer,Float等數據類型封裝了自己實現的hadoop序列化接口的類型:LongWriable,Text,Intwritable,Floatwritable
*
*
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//切單詞
String line = value.toString();
String[] words = line.split(" ");
for(String word : words) {
context.write(new Text(word), new IntWritable(1));
}
}
}
WordcountReducer類
package cn.edu360.mr.wc;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
JobSubmitterWindowsLocal類實現
package cn.edu360.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitterWindowsLocal {
public static void main(String[] args) throws Exception{
//沒指定默認文件系統
//沒指定MapReduce job提交到哪裏進行
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
conf.set("mapreduce.framework.name", "local");
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitterLinuxToYarn.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//設置maptask端的局部聚合邏輯類
job.setCombinerClass(WordcountCombiner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job,new Path("f:/mrdata/wordcount/input"));
FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/output2"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}