要求
沒什麼太多要說的,以前的MapR in Python用熟練了,Java要不是作業感覺也不會太常用它了…… 就貼一下以備忘(防止以後突然要用java寫的時候可以來參考一下)
- 輸入文件:文本文件
- 每行格式
- source _ destination _ time
- 3個部分由空格隔開
- 其中source和destination爲兩個字符串,內部沒有空格
- time爲一個浮點數,代表時間(秒爲單位)
- 涵義:可以表示一次電話通話,或表示一次網站訪問等
- 輸入可能有噪音
- 如果一行不符合上述格式,應該被丟棄,程序需要正確執行
- 每行格式
- MapReduce計算:統計每對source-destination的信息
- 輸出
- source _ destination _ count _ average time
- 每一個source-destination組合輸出一行(注意:順序相反按不同處理)
- 每行輸出通話次數和通話平均時間(保留3位小數,例如2.300)
Source Code
/* 3, 201618013229031, ChenDian */
// Based on WordCount.java:
// Modified by Shimin Chen to demonstrate functionality for Homework 2
// April-May 2015
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Hw2Part1 {
// This is the Mapper class
// reference: http://hadoop.apache.org/docs/r2.6.0/api/org/apache/hadoop/mapreduce/Mapper.html
//
public static class CDMapper
extends Mapper<Object, Text, Text, DoubleWritable>{
private Text pair = new Text();
private DoubleWritable time = new DoubleWritable();
// (src, dest)=key, (time)=value
private String keyStr = new String();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String delimiters = "\\s+";
String[] tokens = value.toString().split(delimiters);
keyStr = tokens[0] + " " + tokens[1];
pair.set(keyStr); // Key
time.set(Double.parseDouble(tokens[2])); // Value
context.write(pair, time);
}
}
// This is the Reducer class
// reference http://hadoop.apache.org/docs/r2.6.0/api/org/apache/hadoop/mapreduce/Reducer.html
//
public static class CDReducer
extends Reducer<Text, DoubleWritable, Text, Text> {
private Text retKey= new Text();
private Text retValue= new Text();
public void reduce(Text key, Iterable<DoubleWritable> values, Context context
) throws IOException, InterruptedException {
int cnt = 0; double sum = 0.0;
for (DoubleWritable val : values) {
sum += val.get();
cnt += 1;
}
// calculate for return value
String vStr = new String();
double ans = sum / (double)cnt;
vStr = Integer.toString(cnt) + " " + String.format("%.3f", ans);
// generate result key
retKey.set(key);
// generate result value
retValue.set(vStr);
// write answer
context.write(retKey, retValue);
}
}
public static void deletePath(String path) throws IOException{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path f = new Path(path);
if(fs.exists(f)) fs.delete(f, true);
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: Hw2Part1 <in> <out>");
System.exit(2);
}
// deletePath(otherArgs[otherArgs.length - 1])
// Perhaps... Teacher's script has done this...
Job job = Job.getInstance(conf, "Mapr_Average");
job.setJarByClass(Hw2Part1.class);
job.setMapperClass(CDMapper.class);
job.setReducerClass(CDReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}