引用的包從mapred改爲mapreduce,一個NutchJob的相關代碼修改。
1. Job設置和運行
舊API
// mapred 包中有 JobConf、JobClient,在 mapreduce 包中都取消了
public void myTask() throws Exception {
JobConf job = new NutchJob(getConf());
job.setJobName("MyTool");
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// my conf
job.set(key, value);
try{
RunningJob runningJob = JobClient.runJob(job);
if (!runningJob.isSuccessful()){
throw new Exception("@@JOB FAILED");
}
}catch (Exception e){
throw e;
}
}
新API
// mapreduce 包中配置使用 Configuration 類,運行通過 Job 類
public void myTask() throws Exception {
Job job = NutchJob.getInstance(getConf());
Configuration conf = job.getConfiguration();
job.setJobName("MyTool");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setJarByClass(MyTool.class); // 通過傳入的 class 找到 job 的 jar 包
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// my conf
conf.set(key, value);
try{
boolean success = job.waitForCompletion(true);
if (!success) {
throw new Exception("@@JOB FAILED");
}
}catch (Exception e){
throw e;
}
}
2. Mapper
// 舊 API 中 Mapper 和 Reducer 是接口
public static class MyMapper implements Mapper<Text, CrawlDatum, Text, Text>{
@Override
public void map(Text key, CrawlDatum value, OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String mapOutputKey;
String mapOutoutValue;
// some operations
output.collect(new Text(mapOutputKey), new Text(mapOutoutValue));
}
}
// 新 API 中 Mapper 和 Reducer 是抽象類
// 新 API 廣泛使用 Context ,允許用戶代碼與 MapReduce 系統進行通信。
public static class MyMapper extends Mapper<Text, CrawlDatum, Text, Text> {
@Override
public void map(Text key, CrawlDatum value, Context context)
throws IOException, InterruptedException {
String mapOutputKey;
String mapOutoutValue;
// some operations
context.write(new Text(mapOutputKey), new Text(mapOutoutValue));
}
}
3. Reducer
// 舊 API 的配置繼承了JobConfigurable 中的 configure
public static class MyReducer implements Reducer<Text, Text, Text, Text>{
@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String reduceOutputKey;
String reduceOutoutValue;
while (values.hasNext()){
// some operations
}
output.collect(new Text(reduceOutputKey), new Text(reduceOutoutValue));
}
@Override
public void configure(JobConf job) {
String someValue= job.get(SOME_KEY);
}
}
// 新 API 的配置函數 setup 在Mapper 和 Reducer 中,通過 context 傳遞上下文
public static class MyReducer extends Reducer<Text, Text, Text, Text>{
@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String reduceOutputKey;
String reduceOutoutValue;
for (Text value : values) {
// some operations
}
context.write(new Text(reduceOutputKey), new Text(reduceOutoutValue));
}
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
String someValue = conf.get(SOME_KEY);
}
}