第一個程序一般都是Hello World,所以說MapReduce的第一個程序就是單詞計數,主要代碼如下:
package Temperature;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
/***
*
* 當向MapReduce提交作業的時候,首先文件會被分割成splits,由於我們只是測試
* 所以,只有一個split,然後MapReduce按行將文件切分,<key,value>相當於Python的字典
*
*/
/***
*
* 將上邊切割好的<key,value>傳遞給一下我們自定義的map
* 生成<key, value>
* 上邊是按行分的文件數據,這裏是按照空格分的行數據
*
*/
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* hadoop === java
* BooleanWritable === boolean
* ByteWritable === byte
* ShortWritable === short
* LongWritable === long
* Text === String
* IntWritable === int
* FloatWritable === float
* DoubleWritable === double
* ArrayWritable === Array
* MapWritable === map
*/
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(this.word, this.one);
}
}
}
/**
*
* 得到<key,value>的值後Mapper會按照key對其進行排序,
* 如果定義了Combine函數,將會對這些排序後的相同的鍵值進行合併,以後再解析Combine函數,這裏先不做解釋
* Mapper將<key,value>交給Reducer
* Reduce端首先把收到的數據進行排序,生成<key,[values]>
* 然後交給下面我們自定義的reduce函數處理,最後生成<key,value>鍵值對輸出到hdfs。
*
*/
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter report) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String [] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
// 配置輸出Key和Value的類型
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
//配置Map和Reduce類
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
//配置輸入輸出類
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
//設置輸入輸出路徑
FileInputFormat.setInputPaths(conf, new Path("hdfs://192.168.1.51:9000/input/qixiang_data"));
FileOutputFormat.setOutputPath(conf, new Path("hdfs://192.168.1.51:9000/output/lzh/3"));
//提交作業
JobClient.runJob(conf);
}
}