需求: 在一堆給定的文本文件中統計輸出每一個單詞出現的總次數
Step 1. 數據格式準備
1.創建一個新的文件
cd /export/servers
vim wordcount.txt
2. 向其中放入以下內容並保存
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
3. 上傳到 HDFS
hdfs dfs -mkdir /wordcount/
hdfs dfs -put wordcount.txt /wordcount/
Step 2. Mapper
public class WordCountMapper extends
Mapper<LongWritable,Text,Text,LongWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
for (String word : split) {
context.write(new Text(word),new LongWritable(1));
}
}
}
Step 3. Reducer
public class WordCountReducer extends
Reducer<Text,LongWritable,Text,LongWritable> {
/**
* 自定義我們的reduce邏輯
* 所有的key都是我們的單詞,所有的values都是我們單詞出現的次數
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
long count = 0;
for (LongWritable value : values) {
count += value.get();
}
context.write(key,new LongWritable(count));
}
}
Step 4. 定義主類, 描述 Job 並提交 Job
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),
JobMain.class.getSimpleName());
//打包到集羣上面運行時候,必須要添加以下配置,指定程序的main函數
job.setJarByClass(JobMain.class);
//第一步:讀取輸入文件解析成key,value對
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new
Path("hdfs://192.168.52.250:8020/wordcount"));
//第二步:設置我們的mapper類
job.setMapperClass(WordCountMapper.class);
//設置我們map階段完成之後的輸出類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//第三步,第四步,第五步,第六步,省略
//第七步:設置我們的reduce類
job.setReducerClass(WordCountReducer.class);
//設置我們reduce階段完成之後的輸出類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//第八步:設置輸出類以及輸出路徑
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new
Path("hdfs://192.168.52.250:8020/wordcount_out"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
/**
* 程序main函數的入口類
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Tool tool = new JobMain();
int run = ToolRunner.run(configuration, tool, args);
System.exit(run);
}
}