cd /home/hadoop/ && hadoop jar ./test/wordcount/wordcount.jar org.codetree.hadoop.v1.WordCount /test/chqz/input /test/chqz/output
那麼這個命令內部到底做了些什麼呢?
1、首先,在 ${HADOOP_HOME}/bin/hadoop 腳本中我們可以看到有如下代碼:
由於這裏$starting_secure_dn=false的(這裏可以參見hadoop腳本),所以最終會執行下面這行代碼:
從上面shell腳本中,我們可以明確看出當執行hadoop jar命令時,實際上執行了org.apache.hadoop.util.RunJar類。
下面#run it這一行代碼實質上是爲執行這個類的main方法設置所需的類路徑classpath。
2、繼續研究org.apache.hadoop.util.RunJar來內發生的事情:
因此,命令hadoop jar ./test/wordcount/wordcount.jar org.codetree.hadoop.v1.WordCount /test/chqz/input /test/chqz/output的各段的含義:
(1) hadoop:${HADOOP_HOME}/bin下的shell腳本名。
(2) jar:hadoop腳本需要的command參數。
(3) ./test/wordcount/wordcount.jar:要執行的jar包在本地文件系統中的完整路徑,參遞給RunJar類。
(4) org.codetree.hadoop.v1.WordCount:main方法所在的類,參遞給RunJar類。
(5) /test/chqz/input:傳遞給WordCount類,作爲DFS文件系統的路徑,指示輸入數據來源。
(6) /test/chqz/output:傳遞給WordCount類,作爲DFS文件系統的路徑,指示輸出數據路徑。
3、org.codetree.hadoop.v1.WordCount類代碼:
Source Code
WordCount.java | |
---|---|
1. | package org.myorg; |
2. | |
3. | import java.io.IOException; |
4. | import java.util.*; |
5. | |
6. | import org.apache.hadoop.fs.Path; |
7. | import org.apache.hadoop.conf.*; |
8. | import org.apache.hadoop.io.*; |
9. | import org.apache.hadoop.mapred.*; |
10. | import org.apache.hadoop.util.*; |
11. | |
12. | public class WordCount { |
13. | |
14. | public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { |
15. | private final static IntWritable one = new IntWritable(1); |
16. | private Text word = new Text(); |
17. | |
18. | public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { |
19. | String line = value.toString(); |
20. | StringTokenizer tokenizer = new StringTokenizer(line); |
21. | while (tokenizer.hasMoreTokens()) { |
22. | word.set(tokenizer.nextToken()); |
23. | output.collect(word, one); |
24. | } |
25. | } |
26. | } |
27. | |
28. | public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { |
29. | public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { |
30. | int sum = 0; |
31. | while (values.hasNext()) { |
32. | sum += values.next().get(); |
33. | } |
34. | output.collect(key, new IntWritable(sum)); |
35. | } |
36. | } |
37. | |
38. | public static void main(String[] args) throws Exception { |
39. | JobConf conf = new JobConf(WordCount.class); |
40. | conf.setJobName("wordcount"); |
41. | |
42. | conf.setOutputKeyClass(Text.class); |
43. | conf.setOutputValueClass(IntWritable.class); |
44. | |
45. | conf.setMapperClass(Map.class); |
46. | conf.setCombinerClass(Reduce.class); |
47. | conf.setReducerClass(Reduce.class); |
48. | |
49. | conf.setInputFormat(TextInputFormat.class); |
50. | conf.setOutputFormat(TextOutputFormat.class); |
51. | |
52. | FileInputFormat.setInputPaths(conf, new Path(args[0])); |
53. | FileOutputFormat.setOutputPath(conf, new Path(args[1])); |
54. | |
55. | JobClient.runJob(conf); |
57. | } |
58. | } |
59. |