wordCount
/**
* Following sample is adopted from original wordcount sample from
* http://wiki.apache.org/hadoop/WordCount.
*/
package chapter1;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* <p>The word count sample counts the number of word occurrences within a set of input documents
* using MapReduce. The code has three parts: mapper, reducer, and the main program.</p>
* @author Srinath Perera (srinath@wso2.com)
*/
public class WordCount {
/**
* <p>
* The mapper extends from the org.apache.hadoop.mapreduce.Mapper interface. When Hadoop runs,
* it receives each new line in the input files as an input to the mapper. The �map� function
* tokenize the line, and for each token (word) emits (word,1) as the output. </p>
*/
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
//new一個IntWritable存儲數值
private final static IntWritable one = new IntWritable(1);
// new一個Text 存儲單詞作爲key
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
/**
* <p>Reduce function receives all the values that has the same key as the input, and it output the key
* and the number of occurrences of the key as the output.</p>
*/
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
/**
* <p> As input this program takes any text file. Create a folder called input in HDFS (or in local directory if you are running this locally)
* <ol>
* <li>Option1: You can compile the sample by ant from sample directory. To do this, you need to have Apache Ant installed in your system.
* Otherwise, you can use the complied jar included with the source code. hange directory to HADOOP_HOME, and copy the hadoop-cookbook.jar to the HADOOP_HOME.
* Then run the command > bin/hadoop jar hadoop-cookbook.jar chapter1.WordCount input output.</li>
* <li>As an optional step, copy the �input� directory to the top level of the IDE based project (eclipse project) that you created for samples. Now you can run
* the WordCount class directly from your IDE passing �input output� as arguments. This will run the sample same as before. Running MapReduce Jobs from IDE in this manner is very useful
* for debugging your MapReduce Jobs. </li>
* </ol>
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
//Uncomment this to
//job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
build後運行
hadoop jar hadoop-cookbook-chapter1.jar chapter1.WordCount hdfs://localhost:9000/test/word.txt hdfs://localhost:9000/test/output2
結果
[clz@localhost lib]$ hadoop fs -cat /test/output2/part*
"But 1
"I 1
"It 1
"My 2
"On 1
"There 1
"Today, 1
"We 1
"When 1
"a 1
"in 1
"one 1
'this 1
- 1
110 2
130 1
131 4
工作原理
1. Hadoop讀取輸入,以新行(\n)所謂分隔符,將line nummber以及每行數據作爲輸入
2. map函數對每行進行分詞,並生成鍵值對(word,1)
3. hadoop收集所有鍵值對,按照key即(word)排序,將所有values即(1)收集到一起:
word<1,1,...> 然後將(key,values)傳遞給ruduce進行操作
4. reduce對values加和輸出word<sum(values)>
5. 寫入輸出路徑
add a combiner step
在map階段每個map輸出的key-value會有許多key是相同的,如果直接進入shuffle階段會造成不必要的資源浪費,在將map結果持久化到磁盤進行shuffle過程前可以加入一個combiner 在map階段先將每個map中相同key的value執行操作,一般combiner與reduce
的操作相同,在這裏的wordcount是sum操作。
attention
將reduce函數作爲combiner時,只有在reduce函數的輸入和輸出是一樣的情況下纔可以使用。當然,也可以寫一個專用的reducer作爲combiner,此時combiner的輸入和輸出key-value必須與mapper的輸出key-value相同。在分佈式環境下combiner能夠大大提升效率。
HDFS
hdfs是塊結構的分佈式系統,它支持在多節點存儲大容量的數據和高通量的訪問。高容錯。hdfs有namenode和datanode,namennode存儲的是元數據信息,datanode存儲的是實際數據。hdfs的快數據是粗粒度的並在大規模流數據讀取表現更好。
#啓動
$ $HADOOP_HOME/sbin/start-dfs.sh
#信息
$ $HADOOP_HOME/bin/hadoop dfsadmin -report
#stop
$ $HADOOP_HOME/sbin/stop-dfs.sh
Hadoop v2 YARN
YARN包含:
ResourceManager: masternode 控制所有集羣的資源
NodeManager:slavenodes 控制單個節點的資源
MR app 可以在YARN上運行, 通過 ApplicationMaster將每個作業與資源容器resource contianers協調來運行map與reduce任務。
some command line
$ hdfs dfs -ls
$ hdfs dfs -mkdir test
$ hdfs dfs -copyFromLocal README.txt test
$ hdfs dfs –copyToLocal \
test/README.txt README-NEW.txt
# help
hdfs dfs -help
hdfs dfs -help du
當輸入command line後HDFS客戶端會從NAMENODE得到configurations信息即
HADOOP_HOME/etc/hadoop/conf 。當然也可以強制指定一個NAMENODE的位置
例如,
hdfs://bar.foo.com:9000/data
這樣會指定使用bar.foo.com的namenode
How it works
當提交一個作業, YARN會安排ApplicationMaster協調並執行計算。AppMaster從ResourceManager獲取必要的資源, 並使用從resource請求到的containers進行MR計算