hadoop 單機下運行wordcount

原創

zzhuwanpeng

2018-09-11 03:58

關於單機版的配置，eclipse環境搭建，以後再補充吧

首先是程序

project： wordcunt

import java.io.IOException; 
import java.util.StringTokenizer; 
 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Mapper; 
 
public class TokenizerMapper extends 
        Mapper<Object, Text, Text, IntWritable> { 
 
    private final static IntWritable one = new IntWritable(1); 
    private Text word = new Text(); 
 
    public void map(Object key, Text value, Context context) 
            throws IOException, InterruptedException { 
        StringTokenizer itr = new StringTokenizer(value.toString()); 
        while (itr.hasMoreTokens()) { 
            word.set(itr.nextToken()); 
            context.write(word, one); 
        } 
    } 
} 
 
 
import java.io.IOException; 
 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Reducer; 
 
public class IntSumReducer extends 
        Reducer<Text, IntWritable, Text, IntWritable> { 
    private IntWritable result = new IntWritable(); 
 
    public void reduce(Text key, Iterable<IntWritable> values, Context context) 
            throws IOException, InterruptedException { 
        int sum = 0; 
        for (IntWritable val : values) { 
            sum += val.get(); 
        } 
        result.set(sum); 
        context.write(key, result); 
    } 
} 
 
 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Job; 
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.util.GenericOptionsParser; 
 
public class WordCount { 
 
      public static void main(String[] args) throws Exception { 
            Configuration conf = new Configuration(); 
//          String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 
//          if (otherArgs.length != 2) { 
//            System.err.println("Usage: wordcount <in> <out>"); 
//            System.exit(2); 
//          } 
            Job job = new Job(conf, "word count"); 
            job.setJarByClass(WordCount.class); 
            job.setMapperClass(TokenizerMapper.class); 
            job.setCombinerClass(IntSumReducer.class); 
            job.setReducerClass(IntSumReducer.class); 
            job.setOutputKeyClass(Text.class); 
            job.setOutputValueClass(IntWritable.class); 
            FileInputFormat.addInputPath(job, new Path("/tmp/input")); 
            FileOutputFormat.setOutputPath(job, new Path("/tmp/output")); 
            System.exit(job.waitForCompletion(true) ? 0 : 1); 
          } 
 
}

將項目export成爲jar包，注意選擇運行類爲WordCount

在hadoop機器上：

[admin@host WordCount]$ vim input1.txt
Hello, i love china
are you ok?
[admin@host WordCount]$ vim input2.txt
hello, i love word
You are ok

　　在hadoop上新建目錄，和put程序運行所需要的輸入文件:

hadoop fs -mkdir /tmp/input
hadoop fs -mkdir /tmp/output
hadoop fs -put input1.txt /tmp/input/
hadoop fs -put input2.txt /tmp/input/

執行：

hadoop jar wordcount.jar WordCount

查看效果：

hadoop fs -ls /tmp/output/

hadoop fs -cat /tmp/output/part-r-00000

OK！

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

hadoop 單機下運行wordcount

redis的key亂碼問題和值自增問題

一個開源且全面的C#算法實戰教程

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

CORS error 但是 status code 是200 OK

壓縮上傳的GPU數據的方案

使用skopeo同步鏡像

TCP的三次握手(建立連接）和四次揮手(關閉連接）

java 內存管理

tomcat日誌配置

我的友情鏈接

限流

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結