Hadoop官方案例WordCount簡單實現

Hadoop官方案例WordCount簡單實現

前提準備

創建maven工程,導入依賴,注意版本修改與集羣的版本一致

<dependency>
        <groupId>org.apache.logging.log4j</groupId>
        <artifactId>log4j-core</artifactId>
        <version>2.9.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.2</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.7.2</version>
    </dependency>

自定義Mapper類——MyMapper

package mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Mapper類
 */
public class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
    private final static LongWritable one = new LongWritable(1);
    private Text word = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");
        for(String s :words){
            word.set(s);
            context.write(word, one);
        }
    }
}

自定義Reduce類——MyReduce

package mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Reduce類
 */
public class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
    private  LongWritable  longWritable= new LongWritable();
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        long sum  = 0;
        for(LongWritable v:values){
            sum+=v.get();
        }
        longWritable.set(sum);
        context.write(key,longWritable);
    }
}

自定義Runner類——MyRunner

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 運行主類
 */
public class MyRunner implements Tool{
    private Configuration conf = null;
    public int run(String[] args) throws Exception {
        //設置配置類和任務名稱
        Job job = Job.getInstance(conf,"myJob");

        //設置運行主類
        job.setJarByClass(MyRunner.class);

        //設置Mapper類
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //設置Reducer類
        job.setReducerClass(MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //設置數據的輸入和輸出地址
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //表示任務運行狀態
        return job.waitForCompletion(true)?0:1;
    }

    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    public Configuration getConf() {
        return this.conf;
    }

    public static void main(String[] args) throws Exception {
        int state = ToolRunner.run(new MyRunner(), args);
        System.exit(state);
    }
}

運行準備——打jar包

mvn clean package

運行

#在hadoop的根目錄安裝下運行,並且把打好的jar也放入根目錄下。
$ bin/yarn jar hadoop-hdfs-1.0-SNAPSHOT.jar mapreduce.MyRunner /input /output
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章