單詞統計代碼--用java結合Hadoop去做,有源碼註釋

統計單詞統計,統計每個單詞的詞頻。更好地理解Hadoop框架的思想。

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 詞頻統計
 */
public class WordCountApp {

    /**
     * Mapper:讀取源文件,進行單詞拆分
     */
    public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

        LongWritable one = new LongWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            // 獲取文件一行的內容
            String line = value.toString();

            // 將行內容拆分成一個個單詞
            String[] words = line.split(" ");

            // 將單詞做成鍵值對輸出
            for(String word : words) {
                context.write(new Text(word), one);
            }

        }
    }

    /**
     * 對mapper的輸出進行合併統計
     */
    public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context)
                throws IOException, InterruptedException {
            long sum = 0;
            for(LongWritable value : values){
                // 將當前key對應的多個values進行累加
                sum += value.get();
            }

            // 將統計完成的結果按照(text, long)鍵值對輸出
            context.write(key, new LongWritable(sum));

        }
    }

    public static void main(String[] args) throws Exception{
        // 創建配置實例
        Configuration configuration = new Configuration();

        // 創建一個job
        Job job = Job.getInstance(configuration, "wordcount");
        // 設置該job的處理類
        job.setJarByClass(WordCountApp.class);

        // 設置輸入文件的路徑
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        // 設置mapper的相關參數
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // 設置reducer的相關參數
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        // 設置處理的結果文件輸出目錄
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 提交給yarn運行,等待運行完成之後退出
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章