MapReduce統計詞頻demo

目錄結構

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.zdy</groupId>
    <artifactId>WordCount</artifactId>
    <version>1.0-SNAPSHOT</version>


    <properties>
        <hadoop.version>2.8.5</hadoop.version>
        <hadoop-core.version>1.2.1</hadoop-core.version>
        <!-- 使用jdk1.8啓動,設置字符集解析爲UTF-8 -->
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>
    <!-- 導入hadoop依賴環境 -->
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>${hadoop-core.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>
    <!-- 導入apache鏡像原 -->
    <repositories>
        <repository>
            <id>apache</id>
            <url>http://maven.apache.org</url>
        </repository>
    </repositories>

</project>

WordCount.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class WordCount {

    public static class TokenizerMapper
            extends Mapper<Object, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        String regEx="[\n`~!@#$%^&*()+=|{}':;'\",\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?]";

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                String temp = word.toString().replaceAll(regEx,"");//去除標點符號
                Text new_word= new Text(temp.toLowerCase());//忽略大小寫
                context.write(new_word, one);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }


    public static void getWordCount(String index) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("./input/file"+index+".txt"));
        FileOutputFormat.setOutputPath(job, new Path("./output/file"+index+"/"));
        //System.exit(job.waitForCompletion(true) ? 0 : 1);
        job.waitForCompletion(true);
    }
}

bow.java

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class bow {

    private final static String[] top100Word = { "the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
            "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so",
            "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no",
            "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see",
            "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any",
            "these", "give", "day", "most", "us" };

    public static void main(String[] args) throws Exception {


        File file = new File("./output/bow_result.txt");
        if(file.exists())
        {
            System.out.println("bow_result already exists, see output");
            System.exit(0);
        }

        for(Integer i=1;i<=10;i++)//MapReduce統計詞頻
        {
            String str = String.format("%02d",i);
            WordCount.getWordCount(str);
        }

        List<StringBuffer> list = new ArrayList<>();//從指定的top100Word獲取對應的詞頻
        for(Integer i=1;i<=10;i++)
        {
            String str = String.format("%02d",i);
            list.add(generateVector(str));
        }

        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("./output/bow_result.txt")),
                StandardCharsets.UTF_8));//結果寫入txt

        for (StringBuffer sb : list) {
            bw.write(String.valueOf(sb));
            bw.newLine();
        }
        bw.close();

    }
    public static StringBuffer generateVector(String index) throws IOException {
        HashMap<String,Integer> map = new HashMap<>();
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file"+index+"/part-r-00000")),

                StandardCharsets.UTF_8));

        String lineTxt = null;
        while ((lineTxt = br.readLine()) != null) {//數據以逗號分隔
            String[] names = lineTxt.split("\t");
            map.put(names[0],Integer.parseInt(names[1]));
        }
        br.close();
        StringBuffer sb = new StringBuffer("file"+index+".txt\t");
        for(int i=0;i<top100Word.length-1;i++)
        {
            String key = top100Word[i];
            if(map.containsKey(key))
            {
                sb.append(map.get(key)+",");
            }
            else
            {
                sb.append("0,");
            }
        }
        sb.append(map.containsKey(top100Word[top100Word.length-1]) ? map.get(top100Word[top100Word.length-1]) : "0");
        System.out.println(sb);
        return sb;
    }
}

 

file01.txt	525,62,269,276,288,253,173,127,79,298,163,57,69,31,84,123,74,151,29,67,36,61,122,38,51,16,37,7,48,71,23,38,38,98,30,33,22,35,9,34,29,33,29,27,11,30,4,62,4,55,35,8,10,6,10,17,15,27,22,7,4,33,2,50,10,17,27,10,17,14,13,37,19,9,9,13,7,8,11,5,9,7,5,11,23,13,5,3,12,6,7,4,2,5,6,4,4,5,15,8
file02.txt	543,64,284,250,340,275,180,184,89,298,168,104,69,51,88,187,102,150,33,74,44,73,111,24,43,34,76,12,14,14,27,39,24,71,31,47,51,51,10,50,49,32,32,24,25,24,6,61,11,71,38,11,17,12,19,32,22,48,20,8,0,20,3,49,15,23,32,22,27,20,23,41,15,6,18,23,16,24,9,4,10,19,6,20,11,26,8,18,30,12,5,6,5,2,19,5,6,13,13,30
file03.txt	609,57,275,290,320,268,148,212,94,265,196,72,90,41,86,177,101,149,36,86,48,69,158,34,54,24,43,12,54,41,25,30,24,58,38,32,38,45,6,42,49,22,32,35,21,25,8,78,14,50,34,1,10,8,16,46,8,65,21,6,4,20,2,35,5,36,33,15,27,26,14,36,19,6,23,15,2,18,20,8,17,7,3,17,15,12,7,7,13,14,8,2,1,2,10,9,4,4,6,19
file04.txt	658,64,267,325,344,245,216,168,102,292,169,62,59,37,84,177,95,127,34,75,52,60,153,39,73,22,44,5,20,20,29,33,31,112,39,43,31,58,19,39,42,32,31,22,16,27,3,100,8,61,32,6,22,12,20,44,5,51,13,12,1,14,5,25,4,38,27,18,23,8,18,31,26,2,10,32,17,19,16,4,13,8,4,15,14,18,3,13,23,14,9,4,4,2,9,11,10,14,6,25
file05.txt	600,59,250,298,283,306,186,170,103,245,161,67,54,43,88,180,92,132,19,63,52,69,136,43,52,11,41,6,33,30,11,35,18,101,41,32,26,58,10,34,37,32,38,27,15,28,10,85,7,43,29,6,29,5,10,57,9,57,19,7,0,29,0,24,10,29,25,7,26,15,18,35,27,7,13,14,14,13,13,10,17,7,3,15,11,8,2,5,22,13,7,1,3,1,13,8,4,17,7,14
file06.txt	581,51,260,288,298,264,182,157,87,276,155,87,46,33,85,114,68,177,21,81,63,68,103,44,46,12,65,9,55,47,13,26,31,123,43,34,34,47,10,40,41,27,35,23,21,34,11,92,11,69,21,4,19,10,18,38,18,36,23,11,5,44,3,43,12,22,18,14,28,14,14,42,38,5,16,10,20,21,18,4,16,5,2,14,12,33,3,6,28,12,5,3,4,2,12,9,8,6,5,19
file07.txt	635,56,267,286,328,308,168,186,74,352,201,65,49,41,76,120,78,128,22,96,47,60,88,31,64,17,95,9,36,30,15,35,19,130,44,41,35,57,6,33,49,38,37,23,14,19,6,101,14,70,35,7,15,13,23,33,11,21,9,9,2,33,0,44,15,27,36,8,26,13,14,38,20,7,17,21,18,18,17,6,15,9,5,21,16,30,7,5,10,11,11,5,4,1,17,11,4,4,6,26
file08.txt	531,73,282,242,290,268,183,185,99,288,154,74,54,40,66,107,81,135,19,66,59,58,92,36,44,25,50,15,73,64,13,39,31,87,39,48,22,53,7,35,50,28,27,19,16,23,5,53,6,66,36,12,14,11,19,46,19,37,12,8,2,26,4,30,14,26,16,19,17,13,13,44,20,6,18,11,4,25,26,3,16,17,1,6,14,25,4,12,16,13,4,3,2,4,11,9,7,9,8,14
file09.txt	530,77,317,221,274,252,181,199,132,404,197,86,85,33,103,197,78,198,42,83,47,78,142,30,33,16,24,16,46,62,20,32,29,122,31,42,35,44,5,48,58,30,33,33,15,39,7,81,12,86,42,11,17,8,8,35,7,64,13,7,2,36,1,59,10,14,42,16,22,17,22,35,30,10,21,8,6,18,28,5,12,9,1,11,14,29,3,8,12,10,12,1,1,3,18,7,7,2,10,10
file10.txt	575,82,283,289,301,239,196,161,68,276,143,75,80,41,112,82,91,195,44,88,81,56,54,46,50,25,56,3,50,47,97,30,24,104,34,51,33,54,15,32,43,30,25,46,20,25,2,61,11,63,21,9,25,9,7,48,11,27,18,4,7,20,5,45,11,13,30,17,18,21,10,26,17,15,17,15,18,13,16,4,13,11,13,13,10,15,45,9,19,11,4,2,2,2,49,10,9,11,15,20

 dist.java

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;

public class dist {
    public static void main(String[] args) throws IOException {

        File file = new File("./output/dist_result.txt");
        if(file.exists())
        {
            System.out.println("dist_result already exists, see output");
            System.exit(0);
        }

        List<StringBuffer> list = new ArrayList<>();
        for(Integer i=1;i<=10;i++)
        {
            String str = String.format("%02d",i);
            list.add(generateVector(str));
        }

        StringBuffer sbTotal = generateTotalVector();

        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("./output/dist_result.txt")),
                StandardCharsets.UTF_8));//結果寫入txt

        for (StringBuffer sb : list) {
            bw.write(String.valueOf(sb));
            bw.newLine();
        }

        bw.write(String.valueOf(sbTotal));
        bw.newLine();

        bw.close();
    }
    public static StringBuffer generateVector(String index) throws IOException {
        HashMap<String,Integer> map = new HashMap<>();
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file"+index+"/part-r-00000")),

                StandardCharsets.UTF_8));

        String lineTxt = null;
        while ((lineTxt = br.readLine()) != null) {//數據以逗號分隔
            String[] names = lineTxt.split("\t");
            if(names[0].startsWith("ex"))
                map.put(names[0],Integer.parseInt(names[1]));
        }
        br.close();

        List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());//降序排列

        list.sort(new Comparator<Map.Entry<String, Integer>>() {
            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                if(o1.getValue().equals(o2.getValue()))
                {
                    return o1.getKey().compareTo(o2.getKey());
                }
                return o2.getValue().compareTo(o1.getValue());
            }
        });

        StringBuffer sb = new StringBuffer("file"+index+".txt\t");
        for (int i = 0; i < list.size(); i++) {
            sb.append(list.get(i).getKey() + "," + list.get(i).getValue() + ",");
        }

        sb.deleteCharAt(sb.length()-1);
        System.out.println(sb);
        return sb;
    }

    public static StringBuffer generateTotalVector() throws IOException {
        HashMap<String,Integer> map = new HashMap<>();
        for(Integer i=1;i<=10;i++) {
            String index = String.format("%02d",i);
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file" + index + "/part-r-00000")),

                    StandardCharsets.UTF_8));

            String lineTxt = null;
            while ((lineTxt = br.readLine()) != null) {//數據以逗號分隔
                String[] names = lineTxt.split("\t");
                if (names[0].startsWith("ex")) {
                    if (!map.containsKey(names[0]))
                        map.put(names[0], Integer.parseInt(names[1]));
                    else {
                        Integer temp = map.get(names[0]);
                        map.put(names[0], Integer.parseInt(names[1]) + temp);
                    }
                }
            }
            br.close();
        }
        List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());//降序排列

        list.sort(new Comparator<Map.Entry<String, Integer>>() {
            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                if(o1.getValue().equals(o2.getValue()))
                {
                    return o1.getKey().compareTo(o2.getKey());
                }
                return o2.getValue().compareTo(o1.getValue());
            }
        });

        StringBuffer sb = new StringBuffer("total\t");
        for (int i = 0; i < list.size(); i++) {
            sb.append(list.get(i).getKey() + "," + list.get(i).getValue() + ",");
        }

        sb.deleteCharAt(sb.length()-1);
        System.out.println(sb);
        return sb;
    }
}
file01.txt	extraordinary,3,examined,2,example,2,excellent,2,excuse,2,expected,2,expression,2,extreme,2,exactly,1,exaggerated,1,exalted,1,except,1,exchange,1,excitedly,1,excuses,1,expenses,1,experiences,1,explain,1,explained,1,explaining,1,expostulating,1,extended,1,extending,1
file02.txt	experience,4,excuse,3,expected,3,extraordinary,3,extreme,3,example,2,exceedingly,2,existence,2,expect,2,expedition,2,exact,1,exactness,1,examine,1,except,1,excitement,1,exciting,1,exclaimed,1,expectancy,1,expenditure,1,expense,1,expensive,1,experienced,1,explain,1,explained,1,explanation,1,explore,1,expression,1,extra,1,extremely,1
file03.txt	extraordinary,3,example,2,except,2,excited,2,explain,2,ex-australian,1,exact,1,exacted,1,exactly,1,examination,1,examined,1,examining,1,exceedingly,1,excitable,1,excitement,1,expected,1,expense,1,experience,1,expired,1,expiring,1,explanation,1,expressed,1,extent,1,extremely,1
file04.txt	examination,2,exclaimed,2,excuse,2,expected,2,extreme,2,ex-confederate,1,examined,1,excellent,1,except,1,exception,1,exceptional,1,exchange,1,exercise,1,exhibited,1,expect,1,expedition,1,experience,1,explanations,1,exposed,1,expound,1,expression,1,extending,1,extremely,1,extremity,1
file05.txt	examination,4,exceedingly,4,excellent,3,experience,3,examined,2,exceptional,2,explain,2,exposure,2,extremely,2,exactly,1,example,1,exception,1,excited,1,exclamation,1,execution,1,existence,1,exit,1,expected,1,expecting,1,expensive,1,explained,1,exposed,1
file06.txt	excellent,4,examined,2,excitement,2,excuse,2,extended,2,exact,1,examine,1,exchanging,1,excursion,1,existence,1,expect,1,expected,1,expenses,1,explain,1,explained,1,exposure,1,expression,1,expressly,1,extent,1
file07.txt	examine,4,examined,4,examining,4,experience,4,example,3,excuse,3,explain,3,exactly,2,examination,2,exchanged,2,expect,2,explained,2,exact,1,exacted,1,excavating,1,exceeding,1,exceedingly,1,excellent,1,exceptionally,1,exclamation,1,exercising,1,expected,1,experiences,1,explaining,1,explanation,1,expression,1,extinguished,1,extraordinary,1,extreme,1
file08.txt	explain,4,example,2,exceedingly,2,excitement,2,explained,2,explanation,2,exactly,1,exalted,1,examination,1,excellent,1,except,1,excited,1,exclude,1,exclusion,1,excuse,1,existence,1,expect,1,expectancies,1,expected,1,expensive,1,experience,1,explains,1,expression,1,expressions,1,expressive,1,exquisite,1,extend,1,extended,1,extra,1,extracts,1,extraordinary,1,extremely,1
file09.txt	extraordinary,4,examined,3,excellent,3,experience,3,excuse,2,expedition,2,expense,2,explain,2,explanation,2,exactly,1,exalted,1,examination,1,exceeded,1,exceedingly,1,exceptionally,1,excessive,1,excluded,1,exercise,1,expensive,1,explained,1,exposed,1,exposure,1,expressive,1,extinguishes,1,extreme,1
file10.txt	explanation,4,extraordinary,4,excellent,3,except,3,expenses,3,experience,3,exempt,2,expense,2,exacting,1,exactly,1,exaggerated,1,examined,1,examining,1,example,1,exceptional,1,exclusion,1,executive,1,exert,1,exhilarating,1,exists,1,expect,1,expected,1,expend,1,explanations,1,exporting,1,express,1,expressed,1,expression,1,extent,1,extremely,1
total	experience,20,extraordinary,19,excellent,18,examined,16,explain,16,excuse,15,example,13,expected,13,examination,11,exceedingly,11,explanation,11,except,9,explained,9,extreme,9,exactly,8,expect,8,expression,8,extremely,7,examine,6,examining,6,excitement,6,expense,6,existence,5,expedition,5,expenses,5,exact,4,exceptional,4,excited,4,expensive,4,exposure,4,extended,4,exalted,3,exclaimed,3,exposed,3,extent,3,exacted,2,exaggerated,2,exception,2,exceptionally,2,exchange,2,exchanged,2,exclamation,2,exclusion,2,exempt,2,exercise,2,experiences,2,explaining,2,explanations,2,expressed,2,expressive,2,extending,2,extra,2,ex-australian,1,ex-confederate,1,exacting,1,exactness,1,excavating,1,exceeded,1,exceeding,1,excessive,1,exchanging,1,excitable,1,excitedly,1,exciting,1,exclude,1,excluded,1,excursion,1,excuses,1,execution,1,executive,1,exercising,1,exert,1,exhibited,1,exhilarating,1,exists,1,exit,1,expectancies,1,expectancy,1,expecting,1,expend,1,expenditure,1,experienced,1,expired,1,expiring,1,explains,1,explore,1,exporting,1,expostulating,1,expound,1,express,1,expressions,1,expressly,1,exquisite,1,extend,1,extinguished,1,extinguishes,1,extracts,1,extremity,1

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章