數據算法-hadoop3 TopN

topN 全世界都一樣。。。先map取topN,在reduce把各個map的topN取topN

map時候top10cast.put後,所有的value都會變成最新的一個,應該是地址引用問題,之前從來沒有注意。只能put之前先tostring一把。
reduce時候也一樣

    public static void main(String[] args) throws Exception {
        Configuration conf1 = new Configuration();
        System.setProperty("hadoop.home.dir", "D:\\hadoop-2.5.2");
        conf1.setInt("N", 10);

        conf1.setBoolean("dfs.permissions", false);

        Job job = Job.getInstance(conf1, "TopN");

        job.setMapperClass(TopNMapper.class);
        job.setReducerClass(TopNReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setNumReduceTasks(1);

        FileInputFormat.setInputPaths(job, new Path("C:\\demo\\03\\03.txt"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\demo\\03\\out"));
        if (job.waitForCompletion(true)) {
            log.info("MR run successfully");

        } else {
            log.error("MR run failed");

        }

    }
public class TopNMapper extends Mapper<Object, Text, NullWritable, Text> {
    private SortedMap<Double, Text> top10cast = new TreeMap<Double, Text>();
    private int N = 10;

    @Override
    protected void setup(
            Mapper<Object, Text, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
            this.N = context.getConfiguration().getInt("N", 10); 
    }

    public void map(Object key, Text value, Context context)
            throws IOException, InterruptedException {
        String[] tokens = value.toString().split(",");
        String s = value.toString();
        double weight = Double.parseDouble(tokens[0]);
        top10cast.put(weight, new Text(s));

        if (top10cast.size() > N) {
            top10cast.remove(top10cast.firstKey());
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException,
            InterruptedException {
        for (Text catAttributes : top10cast.values()) {
            context.write(NullWritable.get(), catAttributes);
        }
    }
}
public class TopNReducer extends Reducer<NullWritable, Text, NullWritable, Text> {

    private int N = 10;

    @Override
    protected void setup(
            Reducer<NullWritable, Text, NullWritable, Text>.Context context)
            throws IOException, InterruptedException {
        this.N = context.getConfiguration().getInt("N", 10); 
    }
    /**
     * reduce
     */
    @Override
    protected void reduce(NullWritable key, Iterable<Text> values,
            Context context) throws IOException, InterruptedException {
         SortedMap<Double, Text> finaltop10cast = new TreeMap<Double, Text>();
         for (Text catRecord:values) {
             double weight = Double.parseDouble(catRecord.toString().split(",")[0]);
             String s = catRecord.toString();
             finaltop10cast.put(weight, new Text(s));
             if (finaltop10cast.size() >N) {
                 finaltop10cast.remove(finaltop10cast.firstKey());
             }
         }
         for (Text text :finaltop10cast.values()) {
             context.write(NullWritable.get(), text);
         }
    }
}

輸入

12,cat1
13,cat2
14,cat3
15,cat4
10,cat5
100,cat100
200,cat200
300,cat300
1,cat001
67,cat67
22,cat22
23,cat23
1000,cat1000
2000,cat2000

輸出

14,cat3
15,cat4
22,cat22
23,cat23
67,cat67
100,cat100
200,cat200
300,cat300
1000,cat1000
2000,cat2000
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章