spark對分組後value值進行排序(JAVA)

maven:

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.10</artifactId>
    <version>1.6.0</version>
</dependency>
groupsort.txt:

spark 100
storm 90
kafka 75
hadoop 60
zookeeper 100
impala 80
hbase 65
hive 90
flume 95
elasticsearch 100
spark 80
storm 70
kafka 80
hadoop 75
zookeeper 90
impala 100
hbase 30
hive 70
flume 80
elasticsearch 90
spark 56
storm 88
kafka 44
hadoop 33
zookeeper 99
impala 88
hbase 63
hive 45
flume 89
elasticsearch 79

public class GroupSort {
    public static void main(String[] args) {
        /**
         * 創建spark配置對象SparkConf,設置spark運行時配置信息,
         * 例如通過setMaster來設置程序要連接的集羣的MasterURL,如果設置爲local         * spark爲本地運行
         */
        SparkConf conf = new SparkConf().setAppName("My first spark").setMaster("local");
        /**
         * 創建JavaSparkContext對象
         * SparkContextspark所有功能的唯一入口,
         * SparkContext核心作用,初始化spark運行所需要的核心組件,同時還會負責spark程序在master的註冊。
         *
         */
        JavaSparkContext sc = new JavaSparkContext(conf);
        //sc.setLogLevel("OFF");
        /**
         * 根據數據來源,通過JavaSparkContext來創建RDD
         */
        JavaRDD<String> lines = sc.textFile("E:/groupsort.txt");

        JavaPairRDD<String, Integer> pairs = lines.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String line) throws Exception {
                String[] split = line.split(" ");
                return new Tuple2<String, Integer>(split[0], Integer.parseInt(split[1]));
            }
        });
        /**
         * 分組
         */
        JavaPairRDD<String, Iterable<Integer>> groups = pairs.groupByKey();
        /**
         * 對分組結果排序
         */
        JavaPairRDD<String, Iterable<Integer>> groupsSort = groups.mapToPair(new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() {
            public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> groupData) throws Exception {
                List<Integer> integers = new ArrayList<Integer>();
                String name = groupData._1;
                Iterator<Integer> it = groupData._2.iterator();
                while (it.hasNext()) {
                    integers.add(it.next());
                }
                integers.sort(new Comparator<Integer>() {
                    public int compare(Integer o1, Integer o2) {
                        return o2 - o1;
                    }
                });
                return new Tuple2<String, Iterable<Integer>>(name, integers);
            }
        });
        /**
         * 打印
         */
        groupsSort.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            public void call(Tuple2<String, Iterable<Integer>> data) throws Exception {
                System.out.println(data._1+"  "+data._2);
            }
        });
        /**
         * 關閉JavaSparkContext
         */
        sc.stop();
    }
}
運行結果:

spark  [100, 80, 56]
hive  [90, 70, 45]
hadoop  [75, 60, 33]
flume  [95, 89, 80]
zookeeper  [100, 99, 90]
impala  [100, 88, 80]
storm  [90, 88, 70]
elasticsearch  [100, 90, 79]
kafka  [80, 75, 44]
hbase  [65, 63, 30]

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章