spark: rdd的應用(java api)

  1. wordcount: 統計詞頻, 排序 (jdk7, jdk8-lambda表達式)
  2. 歷年溫度最值: max, min, avg
<!--maven依賴 -->
	<dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.0</version>
        </dependency>

jdk8 : lambda表達式

idea中添加maven jdk8構建依賴

     <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
  public static void main(String[] args){
        //conf
        SparkConf conf = new SparkConf();
        conf.setMaster("local");
        conf.setAppName("wc");

        //context
        JavaSparkContext context = new JavaSparkContext(conf);
        JavaRDD<String> rdd1 = context.textFile("/home/wang/txt/word.txt");
        
        //lambda: 表達式
        JavaRDD<String> rdd2 = rdd1.flatMap(s -> Arrays.asList(s.split(" ")).iterator());
        JavaPairRDD<String, Integer> rdd3 = rdd2.mapToPair(s -> new Tuple2<String, Integer>(s, 1));
        JavaPairRDD<String, Integer> rdd4 = rdd3.reduceByKey((x, y) -> x + y);

        //按單詞升序
        List<Tuple2<String, Integer>> list1 = rdd4.sortByKey(true).collect();
        //按詞頻降序
        JavaPairRDD<String, Integer> rdd5 = rdd4.mapToPair(x -> new Tuple2<Integer, String>(x._2, x._1))
                .sortByKey()
                .mapToPair(x -> new Tuple2<String, Integer>(x._2, x._1));
        List<Tuple2<String, Integer>> list2 = rdd5.collect();
    }

jdk7 : spark api

public static void main(String[] args){
        //conf, context........     同上 
        //1, a b c ===> split( ) : string[] {a,b,c}
        JavaRDD<String> rdd2 = rdd1.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String s) throws Exception {
                String[] arr = s.split(" ");
                return Arrays.asList(arr).iterator();
            }
        });

        //2, string[] {a,b,c}==>(a,1),(b,1)
        JavaPairRDD<String, Integer> rdd3 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String, Integer>(s, 1);
            }
        }).filter(new Function<Tuple2<String, Integer>, Boolean>() {//過濾空字符
            public Boolean call(Tuple2<String, Integer> v1) throws Exception {
                return v1._1.trim().length()>0 ;
            }
        });

        //3, (a,1),(b,1) ==> reduceByKey: (a,4), (b,3)
        JavaPairRDD<String, Integer> rdd4 = rdd3.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1+v2;
            }
        });

        //4.1 排序( 字母生序)
        JavaPairRDD<String, Integer> rddRes = rdd4.sortByKey();
        List<Tuple2<String, Integer>> list1 = rddRes.collect();

        //4.2排序( 詞頻降序)
        JavaPairRDD<Integer, String> sortRdd1 = rdd4.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
            public Tuple2<Integer, String> call(Tuple2<String, Integer> tup) throws Exception {
                return new Tuple2<Integer, String>(tup._2, tup._1);
            }
        });
        JavaPairRDD<String, Integer> sortRdd2 = sortRdd1.sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
            public Tuple2<String, Integer> call(Tuple2<Integer, String> tup) throws Exception {
                return new Tuple2<String, Integer>(tup._2, tup._1);
            }
        });
        List<Tuple2<String, Integer>> list2 = sortRdd2.collect();
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章