Spark

RDD編程模型
在這裏插入圖片描述

在這裏插入圖片描述

在這裏插入圖片描述

RDD運行規劃圖
RDD運行規劃

注意:當以Cluster模式啓動Spark程序時,需要把本地Driver端的Client殺掉,否則會佔用本地內存,可能導致本地宕機
(可在spark-submit中加上spark.yarn.submit.waitAppCompletion=false)

operator

在這裏插入圖片描述

aggregateByKey(代碼如下,key只作爲標識,提供兩個變量供設計計算算法)

public class OptTest {
    public static void main(String[] args) {
        SparkConf sc = new SparkConf().setAppName("").setMaster("local[2]");
        JavaSparkContext jsc = new JavaSparkContext(sc);
        jsc.setLogLevel("WARN");

        JavaRDD<Tuple2<Integer,Integer>> tupleRdd = jsc.parallelize(Arrays.asList(
                /**
                 * 1;9;3
                 * 2;3;1
                 * 3;14;2
                 */
                new Tuple2<Integer,Integer>(1,3),
                new Tuple2<Integer,Integer>(1,2),
                new Tuple2<Integer,Integer>(1,4),
                new Tuple2<Integer,Integer>(2,3),
                new Tuple2<Integer,Integer>(3,6),
                new Tuple2<Integer,Integer>(3,8)
        ));

        JavaPairRDD<Integer,Integer> pairRdd = tupleRdd.mapToPair(tuple->tuple);

        JavaPairRDD<Integer,Tuple2<Integer,Integer>> result =
        pairRdd.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),
                new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple, Integer val2) throws Exception {
                        return new Tuple2<>(tuple._1() + val2, tuple._2() + 1);
                    }
                }, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
                        return new Tuple2<>(tuple1._1()+tuple2._1(),tuple1._2()+tuple2._2());
                    }
                }
        );

        result.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Integer, Integer>>>() {
            @Override
            public void call(Tuple2<Integer, Tuple2<Integer, Integer>> res) throws Exception {
                System.out.println("res1:"+res._1+";res21:"+res._2._1+";res22:"+res._2._2);
            }
        });

        //--------------------
        JavaRDD<Tuple2<String,Integer>> msgRDD = jsc.parallelize(Arrays.asList(

                new Tuple2<String,Integer>("msg1",3),
                new Tuple2<String,Integer>("msg1",2),
                new Tuple2<String,Integer>("msg1",4),
                new Tuple2<String,Integer>("msg2",3),
                new Tuple2<String,Integer>("msg3",6),
                new Tuple2<String,Integer>("msg3",9),
                new Tuple2<String,Integer>("msg3",8)
                ...
        ));

        JavaPairRDD<String,Integer> msgPair = msgRDD.mapToPair(tuple->tuple);

        /**msgPair.aggregateByKey()打印結果
         *
         * return new Tuple2<>(tuple1._1+tuple2._1,tuple1._2+tuple2._2)的結果
         * msg1;9;3
         * msg2;3;1
         * msg3;168;24
         *
         * 分兩個分區後 分別return tuple1 和 tuple2 的結果
         * msg1;9;3
         * msg2;3;1
         * msg3;98;14
         *
         * msg1;9;3
         * msg2;3;1
         * msg3;70;10
         *
         * 由結果可知,第一個Function2爲分區內計算  第二個爲分區間的計算(第一個Function2的結果作爲第二個Function2的參數宏觀調用)
         */
        //sum and count
        JavaPairRDD<String,Tuple2<Integer,Integer>>  msgRes =
        msgPair.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),new Integer(4),
                new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> val1, Integer val2) throws Exception {
                        //分區內 param1 sum(val) param2 設計count遞增
                        return new Tuple2<>(val1._1 + val2, val1._2+1);//val1._1 + val2爲sum, val2, val1._2+1 每條記錄出現+1
                    }
                },
                new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
                        //return new Tuple2<>(tuple1._1+tuple2._1,tuple1._2+tuple2._2);
                        return tuple2;//
                    }
                }
        );
        msgRes.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
            @Override
            public void call(Tuple2<String, Tuple2<Integer, Integer>> msgTuple2) throws Exception {
                System.out.println(msgTuple2._1+";"+msgTuple2._2._1+";"+msgTuple2._2._2);
            }
        });

        //sum and max
        JavaPairRDD<String,Tuple2<Integer,Integer>>  msgMaxRes =
        msgPair.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),
                new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> val1, Integer val2) throws Exception {
                        return new Tuple2<Integer, Integer>(val1._1>val2?val1._1:val2,val1._2+1);
                    }
                },
                new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
                        return new Tuple2<Integer, Integer>(tuple1._1>tuple2._1?tuple1._1:tuple2._1,tuple1._2+tuple2._2);
                    }
                }
        );

        msgMaxRes.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
            @Override
            public void call(Tuple2<String, Tuple2<Integer, Integer>> msgTuple2) throws Exception {
                System.out.println(msgTuple2._1+";"+msgTuple2._2._1+";"+msgTuple2._2._2);
            }
        });
    }
}

//lambda版
msgPair.aggregate(new Tuple2<Integer, Integer>(0.0, 0), 
	     (x,y)->new Tuple2<Integer, Integer>(x._1+y,x._2+1),
	     (x,y)->new Tuple2<Integer, Integer>(x._1+y._1,x._2+y._2));

combineByKey( aggregateByKey是combineByKey的簡版)

/*********      combineByKey  start     **************/
        //sum and count
        JavaPairRDD<String,Tuple2<Integer,Integer>> pairRDD =
        msgPair.combineByKey(
                new Function<Integer, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer,Integer> call(Integer x) throws Exception {
                        return new Tuple2<Integer, Integer>(x,1);
                    }
                },
                new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer,Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v, Integer x) throws Exception {
                        return new Tuple2<Integer, Integer>(v._1+x,v._2+1);
                    }
                },
                new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
                    @Override
                    public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v, Tuple2<Integer, Integer> v1) throws Exception {
                        return new Tuple2<Integer, Integer>(v._1+v1._1,v._2+v1._2);
                    }
                }
        );

        pairRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
            @Override
            public void call(Tuple2<String, Tuple2<Integer, Integer>> tuple2) throws Exception {
                System.out.println("combineByKey==="+tuple2._1+";"+tuple2._2._1+";"+tuple2._2._2);
            }
        });

foldByKey (上面兩種的精簡版)

//sum
JavaPairRDD<String,Integer> foldRDD =
msgPair.foldByKey(new Integer(0),
       new Function2<Integer, Integer, Integer>() {
           @Override
           public Integer call(Integer v1, Integer v2) throws Exception {
               return v1+v2;
           }
       }
);
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章