spark記錄(20)自定義累加器Accumulator

自定義累加器

/**
 * 自定義累加器需要繼承AccumulatorV2<IN,OUT>類
 * 並且要指定要累加的類型
 */
public class MyAccumulator extends AccumulatorV2<MyKey,MyKey> implements Serializable {

    /**
     * 該累加狀態是在Driver端初始化
     * 並且值也是保存在Driver端
     */
    private MyKey info = new MyKey(0, 0);

    public MyKey getInfo() {
        return info;
    }

    public void setInfo(MyKey info) {
        this.info = info;
    }

    /**
     * 判斷是否是初始化狀態
     * 直接與原始狀態的值比較
     * 該判斷爲自己定義的判斷方式
     * @return
     */
    @Override
    public boolean isZero() {
        return info.getPersonAgeSum()==0 && info.getPersonNum()==0;
    }

    /**
     * 爲每個分區創建一個新的累加器
     * @return
     */
    @Override
    public AccumulatorV2<MyKey, MyKey> copy() {
        MyAccumulator myAccumulator = new MyAccumulator();
        myAccumulator.info = this.info;
        return myAccumulator;
    }

    /**
     * 初始化不同的partition分區中的累加類型
     */
    @Override
    public void reset() {
        info = new MyKey(0, 0);
    }

    /**
     * 進行累加時以何種規則進行累加
     * @param v 每條新進來的記錄
     */
    @Override
    public void add(MyKey v) {
        info.setPersonNum(info.getPersonNum() + v.getPersonNum());
        info.setPersonAgeSum(info.getPersonAgeSum() + v.getPersonAgeSum());
    }

    /**
     * 合併不同partition分區中accumulator中儲存的狀態值
     * @param other 每個分區中的累加器
     */
    @Override
    public void merge(AccumulatorV2<MyKey, MyKey> other) {
        MyKey value = other.value();
        info.setPersonNum(info.getPersonNum()+value.getPersonNum());
        info.setPersonAgeSum(info.getPersonAgeSum()+value.getPersonAgeSum());
    }

    /**
     * 最後返回的累加完成的狀態值
     * @return
     */
    @Override
    public MyKey value() {
        return info;
    }
}

自定義key

public class MyKey implements Serializable {
    private Integer personNum;
    private Integer personAgeSum;

    public MyKey() {
    }

    public MyKey(Integer personNum, Integer personAgeSum) {
        this.personNum = personNum;
        this.personAgeSum = personAgeSum;
    }

    public Integer getPersonNum() {
        return personNum;
    }

    public void setPersonNum(Integer personNum) {
        this.personNum = personNum;
    }

    public Integer getPersonAgeSum() {
        return personAgeSum;
    }

    public void setPersonAgeSum(Integer personAgeSum) {
        this.personAgeSum = personAgeSum;
    }

    @Override
    public String toString() {
        return "MyKey{" +
                "personNum=" + personNum +
                ", personAgeSum=" + personAgeSum +
                '}';
    }
}

 運行:

public class MyRun {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setAppName("testAccumulator");
        conf.setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);
        MyAccumulator acc = new MyAccumulator();

        sc.sc().register(acc,"PersonInfoAccumulator");
        JavaRDD<String> rdd = sc.parallelize(Arrays.asList(
                "zhangsan 1", "lisi 2", "wangwu 3", "zhaoliu 4", "tianqi 5", "zhengba 6"
        ));

        rdd.map(new Function<String, String>() {
            @Override
            public String call(String v1) throws Exception {
                acc.add(new MyKey(1,Integer.parseInt(v1.split(" ")[1])));
                return v1;
            }
        }).collect();

        System.out.println("value = "+acc.value());

    }
}

結果:

value = MyKey{personNum=6, personAgeSum=21}

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章