Spark自定義AccumulatorV2
1.概述###
- AccumulatorV2的簡單使用與注意事項見上一遍博客https://blog.csdn.net/wtzhm/article/details/86481846
- 在實際開發中很多時候需要用到自定義Acummulator,Accumulator太多了,不便於維護,例如項目如果要出現一些邏輯上的變更,比如說,session數量的計算邏輯,要改變,就得更改所有Accumulator對應的代碼;或者說,又要增加幾個範圍,那麼又要增加多個Accumulator,並且修改對應的累加代碼;維護成本,相當之高。
2. 實例
-
定義一個類繼承AcummulatorV2抽象類,實現6個抽象方法
package com.spark.zhmcode.session import com.spark.zhmcode.utils.MyStringUtils import org.apache.spark.util.AccumulatorV2 /** * 第一個爲輸入類型,第二個爲輸出類型 */ class MyCustomerAcculatorv2 extends AccumulatorV2[String, String] { var result = "session1=0|session2=0|session3=0|session4=0" /** * isZero: 當AccumulatorV2中存在類似數據不存在這種問題時,是否結束程序 */ override def isZero: Boolean = { result == "session1=0|session2=0|session3=0|session4=0" } /** * 拷貝一個新的AccumulatorV2 */ override def copy(): AccumulatorV2[String, String] = { val accumulator = new MyCustomerAcculatorv2() accumulator.result = this.result accumulator } /** * 重置AccumulatorV2中的數據 */ override def reset(): Unit = { result = "session1=0|session2=0|session3=0|session4=0" } /** * 操作數據累加方法實現 */ override def add(v: String): Unit = { val v1 = result val v2 = v if (MyStringUtils.isNotEmpty(v1) && MyStringUtils.isNotEmpty(v2)) { var newResult="" val oldValue = MyStringUtils.getFieldFromConcatString(v1,"\\|",v2) if(oldValue!=null){ val newValue = oldValue.toInt + 1 newResult = MyStringUtils.setFieldFromConcatString(v1,"\\|",v2,newValue) } result = newResult } } /** * 合併數據 */ override def merge(other: AccumulatorV2[String, String]): Unit = other match { case map: MyCustomerAcculatorv2 => result = other.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } /** * AccumulatorV2對外訪問的數據結果 */ override def value: String = { result } }
-
MyStringUtils工具類
package com.spark.zhmcode.utils object MyStringUtils { /** * 從拼接的字符串中提取字段的值 * * @param str "session1=0|session2=0|session3=0|session4=0" * @param delimiter 分隔符 * @param field 字段 * @return 字段值 */ def getFieldFromConcatString(str: String, delimiter: String, field: String): String = { val fileds = str.split(delimiter) var result = "0" for (concatField <- fileds) { var fieldAndValue = concatField.split("=") if (fieldAndValue.length == 2) { val key = fieldAndValue(0) val value = fieldAndValue(1) if (field.equals(key)) { result = value } } } result } /** * 從拼接的字符串中設置字段的值 * * @param str "session1=0|session2=0|session3=0|session4=0" * @param delimiter 分隔符 * @param field 字段 * @param newValue 字段對應新的value * @return 字段值 */ def setFieldFromConcatString(str: String, delimiter: String, field: String,newValue:Integer): String = { var result = str val buffer = new StringBuffer("") val fileds = str.split(delimiter) var isExist = false for (concatField <- fileds) { var fieldAndValue = concatField.split("=") if (fieldAndValue.length == 2) { val key = fieldAndValue(0) val value = fieldAndValue(1) if (field.equals(key)) { buffer.append(key).append("=").append(newValue).append("|") isExist = true }else{ buffer.append(key).append("=").append(value).append("|") } } } if(isExist){ var newResult = buffer.toString newResult.substring(0,newResult.length-1) }else{ result } } /** * 判斷字符串是否不爲空 * * @param str 字符串 * @return 是否不爲空 */ def isNotEmpty(str: String): Boolean = str != null && !("" == str) }
-
創建自定義Accumulator,記住一定的註冊,不然會拋出異常
package com.spark.zhmcode.session import org.apache.spark.{SparkConf, SparkContext} object MyAccumulator { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("MyAccumulator") val sc = new SparkContext(conf) val data = List("session1","session2","session3","session4","session1","session3","session3","session3","a","b","c","d") val rdd1 = sc.parallelize(data,1) val accumulator = new MyCustomerAcculatorv2() sc.register(accumulator,"countSession") val resultRdd = rdd1.foreach(x=>{ accumulator.add(x) }) println("自定義accumulator統計結果:" + accumulator.value) sc.stop() } }