sparksql dataFrame 關於列的增刪改操作

最近項目中用到spark和ES的集成,涉及到一系列對dataFrame列的操作,時間主要花在列的增刪改上面。

整個類採用的是spark+ES+Redis架構,ES存基礎數據,redis 存儲條件,根據redis的條件從ES篩選出符合條件的記錄。

val sqlContex = SparkSession.builder().master("local[2]")
  .appName("spark2ES")
  // .config("hive.metastore.uris", "thrift://172.1.1.199:9083") //集成hive
  .config("spark.testing.memory", "2147480000")
  .config("es.index.auto.create", "true")
  .config("es.nodes", "xxxx")
  // .enableHiveSupport() // 增加hive支持
  .getOrCreate()

// 讀取es數據

val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true")
val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load()

// 增加列操作

// 方法一 sql的自定義函數
 val topic ="topic123"
tempDataFrame.createOrReplaceTempView("temp")
sqlContex.sqlContext.udf.register("replaceCol", (str:String) => topic)
val addDf =sqlContex.sqlContext.sql(s"select *,replaceCol(content) as topicName from temp")
addDf.show()

// 方法二 dataFrame  withcolumn 自定義函數
val topic ="topic123"
val replace = (x:String)=>{topic}
val replaceCol = udf(replace)
val data = tempDataFrame.withColumn("topicName",replaceCol(tempDataFrame("content")))
data.show()
// 修改列值 重新構造dataFrame,新dataFrame先取不用改變的列,編寫自定義函數構造新列,再加到新
dataFrame

      esDF.createOrReplaceTempView("temp")

      // 創建自定義函數
      sqlContex.udf.register("Time2Time", Time2Time _)
   
      // 獲取colName
      val linkedColNames = getLinkedColNames(esDF.schema.fieldNames)
    
      val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,from temp where $conditon")
//     addDf.saveToEs("ods_wj_scenes_detail/docs")
      addDf.select("update_time").show(50)
//刪除列操作
tempDataFrame.drop("topicName")

下面是整代碼

object EsFilterforNetworkMonitoring {
  def main(args: Array[String]): Unit = {
    val sqlContex = SparkSession.builder().master("local[2]")
      .appName("spark2ES")
      // .config("hive.metastore.uris", "thrift://172.1.1.199:9083")
      .config("spark.testing.memory", "2147480000")
      .config("es.index.auto.create", "true")
      .config("es.nodes", "172.10.4.4:9200,172.10.4.5:9200,172.10.4.6:9200")
      // .enableHiveSupport()
      .getOrCreate()

      
    // section 1: 讀取ES數據
    val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true")
    val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load()
    val topicMap = Map("topic123456789" -> "1==1&&2==2")

    // 篩選出滿足條件的數據
    for ((k, v) <- topicMap) {
      val topic = k
      val conditon = v.replaceAll("==", "=").replaceAll("&&", " and ").replace("||", " or ")
      println("============================= " + conditon)
      esDF.createOrReplaceTempView("temp")

      // 創建自定義函數
      sqlContex.udf.register("Time2Time", Time2Time _)
      sqlContex.sqlContext.udf.register("replaceCol", (str: String) => topic)

      // 獲取colName
      val linkedColNames = getLinkedColNames(esDF.schema.fieldNames)
      println( "linkedNames : "+linkedColNames)

      val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,replaceCol(apk_name) as topicName from temp where $conditon")
//     addDf.saveToEs("ods_wj_scenes_detail/docs")
      addDf.select("update_time").show(50)

  }

  def Time2Time(tp: java.sql.Timestamp): String = {
    if (tp == null ) {
      null
    } else {
      try {
        val sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
        val datestr: String = sdf.format(tp.getTime)
        datestr
      } catch {
        case e: Exception =>
          e.printStackTrace()
          null
      }
    }
  }

    //  拼接字符串
    def getLinkedColNames(arr:Array[String]):String={
      var linkedColName =""
      for(ele<-arr if(!("update_time".equals(ele)||("create_time".equals(ele))))){
        if(linkedColName.length==0){
          linkedColName=ele
        }else{
          linkedColName=linkedColName+","+ele
        }
      }
      linkedColName
    }
  }






發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章