Spark | 讀取Hive表數據寫入MySQL

 

 

import java.sql.Connection

import scala.collection.mutable.ArrayBuffer

object JdbcTemplateUtil extends Serializable {
    /**
      * 單條操作
      * @param sql
      * @param params
      */
    def executeSql(conn: Connection, sql: String, params: Array[String]): Unit = {
        try {
            val ps = conn.prepareStatement(sql)
            if (params != null) {
                for (i <- params.indices)
                    ps.setString(i + 1, params(i))
            }
            //noinspection ScalaUnusedSymbol
            val update = ps.executeUpdate()
            ps.close()
        } catch {
            case e: Exception => println(">>>Execute Sql Exception..." + e)
        }
    }
    /**
      * 批量操作
      * @param sql
      * @param paramList
      */
    def executeBatchSql(conn: Connection, sql: String, paramList: ArrayBuffer[Array[String]]): Unit = {
        try {
            val ps = conn.prepareStatement(sql)
            conn.setAutoCommit(false)
            for (params: Array[String] <- paramList) {
                if (params != null) {
                    for (i <- params.indices) ps.setString(i + 1, params(i))
                    ps.addBatch()
                }
            }
            // val update = ps.executeUpdate()
            ps.executeBatch()
            conn.commit()
            ps.close()
            conn.close()
        } catch {
            case e: Exception =>  println(">>>Execute Batch Sql Exception..." + e)
        }
    }
}
import java.sql.Connection
import java.text.{ParseException, SimpleDateFormat}
import java.util.{Calendar, Date}

import scala.collection.mutable.ArrayBuffer

object DateTimeUtil extends Serializable {
    /**
      * 獲取本機日曆日期
      * @param delta
      * @return
      */
    def dateDelta(delta: Int, separator: String): String = {
        val sdf = new SimpleDateFormat("yyyy" + separator + "MM" + separator + "dd")
        val cal = Calendar.getInstance()
        cal.add(Calendar.DATE, delta)
        val date = sdf.format(cal.getTime)
        date
    }

}

 

數據寫入JDBC時,Spark提供了以下幾種模式,注意:


  SaveMode.ErrorIfExists模式(默認),該模式下,若數據庫中已經存在該表,則會直接報異常,導致數據不能存入數據庫;
  SaveMode.Append 若表已經存在,則追加在該表中;若該表不存在,則會先創建表,再插入數據;
  SaveMode.Overwrite 重寫模式,其本質是先將已有的表及其數據全都刪除,再重新創建該表,然後插入新的數據;
  SaveMode.Ignore 若表不存在,則創建表,並存入數據;若表存在的情況下,直接跳過數據的存儲,不會報錯。

SparkSQL讀取Hive表數據,然後將數據寫入MySQL,如何避免主鍵衝突?

1、基於日期刪除當前mysql表中數據,然後再寫入。但是需注意當前寫入的批數據中主鍵唯一。

2、基於duplicate key關鍵詞解決主鍵衝突,使用PrepareStatement來完成JDBC寫入。

      但是這樣就得捨棄spark提供的JDBC TO Other DataBases。

       具體參照官網:http://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

 

下文使用SaveMode.Append 以及確保delete刪除當前日期數據,然後再次寫入當前批數據到mysql。

 

import java.sql.{Connection, DriverManager}
import com.berg.commons.enums.PropertyEnum
import com.berg.commons.utils._
import org.apache.log4j.Logger
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.commons.codec.digest.DigestUtils

case class GisOmsAddressFilterDO(ID: String, STAT_DATE: String, SN: String, ADDRESS: String , AK: String, RESULT: Int, CITY_CODE: Int, REQUEST_DATE_TIME: String)
object GisOmsAddressFilterStorage extends Serializable{
    val appName: String = this.getClass.getSimpleName.replace("$", "")
    val logger: Logger = Logger.getLogger(appName)
    private[GisOmsAddressFilterStorage] val dbName: String = "xxx"
    private[GisOmsAddressFilterStorage] val dbTableName: String = "xxx"
    private[GisOmsAddressFilterStorage] val sourceTableName = "xxx"
    private[GisOmsAddressFilterStorage] val url =  PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.driverURL")
    private[GisOmsAddressFilterStorage] val user =  PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.user")
    private[GisOmsAddressFilterStorage] val password =  PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.password")

    val dlr = "$"
    /**
      * @param spark
      * @param incDay
      * */
    def saveGisOmsAddressData(spark: SparkSession, incDay: String):Unit={
        val sparkSql = s"""
                            | select
                            | *
                            | from $sourceTableName
                            | where inc_day = '$incDay'
                            """.stripMargin

        logger.error(">>>>>>Execute Hive Sql: " + sparkSql)
        val df = spark.sql(sparkSql).toDF("sn", "address", "ak", "result", "cityCode", "dateTime", "statDate")
        import spark.implicits._
        val jdbcDF = df.rdd.map(row=>{
                val sn = row.getAs[String]("sn")
                val address = row.getAs[String]("address")
                val ak = row.getAs[String]("ak")
                val result = row.getAs[String]("result").toInt
                val cityCode = row.getAs[String]("cityCode").toInt
                val dateTime = row.getAs[String]("dateTime")
                val statDate = row.getAs[String]("statDate")
                val id = DigestUtils.md5Hex(statDate.concat(sn))
                GisOmsAddressFilterDO(id, statDate, sn, address, ak, result, cityCode, dateTime)
            }).toDF ()

        try {
            val delSql = s"DELETE FROM $dbTableName WHERE STAT_DATE = '$incDay'"
            logger.error(">>>>>>Execute Del Sql: " + delSql)
            val conn: Connection = DriverManager.getConnection(url,user,password)
            JdbcTemplateUtil.executeSql(conn,delSql,null)
            jdbcDF.write.format("jdbc").mode(SaveMode.Append)
                    .option("url", url)
                    .option("user", user)
                    .option("password", password)
                    .option("dbtable", dbTableName)
                    .save()
            logger.error(">>>>>>Save OK!!!")
        } catch {
            case e: Exception => logger.error(">>>OP DataBase Exception: "+e)
        }
        spark.stop()
    }

    def start(incDay: String): Unit = {
        val spark = SparkSession.builder().config(ConfigUtil.getSparkConf(appName)).enableHiveSupport().getOrCreate()
        spark.sparkContext.setLogLevel("ERROR")
        saveGisOmsAddressData(spark, incDay)
    }

    def main(args: Array[String]): Unit = {
        val incDay: String = DateTimeUtil.dateDelta(-1, "")
        logger.info(incDay)
        start(incDay)
    }

}

更多學習: https://blog.csdn.net/u011622631/article/details/84572022

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章