import java.sql.Connection
import scala.collection.mutable.ArrayBuffer
object JdbcTemplateUtil extends Serializable {
/**
* 單條操作
* @param sql
* @param params
*/
def executeSql(conn: Connection, sql: String, params: Array[String]): Unit = {
try {
val ps = conn.prepareStatement(sql)
if (params != null) {
for (i <- params.indices)
ps.setString(i + 1, params(i))
}
//noinspection ScalaUnusedSymbol
val update = ps.executeUpdate()
ps.close()
} catch {
case e: Exception => println(">>>Execute Sql Exception..." + e)
}
}
/**
* 批量操作
* @param sql
* @param paramList
*/
def executeBatchSql(conn: Connection, sql: String, paramList: ArrayBuffer[Array[String]]): Unit = {
try {
val ps = conn.prepareStatement(sql)
conn.setAutoCommit(false)
for (params: Array[String] <- paramList) {
if (params != null) {
for (i <- params.indices) ps.setString(i + 1, params(i))
ps.addBatch()
}
}
// val update = ps.executeUpdate()
ps.executeBatch()
conn.commit()
ps.close()
conn.close()
} catch {
case e: Exception => println(">>>Execute Batch Sql Exception..." + e)
}
}
}
import java.sql.Connection
import java.text.{ParseException, SimpleDateFormat}
import java.util.{Calendar, Date}
import scala.collection.mutable.ArrayBuffer
object DateTimeUtil extends Serializable {
/**
* 獲取本機日曆日期
* @param delta
* @return
*/
def dateDelta(delta: Int, separator: String): String = {
val sdf = new SimpleDateFormat("yyyy" + separator + "MM" + separator + "dd")
val cal = Calendar.getInstance()
cal.add(Calendar.DATE, delta)
val date = sdf.format(cal.getTime)
date
}
}
數據寫入JDBC時,Spark提供了以下幾種模式,注意:
SaveMode.ErrorIfExists模式(默認),該模式下,若數據庫中已經存在該表,則會直接報異常,導致數據不能存入數據庫;
SaveMode.Append 若表已經存在,則追加在該表中;若該表不存在,則會先創建表,再插入數據;
SaveMode.Overwrite 重寫模式,其本質是先將已有的表及其數據全都刪除,再重新創建該表,然後插入新的數據;
SaveMode.Ignore 若表不存在,則創建表,並存入數據;若表存在的情況下,直接跳過數據的存儲,不會報錯。
SparkSQL讀取Hive表數據,然後將數據寫入MySQL,如何避免主鍵衝突?
1、基於日期刪除當前mysql表中數據,然後再寫入。但是需注意當前寫入的批數據中主鍵唯一。
2、基於duplicate key關鍵詞解決主鍵衝突,使用PrepareStatement來完成JDBC寫入。
但是這樣就得捨棄spark提供的JDBC TO Other DataBases。
具體參照官網:http://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
下文使用SaveMode.Append 以及確保delete刪除當前日期數據,然後再次寫入當前批數據到mysql。
import java.sql.{Connection, DriverManager}
import com.berg.commons.enums.PropertyEnum
import com.berg.commons.utils._
import org.apache.log4j.Logger
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.commons.codec.digest.DigestUtils
case class GisOmsAddressFilterDO(ID: String, STAT_DATE: String, SN: String, ADDRESS: String , AK: String, RESULT: Int, CITY_CODE: Int, REQUEST_DATE_TIME: String)
object GisOmsAddressFilterStorage extends Serializable{
val appName: String = this.getClass.getSimpleName.replace("$", "")
val logger: Logger = Logger.getLogger(appName)
private[GisOmsAddressFilterStorage] val dbName: String = "xxx"
private[GisOmsAddressFilterStorage] val dbTableName: String = "xxx"
private[GisOmsAddressFilterStorage] val sourceTableName = "xxx"
private[GisOmsAddressFilterStorage] val url = PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.driverURL")
private[GisOmsAddressFilterStorage] val user = PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.user")
private[GisOmsAddressFilterStorage] val password = PropertyUtil.getPropertyValue(PropertyEnum.DATABASE_DEV.getValue(),"berg.mysql.password")
val dlr = "$"
/**
* @param spark
* @param incDay
* */
def saveGisOmsAddressData(spark: SparkSession, incDay: String):Unit={
val sparkSql = s"""
| select
| *
| from $sourceTableName
| where inc_day = '$incDay'
""".stripMargin
logger.error(">>>>>>Execute Hive Sql: " + sparkSql)
val df = spark.sql(sparkSql).toDF("sn", "address", "ak", "result", "cityCode", "dateTime", "statDate")
import spark.implicits._
val jdbcDF = df.rdd.map(row=>{
val sn = row.getAs[String]("sn")
val address = row.getAs[String]("address")
val ak = row.getAs[String]("ak")
val result = row.getAs[String]("result").toInt
val cityCode = row.getAs[String]("cityCode").toInt
val dateTime = row.getAs[String]("dateTime")
val statDate = row.getAs[String]("statDate")
val id = DigestUtils.md5Hex(statDate.concat(sn))
GisOmsAddressFilterDO(id, statDate, sn, address, ak, result, cityCode, dateTime)
}).toDF ()
try {
val delSql = s"DELETE FROM $dbTableName WHERE STAT_DATE = '$incDay'"
logger.error(">>>>>>Execute Del Sql: " + delSql)
val conn: Connection = DriverManager.getConnection(url,user,password)
JdbcTemplateUtil.executeSql(conn,delSql,null)
jdbcDF.write.format("jdbc").mode(SaveMode.Append)
.option("url", url)
.option("user", user)
.option("password", password)
.option("dbtable", dbTableName)
.save()
logger.error(">>>>>>Save OK!!!")
} catch {
case e: Exception => logger.error(">>>OP DataBase Exception: "+e)
}
spark.stop()
}
def start(incDay: String): Unit = {
val spark = SparkSession.builder().config(ConfigUtil.getSparkConf(appName)).enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
saveGisOmsAddressData(spark, incDay)
}
def main(args: Array[String]): Unit = {
val incDay: String = DateTimeUtil.dateDelta(-1, "")
logger.info(incDay)
start(incDay)
}
}
更多學習: https://blog.csdn.net/u011622631/article/details/84572022