Spark:將DataFrame 寫入mysql

DataFrame 寫入mysql

import java.io.FileInputStream
import java.sql.{Connection, DriverManager}
import java.util.Properties

import org.apache.spark.sql.{DataFrame, SaveMode}

/**
  * @author 利伊奧克兒-lillcol
  *         2018/10/12-14:44
  *
  */
object MyTestDemo {
  /**
    * 將DataFrame保存爲Mysql表
    *
    * @param dataFrame 需要保存的dataFrame
    * @param tableName 保存的mysql 表名
    * @param saveMode  保存的模式 :Append、Overwrite、ErrorIfExists、Ignore
    * @param proPath   配置文件的路徑
    */
  def saveASMysqlTable(dataFrame: DataFrame, tableName: String, saveMode: SaveMode, proPath: String) = {
    var table = tableName
    val properties: Properties = getProPerties(proPath)
    val prop = new Properties //配置文件中的key 與 spark 中的 key 不同 所以 創建prop 按照spark 的格式 進行配置數據庫
    prop.setProperty("user", properties.getProperty("mysql.username"))
    prop.setProperty("password", properties.getProperty("mysql.password"))
    prop.setProperty("driver", properties.getProperty("mysql.driver"))
    prop.setProperty("url", properties.getProperty("mysql.url"))
    if (saveMode == SaveMode.Overwrite) {

      var conn: Connection = null
      try {
        conn = DriverManager.getConnection(
          prop.getProperty("url"),
          prop.getProperty("user"),
          prop.getProperty("password")
        )
        val stmt = conn.createStatement
        table = table.toUpperCase
        stmt.execute(s"truncate table $table") //此操作的目的是在覆蓋的時候不刪除原來的表,避免數據的類型全部變爲TEXT類型
        conn.close()
      }
      catch {
        case e: Exception =>
          println("MySQL Error:")
          e.printStackTrace()
      }
    }
    dataFrame.write.mode(SaveMode.Append).jdbc(prop.getProperty("url"), table.toUpperCase, prop)
  }

  /**
    * 獲取配置文件
    *
    * @param proPath
    * @return
    */
  def getProPerties(proPath: String): Properties = {
    val properties: Properties = new Properties()
    properties.load(new FileInputStream(proPath))
    properties
  }
}

效率問題

  def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
    val props = new Properties()
    extraOptions.foreach { case (key, value) =>
      props.put(key, value)
    }
    // connectionProperties should override settings in extraOptions
    props.putAll(connectionProperties)
    val conn = JdbcUtils.createConnectionFactory(url, props)()

    try {
      var tableExists = JdbcUtils.tableExists(conn, url, table)

      if (mode == SaveMode.Ignore && tableExists) {
        return
      }

      if (mode == SaveMode.ErrorIfExists && tableExists) {
        sys.error(s"Table $table already exists.")
      }

      if (mode == SaveMode.Overwrite && tableExists) {
        JdbcUtils.dropTable(conn, table)
        tableExists = false
      }

      // Create the table if the table didn't exist.
      if (!tableExists) {
        val schema = JdbcUtils.schemaString(df, url)
        val sql = s"CREATE TABLE $table ($schema)"
        val statement = conn.createStatement
        try {
          statement.executeUpdate(sql)
        } finally {
          statement.close()
        }
      }
    } finally {
      conn.close()
    }

    JdbcUtils.saveTable(df, url, table, props)
  }
--------------------------------------------------------------
/**
   * Saves the RDD to the database in a single transaction.
   */
  def saveTable(
      df: DataFrame,
      url: String,
      table: String,
      properties: Properties) {
    val dialect = JdbcDialects.get(url)
    val nullTypes: Array[Int] = df.schema.fields.map { field =>
      getJdbcType(field.dataType, dialect).jdbcNullType
    }

    val rddSchema = df.schema
    val getConnection: () => Connection = createConnectionFactory(url, properties)
    val batchSize = properties.getProperty("batchsize", "1000").toInt
    df.foreachPartition { iterator =>
      savePartition(getConnection, table, iterator, rddSchema, nullTypes, batchSize, dialect)
    }
  }
配置文件部分內容
#mysql數據庫配置
mysql.driver=com.mysql.jdbc.Driver
mysql.url=jdbc:mysql://0.0.0.0:3306/iptv?useSSL=false&autoReconnect=true&failOverReadOnly=false&rewriteBatchedStatements=true
mysql.username=lillclol
mysql.password=123456

#hive
hive.root_path=hdfs://ns1/user/hive/warehouse/

上面兩段代碼爲DataFrame寫入mysql關鍵源代碼

一開始我覺得DataFrame寫入mysql效率感人,太慢了,想了各種手段去優化,最快的是把文件拿下來,load進mysql,但是這步驟太繁瑣了,後面去看了一下源代碼,發現了數據寫入mysql的時候是按照分區來寫的,也就是說每個分區都創建了一個mysql連接,於是我在寫入mysql之前對DataFrame先進行分區,根據mysql連接池數量設定合理的分區,每分鐘可以寫入100W條記錄,基本達到較高的效率。

此爲本人日常工作中的總結

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章