由於用的是spark1.5.1的版本,出現諸多想不到的bug,記錄下來,供大家參考。
首先說下我們的需求,是將hive的表進行回寫入oracle,必須使用sparksql這種形式,所以就不考慮sqoop,集羣的大數據平臺沒有sqoop組件。必須按照一定的數據格式精準輸出,從oracle跑數時用的什麼類型,最後回到oracle是什麼類型,並且精度是一致的。
由於大數據平臺hive中,將date也存爲了string,並且hive的string是不指定長度的,難度在此。
1.第一種方案:
由於考慮到不允許訪問hive的metadata元信息,所以使用sqlContext.sql讀目標表的schema,將其轉爲rdd,利用讀取oracle的系統表獲取最終轉換的數據類型及長度,重組schema,並將其與rdd重新構成dataframe
使用一個spark.jdbc類的write.jdbc方法
option(“createTableColumnTypes”,”name varchar(200)”)
加上這個屬性,來解決最後建表問題。該方法的該屬性,經過測試,無法使用於spark1.5.1版本,應爲2.2.0版本使用。
代碼如下:
package test1
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SaveMode
import oracle.jdbc.driver.OracleDriver
import sun.security.util.Length
import org.apache.spark.sql.types.StringType
import java.util.ArrayList
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.DataTypes
import scala.collection.mutable.ArrayBuffer
import java.util.Properties
import org.apache.spark.sql.jdbc._
import java.sql.Types
object ojdbcTest {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("firstTry").setMaster("local");
val sc = new SparkContext(conf);
val sqlContext = new HiveContext(sc);
//控制schame優化
var df = sqlContext.sql("select * from ****.BL_E01_REJECTACCOUNT")
val df1 = df.schema.toArray
val theJdbcDF = sqlContext.load("jdbc", Map(
"url" -> "jdbc:oracle:thin:***/*****@//*****/*****",
"dbtable" -> "( select column_name ,data_type,data_length,data_precision,data_scale from user_tab_cols where table_name ='BL_E01_REJECTACCOUNT' order by COLUMN_ID ) a ",
"driver" -> "oracle.jdbc.driver.OracleDriver",
"numPartitions" -> "5",
"lowerBound" -> "0",
"upperBound" -> "80000000"))
val str = theJdbcDF.collect().toArray
var dateArray = new ArrayBuffer[String]
var stringArray = new ArrayBuffer[(String, Int)]
var list = new ArrayList[org.apache.spark.sql.types.StructField]();
var string = new ArrayList[String]
for (j <- 0 until str.length) {
var st = str(j)
var column_name = st.get(0)
var data_type = st.get(1)
var data_length = st.get(2)
var data_precision = st.get(3)
var data_scale = st.get(4)
println(column_name + ":" + data_type + ":" + data_length + ":" + data_precision + data_scale)
if (data_type.equals("DATE")) {
dateArray += (column_name.toString())
string.add(column_name.toString() + " " + data_type.toString())
}
if (data_type.equals("NUMBER")) {
if (data_precision != null) {
string.add(column_name.toString() + " " + data_type.toString() + s"(${data_precision.toString().toDouble.intValue()},${data_scale.toString().toDouble.intValue()})")
} else {
string.add(column_name.toString() + " " + data_type.toString())
}
}
if (data_type.equals("VARCHAR2")) {
stringArray += ((column_name.toString(), data_length.toString().toDouble.intValue()))
string.add(column_name.toString() + " " + data_type.toString() + s"(${data_length.toString().toDouble.intValue()})")
}
}
for (i <- 0 until df1.length) {
var b = df1(i)
var dataName = b.name
var dataType = b.dataType
// println("字段名"+dataName+"字段類型"+dataType)
if (dateArray.exists(p => p.equalsIgnoreCase(s"${dataName}"))) {
dataType = DateType
}
var structType = DataTypes.createStructField(dataName, dataType, true)
list.add(structType)
}
val schema = DataTypes.createStructType(list)
if (dateArray.length > 0) {
for (m <- 0 until dateArray.length) {
var mm = dateArray(m).toString()
println("mm:" + mm)
var df5 = df.withColumn(s"$mm", df(s"$mm").cast(DateType))
df = df5
}
}
val rdd = df.toJavaRDD
val df2 = sqlContext.createDataFrame(rdd, schema);
df2.printSchema()
val url = "jdbc:oracle:thin:@//*******/***"
val table = "test2"
val user = "***"
val password = "***"
val url1="jdbc:oracle:thin:***/***@//***/***"
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")
val a = string.toString()
val option = a.substring(1, a.length() - 1)
println(option)
df2.option("createTableColumnTypes",s"${option}").write.jdbc(url, table, connectionProperties)
sc.stop()
}
}
代碼寫的比較隨意,只是個test類。
2.第二種方案:
由於考慮到之前那些情況,以上方法不適用於1.5.1後面又採用新的辦法
使用重寫JdbcDialect類中的三個方法進行讀寫,這個是sql當中獲取jdbc數據庫類型的方法,重寫就可以實現簡單轉換。
package test1
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SaveMode
import oracle.jdbc.driver.OracleDriver
import sun.security.util.Length
import org.apache.spark.sql.types.StringType
import java.util.ArrayList
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.DataTypes
import scala.collection.mutable.ArrayBuffer
import java.util.Properties
import org.apache.spark.sql.jdbc._
import java.sql.Types
object ojdbcTest {
def oracleInit(){
val dialect:JdbcDialect= new JdbcDialect() {
override def canHandle(url:String)={
url.startsWith("jdbc:oracle");
}
//讀oracle的類型轉換方法
override def getCatalystType(sqlType, typeName, size, md):Option[DataType]={
}
//寫oracle的類型轉換方法
override def getJDBCType(dt:DataType):Option[org.apache.spark.sql.jdbc.JdbcType]=
dt match{
case BooleanType => Some(JdbcType("NUMBER(1)", java.sql.Types.BOOLEAN))
case IntegerType => Some(JdbcType("NUMBER(10)", java.sql.Types.INTEGER))
case LongType => Some(JdbcType("NUMBER(19)", java.sql.Types.BIGINT))
case FloatType => Some(JdbcType("NUMBER(19, 4)", java.sql.Types.FLOAT))
case DoubleType => Some(JdbcType("NUMBER(19, 4)", java.sql.Types.DOUBLE))
case ByteType => Some(JdbcType("NUMBER(3)", java.sql.Types.SMALLINT))
case ShortType => Some(JdbcType("NUMBER(5)", java.sql.Types.SMALLINT))
case StringType => Some(JdbcType("VARCHAR2(250)", java.sql.Types.VARCHAR))
case DateType => Some(JdbcType("DATE", java.sql.Types.DATE))
case DecimalType.Unlimited => Some(JdbcType("NUMBER",java.sql.Types.NUMERIC))
case _ => None
}
}
JdbcDialects.registerDialect(dialect);
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("firstTry").setMaster("local");
val sc = new SparkContext(conf);
val sqlContext = new HiveContext(sc);
//控制schame優化
var df = sqlContext.sql("select * from ****.BL_E01_REJECTACCOUNT")
val df1 = df.schema.toArray
val theJdbcDF = sqlContext.load("jdbc", Map(
"url" -> "jdbc:oracle:thin:****/****@//********/claimamdb",
"dbtable" -> "( select column_name ,data_type,data_length,data_precision,data_scale from user_tab_cols where table_name ='BL_E01_REJECTACCOUNT' order by COLUMN_ID ) a ",
"driver" -> "oracle.jdbc.driver.OracleDriver",
"numPartitions" -> "5",
"lowerBound" -> "0",
"upperBound" -> "80000000"))
val str = theJdbcDF.collect().toArray
var dateArray = new ArrayBuffer[String]
var stringArray = new ArrayBuffer[(String, Int)]
var list = new ArrayList[org.apache.spark.sql.types.StructField]();
for (j <- 0 until str.length) {
var st = str(j)
var column_name = st.get(0)
var data_type = st.get(1)
var data_length = st.get(2)
var data_precision = st.get(3)
var data_scale = st.get(4)
println(column_name + ":" + data_type + ":" + data_length + ":" + data_precision + data_scale)
if (data_type.equals("DATE")) {
dateArray += (column_name.toString())
}
if (data_type.equals("VARCHAR2")) {
stringArray += ((column_name.toString(), data_length.toString().toDouble.intValue()))
}
}
for (i <- 0 until df1.length) {
var b = df1(i)
var dataName = b.name
var dataType = b.dataType
// println("字段名"+dataName+"字段類型"+dataType)
if (dateArray.exists(p => p.equalsIgnoreCase(s"${dataName}"))) {
dataType = DateType
}
var structType = DataTypes.createStructField(dataName, dataType, true)
list.add(structType)
}
val schema = DataTypes.createStructType(list)
if (dateArray.length > 0) {
for (m <- 0 until dateArray.length) {
var mm = dateArray(m).toString()
println("mm:" + mm)
var df5 = df.withColumn(s"$mm", df(s"$mm").cast(DateType))
df = df5
}
}
val rdd = df.toJavaRDD
val df2 = sqlContext.createDataFrame(rdd, schema);
df2.printSchema()
val url = "jdbc:oracle:thin:@//********/claimamdb"
val table = "test2"
val user = "****"
val password = "****"
val url1="jdbc:oracle:thin:****/****@//********/claimamdb"
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")
oracleInit()
df2.write.jdbc(url, table, connectionProperties)
sc.stop()
}
}
這種方法只能解決簡單類型轉換,不能夠滿足我將hive中原先date已經被轉爲string再轉換回oracle的date,因爲即便是重寫方法一樣也不能傳進去參數,無法判斷哪個string是date型,可以繼承logging類重新jdbcUtils,需要讀懂源碼還是有些複雜的。
3.第三種方案
代碼和第一種相同。
將方法改爲由於無法使其建表數據類型爲精準值,每次寫入oracle中string沒有長度就會默認255,這種問題,我將其改爲使用createjdbctable和insertIntoJDBC(url1, table, true),結果發現該版本的insertintojdbc是有bug的,官方文檔上提示
Save this DataFrame to a JDBC database at url under the table name table. Assumes the table already exists and has a compatible schema. If you pass true for overwrite, it will TRUNCATE the table before performing the INSERTs.
The table must already exist on the database. It must have a schema that is compatible with the schema of this RDD; inserting the rows of the RDD in order via the simple statement INSERT INTO table VALUES (?, ?, ..., ?) should not fail.
結果還會報錯表已經存在,經過去國外的網站查詢發現,這是一個bug。
查詢結果如下
好了看了這麼多東西以後,不採用以上方法,該如何將我們的數據精準搞進去。
4.第四種方案
我看了下的oracle數據庫最大varchar2長度是4000,我這麼考慮一下,利用重寫方言的getjdbcType方法將所有string的數據轉爲4000,保證數據不會被截斷,然後利用oracle的jdbc類將我們目標表的建表字符串拿去建表,然後用dataframe寫入一張oracle的臨時表,其中varchar2都是4000,再利用select將該表數據導入目標表中。
中間date類型我利用系統表的字段判斷出來以後,將其轉爲timestamp類型,在重寫的getjdbcType中轉爲底層的oracle的date類,這樣就不會出現日期被截斷的問題。
代碼如下:
package test1
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SaveMode
import oracle.jdbc.driver.OracleDriver
import sun.security.util.Length
import org.apache.spark.sql.types.StringType
import java.util.ArrayList
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.DataTypes
import scala.collection.mutable.ArrayBuffer
import java.util.Properties
import org.apache.spark.sql.jdbc._
import java.sql.Types
import java.sql.Connection
import java.sql.DriverManager
object ojdbcTest {
def oracleInit(){
val dialect:JdbcDialect= new JdbcDialect() {
override def canHandle(url:String)={
url.startsWith("jdbc:oracle");
}
// override def getCatalystType(sqlType, typeName, size, md):Option[DataType]={
//
//
// }
override def getJDBCType(dt:DataType):Option[org.apache.spark.sql.jdbc.JdbcType]=
dt match{
case BooleanType => Some(JdbcType("NUMBER(1)", java.sql.Types.BOOLEAN))
case IntegerType => Some(JdbcType("NUMBER(10)", java.sql.Types.INTEGER))
case LongType => Some(JdbcType("NUMBER(19)", java.sql.Types.BIGINT))
case FloatType => Some(JdbcType("NUMBER(19, 4)", java.sql.Types.FLOAT))
case DoubleType => Some(JdbcType("NUMBER(19, 4)", java.sql.Types.DOUBLE))
case ByteType => Some(JdbcType("NUMBER(3)", java.sql.Types.SMALLINT))
case ShortType => Some(JdbcType("NUMBER(5)", java.sql.Types.SMALLINT))
case StringType => Some(JdbcType("VARCHAR2(4000)", java.sql.Types.VARCHAR))
case DateType => Some(JdbcType("DATE", java.sql.Types.DATE))
case DecimalType.Unlimited => Some(JdbcType("NUMBER",java.sql.Types.NUMERIC))
case TimestampType=> Some(JdbcType("DATE",java.sql.Types.DATE))
case _ => None
}
}
JdbcDialects.registerDialect(dialect);
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("firstTry").setMaster("local");
val sc = new SparkContext(conf);
val sqlContext = new HiveContext(sc);
//控制schame優化
var df = sqlContext.sql("select * from ******.BL_E01_REJECTACCOUNT")
val df1 = df.schema.toArray
//val customSchema = sparkTargetDF.dtypes.map(x => x._1+" "+x._2).mkString(",").toUpperCase()
val theJdbcDF = sqlContext.load("jdbc", Map(
"url" -> "jdbc:oracle:thin:********/********//********/********",
"dbtable" -> "( select column_name ,data_type,data_length,data_precision,data_scale from user_tab_cols where table_name ='BL_E01_REJECTACCOUNT' order by COLUMN_ID ) a ",
"driver" -> "oracle.jdbc.driver.OracleDriver",
"numPartitions" -> "5",
"lowerBound" -> "0",
"upperBound" -> "80000000"))
val str = theJdbcDF.collect().toArray
var dateArray = new ArrayBuffer[String]
var stringArray = new ArrayBuffer[(String, Int)]
var list = new ArrayList[org.apache.spark.sql.types.StructField]();
var string = new ArrayList[String]
for (j <- 0 until str.length) {
var st = str(j)
var column_name = st.get(0)
var data_type = st.get(1)
var data_length = st.get(2)
var data_precision = st.get(3)
var data_scale = st.get(4)
println(column_name + ":" + data_type + ":" + data_length + ":" + data_precision + data_scale)
if (data_type.equals("DATE")) {
dateArray += (column_name.toString())
string.add(column_name.toString() + " " + data_type.toString())
}
if (data_type.equals("NUMBER")) {
if (data_precision != null) {
string.add(column_name.toString() + " " + data_type.toString() + s"(${data_precision.toString().toDouble.intValue()},${data_scale.toString().toDouble.intValue()})")
} else {
string.add(column_name.toString() + " " + data_type.toString())
}
}
if (data_type.equals("VARCHAR2")) {
stringArray += ((column_name.toString(), data_length.toString().toDouble.intValue()))
string.add(column_name.toString() + " " + data_type.toString() + s"(${data_length.toString().toDouble.intValue()})")
}
}
for (i <- 0 until df1.length) {
var b = df1(i)
var dataName = b.name
var dataType = b.dataType
// println("字段名"+dataName+"字段類型"+dataType)
if (dateArray.exists(p => p.equalsIgnoreCase(s"${dataName}"))) {
dataType = TimestampType
}
var structType = DataTypes.createStructField(dataName, dataType, true)
list.add(structType)
}
val schema = DataTypes.createStructType(list)
if (dateArray.length > 0) {
for (m <- 0 until dateArray.length) {
var mm = dateArray(m).toString()
println("mm:" + mm)
var df5 = df.withColumn(s"$mm", df(s"$mm").cast(TimestampType))
df = df5
}
}
val rdd = df.toJavaRDD
val df2 = sqlContext.createDataFrame(rdd, schema);
df2.printSchema()
val url = "jdbc:oracle:thin:@//********/********"
val table = "test2"
val table1="test3"
val user = "********"
val password = "#EDC5tgb"
val url1 = "jdbc:oracle:thin:********/********//********/********"
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")
val a = string.toString()
val option = a.substring(1, a.length() - 1)
println(option)
oracleInit()
createJdbcTable(option,table)
println("create table is finish!")
df2.write.jdbc(url, table1, connectionProperties)
insertTable(table,table1)
println("已導入目標表!")
sc.stop()
//option("createTableColumnTypes", "CLAIMNO VARCHAR2(300), comments VARCHAR(1024)")
//df2.select(df2("POLICYNO")).write.option("createTableColumnTypes", "CLAIMNO VARCHAR2(200)")
//.jdbc(url, table, connectionProperties)
}
def createJdbcTable(option:String,table:String) = {
val url = "jdbc:oracle:thin:@//********/********"
//驅動名稱
val driver = "oracle.jdbc.driver.OracleDriver"
//用戶名
val username = "********"
//密碼
val password = "#EDC5tgb"
//初始化數據連接
var connection: Connection = null
try {
//註冊Driver
Class.forName(driver)
//得到連接
connection = DriverManager.getConnection(url, username, password)
val statement = connection.createStatement
//執行查詢語句,並返回結果
val sql =s"""
create table ${table}
(
${option}
)
"""
val rs = statement.executeQuery(sql)
connection.close
} catch { case e: Exception => e.printStackTrace }
finally { //關閉連接,釋放資源 connection.close }
}
}
def insertTable(table:String,table1:String){
val url = "jdbc:oracle:thin:@//********/********"
//驅動名稱
val driver = "oracle.jdbc.driver.OracleDriver"
//用戶名
val username = "********"
//密碼
val password = "*********"
//初始化數據連接
var connection: Connection = null
try {
//註冊Driver
Class.forName(driver)
//得到連接
connection = DriverManager.getConnection(url, username, password)
val statement = connection.createStatement
//執行查詢語句,並返回結果
val sql =s"""
insert into ${table} select * from ${table1}
"""
val rs = statement.executeQuery(sql)
connection.close
} catch { case e: Exception => e.printStackTrace }
finally { //關閉連接,釋放資源 connection.close }
}
}
}
很多版本上的坑比如說用
write.mode().jdbc()
這個mode給提供的參數無論給什麼都會overwirite掉,無論是append還是ignore。查了下源碼,savemode被寫死爲overwrite。
這個問題詳細參考:
https://www.2cto.com/net/201609/551130.html
祝大家少走彎路!
補:由於生產環境不允許建表,所以採用
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")
jdbcUtils.saveTable(df,url,table,connectionproperties)
這種方式插入數據,實測可行。