一、組件版本
組件 | 版本 |
---|---|
Hadoop | 3.0.0+cdh6.1.1 |
Hive | 2.1.1+cdh6.1.1 |
spark | 2.4.0+cdh6.1.1 |
二、問題描述
在 Spark 向 Hive分區表 寫入數據時,拋出異常如下:
org.apache.spark.SparkException: Requested partitioning does not match the test_table_name table:
Requested partitions:
Table partitions: city, year, month, day
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.processInsert(InsertIntoHiveTable.scala:141)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.run(InsertIntoHiveTable.scala:99)
at org.apache.spark.sql.hive.execution.CreateHiveTableAsSelectCommand.run(CreateHiveTableAsSelectCommand.scala:66)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.createTable(DataFrameWriter.scala:458)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:437)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:393)
出錯的代碼如下:
val rdd = hdfs2RDD(dir: String, spark: SparkSession, task: EtlTask)
import spark.implicits._
val res = spark.createDataset(rdd).as[String]
.map(x => {
val obj = JSON.parseObject(x)
val event_refid = obj.getString("event_refid") //事件全局唯一id
val event_cate = obj.getString("event_cate") //事件大類
val event_type = obj.getString("event_type") //事件小類
val server_fid = obj.getString("server_id") //分析主機的id
val event_date = obj.getString("event_date") //事件上傳時間
val camera_fid = obj.getInteger("camera_id") //關聯的攝像頭id
val pole_fid = "-99" //杆件ID 通過 camera_fid 關聯 s_common_base_camera 表 獲取
val vai_source = obj.getInteger("source_id") //視頻算法供應商編碼
val version = obj.getString("version") //協議編碼
val city = 440300 //通過 camera_fid 關聯 s_common_base_camera 表 獲取
val tmpTime = obj.getString("event_date").trim //時間
val dataTime = DateTime.parse(tmpTime, DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"))
val tmpTime2 = dataTime.toString("yyyyMMddHH")
val year = tmpTime2.substring(0, 4)
val month = tmpTime2.substring(4, 6)
val day = tmpTime2.substring(6, 8)
s_vai_primary(UUID.randomUUID().toString, event_refid, event_cate, event_type, server_fid, event_date,
camera_fid, pole_fid, vai_source, version, city, year, month, day)
})
res.show(10, false)
TaskOutput.hive[s_vai_primary](spark, task, Array("city", "year", "month", "day"), res, isPersist = false, dupliCols = Seq(""), isCopy2Tag = false)
三、問題分析
從上面的報錯信息,可以看到是分區不匹配造成的
org.apache.spark.SparkException: Requested partitioning does not match the test_table_name table:
Requested partitions:
Table partitions: city,day,month,day
Spark在向Hive分區表寫入數據時候,找不到這個數據按照哪個分區寫入
Requested partitions後面的值爲空,但是去hive元數據裏面查發現要插入的表是按照city,day,month,day字段分區的,Table partitions: city,day,month,day。這樣造成了不一致匹配問題,從而拋出異常。
問題代碼如下,我們走的是if分支,因爲表存在,我們也沒有刪除操作,那麼創建InsertIntoHiveTable對象時把分區字段置成了一個空的map
CreateHiveTableAsSelectCommand類 的 run方法
override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
val catalog = sparkSession.sessionState.catalog
//查看錶是否存在,如果存在走if分支,不存在走else分支
if (catalog.tableExists(tableIdentifier)) {
//如果是Overwrite寫入模式,那麼需要drop掉表
assert(mode != SaveMode.Overwrite,
s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")
if (mode == SaveMode.ErrorIfExists) {
throw new AnalysisException(s"$tableIdentifier already exists.")
}
if (mode == SaveMode.Ignore) {
// Since the table already exists and the save mode is Ignore, we will just return.
return Seq.empty
}
//我們使用的append模式寫入hive,並且表已經創建,那麼創建如下的InsertIntoHiveTable對象並調用其run方法
InsertIntoHiveTable(
tableDesc,
Map.empty, //分區字段對應了一個空map
query,
overwrite = false, //寫入模式不是overwriter
ifPartitionNotExists = false,
outputColumns = outputColumns).run(sparkSession, child)
} else {
//表不存在情況
//如果schema不爲空那麼將要拋出異常
assert(tableDesc.schema.isEmpty)
//創建表
catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false)
try {
// Read back the metadata of the table which was created just now.
val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
// For CTAS, there is no static partition values to insert.
//分區字段對應的是partitionBy指定的字段
val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
InsertIntoHiveTable(
createdTableMeta,
partition,
query,
overwrite = true, //寫入模式是overwrite
ifPartitionNotExists = false,
outputColumns = outputColumns).run(sparkSession, child)
} catch {
case NonFatal(e) =>
// drop the created table.
catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
throw e
}
}
Seq.empty[Row]
}
我們看看InsertIntoHiveTable.run方法如何使用這個空map的
override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
val externalCatalog = sparkSession.sharedState.externalCatalog
val hadoopConf = sparkSession.sessionState.newHadoopConf()
val hiveQlTable = HiveClientImpl.toHiveTable(table)
// Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
// instances within the closure, since Serializer is not serializable while TableDesc is.
val tableDesc = new TableDesc(
hiveQlTable.getInputFormatClass,
// The class of table should be org.apache.hadoop.hive.ql.metadata.Table because
// getOutputFormatClass will use HiveFileFormatUtils.getOutputFormatSubstitute to
// substitute some output formats, e.g. substituting SequenceFileOutputFormat to
// HiveSequenceFileOutputFormat.
hiveQlTable.getOutputFormatClass,
hiveQlTable.getMetadata
)
val tableLocation = hiveQlTable.getDataLocation
val tmpLocation = getExternalTmpPath(sparkSession, hadoopConf, tableLocation)
try {
processInsert(sparkSession, externalCatalog, hadoopConf, tableDesc, tmpLocation, child)
} finally {
// Attempt to delete the staging directory and the inclusive files. If failed, the files are
// expected to be dropped at the normal termination of VM since deleteOnExit is used.
deleteExternalTmpPath(hadoopConf)
}
// un-cache this table.
sparkSession.catalog.uncacheTable(table.identifier.quotedString)
sparkSession.sessionState.catalog.refreshTable(table.identifier)
CommandUtils.updateTableStats(sparkSession, table)
// It would be nice to just return the childRdd unchanged so insert operations could be chained,
// however for now we return an empty list to simplify compatibility checks with hive, which
// does not return anything for insert operations.
// TODO: implement hive compatibility as rules.
Seq.empty[Row]
}
private def processInsert(
sparkSession: SparkSession,
externalCatalog: ExternalCatalog,
hadoopConf: Configuration,
tableDesc: TableDesc,
tmpLocation: Path,
child: SparkPlan): Unit = {
val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
val numDynamicPartitions = partition.values.count(_.isEmpty)
val numStaticPartitions = partition.values.count(_.nonEmpty)
val partitionSpec = partition.map {
case (key, Some(value)) => key -> value
case (key, None) => key -> ""
}
// All partition column names in the format of "<column name 1>/<column name 2>/..."
val partitionColumns = fileSinkConf.getTableInfo.getProperties.getProperty("partition_columns")
val partitionColumnNames = Option(partitionColumns).map(_.split("/")).getOrElse(Array.empty)
// By this time, the partition map must match the table's partition columns
// 此處就是報錯的詳細地方了
if (partitionColumnNames.toSet != partition.keySet) {
throw new SparkException(
s"""Requested partitioning does not match the ${table.identifier.table} table:
|Requested partitions: ${partition.keys.mkString(",")}
|Table partitions: ${table.partitionColumnNames.mkString(",")}""".stripMargin)
}
// Validate partition spec if there exist any dynamic partitions
if (numDynamicPartitions > 0) {
// Report error if dynamic partitioning is not enabled
if (!hadoopConf.get("hive.exec.dynamic.partition", "true").toBoolean) {
throw new SparkException(ErrorMsg.DYNAMIC_PARTITION_DISABLED.getMsg)
}
// Report error if dynamic partition strict mode is on but no static partition is found
if (numStaticPartitions == 0 &&
hadoopConf.get("hive.exec.dynamic.partition.mode", "strict").equalsIgnoreCase("strict")) {
throw new SparkException(ErrorMsg.DYNAMIC_PARTITION_STRICT_MODE.getMsg)
}
// Report error if any static partition appears after a dynamic partition
val isDynamic = partitionColumnNames.map(partitionSpec(_).isEmpty)
if (isDynamic.init.zip(isDynamic.tail).contains((true, false))) {
throw new AnalysisException(ErrorMsg.PARTITION_DYN_STA_ORDER.getMsg)
}
}
table.bucketSpec match {
case Some(bucketSpec) =>
// Writes to bucketed hive tables are allowed only if user does not care about maintaining
// table's bucketing ie. both "hive.enforce.bucketing" and "hive.enforce.sorting" are
// set to false
val enforceBucketingConfig = "hive.enforce.bucketing"
val enforceSortingConfig = "hive.enforce.sorting"
val message = s"Output Hive table ${table.identifier} is bucketed but Spark " +
"currently does NOT populate bucketed output which is compatible with Hive."
if (hadoopConf.get(enforceBucketingConfig, "true").toBoolean ||
hadoopConf.get(enforceSortingConfig, "true").toBoolean) {
throw new AnalysisException(message)
} else {
logWarning(message + s" Inserting data anyways since both $enforceBucketingConfig and " +
s"$enforceSortingConfig are set to false.")
}
case _ => // do nothing since table has no bucketing
}
val partitionAttributes = partitionColumnNames.takeRight(numDynamicPartitions).map { name =>
query.resolve(name :: Nil, sparkSession.sessionState.analyzer.resolver).getOrElse {
throw new AnalysisException(
s"Unable to resolve $name given [${query.output.map(_.name).mkString(", ")}]")
}.asInstanceOf[Attribute]
}
saveAsHiveFile(
sparkSession = sparkSession,
plan = child,
hadoopConf = hadoopConf,
fileSinkConf = fileSinkConf,
outputLocation = tmpLocation.toString,
partitionAttributes = partitionAttributes)
if (partition.nonEmpty) {
if (numDynamicPartitions > 0) {
externalCatalog.loadDynamicPartitions(
db = table.database,
table = table.identifier.table,
tmpLocation.toString,
partitionSpec,
overwrite,
numDynamicPartitions)
} else {
// scalastyle:off
// ifNotExists is only valid with static partition, refer to
// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries
// scalastyle:on
val oldPart =
externalCatalog.getPartitionOption(
table.database,
table.identifier.table,
partitionSpec)
var doHiveOverwrite = overwrite
if (oldPart.isEmpty || !ifPartitionNotExists) {
// SPARK-18107: Insert overwrite runs much slower than hive-client.
// Newer Hive largely improves insert overwrite performance. As Spark uses older Hive
// version and we may not want to catch up new Hive version every time. We delete the
// Hive partition first and then load data file into the Hive partition.
if (oldPart.nonEmpty && overwrite) {
oldPart.get.storage.locationUri.foreach { uri =>
val partitionPath = new Path(uri)
val fs = partitionPath.getFileSystem(hadoopConf)
if (fs.exists(partitionPath)) {
if (!fs.delete(partitionPath, true)) {
throw new RuntimeException(
"Cannot remove partition directory '" + partitionPath.toString)
}
// Don't let Hive do overwrite operation since it is slower.
doHiveOverwrite = false
}
}
}
// inheritTableSpecs is set to true. It should be set to false for an IMPORT query
// which is currently considered as a Hive native command.
val inheritTableSpecs = true
externalCatalog.loadPartition(
table.database,
table.identifier.table,
tmpLocation.toString,
partitionSpec,
isOverwrite = doHiveOverwrite,
inheritTableSpecs = inheritTableSpecs,
isSrcLocal = false)
}
}
} else {
externalCatalog.loadTable(
table.database,
table.identifier.table,
tmpLocation.toString, // TODO: URI
overwrite,
isSrcLocal = false)
}
}
四、解決辦法
從github下載spark源碼,找到 CreateHiveTableAsSelectCommand
類 的 run
方法
org\apache\spark\sql\hive\execution\CreateHiveTableAsSelectCommand.scala
修改如下代碼:
將
InsertIntoHiveTable(
tableDesc,
Map.empty,
query,
overwrite = false,
ifPartitionNotExists = false,
outputColumnNames = outputColumnNames).run(sparkSession, child)
修改爲
val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap
InsertIntoHiveTable(
tableDesc,
partition,
query,
overwrite = false,
ifPartitionNotExists = false,
outputColumnNames = outputColumnNames).run(sparkSession, child)
編譯代碼,替換CDH中 spark-hive jar包的CreateHiveTableAsSelectCommand
class文件
具體相關操作我已經改好了,只需下載替換即可,下載鏈接
鏈接:https://pan.baidu.com/s/10B-1sdIN3NGJtNrcxqxFHg
提取碼:lic7
複製這段內容後打開百度網盤手機App,操作更方便哦
替換完記得重啓