先上結論:
sparksql中不支持create external table外部表的創建,只能是非external表。使用write.option(“path”,"/some/path").saveAsTable是external表。
使用外部表,可以直接加載數據並加載到DateSet.createOrReplaceTempView中完成。
如果註冊的表是createGlobalTempView,那麼訪問表需要加上數據庫名,global_temp.tableName否在默認在default中查找會導致報錯: Table or view ‘tableName’ not found in database ‘default’;
執行如下sql:
// spark sql native syntax "USING HIVE"
sql("create external table if not exists src(key INT ,value STRING ) row format delimited fields terminated by '\t'")
報錯信息:
Exception in thread "main" org.apache.spark.sql.catalyst.parser.ParseException:
Operation not allowed: CREATE EXTERNAL TABLE must be accompanied by LOCATION(line 1, pos 0)
== SQL ==
create external table if not exists src(key INT ,value STRING ) row format delimited fields terminated by ' '
^^^
at org.apache.spark.sql.catalyst.parser.ParserUtils$.operationNotAllowed(ParserUtils.scala:41)
at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateHiveTable$1.apply(SparkSqlParser.scala:1153)
at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateHiveTable$1.apply(SparkSqlParser.scala:1113)
at org.apache.spark.sql.catalyst.parser.ParserUtils$.withOrigin(ParserUtils.scala:108)
at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateHiveTable(SparkSqlParser.scala:1113)
at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateHiveTable(SparkSqlParser.scala:55)
at org.apache.spark.sql.catalyst.parser.SqlBaseParser$CreateHiveTableContext.accept(SqlBaseParser.java:1206)
at org.antlr.v4.runtime.tree.AbstractParseTreeVisitor.visit(AbstractParseTreeVisitor.java:18)
at org.apache.spark.sql.catalyst.parser.AstBuilder$$anonfun$visitSingleStatement$1.apply(AstBuilder.scala:72)
at org.apache.spark.sql.catalyst.parser.AstBuilder$$anonfun$visitSingleStatement$1.apply(AstBuilder.scala:72)
at org.apache.spark.sql.catalyst.parser.ParserUtils$.withOrigin(ParserUtils.scala:108)
at org.apache.spark.sql.catalyst.parser.AstBuilder.visitSingleStatement(AstBuilder.scala:71)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser$$anonfun$parsePlan$1.apply(ParseDriver.scala:70)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser$$anonfun$parsePlan$1.apply(ParseDriver.scala:69)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:98)
at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:48)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:69)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at com.example.sql.hive.SparkHiveExample$.main(SparkHiveExample.scala:36)
at com.example.sql.hive.SparkHiveExample.main(SparkHiveExample.scala)
查看源碼:
SparkSession.scala
def sql(sqlText: String): DataFrame = {
Dataset.ofRows(self, sessionState.sqlParser.parsePlan(sqlText))
}
解析sqlText成邏輯計劃:sqlText ->LogicalPlan
SparkSqlAstBuilder將ANTLR 解析sql得到的ParseTree 轉換成:LogicalPlan/Expression/TableIdentifier.
其中CreateTableHeaderContext用於解析:CREATE TEMPORARY TABLE … IF NOT EXISTS等稱爲TableHeader信息
public static class CreateTableHeaderContext extends ParserRuleContext {
public TerminalNode CREATE() { return getToken(SqlBaseParser.CREATE, 0); }
public TerminalNode TABLE() { return getToken(SqlBaseParser.TABLE, 0); }
public TableIdentifierContext tableIdentifier() {
return getRuleContext(TableIdentifierContext.class,0);
}
public TerminalNode TEMPORARY() { return getToken(SqlBaseParser.TEMPORARY, 0); }
public TerminalNode EXTERNAL() { return getToken(SqlBaseParser.EXTERNAL, 0); }
public TerminalNode IF() { return getToken(SqlBaseParser.IF, 0); }
public TerminalNode NOT() { return getToken(SqlBaseParser.NOT, 0); }
public TerminalNode EXISTS() { return getToken(SqlBaseParser.EXISTS, 0); }
public CreateTableHeaderContext(ParserRuleContext parent, int invokingState) {
super(parent, invokingState);
}
其中SparkSqlAstBuilder中有如下代碼,當出現external給出報錯。
// SparkSqlAstBuilder
override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) {
val (table, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader)
if (external) {
operationNotAllowed("CREATE EXTERNAL TABLE ... USING", ctx)
}
checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx)
checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
...
之前已經有人發起過這個問題:SPARK-2825:Allow creating external tables in metastore
spark中createTempView註冊的表和hive中external具有相同作用。
最後補充:spark中createOrReplaceTempView只是一個臨時視圖,程序結束,則失效,使用saveAsTable將表持久化到Hive Metastore中。
如何持久化到hive中形成external表以表drop表時數據被刪除?
df.write.option(“path”,"/some/path").saveAsTable(“t”) 帶有path,表刪除,數據不會被刪除,是external表。
df.write.saveAsTable(“t”) 這種不帶path,刪除表,數據也會被刪除