測試環境:Ubuntu 16.04
Hudi版本:0.5.2
Spark版本:2.4.0
下載編譯測試
- 下載
git clone https://github.com/apache/incubator-hudi.git && cd incubator-hudi - 編譯
mvn clean package -Dmaven.test.skip=true
- Hudi CLI測試
寫入查詢測試
-
Spark shell啓動
./spark-shell --packages org.apache.hudi:hudi-spark-bundle_2.11:0.5.1-incubating,org.apache.spark:spark-avro_2.11:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
-
插入數據測試
import org.apache.hudi.QuickstartUtils._ import scala.collection.JavaConversions._ import org.apache.spark.sql.SaveMode._ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.config.HoodieWriteConfig._ val tableName = "hudi_trips_cow" val basePath = "file:///tmp/hudi_trips_cow" val dataGen = new DataGenerator val inserts = convertToStringList(dataGen.generateInserts(10)) val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) df.write.format("hudi"). options(getQuickstartWriteConfigs). option(PRECOMBINE_FIELD_OPT_KEY, "ts"). option(RECORDKEY_FIELD_OPT_KEY, "uuid"). option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option(TABLE_NAME, tableName). mode(Overwrite). save(basePath)
-
讀取數據測試
val tripsSnapshotDF = spark. read. format("hudi"). load(basePath + "/*/*/*/*") tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot") spark.sql("select fare, begin_lon, begin_lat, ts from hudi_trips_snapshot where fare > 20.0").show() spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_trips_snapshot").show()
-
更新測試
val updates = convertToStringList(dataGen.generateUpdates(10)) val df = spark.read.json(spark.sparkContext.parallelize(updates, 2)) df.write.format("hudi"). options(getQuickstartWriteConfigs). option(PRECOMBINE_FIELD_OPT_KEY, "ts"). option(RECORDKEY_FIELD_OPT_KEY, "uuid"). option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option(TABLE_NAME, tableName). mode(Append). save(basePath)
更新前數據:
更新後數據:
除了_hoodie_record_key ,其它字段都發生了更新 -
增量查詢測試
spark. read. format("hudi"). load(basePath + "/*/*/*/*"). createOrReplaceTempView("hudi_trips_snapshot") val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(10) val beginTime = commits(commits.length - 2) // commit time we are interested in val tripsIncrementalDF = spark.read.format("hudi"). option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL). option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). load(basePath) tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
先新插入1批數據
增量查詢
查詢全量數據
這裏增量表只能查到最新插入的數據 -
查詢指定時間段的數據
val beginTime = "000" // Represents all commits > this time. val endTime = commits(commits.length - 2) // commit time we are interested in //incrementally query data val tripsPointInTimeDF = spark.read.format("hudi"). option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL). option(BEGIN_INSTANTTIME_OPT_KEY, beginTime). option(END_INSTANTTIME_OPT_KEY, endTime). load(basePath) tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time") spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").show()
-
刪除數據
// fetch total records count spark.sql("select uuid, partitionPath from hudi_trips_snapshot").count() // fetch two records to be deleted val ds = spark.sql("select uuid, partitionPath from hudi_trips_snapshot").limit(2) // issue deletes val deletes = dataGen.generateDeletes(ds.collectAsList()) val df = spark.read.json(spark.sparkContext.parallelize(deletes, 2)); df.write.format("hudi"). options(getQuickstartWriteConfigs). option(OPERATION_OPT_KEY,"delete"). option(PRECOMBINE_FIELD_OPT_KEY, "ts"). option(RECORDKEY_FIELD_OPT_KEY, "uuid"). option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). option(TABLE_NAME, tableName). mode(Append). save(basePath) // run the same read query as above. val roAfterDeleteViewDF = spark. read. format("hudi"). load(basePath + "/*/*/*/*") roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot") // fetch should return (total - 2) records spark.sql("select uuid, partitionPath from hudi_trips_snapshot").count()
引用問題
Could not initialize class org.apache.parquet.hadoop.metadata.CompressionCodecName
Caused by: org.apache.hudi.exception.HoodieException: org.apache.hudi.exception.HoodieException: java.util.conct.ExecutionException: java.lang.NoClassDefFoundError: Could not initialize class org.apache.parquet.hadoop.metaCompressionCodecName
at org.apache.hudi.execution.CopyOnWriteLazyInsertIterable.computeNext(CopyOnWriteLazyInsertIterable.ja6)
at org.apache.hudi.execution.CopyOnWriteLazyInsertIterable.computeNext(CopyOnWriteLazyInsertIterable.ja)
at org.apache.hudi.client.utils.LazyIterableIterator.next(LazyIterableIterator.java:119)
... 23 more
Caused by: org.apache.hudi.exception.HoodieException: java.util.concurrent.ExecutionException: java.lang.NoClasoundError: Could not initialize class org.apache.parquet.hadoop.metadata.CompressionCodecName
at org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.execute(BoundedInMemoryExecutor.java:143)
at org.apache.hudi.execution.CopyOnWriteLazyInsertIterable.computeNext(CopyOnWriteLazyInsertIterable.ja2)
... 25 more
Caused by: java.util.concurrent.ExecutionException: java.lang.NoClassDefFoundError: Could not initialize class pache.parquet.hadoop.metadata.CompressionCodecName
at java.util.concurrent.FutureTask.report(FutureTask.java:122)
at java.util.concurrent.FutureTask.get(FutureTask.java:192)
at org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.execute(BoundedInMemoryExecutor.java:141)
... 26 more
Caused by: java.lang.NoClassDefFoundError: Could not initialize class org.apache.parquet.hadoop.metadata.CompreCodecName
at org.apache.hudi.config.HoodieWriteConfig.getParquetCompressionCodec(HoodieWriteConfig.java:466)
at org.apache.hudi.io.storage.HoodieStorageWriterFactory.newParquetStorageWriter(HoodieStorageWriterFacjava:62)
at org.apache.hudi.io.storage.HoodieStorageWriterFactory.getStorageWriter(HoodieStorageWriterFactory.ja)
at org.apache.hudi.io.HoodieCreateHandle.<init>(HoodieCreateHandle.java:72)
at org.apache.hudi.execution.CopyOnWriteLazyInsertIterable$CopyOnWriteInsertHandler.consumeOneRecord(CoriteLazyInsertIterable.java:140)
at org.apache.hudi.execution.CopyOnWriteLazyInsertIterable$CopyOnWriteInsertHandler.consumeOneRecord(CoriteLazyInsertIterable.java:128)
at org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer.consume(BoundedInMemoryQueueConsumer.37)
at org.apache.hudi.common.util.queue.BoundedInMemoryExecutor.lambda$null$2(BoundedInMemoryExecutor.java
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
20/04/27 10:14:55 WARN TaskSetManager: Lost task 1.0 in stage 21.0 (TID 26, localhost, executor driver): java.luntimeException: org.apache.hudi.exception.HoodieException: org.apache.hudi.exception.HoodieException: java.uticurrent.ExecutionException: java.lang.NoSuchFieldError: BROTLI
需引用parquet-format-structures
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format-structures</artifactId>
<version>1.11.0</version>
</dependency>
https://hudi.apache.org/docs/quick-start-guide.html