hudi的基本操作

1、首先將hudi-spark3.1-bundle_2.12-0.12.2.jar包先拷貝到spark/jars裏面

2、執行下面的spark-sql命令

spark-sql \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
create table hudi_mor_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/apps/hudi/warehouse/app/hudi_mor_pt_tbl';

insert into hudi_mor_pt_tbl select 1, 'a1_1', 20, '1674958478800','10';
insert into hudi_mor_pt_tbl select 2, 'a1_2', 20, '1674958478800','10';
insert into hudi_mor_pt_tbl select 3, 'a1_3', 20, '1674958478800','10';
insert into hudi_mor_pt_tbl select 4, 'a1_4', 20, '1674954878000','9';
insert into hudi_mor_pt_tbl select 5, 'a1_5', 20, '1674954878000','9';
insert into hudi_mor_pt_tbl select 6, 'a1_6', 20, '1674954878000','9';

update hudi_mor_pt_tbl set name = 'a1_22222', ts = 1111 where id = 1;
update hudi_mor_pt_tbl set name = 'a1_44444', ts = 4444 where id = 4;

delete from hudi_mor_pt_tbl where id=3;

3、執行下面的spark-shell命令,程序構造數據

spark-shell \
  --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
  --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.hudi.common.model.HoodieRecord

val tableName = "hudi_trips_cow"
val basePath = "/apps/hudi/warehouse/app/hudi_trips_cow"
val dataGen = new DataGenerator

val inserts = convertToStringList(dataGen.generateInserts(10))
val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
df.write.format("hudi").
  options(getQuickstartWriteConfigs).
  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
  option(TABLE_NAME, tableName).
  mode(Overwrite).
  save(basePath)

 val tripsSnapshotDF = spark.
  read.
  format("hudi").
  load(basePath)
tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")

spark.sql("select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0").show()
spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from  hudi_trips_snapshot").show()

val updates = convertToStringList(dataGen.generateUpdates(10))
val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
df.write.format("hudi").
  options(getQuickstartWriteConfigs).
  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
  option(TABLE_NAME, tableName).
  mode(Append).
  save(basePath)

spark.
  read.
  format("hudi").
  load(basePath).
  createOrReplaceTempView("hudi_trips_snapshot")

val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
val beginTime = commits(commits.length - 2) // commit time we are interested in

// incrementally query data
val tripsIncrementalDF = spark.read.format("hudi").
  option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
  option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
  load(basePath)
tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")

spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()

4、執行下面的spark-shell命令,手動構造數據,同時修改schema

import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

val tableName = "hudi_trips_cow"
val basePath = "/apps/hudi/warehouse/app/hudi_trips_cow"
val schema = StructType( Array(
    StructField("rowId", StringType,true),
    StructField("partitionId", StringType,true),
    StructField("preComb", LongType,true),
    StructField("name", StringType,true),
    StructField("versionId", StringType,true),
    StructField("intToLong", IntegerType,true)
    ))    
val data1 = Seq(Row("row_1", "part_0", 0L, "bob", "v_0", 0),
    Row("row_2", "part_0", 0L, "john", "v_0", 0),
    Row("row_3", "part_0", 0L, "tom", "v_0", 0))
var dfFromData1 = spark.createDataFrame(data1, schema)
dfFromData1.write.format("hudi").
    options(getQuickstartWriteConfigs).
    option(PRECOMBINE_FIELD_OPT_KEY.key, "preComb").
    option(RECORDKEY_FIELD_OPT_KEY.key, "rowId").
    option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionId").
    option("hoodie.index.type","SIMPLE").
    option(TABLE_NAME.key, tableName).
    mode(Overwrite).
    save(basePath) 

var tripsSnapshotDF1 = spark.read.format("hudi").load(basePath + "/*/*")
tripsSnapshotDF1.createOrReplaceTempView("hudi_trips_snapshot")
spark.sql("desc hudi_trips_snapshot").show()
spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong from hudi_trips_snapshot").show()

val newSchema = StructType( Array(
    StructField("rowId", StringType,true),
    StructField("partitionId", StringType,true),
    StructField("preComb", LongType,true),
    StructField("name", StringType,true),
    StructField("versionId", StringType,true),
    StructField("intToLong", LongType,true),
    StructField("newField", StringType,true)
    ))
val data2 = Seq(Row("row_2", "part_0", 5L, "john", "v_3", 3L, "newField_1"),
    Row("row_5", "part_0", 5L, "maroon", "v_2", 2L, "newField_1"),
    Row("row_9", "part_0", 5L, "michael", "v_2", 2L, "newField_1"))
var dfFromData2 = spark.createDataFrame(data2, newSchema)
dfFromData2.write.format("hudi").
    options(getQuickstartWriteConfigs).
    option(PRECOMBINE_FIELD_OPT_KEY.key, "preComb").
    option(RECORDKEY_FIELD_OPT_KEY.key, "rowId").
    option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionId").
    option("hoodie.index.type","SIMPLE").
    option(TABLE_NAME.key, tableName).
    mode(Append).
    save(basePath)

var tripsSnapshotDF2 = spark.read.format("hudi").load(basePath + "/*/*")
tripsSnapshotDF2.createOrReplaceTempView("hudi_trips_snapshot")
spark.sql("desc hudi_trips_snapshot").show()
spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong, newField from hudi_trips_snapshot").show()

5、hudi-cli

//連接表
connect --path /apps/hudi/warehouse/app/hudi_mor_pt_tbl
//查看錶基本信息
desc
//查看錶元數據
fetch table schema
//查看錶合併記錄
compactions show all
//查看錶log文件內容
show logfile records --logFilePathPattern /apps/hudi/warehouse/app/hudi_mor_pt_tbl/dt=1674954878000/hh=9/.e1b43d80-8e03-4225-b404-a47ac70ab141-0_20230129103114699.log.1_0-354-328
//查看錶log文件schema
show logfile metadata --logFilePathPattern /apps/hudi/warehouse/app/hudi_mor_pt_tbl/dt=1674954878000/hh=9/.e1b43d80-8e03-4225-b404-a47ac70ab141-0_20230129103114699.log.1_0-354-328
//合併1
compaction schedule --hoodieConfigs 'hoodie.compaction.strategy=org.apache.hudi.table.action.compact.strategy.BoundedIOCompactionStrategy,hoodie.compaction.target.io=1,hoodie.compact.inline.max.delta.commits=1'
//合併2(schema.avsc就是fetch table schema獲取的元數據,並上傳到了hdfs上)
compaction run --sparkMaster yarn  --parallelism 100 --sparkMemory 1g --retry 1 --compactionInstant 20230111155816324 --hoodieConfigs 'hoodie.compaction.strategy=org.apache.hudi.table.action.compact.strategy.BoundedIOCompactionStrategy,hoodie.compaction.target.io=1,hoodie.compact.inline.max.delta.commits=1' --propsFilePath /apps/hudi/warehouse/app/hudi_mor_pt_tbl/.hoodie/hoodie.properties --schemaFilePath /tmp/schema.avsc

6、parquet-tools

//parquet-tools指定別名
alias parquetview='hadoop jar /data/parquet-tools-1.6.0rc3-SNAPSHOT.jar'
//parquet-tools查看parquet文件內容
parquetview cat /apps/hudi/warehouse/app/app_eap_event_buy_de_153291/1/2023-01-05/2f4fea7f-1803-4619-beaf-6181cc7ef2ad-0_0-33323-403302_20230109151113302.parquet
//parquet-tools查看parquet文件schema
parquetview meta /apps/hudi/warehouse/app/app_eap_event_buy_de_153291/1/2023-01-05/2f4fea7f-1803-4619-beaf-6181cc7ef2ad-0_0-33323-403302_20230109151113302.parquet  

7、hudi表拷貝

//hudi表文件從hdfs複製到本地 
hadoop fs -copyToLocal /apps/hudi/warehouse/app/app_eap_event_receive_grow_de_153291/.hoodie /data/dup/app_eap_event_receive_grow_de_153291
//壓縮文件
tar -czvf xxx.tar.gz  source_file
//解壓文件
tar -xzvf xxx.tar.gz
//hudi表文件從本地複製到hdfs
hadoop fs -copyFromLocal /home/hive/app_eap_event_receive_grow_de_153291 /apps/hudi/warehouse/app/
//執行hive建表sql
//執行hive加載分區
alter table app_eap_event_receive_grow_de_153291 add partition (eap_tenant_channel='1', dt='2023-01-12') location '/apps/hudi/warehouse/app/app_eap_event_receive_grow_de_153291/1/2023-01-12';
//查詢hive表

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章