處理維度數據合併的策略
維度數據和狀態數據非常像,但也有不同之處:
共同點:
- 長期保存維護
- 可修改
- 使用k-v方式查詢
不同點:
- 數據變更的時機不同
狀態數據往往因爲事實數據的新增變化而變更
維度數據只會受到業務數據庫中的變化而變更
根據共同點,維度數據也是非常適合使用hbase存儲的,稍有不同的是維度數據必須啓動單獨的實時計算來監控維度表變化來更新實時數據。
實時處理流程
HBase建表
create table gmall_province_info ( id varchar primary key , info.name varchar , info.region_id varchar , info.area_code varchar )SALT_BUCKETS = 3
增加ProvinceInfo
在scala\com\atguigu\gmall\realtime\bean\ProvinceInfo.scala
case class ProvinceInfo(id:String,
name:String,
region_id:String,
area_code:String) {
}
增加ProvinceInfoApp
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.ProvinceInfo
import com.atguigu.gmall.realtime.utils.{MyKafkaUtil, OffsetManagerUtil}
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.phoenix.spark._
object ProvinceInfoApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("province_info_app").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val groupId = "gmall_province_group"
val topic = "ODS_T_BASE_PROVINCE"
val offsets: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
if (offsets != null && offsets.size > 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offsets, groupId)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
//獲得偏移結束點
var offsetRanges: Array[OffsetRange] = Array.empty[OffsetRange]
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
val provinceInfoDstream: DStream[ProvinceInfo] = inputGetOffsetDstream.map { record =>
val jsonString: String = record.value()
val provinceInfo: ProvinceInfo = JSON.parseObject(jsonString, classOf[ProvinceInfo])
provinceInfo
}
provinceInfoDstream.cache()
provinceInfoDstream.print(1000)
provinceInfoDstream.foreachRDD { rdd =>
rdd.saveToPhoenix("gmall_province_info", Seq("ID", "NAME", "REGION_ID", "AREA_CODE"),
new Configuration, Some("hadoop102,hadoop103,hadoop104:2181"))
OffsetManagerUtil.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
修改BaseDBMaxwellApp
加入判斷是否爲空
if(dataObj != null && !dataObj.isEmpty){
MyKafkaSinkUtil.send(topic, id, dataObj.toJSONString)
}
利用maxwell-bootstrap 初始化數據
其中client_id 是指另一個已啓動的maxwell監控進程的client_id
bin/maxwell-bootstrap --user maxwell --password 123123 --host hadoop102 --database spark_gmall --table base_province --client_id maxwell_1
用kafka監控ODS_DB_GMALL_M: 能夠將base_province這張維表全量導入