使用spark將從hbase中讀取數據

原創

wuwang1988

2020-07-05 23:22

使用spark將從hbase中讀取數據
val sparkConf = new SparkConf().setAppName(“xxxx”).setMaster(“local”)
//從數據庫中讀取數據
val sparkTask = SparkTaskDao.findTaskById(sparkConf.get(GlobalConstants.RUN_TASK_ID).toLong)

//判斷 sparkTask 是否正確
if (sparkTask == null) {
  throw new SparkException("與 taskID 對應的 任務不存在")
}

//如果sparkTask 存在，將sparkTask中的task_param轉換爲對象（這裏用到了json）
//阿里巴巴編寫的JSON可以將對象轉換成json字符串，還可以將json字符串轉換成對象
val taskParam = JSON.parseObject(sparkTask.task_param)

//開始創建 hbase 配置
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(GlobalConstants.HBASE_ZOOKEEPER_QUORUM, ConfigurationManager.getProperty(GlobalConstants.HBASE_ZOOKEEPER_QUORUM))
hbaseConf.set(GlobalConstants.HBASE_ZOOKEEPER_PROPERTY, ConfigurationManager.getProperty(GlobalConstants.HBASE_ZOOKEEPER_PROPERTY))
//設置表
hbaseConf.set(TableInputFormat.INPUT_TABLE, EventLogConstants.HBASE_EVENT_LOG_TABLE)

//設置掃描器
val scan = initScan(taskParam)

if(scan == null)return

//如果scan不爲空，則將其進行序列化
val protoScan = ProtobufUtil.toScan(scan)
val stringScan = Base64.encodeBytes(protoScan.toByteArray)

hbaseConf.set(TableInputFormat.SCAN,stringScan)//定義scan的輸入格式

//開始初始化 SparkContext
val sc = new SparkContext(sparkConf)

//加載hbase中的日誌數據
//開始讀取hbase中的數據(hbase 的配置，輸入格式，輸入的key，輸入的value)【注意】這裏，我們只需要hbase中的value值
val eventLogRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
//解析加載的數據，解析成元組
val sessionidActionRDD = eventLogRDD.map(result => {
  //uuid
  val uuid = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLMUN_NAME_UUID.getBytes()))
  //println(uuid)
  //會話ID
  val sid = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLMUN_NAME_SID.getBytes()))
  //服務器時間
  val serverTime = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_SERVER_TIME.getBytes()))
  //時間名稱
  val eventName = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLMUN_NAME_EVENT_NAME.getBytes()))
  //國家(後面並沒有用到，所以可以忽略掉)
  val country = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_COUNTRY.getBytes()))
  //省份
  val province = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_PROVINCE.getBytes()))
  //市
  val city = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_CITY.getBytes()))
  //點擊的商品id
  val goodsId = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_GOODS.getBytes()))
  //獲取URL
  val url = Bytes.toString(result.getValue(EventLogConstants.HBASE_EVENT_LOG_TABLE_FAMILY.getBytes(),EventLogConstants.LOG_COLUMN_NAME_URL.getBytes()))

  (sid,(uuid, sid, eventName, serverTime, province, city, goodsId))
})

/**
  * 由於sessionidActionRDD會被多次引用，那麼意味着它可能多次從hbase中讀取數據
  * 那麼這個讀取過程是非常耗時的，在這裏可以對讀取出來的數據進行持久化，以便下次可以直接從內存或磁盤中讀取
  */
sessionidActionRDD.cache()

//【重點】將session進行聚合(sessionId,sid=sdsd|uuid=aksja|sid=askfja|clickGoodisI)
val sessionFullAggrInfoRDD = aggregateBySessionId(sessionidActionRDD)
sessionFullAggrInfoRDD.foreach(println(_))
sc.stop()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

使用spark將從hbase中讀取數據

win11關閉自動檢測病毒刪文件

千兆寬帶實際網速能到達多少？

hive 中複雜 sql 的使用

使用spark將從hbase中讀取數據

spark中的序列化器

mr的shuffle和spark的shuffle之間的區別

spark優化

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結