sparkstreaming實現hdfs目錄的監控，並實時寫入hbase

原創

2019-08-23 09:49

部署環境參考https://blog.csdn.net/luoye4321/article/details/99745877

代碼如下：

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}

object HbaseOpe {
  def writeStreamToHbase(txtPath:String)={

    val sparkConf = new SparkConf().setAppName("streamToHbase")
    val ssc = new StreamingContext(sparkConf, Seconds(10))
    val lines = ssc.textFileStream(txtPath)

    lines.foreachRDD(txtRdd=>{
      val tablename= "tb:table3"
      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)
      val jobConf = new JobConf(hbaseConf)
      jobConf.setOutputFormat(classOf[TableOutputFormat])
      txtRdd.map(_.split(",")).map(arr=>{
        val put = new Put(Bytes.toBytes(arr(0)))
                
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col1"),Bytes.toBytes(arr(1)))
            
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col2"),Bytes.toBytes(arr(2)))
        
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col3"),Bytes.toBytes(arr(3)))
        
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col4"),Bytes.toBytes(arr(4)))
        
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col5"),Bytes.toBytes(arr(5)))
        
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col6"),Bytes.toBytes(arr(6)))
        
      put.addColumn(Bytes.toBytes("static"),Bytes.toBytes("col7"),Bytes.toBytes(arr(7)))
        (new ImmutableBytesWritable, put)
      }).saveAsHadoopDataset(jobConf)
    })

    ssc.start()
    ssc.awaitTermination()

  }
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

sparkstreaming實現hdfs目錄的監控，並實時寫入hbase

工作中用到的腳本合集

通過f-string編寫簡潔高效的Python格式化輸出代碼

24-5-18 X

分佈式一致性算法-Paxos理解

區塊鏈共識算法的理解

（一）基於Python的Geotrellis實現-環境部署

scala函數參數

六、geotrellis按時間序列存儲至hbase

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結