Kafka直連存儲HBase

在之前介紹了Kafka與SparkStreaming交互的兩種方式，我提到了公司採用的是Direct方式，這次我向大家分享一下將偏移量存儲在HBase中。
代碼如下：

package kafka1
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object KafkaHbaseManager {
  // 保存offset到hbase
  def saveOffsets(TOPIC_NAME: String, GROUP_ID: String, offsetRanges: Array[OffsetRange],
                  hbaseTableName: String, batchTime: org.apache.spark.streaming.Time) = {
    val hbaseConf = HBaseConfiguration.create()
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hbaseTableName))
    val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
    val put = new Put(rowKey.getBytes())
    for (offset <- offsetRanges) {
      put.addColumn(Bytes.toBytes("offsets"), Bytes.toBytes(offset.partition.toString),
        Bytes.toBytes(offset.untilOffset.toString))
    }
    table.put(put)
    conn.close()
  }

  // 從zookeeper中獲取topic的分區數
  def getNumberOfPartitionsForTopicFromZK(TOPIC_NAME: String, GROUP_ID: String,
                                          zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Int = {
    val zkUrl = zkQuorum + "/" + zkRootDir
    val zkClientAndConn = ZkUtils.createZkClientAndConnection(zkUrl, sessTimeout, connTimeOut)
    val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2, false)
    // 獲取分區數量
    val zkPartitions = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size
    println(zkPartitions)
    zkClientAndConn._1.close()
    zkClientAndConn._2.close()
    zkPartitions
  }

  // 獲取hbase的offset
  def getLastestOffsets(TOPIC_NAME: String, GROUP_ID: String, hTableName: String,
                        zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Map[TopicAndPartition, Long] = {

    // 連接zk獲取topic的partition數量
    val zKNumberOfPartitions = getNumberOfPartitionsForTopicFromZK(TOPIC_NAME, GROUP_ID, zkQuorum, zkRootDir, sessTimeout, connTimeOut)

    val hbaseConf = HBaseConfiguration.create()

    // 獲取hbase中最後提交的offset
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hTableName))
    val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
    val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
    val scan = new Scan()
    val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
    val result = scanner.next()

    var hbaseNumberOfPartitions = 0 // 在hbase中獲取的分區數量
    if (result != null) {
      // 將分區數量設置爲hbase表的列數量
      hbaseNumberOfPartitions = result.listCells().size()
    }

    val fromOffsets = collection.mutable.Map[TopicAndPartition, Long]()
    if (hbaseNumberOfPartitions == 0) { // 如果沒有保存過offset
      // 初始化kafka爲開始
      for (partition <- 0 until zKNumberOfPartitions) {
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
      }

    } else if (zKNumberOfPartitions > hbaseNumberOfPartitions) { // 如果zk的partition數量大於hbase的partition數量，說明topic增加了分區，就需要對分區做單獨處理
      // 處理新增加的分區添加到kafka的topic
      for (partition <- 0 until zKNumberOfPartitions) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
      }
      // 對新增加的分區將它的offset值設爲0
      for (partition <- hbaseNumberOfPartitions until zKNumberOfPartitions) {
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
      }
    } else { // 如果既沒有新增加的分區，也不是第一次運行
      // 獲取上次運行的offset
      for (partition <- 0 until hbaseNumberOfPartitions) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
      }
    }

    scanner.close()
    conn.close()
    fromOffsets.toMap
  }

  def main(args: Array[String]): Unit = {
    val processingInterval = 2
    val brokers = "192.168.85.200:9092,192.168.85.201:9092,192.168.85.202:9092"
    val topics = "test01"
    // Create context with 2 second batch interval
    val sparkConf = new SparkConf().setAppName("kafkahbase").setMaster("local[2]")
    // Create direct kafka stream with brokers and topics
    val topicsSet = topics.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers,
      "auto.offset.reset" -> "smallest")

    val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
    val groupId = "gp01"
    val hbaseTableName = "spark_kafka_offsets"

    // 獲取kafkaStream
    //val kafkaStream = createMyDirectKafkaStream(ssc, kafkaParams, zkClient, topicsSet, "testp")
    val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
    // 獲取offset
    val fromOffsets = getLastestOffsets("test01", groupId, hbaseTableName, "192.168.85.200:2181,192.168.85.201:2181,192.168.85.202:2181", "kafka0.9", 30000, 30000)

    var kafkaStream: InputDStream[(String, String)] = null
    kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)

    kafkaStream.foreachRDD((rdd, btime) => {
      if (!rdd.isEmpty()) {
        println("==========================:" + rdd.count())
        println("==========================btime:" + btime)
        saveOffsets(topics, groupId, rdd.asInstanceOf[HasOffsetRanges].offsetRanges, hbaseTableName, btime)
      }

    })

    ssc.start()
    ssc.awaitTermination()
  }
}

存放在HBase中代碼有點麻煩，接下來我的博客中會像大家介紹兩種比較簡單的。

									 summed up by JiaMingcan
									 轉載請署名：JiaMingcan

Kafka直連存儲HBase

Wireshark 安裝+使用（一）

Kafka直連方式存儲MySQL

Kafka直連存儲HBase

Kafka直連存儲ZK

Linux安裝CentOS6.5mini版全攻略

leetcode反轉鏈表

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結