在之前介紹了Kafka與SparkStreaming交互的兩種方式,我提到了公司採用的是Direct方式,這次我向大家分享一下將偏移量存儲在HBase中。
代碼如下:
package kafka1
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object KafkaHbaseManager {
// 保存offset到hbase
def saveOffsets(TOPIC_NAME: String, GROUP_ID: String, offsetRanges: Array[OffsetRange],
hbaseTableName: String, batchTime: org.apache.spark.streaming.Time) = {
val hbaseConf = HBaseConfiguration.create()
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(hbaseTableName))
val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
val put = new Put(rowKey.getBytes())
for (offset <- offsetRanges) {
put.addColumn(Bytes.toBytes("offsets"), Bytes.toBytes(offset.partition.toString),
Bytes.toBytes(offset.untilOffset.toString))
}
table.put(put)
conn.close()
}
// 從zookeeper中獲取topic的分區數
def getNumberOfPartitionsForTopicFromZK(TOPIC_NAME: String, GROUP_ID: String,
zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Int = {
val zkUrl = zkQuorum + "/" + zkRootDir
val zkClientAndConn = ZkUtils.createZkClientAndConnection(zkUrl, sessTimeout, connTimeOut)
val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2, false)
// 獲取分區數量
val zkPartitions = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size
println(zkPartitions)
zkClientAndConn._1.close()
zkClientAndConn._2.close()
zkPartitions
}
// 獲取hbase的offset
def getLastestOffsets(TOPIC_NAME: String, GROUP_ID: String, hTableName: String,
zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Map[TopicAndPartition, Long] = {
// 連接zk獲取topic的partition數量
val zKNumberOfPartitions = getNumberOfPartitionsForTopicFromZK(TOPIC_NAME, GROUP_ID, zkQuorum, zkRootDir, sessTimeout, connTimeOut)
val hbaseConf = HBaseConfiguration.create()
// 獲取hbase中最後提交的offset
val conn = ConnectionFactory.createConnection(hbaseConf)
val table = conn.getTable(TableName.valueOf(hTableName))
val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
val scan = new Scan()
val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
val result = scanner.next()
var hbaseNumberOfPartitions = 0 // 在hbase中獲取的分區數量
if (result != null) {
// 將分區數量設置爲hbase表的列數量
hbaseNumberOfPartitions = result.listCells().size()
}
val fromOffsets = collection.mutable.Map[TopicAndPartition, Long]()
if (hbaseNumberOfPartitions == 0) { // 如果沒有保存過offset
// 初始化kafka爲開始
for (partition <- 0 until zKNumberOfPartitions) {
fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
}
} else if (zKNumberOfPartitions > hbaseNumberOfPartitions) { // 如果zk的partition數量大於hbase的partition數量,說明topic增加了分區,就需要對分區做單獨處理
// 處理新增加的分區添加到kafka的topic
for (partition <- 0 until zKNumberOfPartitions) {
val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
Bytes.toBytes(partition.toString)))
fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
}
// 對新增加的分區將它的offset值設爲0
for (partition <- hbaseNumberOfPartitions until zKNumberOfPartitions) {
fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
}
} else { // 如果既沒有新增加的分區,也不是第一次運行
// 獲取上次運行的offset
for (partition <- 0 until hbaseNumberOfPartitions) {
val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
Bytes.toBytes(partition.toString)))
fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
}
}
scanner.close()
conn.close()
fromOffsets.toMap
}
def main(args: Array[String]): Unit = {
val processingInterval = 2
val brokers = "192.168.85.200:9092,192.168.85.201:9092,192.168.85.202:9092"
val topics = "test01"
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setAppName("kafkahbase").setMaster("local[2]")
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest")
val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val groupId = "gp01"
val hbaseTableName = "spark_kafka_offsets"
// 獲取kafkaStream
//val kafkaStream = createMyDirectKafkaStream(ssc, kafkaParams, zkClient, topicsSet, "testp")
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
// 獲取offset
val fromOffsets = getLastestOffsets("test01", groupId, hbaseTableName, "192.168.85.200:2181,192.168.85.201:2181,192.168.85.202:2181", "kafka0.9", 30000, 30000)
var kafkaStream: InputDStream[(String, String)] = null
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
kafkaStream.foreachRDD((rdd, btime) => {
if (!rdd.isEmpty()) {
println("==========================:" + rdd.count())
println("==========================btime:" + btime)
saveOffsets(topics, groupId, rdd.asInstanceOf[HasOffsetRanges].offsetRanges, hbaseTableName, btime)
}
})
ssc.start()
ssc.awaitTermination()
}
}
存放在HBase中代碼有點麻煩,接下來我的博客中會像大家介紹兩種比較簡單的。
summed up by JiaMingcan
轉載請署名:JiaMingcan