最近使用spark streaming處理kafka的數據,業務數據量比較大,就使用了kafkaUtils的createDirectStream()方式,此方法直接從kafka的broker的分區中讀取數據,跳過了zookeeper,並且沒有receiver,是spark的task直接對接kakfa topic partition,能保證消息恰好一次語意,但是此種方式因爲沒有經過zk,topic的offset也就沒有保存,當job重啓後只能從最新的offset開始消費消息,造成重啓過程中的消息丟失。
解決方案
一般,有兩種方式可以先spark streaming 保存offset:spark checkpoint機制和程序中自己實現保存offset邏輯,下面分別介紹。
checkpoint機制
spark streaming job 可以通過checkpoint 的方式保存job執行斷點,斷點中有spark streaming context中的全部信息(包括有kakfa每個topic partition的offset)。checkpoint有兩種方式,一個是checkpoint 數據和metadata,另一個只checkpoint metadata,一般情況只保存metadata即可,因此這裏只介紹checkpoint metadata。
流程圖
代碼實現
package com.nsfocus.bsa.example;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
/**
* Checkpoint example
*
* @author Shuai YUAN
* @date 2015/10/27
*/public class CheckpointTest {
private static String CHECKPOINT_DIR = "/checkpoint";
public static void main(String[] args) {
// get javaStreamingContext from checkpoint dir or create from sparkconf
JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(CHECKPOINT_DIR, new JavaStreamingContextFactory() {
public JavaStreamingContext create() {
return createContext();
}
});
jssc.start();
jssc.awaitTermination();
}
public static JavaStreamingContext createContext() {
SparkConf sparkConf = new SparkConf().setAppName("tachyon-test-consumer");
Set<String> topicSet = new HashSet<String>();
topicSet.add("test_topic");
HashMap<String, String> kafkaParam = new HashMap<String, String>();
kafkaParam.put("metadata.broker.list", "test1:9092,test2:9092");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
// do checkpoint metadata to hdfs
jssc.checkpoint(CHECKPOINT_DIR);
JavaPairInputDStream<String, String> message =
KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParam,
topicSet
);
JavaDStream<String> valueDStream = message.map(new Function<Tuple2<String, String>, String>() {
public String call(Tuple2<String, String> v1) throws Exception {
return v1._2();
}
});
valueDStream.count().print();
return jssc;
}
開發者可以自己開發保存offset到zk的實現邏輯。spark streaming 的rdd可以被轉換爲HasOffsetRanges類型,進而得到所有partition的offset。
實現流程
源碼實現
scala的實現網上很容易搜到,這裏貼個java實現的代碼。
package com.xueba207.test;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import scala.Predef;
import scala.Tuple2;
import scala.collection.JavaConversions;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
/**
* KafkaOffsetExample
*
* @author Shuai YUAN
* @date 2015/10/28
*/public class KafkaOffsetExample {
private static KafkaCluster kafkaCluster = null;
private static HashMap<String, String> kafkaParam = new HashMap<String, String>();
private static Broadcast<HashMap<String, String>> kafkaParamBroadcast = null;
private static scala.collection.immutable.Set<String> immutableTopics = null;
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("tachyon-test-consumer");
Set<String> topicSet = new HashSet<String>();
topicSet.add("test_topic");
kafkaParam.put("metadata.broker.list", "test:9092");
kafkaParam.put("group.id", "com.xueba207.test");
// transform java Map to scala immutable.map
scala.collection.mutable.Map<String, String> testMap = JavaConversions.mapAsScalaMap(kafkaParam);
scala.collection.immutable.Map<String, String> scalaKafkaParam =
testMap.toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() {
public Tuple2<String, String> apply(Tuple2<String, String> v1) {
return v1;
}
});
// init KafkaCluster
kafkaCluster = new KafkaCluster(scalaKafkaParam);
scala.collection.mutable.Set<String> mutableTopics = JavaConversions.asScalaSet(topicSet);
immutableTopics = mutableTopics.toSet();
scala.collection.immutable.Set<TopicAndPartition> topicAndPartitionSet2 = kafkaCluster.getPartitions(immutableTopics).right().get();
// kafka direct stream 初始化時使用的offset數據
Map<TopicAndPartition, Long> consumerOffsetsLong = new HashMap<TopicAndPartition, Long>();
// 沒有保存offset時(該group首次消費時), 各個partition offset 默認爲0
if (kafkaCluster.getConsumerOffsets(kafkaParam.get("group.id"), topicAndPartitionSet2).isLeft()) {
System.out.println(kafkaCluster.getConsumerOffsets(kafkaParam.get("group.id"), topicAndPartitionSet2).left().get());
Set<TopicAndPartition> topicAndPartitionSet1 = JavaConversions.setAsJavaSet(topicAndPartitionSet2);
for (TopicAndPartition topicAndPartition : topicAndPartitionSet1) {
consumerOffsetsLong.put(topicAndPartition, 0L);
}
}
// offset已存在, 使用保存的offset
else {
scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster.getConsumerOffsets("com.nsfocus.bsa.ys.test", topicAndPartitionSet2).right().get();
Map<TopicAndPartition, Object> consumerOffsets = JavaConversions.mapAsJavaMap(consumerOffsetsTemp);
Set<TopicAndPartition> topicAndPartitionSet1 = JavaConversions.setAsJavaSet(topicAndPartitionSet2);
for (TopicAndPartition topicAndPartition : topicAndPartitionSet1) {
Long offset = (Long)consumerOffsets.get(topicAndPartition);
consumerOffsetsLong.put(topicAndPartition, offset);
}
}
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(5000));
kafkaParamBroadcast = jssc.sparkContext().broadcast(kafkaParam);
// create direct stream
JavaInputDStream<String> message = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParam,
consumerOffsetsLong,
new Function<MessageAndMetadata<String, String>, String>() {
public String call(MessageAndMetadata<String, String> v1) throws Exception {
return v1.message();
}
}
);
// 得到rdd各個分區對應的offset, 並保存在offsetRanges中
final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<OffsetRange[]>();
JavaDStream<String> javaDStream = message.transform(new Function<JavaRDD<String>, JavaRDD<String>>() {
public JavaRDD<String> call(JavaRDD<String> rdd) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
offsetRanges.set(offsets);
return rdd;
}
});
// output
javaDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> v1) throws Exception {
if (v1.isEmpty()) return null;
//處理rdd數據,這裏保存數據爲hdfs的parquet文件
HiveContext hiveContext = SQLContextSingleton.getHiveContextInstance(v1.context());
DataFrame df = hiveContext.jsonRDD(v1);
df.save("/offset/test", "parquet", SaveMode.Append);
for (OffsetRange o : offsetRanges.get()) {
// 封裝topic.partition 與 offset對應關係 java Map
TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition());
Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>();
topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset());
// 轉換java map to scala immutable.map
scala.collection.mutable.Map<TopicAndPartition, Object> testMap =
JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap);
scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap =
testMap.toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() {
public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) {
return v1;
}
});
// 更新offset到kafkaCluster
kafkaCluster.setConsumerOffsets(kafkaParamBroadcast.getValue().get("group.id"), scalatopicAndPartitionObjectMap);
// System.out.println(// o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset()//
);
}
return null;
}
});
jssc.start();
jssc.awaitTermination();
}
}