spark流式計算的檢查點恢復

pom依賴

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId>
    <version>1.6.1</version>
</dependency>
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artifactId>
    <version>1.6.1</version>
</dependency>

 

代碼

 

import com.google.common.base.Optional;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function0;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;

import java.util.Arrays;
import java.util.List;

public class MyStreamIng{
    private static String appName = "streaming";
    private static String master = "local[2]";
    private static String host = "mini1";
    private static int port = 8888;

    public static void main(String[] args) {
        String checkpointDir = "C:\\workspace\\sparkTest\\src\\test\\java\\data\\";//checkPointPath
        JavaStreamingContext ssc = JavaStreamingContext.getOrCreate(checkpointDir, createContext(appName, checkpointDir));
        //開始作業
        ssc.start();
        ssc.awaitTermination();
    }

    public static Function0<JavaStreamingContext> createContext(final String appName, final String checkpointDir) {
        return new Function0<JavaStreamingContext>() {
            @Override
            public JavaStreamingContext call() throws Exception {
                SparkConf sparkConf = new SparkConf().setMaster(master).setAppName(appName);//初始化sparkConf
                JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5));//獲得JavaStreamingContext
                ssc.checkpoint(checkpointDir);
                JavaReceiverInputDStream<String> lines = ssc.socketTextStream(host, port);//從socket源獲取數據
                JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {//拆分行成單詞
                    @Override
                    public Iterable<String> call(String s) throws Exception {
                        return Arrays.asList(s.split(" "));
                    }
                });
                JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {//轉化成<K,V>
                    public Tuple2<String, Integer> call(String s) throws Exception {
                        return new Tuple2<String, Integer>(s, 1);
                    }
                }).cache();
                JavaPairDStream<String, Integer> dStream = pairs.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {//統計歷史數據
                    @Override
                    public Optional<Integer> call(List<Integer> integers, Optional<Integer> optional) throws Exception {//<當前批次數據,歷史數據>
                        Integer updatedValue = 0;//默認初始值
                        if (optional.isPresent()) {//獲取歷史數據
                            updatedValue = optional.get();
                        }
                        for (Integer value : integers) {//累加
                            updatedValue += value;
                        }
                        return Optional.of(updatedValue);//返回結果
                    }
                }, 10);
                dStream.print();//輸出
                return ssc;
            }
        };
    }
}

 

使用有狀態的計算(例如計算累計量)都會將結果數據保存在checkpoint中,減少依賴長度所以計算累計必須設置檢查點

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章