利用saprk streaming實時分析數據時報的一些問題:打印日誌如下:
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:926)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:925)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.foreach(RDD.scala:925)
at org.apache.spark.api.java.JavaRDDLike$class.foreach(JavaRDDLike.scala:351)
at org.apache.spark.api.java.AbstractJavaRDDLike.foreach(JavaRDDLike.scala:45)
at com.myspark.sparkanalysis.web.WebSocketServer.lambda$1(WebSocketServer.java:54)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
導致我以上報出的問題是,JavaSpakrContext,JavaStreamingContext不能被序列化,以下我的關鍵spark streaming類代碼如下:
package com.myspark.sparkanalysis.service;
import java.io.Serializable;
import java.util.List;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;
@Component
public class StreamingConfig implements Serializable,Runnable{
private static final long serialVersionUID = 1L;
//這裏是關鍵,要加上 transient 關鍵字,表示不被序列化
private transient JavaSparkContext javaSparkContext;
//這裏是關鍵,要加上 transient 關鍵字,表示不被序列化
private transient JavaStreamingContext streamingContext = null;
public StreamingConfig(@Autowired JavaSparkContext javaSparkContext) {
this.javaSparkContext = javaSparkContext;
}
/**
* 開啓Stream任務
* @param server
* @param listenerDirectory 要監聽的文件夾
*/
public void startStreamTask(StreamingConsumer server, String listenerDirectory) {
streamingContext = new JavaStreamingContext(javaSparkContext, Durations.seconds(20));
JavaDStream<String> lines = streamingContext.textFileStream(listenerDirectory);
lines.map(line -> line.split(",")[2])
.foreachRDD(rdd -> {
//do something....
List<String> collect = rdd.collect();
for (String d : collect) {
server.sendMessageToCient(d);
}
//rdd.saveAsTextFile("");
});
streamingContext.start();
try {
streamingContext.awaitTermination();
streamingContext.close();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 手動關閉Stream
*/
public void destroyStreamTask() {
if(streamingContext != null) {
streamingContext.stop();
}
}
@Override
public void run() {
//startStreamTask(StreamingConsumer server, String listenerDirectory)
}
}
最後修改完後,項目正確運行起來。