object WordCountWindows {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "E:\\software\\bigdate\\hadoop-2.6.0-cdh5.15.0\\hadoop-2.6.0-cdh5.15.0")
val conf= new SparkConf();
//spark配置
//指明運行方式 local本地,local[2]開啓2個線程
conf.setMaster("local[2]")//一個線程用於讀數據,一個用於處理數據
conf.setAppName("WordCountStreaming")//在yarn資源管理器監控
val sc= new SparkContext(conf);
//StreamingContext 是SparkStreaming驅動
val batch=5
val streamingContext= new StreamingContext(sc,Seconds(batch))
//確定輸入源
//單純統計 輸入源 監聽網絡端口
val sourceDS= streamingContext.socketTextStream("wangfutai",7777)
//streamingContext.checkpoint("hdfs://wangfutai:9000//home//wangfutai//a//sparkcheck")
streamingContext.checkpoint("E:\\sparkdata")
//單詞統計
//sourceDS支持絕大多數的算子
//注意:每個batch,只處理當前時間RDD
sourceDS.
flatMap(
line=>{
line.split(" ")
}
).map((_,1))
// reduceFunc: (V, V) => V, 高階函數
// windowDuration: Duration,窗口大小
// slideDuration: Duration,滑動步長
.reduceByKeyAndWindow(
(v1:Int,v2:Int)=>{
v1+v2
},
Seconds(12*batch),//窗口大小 1分鐘,等超過一分鐘的數據就會捨去
Seconds(4*batch)//步長20秒,每20秒計算
)
.print()
/*
測試數據:每隔20秒輸入一次,當超過1分鐘時,即第四次輸入後,計算結果會捨去第一次的數據
aaa aa ddd rrr
aaa eee
aaa uuu
rrr
結果:-------------------------------------------
Time: 1551680930000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(aaa,1)
WARN - Block input-0-1551680941000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680950000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(eee,1)
(aaa,2)
WARN - Block input-0-1551680957200 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680970000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(eee,1)
(aaa,3)
(uuu,1)
WARN - Block input-0-1551680986000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680990000 ms
-------------------------------------------
,1)
(rrr,1)
(eee,1)
(aaa,2)
(uuu,1)*/
//開啓spark streaming任務
streamingContext.start()
//需要等待任務執行
//當調用這個函數,整個任務會在後臺不停運行
streamingContext.awaitTermination()
}
}