object WordCountWindows {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "E:\\software\\bigdate\\hadoop-2.6.0-cdh5.15.0\\hadoop-2.6.0-cdh5.15.0")
val conf= new SparkConf();
//spark配置
//指明运行方式 local本地,local[2]开启2个线程
conf.setMaster("local[2]")//一个线程用于读数据,一个用于处理数据
conf.setAppName("WordCountStreaming")//在yarn资源管理器监控
val sc= new SparkContext(conf);
//StreamingContext 是SparkStreaming驱动
val batch=5
val streamingContext= new StreamingContext(sc,Seconds(batch))
//确定输入源
//单纯统计 输入源 监听网络端口
val sourceDS= streamingContext.socketTextStream("wangfutai",7777)
//streamingContext.checkpoint("hdfs://wangfutai:9000//home//wangfutai//a//sparkcheck")
streamingContext.checkpoint("E:\\sparkdata")
//单词统计
//sourceDS支持绝大多数的算子
//注意:每个batch,只处理当前时间RDD
sourceDS.
flatMap(
line=>{
line.split(" ")
}
).map((_,1))
// reduceFunc: (V, V) => V, 高阶函数
// windowDuration: Duration,窗口大小
// slideDuration: Duration,滑动步长
.reduceByKeyAndWindow(
(v1:Int,v2:Int)=>{
v1+v2
},
Seconds(12*batch),//窗口大小 1分钟,等超过一分钟的数据就会舍去
Seconds(4*batch)//步长20秒,每20秒计算
)
.print()
/*
测试数据:每隔20秒输入一次,当超过1分钟时,即第四次输入后,计算结果会舍去第一次的数据
aaa aa ddd rrr
aaa eee
aaa uuu
rrr
结果:-------------------------------------------
Time: 1551680930000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(aaa,1)
WARN - Block input-0-1551680941000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680950000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(eee,1)
(aaa,2)
WARN - Block input-0-1551680957200 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680970000 ms
-------------------------------------------
(aa,1)
(ddd,1)
(rrr,1)
(eee,1)
(aaa,3)
(uuu,1)
WARN - Block input-0-1551680986000 replicated to only 0 peer(s) instead of 1 peers
-------------------------------------------
Time: 1551680990000 ms
-------------------------------------------
,1)
(rrr,1)
(eee,1)
(aaa,2)
(uuu,1)*/
//开启spark streaming任务
streamingContext.start()
//需要等待任务执行
//当调用这个函数,整个任务会在后台不停运行
streamingContext.awaitTermination()
}
}