package spark.SparkStreaming.file
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Description:統計hdfs上指定目錄中實時的文件中單詞的次數(特點:實時的文件,不是歷史的文件)<br/>
*/
object test extends App {
//SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(test.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val ssc: StreamingContext = new StreamingContext(sc, Seconds(2))
//DStream,迭代計算,並顯示內容
ssc.textFileStream("hdfs://mini1:9000/spark-streaming/wc") //← hdfs上特定的資源目錄
.flatMap(_.split("\\s+"))
.filter(_.nonEmpty)
.map((_, 1))
.print(100)
//啓動SparkStreaming應用
ssc.start
//等待結束(必須要添加)
ssc.awaitTermination
}