import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Spark Streaming 實時監控一個HDFS的文件夾,當新的文件進來(名字不能重複),將對新文件進行處理。
* Created by csw on 2017/7/4.
*/
object HDFSDemo {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val config = new SparkConf().setAppName("Spark shell")
val ssc = new StreamingContext(config, Seconds(10))
val lines = ssc.textFileStream("hdfs://master:9000/csw/tmp2/test/")
val words: DStream[String] = lines.flatMap(_.split(" "))
val wordCounts: DStream[(String, Int)] = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
//下滿是獲取Linux本地的文件
val lines = ssc.textFileStream("file:///csw/tmp/test2")