Maven依賴
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.2</version>
</dependency>
</dependencies>
先測試一下環境,是否與Spark Streaming連接
WordCount.scala
package blog
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author Daniel
* @Description 測試SparkStreaming連接
**/
object WordCount {
def main(args: Array[String]): Unit = {
if (args == null || args.length < 2) {
println(
"""
|Usage: <host> <port>
""".stripMargin)
System.exit(-1)
}
val Array(host, port) = args
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("WordCount")
//batchduration表示每一次提交的含義是每隔多長時間產生一個批次batch,即提交一次sparkstreaming作業
val batchInterval = Seconds(2)
//編程入口
val ssc = new StreamingContext(conf, batchInterval)
//具體業務
//爲了容錯,流式數據的特點,一旦丟失就找不回來了,所以要進行持久化
val input: ReceiverInputDStream[String] = ssc.socketTextStream(host, port.toInt, StorageLevel.MEMORY_AND_DISK_SER_2)
//wordcount
val retDStream: DStream[(String, Int)] = input.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_ + _)
//打印結果
retDStream.print()
//啓動
ssc.start()
//保證streaming作業持續不斷的運行
ssc.awaitTermination()
}
}
設置參數爲hadoop01 9999
通過nc來測試
首先安裝
sudo yum -y install nc
打開端口9999
nc -lk hadoop01 9999
啓動程序,並在傳入信息
可以看到結果成功被輸出到控制檯
測試環境沒問題了之後,進行與HDFS的整合
StreamingHDFS.scala
package blog
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author Daniel
* @Description Spark Streaming 整合HDFS
**/
//sparkstreaming和hdfs整合 讀取hdfs中新增的文件
object StreamingHDFS {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("StreamingHDFS")
.setMaster("local")
val batchInterval = Seconds(2)
val ssc = new StreamingContext(conf, batchInterval)
// val input:DStream[String] = ssc.textFileStream("file:///F:/data/")//讀取本地文件
//讀取hdfs中的文件,監控HDFS上文件的變化
val input: DStream[String] = ssc.textFileStream("hdfs://bde/data/words")
val ret = input.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_ + _)
//遍歷每個rdd
ret.foreachRDD((rdd, time) => {
//如果RDD不爲空則輸出
if (!rdd.isEmpty()) {
println(s"Time: $time")
rdd.foreach(println)
}
})
ssc.start()
ssc.awaitTermination()
}
}
拷貝hdfs-site.xml與core-site.xml到當前目錄!!
準備一些數據文件,上傳到hdfs
1.txt
hello
word
hello
ww
lily
hadoop
hadoop
spark
hive
spark
hive
hadoop
hello
word
lily
hadoop
hadoop
spark
hive
spark
hive
hadoop
啓動程序,上傳文件至hdfs
hdfs dfs -put 1.txt /data/words/
只要是流式的文件操作,Streaming都能監控到,所以可以自行寫一個寫文件操作
WriteFile.java
package blog
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
/**
* @Author Daniel
* @Description 流式寫入文件到HDFS
**/
object WriteFile {
def main(args: Array[String]): Unit = {
//設置用戶名避免無權限
System.setProperty("HADOOP_USER_NAME", "hadoop")
val uri = new URI("hdfs://bde/")
val fs = FileSystem.newInstance(uri, new Configuration())
val fos = fs.create(new Path("/data/words/write.txt"))
fos.write("hello spark\nhello streaming\nhello successfully".getBytes())
fs.close()
}
}