相當於在舊的dataframe上並上新的數據
微批處理:先寫入日誌
持續處理:異步處理
操作步驟
# 1.導入pyspark模塊
From pyspark.sql import SparkSession
From pyspark.sql.functions import split
From pyspark.sql.functions import explode
# 2.創建sparksession對象
if __name__ =='__main__':
spark = SparkSession\
.builder\
.appName("StructuredNetworkWordCount")\
.getOrCreate()
spark.sparkContext.setLogLevel('WARN')
# 3.創建輸入源
lines = spark\
.readStream\
.format('socket')\
.option('host','localhost')\
.option('port','9999')\
.load()
# 4.定義流計算過程
word = lines.select(
explode(
split(lines.value,'')
).alias('word')
)
wordCounts = words.groupBy('word').count()
# 5.啓動流計算並輸出結果
query = wordCounts\
.writeStream\
.outputMode('complete')\
.format('console')\
.trigger(processingTime = '8 seconds')\
.start()
query.awaitTermination()
啓動