from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3.7'
def updateFunction(newValues, runningCount):
if runningCount is None:
runningCount = 0
return sum(newValues, runningCount)
def start():
conf = SparkConf().set(
"spark.python.profile",
"true").set(
"spark.io.compression.codec",
"snappy")
conf.setAppName('spark-streaming-test').setMaster("local[*]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 1)
ssc.checkpoint("checkpoint")
brokers = "localhost:9092"
topic = 'test'
kafkaStreams = KafkaUtils.createDirectStream(
ssc, [topic], kafkaParams={
"metadata.broker.list": brokers})
msg = kafkaStreams.map(lambda x: x[1])
words = msg.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
running_counts = pairs.updateStateByKey(updateFunction)
running_counts.pprint()
kafkaStreams.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
ssc.start()
ssc.awaitTermination()
offsetRanges = []
def storeOffsetRanges(rdd):
global offsetRanges
offsetRanges = rdd.offsetRanges()
return rdd
def printOffsetRanges(rdd):
for o in offsetRanges:
print (
"%s %s %s %s %s" %
(o.topic,
o.partition,
o.fromOffset,
o.untilOffset,
o.untilOffset -
o.fromOffset))
if __name__ == '__main__':
start()
pyspark streaming+Kafka demo
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.