pyspark streaming+Kafka demo

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python3.7'


def updateFunction(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)


def start():

    conf = SparkConf().set(
        "spark.python.profile",
        "true").set(
        "spark.io.compression.codec",
        "snappy")

    conf.setAppName('spark-streaming-test').setMaster("local[*]")
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    sc = spark.sparkContext

    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("checkpoint")

    brokers = "localhost:9092"
    topic = 'test'
    kafkaStreams = KafkaUtils.createDirectStream(
        ssc, [topic], kafkaParams={
            "metadata.broker.list": brokers})

    msg = kafkaStreams.map(lambda x: x[1])
    words = msg.flatMap(lambda line: line.split(" "))
    pairs = words.map(lambda word: (word, 1))
    running_counts = pairs.updateStateByKey(updateFunction)
    running_counts.pprint()
    kafkaStreams.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
    ssc.start()
    ssc.awaitTermination()


offsetRanges = []


def storeOffsetRanges(rdd):
    global offsetRanges
    offsetRanges = rdd.offsetRanges()
    return rdd


def printOffsetRanges(rdd):
    for o in offsetRanges:
        print (
            "%s %s %s %s %s" %
            (o.topic,
             o.partition,
             o.fromOffset,
             o.untilOffset,
             o.untilOffset -
             o.fromOffset))


if __name__ == '__main__':
    start()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章