flink消費kafka數據直接到hdfs

原創

2019-03-26 03:33

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.fs.SequenceFileWriter;
import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink;
import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.Collector;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import java.util.Properties;

public class ReadingToKafka {

    public static void main(String[] args) throws Exception {
        //String outPath = "/user/storm/test";
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //env.getConfig().enableSysoutLogging();
        env.enableCheckpointing(1000);

        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "192.168.10.22:9092,192.168.10.23:9092,192.168.10.24:9092");
        properties.setProperty("zookeeper.connect", "192.168.10.11:9501,192.168.10.12:9501,192.168.10.13:9501");
        properties.setProperty("group.id", "test");
        //properties.setProperty("fs.default-scheme", "hdfs://hostname:8020");
        //FlinkKafkaConsumer010
        FlinkKafkaConsumer010<String> myConsumer = new FlinkKafkaConsumer010<String>("test", new SimpleStringSchema(),
                properties);
        //只讀取最新的數據源
        myConsumer.setStartFromLatest();
        //添加kafka數據源
        DataStreamSource<String> stream = env.addSource(myConsumer);
        BucketingSink<String> hdfs_sink = new BucketingSink<String>(
                "hdfs:///user/storm/data/");
        hdfs_sink.setBatchSize(1024 * 1024 * 400);
        hdfs_sink.setBucketer(new DateTimeBucketer<String>("yyyy-MM-dd"));

        //hdfs_sink.setWriter(new SequenceFileWriter<IntWritable,Text>()).

        //設置的是關閉不活躍桶的閾值,多久時間沒有數據寫入就關閉桶
        hdfs_sink.setBatchRolloverInterval(3600000);
        //存到hdfs
        stream.addSink(hdfs_sink);
        env.execute("flink to hdfs");
       //流計算
      /* DataStream<Tuple2<String, Integer>> counts = stream.flatMap(new LineSplitter())
                .keyBy(0).sum(1);*/
        //counts.writeAsCsv(outPath).setParallelism(1);
        //counts.print();
        //env.execute("WordCount from Kafka data");
    }

  /* public static final class LineSplitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
        private static final long serialVersionUID = 1L;
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
            String[] tokens = value.toLowerCase().split("\\W+");
            for (String token : tokens) {
                if (token.length() > 0) {
                    out.collect(new Tuple2<String, Integer>(token, 1));
                }
            }
        }
    }*/
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

flink消費kafka數據直接到hdfs

釘釘打卡速度慢

使用neovim打造go ide(支持代碼跳轉, 代碼補全, 實時語法檢查)

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Python 潮流週刊#51：用 Python 繪製美觀的圖表

cs01 CSS Syntax

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

spark ,hive collect_list全局保持順序

MapReduce 單表關聯

Flink 同步kafka 數據寫入hbase

FlinkSQL實現WordCount

spark wordcount

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結