import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.fs.SequenceFileWriter; import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink; import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010; import org.apache.flink.util.Collector; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import java.util.Properties; public class ReadingToKafka { public static void main(String[] args) throws Exception { //String outPath = "/user/storm/test"; final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //env.getConfig().enableSysoutLogging(); env.enableCheckpointing(1000); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); Properties properties = new Properties(); properties.setProperty("bootstrap.servers", "192.168.10.22:9092,192.168.10.23:9092,192.168.10.24:9092"); properties.setProperty("zookeeper.connect", "192.168.10.11:9501,192.168.10.12:9501,192.168.10.13:9501"); properties.setProperty("group.id", "test"); //properties.setProperty("fs.default-scheme", "hdfs://hostname:8020"); //FlinkKafkaConsumer010 FlinkKafkaConsumer010<String> myConsumer = new FlinkKafkaConsumer010<String>("test", new SimpleStringSchema(), properties); //只讀取最新的數據源 myConsumer.setStartFromLatest(); //添加kafka數據源 DataStreamSource<String> stream = env.addSource(myConsumer); BucketingSink<String> hdfs_sink = new BucketingSink<String>( "hdfs:///user/storm/data/"); hdfs_sink.setBatchSize(1024 * 1024 * 400); hdfs_sink.setBucketer(new DateTimeBucketer<String>("yyyy-MM-dd")); //hdfs_sink.setWriter(new SequenceFileWriter<IntWritable,Text>()). //設置的是關閉不活躍桶的閾值,多久時間沒有數據寫入就關閉桶 hdfs_sink.setBatchRolloverInterval(3600000); //存到hdfs stream.addSink(hdfs_sink); env.execute("flink to hdfs"); //流計算 /* DataStream<Tuple2<String, Integer>> counts = stream.flatMap(new LineSplitter()) .keyBy(0).sum(1);*/ //counts.writeAsCsv(outPath).setParallelism(1); //counts.print(); //env.execute("WordCount from Kafka data"); } /* public static final class LineSplitter implements FlatMapFunction<String, Tuple2<String, Integer>> { private static final long serialVersionUID = 1L; public void flatMap(String value, Collector<Tuple2<String, Integer>> out) { String[] tokens = value.toLowerCase().split("\\W+"); for (String token : tokens) { if (token.length() > 0) { out.collect(new Tuple2<String, Integer>(token, 1)); } } } }*/ }
flink消費kafka數據直接到hdfs
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.