Flink 計算 PV UV

前言

使用 flink 很長一段時間了,突然發現竟然沒有計算過 pv uv,這可是 flink 常見的計算場景了,面試時也是常問題之一。故自己想了一個場景來計算一下。
基於 Flink 1.12

場景

外賣員聽單的信息會發到單獨一個 topic 中,計算一個每天有多少個 外賣員聽單以及總共的聽單次數。

kafka 中消息類型

{"locTime":"2020-12-28 12:32:23","courierId":12,"other":"aaa"}

locTime:事件發生的時間,courierId 外賣員id

計算一天有多少個外賣員聽單( UV ),總共聽單多少次( PV )

FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>(topics, new SimpleStringSchema(), properties);
        FlinkHelp.setOffset(parameter, consumer);
        consumer.assignTimestampsAndWatermarks(
                WatermarkStrategy.<String>forMonotonousTimestamps()
                        .withTimestampAssigner(new SerializableTimestampAssigner<String>() {
                            @Override
                            public long extractTimestamp(String element, long recordTimestamp) {
                                String locTime = "";
                                try {
                                    Map<String, Object> map = Json2Others.json2map(element);
                                    locTime = map.get("locTime").toString();
                                } catch (IOException e) {
                                }
                                LocalDateTime startDateTime =
                                        LocalDateTime.parse(locTime, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
                                long milli = startDateTime.toInstant(OffsetDateTime.now().getOffset()).toEpochMilli();
                                return milli;
                            }
                        }).withIdleness(Duration.ofSeconds(1)));

        env.addSource(consumer).filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String value) throws Exception {
                return true;
            }
        }).windowAll(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8)))
                .allowedLateness(Time.minutes(1))
//              .trigger(CountTrigger.of(5))// 其實多個 trigger 就是下一個 trigger 覆蓋上一個 trigger
                //用 event time 可能會導致 window 延遲觸發,最好的解決辦法是在 processingTime 的基礎上添加對窗口的判斷
                // watermark 不會回退,所以如果消息早到的話( 亂序了,該相對來說晚到的消息早到了),可能會導致窗口延遲觸發
                // 誇張一點的話,窗口不觸發了,直到有大於等於 watermark + triggerTime 的消息到達
                // ContinuousProcessingTimeTrigger 一樣
                .trigger(ContinuousEventTimeTrigger.of(Time.seconds(30)))
                //追歷史數據的時候會有問題,可能歷史數據不足 10s 就全部消費完畢,導致窗口不會被觸發而被跳過,消費同理
//              .trigger(ContinuousProcessingTimeTrigger.of(Time.seconds(10)))
                //處理完畢後將 window state 中的數據清除掉
                // 其實完全可以通過自定義 trigger 來達到 clear windowState 的目的 (Purge)
                .evictor(TimeEvictor.of(Time.seconds(0), true))
                .process(new ProcessAllWindowFunction<String, String, TimeWindow>() {
                    private JedisCluster jedisCluster;
                    private MapState<String, String> courierInfoMapState;
                    private MapStateDescriptor<String, String> mapStateDescriptor;
                    private MapStateDescriptor<String, Long> mapStateUVDescriptor;
                    private MapState<String, Long> courierInfoUVMapState;
                    private MapStateDescriptor<String, Long> mapStatePVDescriptor;
                    private MapState<String, Long> courierInfoPVMapState;
                    private String beforeDay = "";
                    private String currentDay = "";

                    @Override
                    public void open(Configuration parameters) throws Exception {
                        StateTtlConfig ttlConfig = StateTtlConfig
                                .newBuilder(org.apache.flink.api.common.time.Time.hours(25))
                                //default,不支持 eventTime 1.12.0
                                .setTtlTimeCharacteristic(StateTtlConfig.TtlTimeCharacteristic.ProcessingTime)
                                .cleanupInRocksdbCompactFilter(1000)
                                .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)//default
                                .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
                                .build();

                        mapStateDescriptor =
                                new MapStateDescriptor<String, String>("courierInfos", TypeInformation.of(String.class), TypeInformation.of(String.class));
                        mapStateDescriptor.enableTimeToLive(ttlConfig);
                        courierInfoMapState = getRuntimeContext().getMapState(mapStateDescriptor);

                        mapStateUVDescriptor =
                                new MapStateDescriptor<String, Long>("courierUVStateDesc", TypeInformation.of(String.class), TypeInformation.of(Long.class));
                        mapStateUVDescriptor.enableTimeToLive(ttlConfig);
                        courierInfoUVMapState = getRuntimeContext().getMapState(mapStateUVDescriptor);

                        mapStatePVDescriptor =
                                new MapStateDescriptor<String, Long>("courierPVStateDesc", TypeInformation.of(String.class), TypeInformation.of(Long.class));
                        mapStatePVDescriptor.enableTimeToLive(ttlConfig);
                        courierInfoPVMapState = getRuntimeContext().getMapState(mapStatePVDescriptor);


                        jedisCluster = RedisUtil.getJedisCluster(redisHp);
                    }

                    @Override
                    public void close() throws Exception {
                        RedisUtil.closeConn(jedisCluster);
                    }

                    @Override
                    public void process(Context context, Iterable<String> elements, Collector<String> out) throws Exception {
                        Iterator<String> iterator = elements.iterator();
                        TimeWindow window = context.window();
                        System.out.println(" window = "
                                + DateUtils.millisecondsToDateStr(window.getStart(), "yyyy-MM-dd HH:mm:ss")
                                + "-" + DateUtils.millisecondsToDateStr(window.getEnd(), "yyyy-MM-dd HH:mm:ss"));
                        while (iterator.hasNext()) {
                            Map<String, Object> map = Json2Others.json2map(iterator.next());
                            String courierId = map.get("courierId").toString();
                            String day = map.get("locTime").toString().split(" ")[0].replace("-", "");
                            if (courierInfoPVMapState.contains(day)) {
                                courierInfoPVMapState.put(day, courierInfoPVMapState.get(day) + 1);
                            } else {
                                courierInfoPVMapState.put(day, 1L);
                            }
                            if (!courierInfoMapState.contains(day + "-" + courierId)) {
                                if (courierInfoUVMapState.contains(day)) {
                                    courierInfoUVMapState.put(day, courierInfoUVMapState.get(day) + 1);
                                } else {
                                    courierInfoUVMapState.put(day, 1L);
                                }
                                courierInfoMapState.put(day + "-" + courierId, "");
                            }
                            currentDay = day;
                        }

                        HashMap<String, String> map = new HashMap<String, String>();
                        if (currentDay.equals(beforeDay)) {
                            map.put(currentDay + "-pv", courierInfoPVMapState.get(currentDay).toString());
                            map.put(currentDay + "-uv", courierInfoUVMapState.get(currentDay).toString());

                        } else {
                            map.put(currentDay + "-pv", courierInfoPVMapState.get(currentDay).toString());
                            map.put(currentDay + "-uv", courierInfoUVMapState.get(currentDay).toString());
                            //超過25個小時,昨天的數據就不對了
                            if (!beforeDay.isEmpty()) {
                                map.put(beforeDay + "-pv", courierInfoPVMapState.get(beforeDay).toString());
                                map.put(beforeDay + "-uv", courierInfoUVMapState.get(beforeDay).toString());
                            }
                        }
                        map.forEach((k, v) -> {
                            System.out.println(k + ":" + v);
                        });
                        jedisCluster.hmset("test_courier_puv:", map);
                        jedisCluster.expire("test_courier_puv:", 3 * 24 * 60 * 60);

                        beforeDay = currentDay;

                    }
                });

結果樣例

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章