接着 https://blog.51cto.com/mapengfei/2581240 這裏用Flink來實現對APP在每個渠道的推廣情況包括下載、查看、卸載等等行爲的分析
因爲之前的文章都是用scala寫的,這篇用純java來實現一波,
分別演示下用aggregate 聚合方式和process 方式的實現和效果
整體思路
1、準備好數據源: 這裏用SimulatedSource 來自己隨機造一批數據
2、準備數據輸入樣例 `MarketUserBehavior` 和輸出樣例`MarketViewCountResult`
3、準備環境並設置watermark時間,和指定事件時間字段爲timestamp
4、進行過濾:uninstall 的行爲過濾掉(根據實際情況來改)
5、根據行爲和渠道進行KeyBy統計
6、設置滑動窗口1小時,每10s輸出一次
7、進行聚合輸出
/**
* @author mafei
* @date 2021/1/9
*/
package com.mafei.market;
import cn.hutool.core.util.RandomUtil;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import static java.lang.Thread.sleep;
/**
* APP市場推廣分析
*/
/**
* 定義一個輸入數據的樣例類
*/
class MarketUserBehavior {
String userId;
String behavior;
String channel;
Long timestamp;
public MarketUserBehavior(String userId, String behavior, String channel, Long timestamp) {
this.userId = userId;
this.behavior = behavior;
this.channel = channel;
this.timestamp = timestamp;
}
}
/**
* 定義一個輸出數據的類
*/
class MarketViewCountResult {
Long windowStart;
Long windowEnd;
String channel;
String behavior;
Long count;
public MarketViewCountResult(Long windowStart, Long windowEnd, String channel, String behavior, Long count) {
this.windowStart = windowStart;
this.windowEnd = windowEnd;
this.channel = channel;
this.behavior = behavior;
this.count = count;
getOutput();
}
public void getOutput() {
/**
* 爲了驗證效果加的
*/
StringBuffer stringBuffer = new StringBuffer();
stringBuffer.append("windowsStart: " + windowStart);
stringBuffer.append(" windowEnd: " + windowEnd);
stringBuffer.append(" channel: " + channel);
stringBuffer.append(" behavior: " + behavior);
stringBuffer.append(" count: " + count);
//爲了驗證效果,追加打印的
System.out.println(stringBuffer.toString());
}
}
/**
* 定義一個產生隨機數據源的類
*/
class SimulatedSource extends RichSourceFunction<MarketUserBehavior> {
/**
* 是否運行的標誌位,主要在cancel 方法中調用
*/
Boolean running = true;
/**
* 定義用戶行爲和渠道的集合
*/
String[] userBeahviors = {"view", "download", "install", "uninstall"};
String[] channels = {"dingding", "wexin", "appstore"};
Long maxRunning = 64 * 10000L;
Long currentRunningCount = 0L;
@Override
public void run(SourceContext<MarketUserBehavior> sourceContext) throws Exception {
while (running && currentRunningCount < maxRunning) {
String channel = RandomUtil.randomEle(channels);
String beahvior = RandomUtil.randomEle(userBeahviors);
Long timestamp = System.currentTimeMillis() * 1000;
String userId = RandomUtil.randomString(20);
sourceContext.collect(new MarketUserBehavior(userId, beahvior, channel, timestamp));
currentRunningCount += 1;
sleep(100L);
}
}
@Override
public void cancel() {
running = false;
}
}
public class MarketChannelAnalysis {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
environment.setParallelism(1);
environment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
SingleOutputStreamOperator<MarketUserBehavior> dataStream = environment.addSource(new SimulatedSource())
//設置watermark時間爲5秒,並且指定事件時間字段爲timestamp
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<MarketUserBehavior>(Time.seconds(5)) {
@Override
public long extractTimestamp(MarketUserBehavior marketUserBehavior) {
return marketUserBehavior.timestamp;
}
});
DataStreamSink<MarketViewCountResult> result = dataStream
.filter(new FilterFunction<MarketUserBehavior>() {
@Override
public boolean filter(MarketUserBehavior marketUserBehavior) throws Exception {
return !marketUserBehavior.behavior.equals("uninstall");
}
})
// .keyBy("channel", "behavior") // scala的實現方式
.keyBy(new KeySelector<MarketUserBehavior, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> getKey(MarketUserBehavior marketUserBehavior) throws Exception {
// return new String[]{marketUserBehavior.behavior, marketUserBehavior.channel};
return Tuple2.of(marketUserBehavior.behavior, marketUserBehavior.channel);
}
})
.timeWindow(Time.hours(1), Time.seconds(10)) //窗口大小是1小時,每10秒輸出一次
.aggregate(new MyMarketChannelAnalysis(), new MyMarketChannelResult())
// .process(new MarkCountByChannel()) //用process方法也可以實現
.print();
environment.execute();
}
}
/**
* 2種實現思路,用process的時候可以用這個方法
* process不用每來一條數據都定義怎麼做,而是把對應的數據會放到內存裏面,當窗口結束後進行統一處理,比較耗內存,看實際使用場景
*/
class MarkCountByChannel extends ProcessWindowFunction<MarketUserBehavior, MarketViewCountResult, Tuple2<String, String>, TimeWindow> {
@Override
public void process(Tuple2<String, String> key, Context context, Iterable<MarketUserBehavior> iterable, Collector<MarketViewCountResult> collector) throws Exception {
Long startTime = context.window().getStart();
Long endTime = context.window().getEnd();
String channel = key.f1;
String behavior = key.f0;
Long count = iterable.spliterator().estimateSize();
collector.collect(new MarketViewCountResult(startTime, endTime, channel, behavior, count));
}
}
/**
* 定義聚合函數的具體操作,AggregateFunction 的3個參數:
* IN,輸入的數據類型: 輸入已經在源頭定義爲 MarketUserBehavior
* ACC,中間狀態的數據類型:因爲每次要算count數,所以是Long類型
* OUT,輸出的數據類型:輸出的是統計的次數,所以也是Long類型
*/
class MyMarketChannelAnalysis implements AggregateFunction<MarketUserBehavior, Long, Long> {
@Override
public Long createAccumulator() {
/**
* 初始化的操作,定義次數爲0
*/
return 0L;
}
@Override
public Long add(MarketUserBehavior marketUserBehavior, Long aLong) {
/**
* 每來一條數據做的操作,這裏直接加1就行了
*/
return aLong + 1;
}
@Override
public Long getResult(Long aLong) {
/**
* 最終輸出時調用的方法
*/
return aLong;
}
@Override
public Long merge(Long aLong, Long acc1) {
/**
* 這裏是多個的時候用到,主要是session window時會使用
*/
return aLong + acc1;
}
}
/**
* 定義輸出的WindowFunction,要的參數可以點進去看
* IN:這裏輸入是上一步的輸出窗口內add的數量,所以是Long類型
* OUT:自定義的輸出結構,這裏定義的是一個類,可以直接改
* KEY:分組的Key,就是keyBy 裏頭定義的Tuple2.of(marketUserBehavior.behavior, marketUserBehavior.channel);
* W extends Window:TimeWindow
*
*/
class MyMarketChannelResult implements WindowFunction<Long, MarketViewCountResult, Tuple2<String, String>, TimeWindow> {
@Override
public void apply(Tuple2<String, String> stringStringTuple2, TimeWindow window, Iterable<Long> input, Collector<MarketViewCountResult> out) {
out.collect(new MarketViewCountResult(window.getStart(), window.getEnd(), stringStringTuple2.f1, stringStringTuple2.f0, input.iterator().next()));
}
}
代碼結構及運行的效果,如果要輸出es、mysql、kafka之類的直接把print換成addSink就可以了