基於Flink1.9,統計某個日誌中每小時的數據量,並輸出到MySQL。
主函數
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//時間爲事件時間
env.enableCheckpointing(500);//設置檢查點
Properties properties = Property.getKafkaProperties(Constants.GET_USER_AUTH_NAME);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>(Constants.TOPIC_USER_AUTH, new SimpleStringSchema(), properties);
consumer.setStartFromGroupOffsets();
DataStream<String> edits = env.addSource(consumer).name("user_auth_edit");
DataStream<UserAuth> calStream = edits
.filter(new FilterFunction<String>() { ///把時間異常過濾,防止因時間異常阻塞
@Override
public boolean filter(String s) throws Exception {
try {
JSONObject jsonObject = JSONObject.parseObject(s);
String ts = jsonObject.getString("ts");
Pattern resultPattern = Pattern.compile("\\d{10}");
Matcher resultMatcher = resultPattern.matcher(ts);
//過濾時間大於現在的,避免異常的未來數據影響到時間窗口。容忍一個小時的時間差
if (!resultMatcher.matches()) {
return false;
}
int now = (int) (System.currentTimeMillis() / 1000) + 30 * 60;//避免服務器時間不準確,設立時間差
int time = Integer.parseInt(ts);
return now >= time;
} catch (Exception e) {
System.out.println("Filter failed");
System.err.println(e + " " + e.getMessage());
return false;
}
}
})
.assignTimestampsAndWatermarks(new TaskTimestamp(Time.hours(0)))
.windowAll(TumblingEventTimeWindows.of(Time.hours(1)))
.allowedLateness(Time.hours(2))
.aggregate(new UserAuthCount());
calStream.addSink(new SinkToMySQL());
env.execute("executed user auth");
System.out.println("executed");
}
UserAuth.java
public class UserAuthCount implements AggregateFunction<String, UserAuth, UserAuth> {
@Override
public UserAuth createAccumulator()
{
return new UserAuth(0L);
}
@Override
public UserAuth add(String s, UserAuth userAuth) {
if(userAuth.getNumbers()==0L){
JSONObject jsonObject = JSONObject.parseObject(s);
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
String time_hour = sdf.format(new Date(Long.valueOf(jsonObject.getString("ts")) * 1000L));
userAuth.setTime_hour(time_hour);
userAuth.setNumbers(1L);
}
else{
userAuth.setNumbers(userAuth.getNumbers()+1L);
}
return userAuth;
}
@Override
public UserAuth getResult(UserAuth userAuth)
{
System.out.println("result is "+userAuth);
return userAuth;
}
@Override
public UserAuth merge(UserAuth userAuth, UserAuth acc1) {
return null;
}
}
解讀:Window窗口AggregateFunction 實現消息事件的次數和累計值
@param <IN> The type of the values that are aggregated (input values) 可以理解爲輸入流數據類型,例子中爲SimpleEvent
@param <ACC> The type of the accumulator (intermediate aggregate state). accumulator累加器的類別,本例中爲一個複合類,包括key,count,sum分別對應ID_NO,事件次數,時間累計值(總金額)
@param <OUT> The type of the aggregated result 聚合結果類別
public interface AggregateFunction<IN, ACC, OUT> extends Function, Serializable
SinkToMySQL.java
public class SinkToMySQL extends RichSinkFunction<UserAuth> implements SinkFunction<UserAuth> {
private static PreparedStatement ps;
private static PreparedStatement selectps;
private static PreparedStatement updateps;
private static Connection connection = null;
/**
* open() 方法中建立連接,這樣不用每次 invoke 的時候都要建立連接和釋放連接
*
* @param parameters
* @throws Exception
*/
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
try {
connection = getConnection();
String selectSql = "select id,time_hour, numbers from auth_time where time_hour = ?;";
selectps = connection.prepareStatement(selectSql);
String sql = "insert into auth_time(time_hour, numbers,create_time,update_time) values(?, ?,?,?);";
ps = connection.prepareStatement(sql);
String updateSql = "update auth_time set numbers = ?,update_time = ? where time_hour = ?;";
updateps = connection.prepareStatement(updateSql);
} catch (Exception e) {
System.out.println("sinkToMysql open is error " + e.getMessage());
}
}
private static Connection getConnection() {
try {
//加載數據庫驅動
Class.forName(Property.getStrValue("mysql.classname"));
String url = Property.getMysqlUrl();
//獲取連接
connection = DriverManager.getConnection(url);
System.out.println("數據庫連接建立成功");
} catch (Exception e) {
System.out.println("-----------mysql get connection has exception , msg = " + e.getMessage());
}
return connection;
}
@Override
public void close() throws Exception {
super.close();
//關閉連接和釋放資源
try {
if (ps != null) {
System.out.println("關閉插入ps");
ps.close();
}
if (selectps != null) {
System.out.println("關閉查詢ps");
selectps.close();
}
if (updateps != null) {
System.out.println("關閉更新ps");
updateps.close();
}
if (connection != null && !connection.isClosed()) {
System.out.println("關閉數據庫連接");
connection.close();
}
} catch (Exception e) {
System.out.println("close mysql is error "+e.getMessage());
}
}
/**
* 每條數據的插入都要調用一次 invoke() 方法
*
* @param userAuth
* @param context
* @throws Exception
*/
@Override
public void invoke(UserAuth userAuth, Context context) throws SQLException {
ResultSet resultSet = null;
try {
//判斷是否連接,嘗試重連
try{
selectps.setString(1, userAuth.getTime_hour());
}catch (MySQLNonTransientConnectionException e){
System.out.println("mysql 連接過期,重新連接");
}
//遍歷數據集合
selectps.setString(1, userAuth.getTime_hour());
resultSet = selectps.executeQuery();
if (!resultSet.next()) {
ps.setString(1, userAuth.getTime_hour());
ps.setLong(2, userAuth.getNumbers());
ps.setInt(3, (int) (System.currentTimeMillis() / 1000));
ps.setInt(4, (int) (System.currentTimeMillis() / 1000));
System.out.println(ps);
ps.execute();
} else {
System.out.println("已有該時段數據");
long resultAuthNum = resultSet.getLong("numbers");
if (resultAuthNum < userAuth.getNumbers()) {
System.out.println("Add number");
updateps.setLong(1, userAuth.getNumbers());
updateps.setInt(2, (int) (System.currentTimeMillis() / 1000));
updateps.setString(3, userAuth.getTime_hour());
System.out.println(updateps);
updateps.execute();
}
}
} catch (Exception e) {
System.err.println("invoke is error " + e.getMessage());
} finally {
if (resultSet != null){
resultSet.close();
}
}
}
解讀:此處爲自定義sinkFunction,繼承AbstractRichFunction,是一個抽象類,實現了RichFunction接口。
1、open方法,進行初始化;2、invoke方法,進行record輸出
測試:
模擬向kafka中生產13:00到21:00的數據,數據結構如下:
{
"subject_id": "test",
"subject_name": "test",
"resource_id": "test",
"client_ip": "234.215.14.137",
"timestamp": "2020-03-17T16:02:32+0800",
"ts": "1584432152"
}