Flume自定義Source之MysqlSource

場景描述:公司項目某一模塊是數據源之間數據遷移,場景之一就是從Mysql中讀取數據寫入到其他的數據源,架構採用的的flume+kafa的形式,Kafka作爲Channel,然後通過自定Source和Sink,將數據在不同的數據源之間遷移,而且效率還比較高,也不會丟數據;

整理項目知識點的過程中打算寫一下博客,結合一些機構的資料簡單寫一下自定義MysqlSource;

主要考慮的幾點:

1、實時監控mysql數據庫某張表的數據變化;

2、根據offset和maxRecords去查詢數據;

3、將數據封裝成event,寫入Channel;

4、寫入成功之後更新offset,失敗回滾(這裏就不說回滾的問題,Source這裏也沒什麼要說的在Sink那裏着重考慮)

maven依賴:

<dependency>
    <groupId>org.apache.flume</groupId>
    <artifactId>flume-ng-core</artifactId>
    <version>1.9.0</version>
</dependency>

<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.27</version>
</dependency>

maven中添加打包的插件:

<build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <mainClass></mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

爲了方便操作定義MySQLSourceHandler

import org.apache.flume.Context;
import org.apache.flume.conf.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.*;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

public class MySQLSQLSourceHandler {
    private static final Logger LOG = LoggerFactory.getLogger(MySQLSQLSourceHandler.class);
    private Integer runQueryDelay,//每兩次查詢的時間間隔
                    startFrom,//開始id
                    currentIndex,//當前id
                    recordSize,//每次查詢返回結果的條數
                    maxRow;//每次查詢的最大條數

    private String table,//要操作的表
                   columnsToSelect,//用戶傳入的查詢的樣例
                   customQuery,//用戶傳入的查詢語句
                   query,//構建的查詢語句
                   defaultCharsetResultSet;//編碼集

    //上下文 用來獲取配置文件
    private Context context;

    //爲定義的變量賦值(默認值),可在flume任務的配置文件中修改
    private static final Integer DEFAULT_QUERY_DELAY = 10000;
    private static final Integer DEFAULT_START_VALUE=0;
    private static final Integer DEFAULT_MAX_ROWS = 5000;
    private static final String DEFAULT_COLUMNS_SELECT = "*";
    private static final String DEFAULT_CHARSET_RESULTSET = "UTF-8";

    private static Connection conn = null;
    private static PreparedStatement ps = null;
    private static String connectionURL,connectionUserName,connectionPassword;

    //加載靜態資源
    static{
        try {
            Properties prop = new Properties();
            prop.load(MySQLSQLSourceHandler
                    .class.getClassLoader()
                    .getResourceAsStream("jdbc.properties"));
            connectionURL = prop.getProperty("dbUrl");
            connectionUserName = prop.getProperty("dbUser");
            connectionPassword = prop.getProperty("dbPassword");
            Class.forName(prop.getProperty("dbDriver"));

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }

    //獲取jdbc連接
    private static Connection InitConnection(String url,String user,String password){

        try {
            Connection conn = DriverManager.getConnection(url,user,password);
            if (conn == null)
                throw new SQLException();
            return conn;
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return null;
    }

    //構造方法
    MySQLSQLSourceHandler (Context context) throws ParseException {
        //初始化上下文
        this.context = context;

        //有默認值參數:獲取flume任務配置文件中的參數,讀不到的採用默認值
        this.columnsToSelect = context.getString("columns.to.select", DEFAULT_COLUMNS_SELECT);
        this.runQueryDelay = context.getInteger("run.query.delay", DEFAULT_QUERY_DELAY);
        this.startFrom = context.getInteger("start.from", DEFAULT_START_VALUE);
        this.maxRow = context.getInteger("max.row", DEFAULT_MAX_ROWS);
        this.defaultCharsetResultSet = context.getString("default.charset.resultset", DEFAULT_CHARSET_RESULTSET);


        //無默認值參數:獲取flume任務配置文件中的參數
        this.table = context.getString("table");
        this.customQuery = context.getString("custom.query");
        connectionURL = context.getString("connection.url");
        connectionUserName = context.getString("connection.user");
        connectionPassword = context.getString("connection.password");
        conn = InitConnection(connectionURL, connectionUserName, connectionPassword);

        //校驗相應的配置信息,如果沒有默認值的參數也沒賦值,拋出異常
        checkMandatoryProperties();
        //獲取當前的id
        currentIndex = getStatusDBIndex(startFrom);
        //構建查詢語句
        query = buildQuery();
    }

    //校驗相應的配置信息(表,查詢語句以及數據庫連接的參數)
    private void checkMandatoryProperties() {
        if (table == null) {
            throw new ConfigurationException("property table not set");
        }
        if (connectionURL == null) {
            throw new ConfigurationException("connection.url property not set");
        }
        if (connectionUserName == null) {
            throw new ConfigurationException("connection.user property not set");
        }
        if (connectionPassword == null) {
            throw new ConfigurationException("connection.password property not set");
        }
    }

    //構建sql語句
    private String buildQuery() {
        String sql = "";
        //獲取當前id
        currentIndex = getStatusDBIndex(startFrom);
        LOG.info(currentIndex + "");
        if (customQuery == null) {
            sql = "SELECT " + columnsToSelect + " FROM " + table;
        } else {
            sql = customQuery;
        }

        StringBuilder execSql = new StringBuilder(sql);
        //以id作爲offset
        if (!sql.contains("where")) {
            execSql.append(" where ");
            execSql.append("id").append(">").append(currentIndex);
            execSql.append(" and id").append("<=").append(currentIndex+maxRow);
            LOG.info("execSql:" + execSql.toString() );
            return execSql.toString();
        } else {
            String  oldSql = execSql.toString();
            int num = KMPFunction.evaluate(oldSql,"where");
            String no_where = oldSql.substring(0,num);
            execSql = new StringBuilder(no_where);
            execSql.append(" where ");
            execSql.append("id").append(">").append(currentIndex);
            execSql.append(" and id").append("<=").append(currentIndex+maxRow);
            LOG.info("execSql:" + execSql.toString() );
            return execSql.toString();
        }
    }

    //執行查詢
    List<List<Object>> executeQuery() {
        try {
            //每次執行查詢時都要重新生成sql,因爲id不同
            customQuery = buildQuery();
            //存放結果的集合
            List<List<Object>> results = new ArrayList<>();
            if (ps == null) {
                //
                ps = conn.prepareStatement(customQuery);
            }
            ResultSet result = ps.executeQuery(customQuery);
            while (result.next()) {
                //存放一條數據的集合(多個列)
                List<Object> row = new ArrayList<>();
                //將返回結果放入集合
                for (int i = 1; i <= result.getMetaData().getColumnCount(); i++) {
                    row.add(result.getObject(i));
                }
                results.add(row);
            }
            LOG.info("execSql:" + customQuery + "\nresultSize:" + results.size());
            return results;
        } catch (SQLException e) {
            LOG.error(e.toString());
            // 重新連接
            conn = InitConnection(connectionURL, connectionUserName, connectionPassword);
        }
        return null;
    }

    //將結果集轉化爲字符串,每一條數據是一個list集合,將每一個小的list集合轉化爲字符串
    List<String> getAllRows(List<List<Object>> queryResult) {
        List<String> allRows = new ArrayList<>();
        if (queryResult == null || queryResult.isEmpty())
            return allRows;
        StringBuilder row = new StringBuilder();
        for (List<Object> rawRow : queryResult) {
            Object value = null;
            for (Object aRawRow : rawRow) {
                value = aRawRow;
                if (value == null) {
                    row.append(",");
                } else {
                    row.append(aRawRow.toString()).append(",");
                }
            }
            allRows.add(row.toString());
            row = new StringBuilder();
        }
        return allRows;
    }

    //更新offset元數據狀態,每次返回結果集後調用。必須記錄每次查詢的offset值,爲程序中斷續跑數據時使用,以id爲offset
    void updateOffset2DB(int size) {
        //以source_tab做爲KEY,如果不存在則插入,存在則更新(每個源表對應一條記錄)
        String sql = "insert into flume_meta(source_tab,currentIndex) VALUES('"
                + this.table
                + "','" + (currentIndex += size)
                + "') on DUPLICATE key update source_tab=values(source_tab),currentIndex=values(currentIndex)";
        LOG.info("updateStatus Sql:" + sql);

        execSql(sql);
    }

    //執行sql語句
    private void execSql(String sql) {
        try {
            ps = conn.prepareStatement(sql);
            LOG.info("exec:" + sql);
            ps.execute();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    //獲取當前id的offset
    private Integer getStatusDBIndex(int startFrom) {
        //從flume_meta表中查詢出當前的id是多少
        String dbIndex = queryOne("select currentIndex from flume_meta where source_tab='" + table + "'");
        if (dbIndex != null) {
            return Integer.parseInt(dbIndex);
        }
        //如果沒有數據,則說明是第一次查詢或者數據表中還沒有存入數據,返回最初傳入的值
        return startFrom;
    }

    //查詢一條數據的執行語句(當前id)
    private String queryOne(String sql) {
        ResultSet result = null;
        try {
            ps = conn.prepareStatement(sql);
            result = ps.executeQuery();
            while (result.next()) {
                return result.getString(1);
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return null;
    }

    //關閉相關資源
    void close() {
        try {
            ps.close();
            conn.close();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    int getCurrentIndex() {
        return currentIndex;
    }

    void setCurrentIndex(int newValue) {
        currentIndex = newValue;
    }

    int getRunQueryDelay() {
        return runQueryDelay;
    }

    String getQuery() {
        return query;
    }

    String getConnectionURL() {
        return connectionURL;
    }

    private boolean isCustomQuerySet() {
        return (customQuery != null);
    }

    Context getContext() {
        return context;
    }

    public String getConnectionUserName() {
        return connectionUserName;
    }

    public String getConnectionPassword() {
        return connectionPassword;
    }

    String getDefaultCharsetResultSet() {
        return defaultCharsetResultSet;
    }


}

處理sql的sql這裏寫了一個方法

public class KMPFunction{
    @Test
    public void test(){
        System.out.println(evaluate("abdsdsedabdwedsdweabe","sed"));
    }

    public static int evaluate(String fullStr,String subStr) {
        if(fullStr == null){
            return 0;
        }
        fullStr = fullStr.toLowerCase();
        return kmp(fullStr, subStr, kmpNext(subStr));
    }

    private static int kmp(String source, String dest, int[] next) {
        System.out.println(source);
        int num = 0;
        for (int i = 0, j = 0; i < source.length(); i++) {
            while (j > 0 && source.charAt(i) != dest.charAt(j)) {
                j = next[j - 1];
            }
            if (source.charAt(i) == dest.charAt(j)) {
                j++;
            }
            if (j == dest.length()) {
                num = i - j + 1;
                break;
            }
        }
        return num;
    }

    /**
     * 子串的部分匹配表,相當於子串與子串自己做了一次kmp算法
     * @param dest
     * @return
     */
    private static int[] kmpNext(String dest) {
        int[] next = new int[dest.length()];
        next[0] = 0;
        for (int i = 1, j = 0; i < dest.length(); i++) {
            while (j > 0 && dest.charAt(i) != dest.charAt(j)) {
                j = next[j - 1];
            }
            if (dest.charAt(i) == dest.charAt(j)) {
                j++;
            }
            next[i] = j;
        }
        return next;
    }
    
}

接下來就是自定義MySQLSource

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class MySQLSource extends AbstractSource implements Configurable, PollableSource {
    //打印日誌
    private static final Logger LOG = LoggerFactory.getLogger(MySQLSource.class);
    //定義sqlHelper
    private MySQLSQLSourceHandler sqlSourceHandler;

    @Override
    public void configure(Context context) {
        try {
            //初始化
            sqlSourceHandler= new MySQLSQLSourceHandler (context);
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }


    @Override
    public Status process() throws EventDeliveryException {
        try {
            //查詢數據表
            List<List<Object>> result = sqlSourceHandler.executeQuery();
            //存放event的集合
            List<Event> events = new ArrayList<>();
            //存放event頭集合
            HashMap<String, String> header = new HashMap<>();
            //如果有返回數據,則將數據封裝了event
            if (!result.isEmpty()) {
                List<String> allRows = sqlSourceHandler.getAllRows(result);
                Event event = null;
                for (String row : allRows) {
                    event = new SimpleEvent();
                    event.setBody(row.getBytes());
                    event.setHeaders(header);
                    events.add(event);
                }
                //將event寫入到Channel
                this.getChannelProcessor().processEventBatch(events);
                //更新數據表的偏移量
                LOG.info("offset:"+result.size());
                sqlSourceHandler.updateOffset2DB(result.size());
            }
            //等待時長
            Thread.sleep(sqlSourceHandler.getRunQueryDelay());
            return Status.READY;
        }catch (InterruptedException e){
            LOG.error("Error processing row",e);
            return Status.BACKOFF;
        }
    }

    @Override
    public synchronized void stop(){
        LOG.info("Stopping sql source {} ...",getName());
        try{
            //關閉資源
            sqlSourceHandler.close();
        } finally {
          super.stop();
        }
    }


    @Override
    public long getBackOffSleepIncrement() {
        return 0;
    }

    @Override
    public long getMaxBackOffSleepInterval() {
        return 0;
    }


}

通過mvn  package打包上傳到 $FLUME_HOME/lib目錄下

編寫MysqlSource.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = com.zyz.flume.code.MySQLSource
a1.sources.r1.connection.url = jdbc:mysql://192.168.31.10:3306/mysqlsource
a1.sources.r1.connection.user = root
a1.sources.r1.connection.password = root
a1.sources.r1.max.row = 2000
a1.sources.r1.table = student
a1.sources.r1.columns.to.select = *
#a1.sources.r1.incremental.column.name = id
#a1.sources.r1.incremental.value = 0
a1.sources.r1.run.query.delay=5000

# Describe the sink
a1.sinks.k1.type = logger

# Describe the channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

mysql建表語句:

CREATE DATABASE mysqlsource;

CREATE TABLE `student` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
);

CREATE TABLE `flume_meta` (
`source_tab` varchar(255) NOT NULL,
`currentIndex` varchar(255) NOT NULL,
PRIMARY KEY (`source_tab`)
);

insert into `student` (`id`, `name`) values('1','zhangsan');
insert into `student` (`id`, `name`) values('2','lisi');
insert into `student` (`id`, `name`) values('3','wangwu');
insert into `student` (`id`, `name`) values('4','zhaoliu');
insert into `student` (`id`, `name`) values('5','xiaoming');
insert into `student` (`id`, `name`) values('6','xiaoliang');

編寫了個腳本方便測試(使用了ganglia ,沒配置可以刪掉後兩行):


#!/bin/bash


read -p "enter agent name: " AGENT_NAME
if [ -z $AGENT_NAME ];then
  echo "Error"
  exit
fi


read -p "enter job config file  name: " FILE_NAME
if [ -z $FILE_NAME ];then
  echo "Error"
  exit
fi



/home/hadoop/apps/flume-1.9.0/bin/flume-ng agent \
-c /home/hadoop/apps/flume-1.9.0/conf/ \
-n $AGENT_NAME \
-f /home/hadoop/apps/flume-1.9.0/job/$FILE_NAME \
-Dflume.root.logger=INFO,console \
-Dflume.monitoring.type=ganglia \
-Dflume.monitoring.hosts=192.168.31.10:8649

執行腳本:查看結果

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章