[canal源碼解析]之esAdapter etl功能

在上一篇使用canal client-adapter完成mysql到es數據同步教程(包括全量和增量)編輯的時候看到了esAdapter中對於etl功能的代碼,由於之前自己也寫過類似的功能點,爲此這裏我打算再看下阿里的大佬是如何寫全量同步代碼的,作爲學習與借鑑

CommonRest

etl類的入口controller類爲:com.alibaba.otter.canal.adapter.launcher.rest.CommonRest

package com.alibaba.otter.canal.adapter.launcher.rest;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.annotation.PostConstruct;
import javax.annotation.Resource;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

import com.alibaba.otter.canal.adapter.launcher.common.EtlLock;
import com.alibaba.otter.canal.adapter.launcher.common.SyncSwitch;
import com.alibaba.otter.canal.adapter.launcher.config.AdapterCanalConfig;
import com.alibaba.otter.canal.client.adapter.OuterAdapter;
import com.alibaba.otter.canal.client.adapter.support.EtlResult;
import com.alibaba.otter.canal.client.adapter.support.ExtensionLoader;
import com.alibaba.otter.canal.client.adapter.support.Result;

/**
 * 適配器操作Rest
 *
 * @author rewerma @ 2018-10-20
 * @version 1.0.0
 */
@RestController
public class CommonRest {

    private static Logger                 logger           = LoggerFactory.getLogger(CommonRest.class);

    private static final String           ETL_LOCK_ZK_NODE = "/sync-etl/";

    private ExtensionLoader<OuterAdapter> loader;

    @Resource
    private SyncSwitch                    syncSwitch;
    @Resource
    private EtlLock                       etlLock;

    @Resource
    private AdapterCanalConfig            adapterCanalConfig;

    @PostConstruct
    public void init() {
        loader = ExtensionLoader.getExtensionLoader(OuterAdapter.class);
    }

    /**
     * ETL curl http://127.0.0.1:8081/etl/rdb/oracle1/mytest_user.yml -X POST
     *
     * @param type 類型 hbase, es
     * @param key adapter key
     * @param task 任務名對應配置文件名 mytest_user.yml
     * @param params etl where條件參數, 爲空全部導入
     */
    @PostMapping("/etl/{type}/{key}/{task}")
    public EtlResult etl(@PathVariable String type, @PathVariable String key, @PathVariable String task,
                         @RequestParam(name = "params", required = false) String params) {
        OuterAdapter adapter = loader.getExtension(type, key);
        String destination = adapter.getDestination(task);
        String lockKey = destination == null ? task : destination;

        boolean locked = etlLock.tryLock(ETL_LOCK_ZK_NODE + type + "-" + lockKey);
        if (!locked) {
            EtlResult result = new EtlResult();
            result.setSucceeded(false);
            result.setErrorMessage(task + " 有其他進程正在導入中, 請稍後再試");
            return result;
        }
        try {

            boolean oriSwitchStatus;
            if (destination != null) {
                oriSwitchStatus = syncSwitch.status(destination);
                if (oriSwitchStatus) {
                    syncSwitch.off(destination);
                }
            } else {
                // task可能爲destination,直接鎖task
                oriSwitchStatus = syncSwitch.status(task);
                if (oriSwitchStatus) {
                    syncSwitch.off(task);
                }
            }
            try {
                List<String> paramArray = null;
                if (params != null) {
                    paramArray = Arrays.asList(params.trim().split(";"));
                }
                return adapter.etl(task, paramArray);
            } finally {
                if (destination != null && oriSwitchStatus) {
                    syncSwitch.on(destination);
                } else if (destination == null && oriSwitchStatus) {
                    syncSwitch.on(task);
                }
            }
        } finally {
            etlLock.unlock(ETL_LOCK_ZK_NODE + type + "-" + lockKey);
        }
    }

    /**
     * ETL curl http://127.0.0.1:8081/etl/hbase/mytest_person2.yml -X POST
     *
     * @param type 類型 hbase, es
     * @param task 任務名對應配置文件名 mytest_person2.yml
     * @param params etl where條件參數, 爲空全部導入
     */
    @PostMapping("/etl/{type}/{task}")
    public EtlResult etl(@PathVariable String type, @PathVariable String task,
                         @RequestParam(name = "params", required = false) String params) {
        return etl(type, null, task, params);
    }

    /**
     * 統計總數 curl http://127.0.0.1:8081/count/rdb/oracle1/mytest_user.yml
     *
     * @param type 類型 hbase, es
     * @param key adapter key
     * @param task 任務名對應配置文件名 mytest_person2.yml
     * @return
     */
    @GetMapping("/count/{type}/{key}/{task}")
    public Map<String, Object> count(@PathVariable String type, @PathVariable String key, @PathVariable String task) {
        OuterAdapter adapter = loader.getExtension(type, key);
        return adapter.count(task);
    }

    /**
     * 統計總數 curl http://127.0.0.1:8081/count/hbase/mytest_person2.yml
     *
     * @param type 類型 hbase, es
     * @param task 任務名對應配置文件名 mytest_person2.yml
     * @return
     */
    @GetMapping("/count/{type}/{task}")
    public Map<String, Object> count(@PathVariable String type, @PathVariable String task) {
        return count(type, null, task);
    }

    /**
     * 返回所有實例 curl http://127.0.0.1:8081/destinations
     */
    @GetMapping("/destinations")
    public List<Map<String, String>> destinations() {
        List<Map<String, String>> result = new ArrayList<>();
        Set<String> destinations = adapterCanalConfig.DESTINATIONS;
        for (String destination : destinations) {
            Map<String, String> resMap = new LinkedHashMap<>();
            boolean status = syncSwitch.status(destination);
            String resStatus;
            if (status) {
                resStatus = "on";
            } else {
                resStatus = "off";
            }
            resMap.put("destination", destination);
            resMap.put("status", resStatus);
            result.add(resMap);
        }
        return result;
    }

    /**
     * 實例同步開關 curl http://127.0.0.1:8081/syncSwitch/example/off -X PUT
     *
     * @param destination 實例名稱
     * @param status 開關狀態: off on
     * @return
     */
    @PutMapping("/syncSwitch/{destination}/{status}")
    public Result etl(@PathVariable String destination, @PathVariable String status) {
        if (status.equals("on")) {
            syncSwitch.on(destination);
            logger.info("#Destination: {} sync on", destination);
            return Result.createSuccess("實例: " + destination + " 開啓同步成功");
        } else if (status.equals("off")) {
            syncSwitch.off(destination);
            logger.info("#Destination: {} sync off", destination);
            return Result.createSuccess("實例: " + destination + " 關閉同步成功");
        } else {
            Result result = new Result();
            result.setCode(50000);
            result.setMessage("實例: " + destination + " 操作失敗");
            return result;
        }
    }

    /**
     * 獲取實例開關狀態 curl http://127.0.0.1:8081/syncSwitch/example
     *
     * @param destination 實例名稱
     * @return
     */
    @GetMapping("/syncSwitch/{destination}")
    public Map<String, String> etl(@PathVariable String destination) {
        boolean status = syncSwitch.status(destination);
        String resStatus;
        if (status) {
            resStatus = "on";
        } else {
            resStatus = "off";
        }
        Map<String, String> res = new LinkedHashMap<>();
        res.put("stauts", resStatus);
        return res;
    }
}

這裏我們只需關注如下這個類,其中我加了一點註釋

    /**
     * ETL curl http://127.0.0.1:8081/etl/rdb/oracle1/mytest_user.yml -X POST
     *
     * @param type   類型 hbase, es
     * @param key    adapter key
     * @param task   任務名對應配置文件名 mytest_user.yml
     * @param params etl where條件參數, 爲空全部導入
     */
    @PostMapping("/etl/{type}/{key}/{task}")
    public EtlResult etl(@PathVariable String type, @PathVariable String key, @PathVariable String task,
                         @RequestParam(name = "params", required = false) String params) {
        //由type和task獲取到外部的es配置文件
        OuterAdapter adapter = loader.getExtension(type, key);
        String destination = adapter.getDestination(task);
        //從配置文件中獲取出adapter中的destination,即canal instance的名稱
        String lockKey = destination == null ? task : destination;
        //嘗試獲取lock,鎖名爲/sync-etl/+type名稱+資源名
        boolean locked = etlLock.tryLock(ETL_LOCK_ZK_NODE + type + "-" + lockKey);
        if (!locked) {
            //獲取鎖失敗,返回失敗
            EtlResult result = new EtlResult();
            result.setSucceeded(false);
            result.setErrorMessage(task + " 有其他進程正在導入中, 請稍後再試");
            return result;
        }
        try {

            boolean oriSwitchStatus;
            if (destination != null) {
                //獲取destination的同步狀態,首次加載情況下destination的狀態都爲true,代表可以此destination可以進行操作(門是開着的可以進行訪問)
                oriSwitchStatus = syncSwitch.status(destination);
                if (oriSwitchStatus) {
                    //將destination的狀態修改爲false,代表不可以獲取鎖(門關上了,不可以進行訪問)
                    syncSwitch.off(destination);
                }
            } else {
                // task可能爲destination,直接鎖task
                oriSwitchStatus = syncSwitch.status(task);
                if (oriSwitchStatus) {
                    syncSwitch.off(task);
                }
            }
            try {
                List<String> paramArray = null;
                if (params != null) {
                    //如果有多個參數,則以;對參數進行分割
                    paramArray = Arrays.asList(params.trim().split(";"));
                }
                return adapter.etl(task, paramArray);
            } finally {
                if (destination != null && oriSwitchStatus) {
                    //如果在進入方法前destination的鎖狀態爲true,則這裏將destination鎖設爲true,開鎖
                    syncSwitch.on(destination);
                } else if (destination == null && oriSwitchStatus) {
                    syncSwitch.on(task);
                }
            }
        } finally {
            //釋放鎖
            etlLock.unlock(ETL_LOCK_ZK_NODE + type + "-" + lockKey);
        }
    }

etlLock(重入鎖或curator實現的分佈式鎖)

此方法中的etlLock是一個自定義的etlLock類,它會根據當前有無zookeeper自動對應的鎖,如果有zookeeper則會試用curator作爲分佈式鎖,如果沒有的話即單機環境,則會採用ReentrantLock作爲鎖

其中etlLock在初始化時會來進行判斷並確定當前的環境:

    @PostConstruct
    public void init() {
        CuratorFramework curator = curatorClient.getCurator();
        if (curator != null) {
            mode = Mode.DISTRIBUTED;
        } else {
            mode = Mode.LOCAL;
        }
    }

上面的代碼中還有一處值得關注的地方,注意到關鍵字:syncSwitch

在此controller中,還有一處用到了syncSwitch的代碼爲destinations的請求代碼,它會返回所有destination當前的狀態:

    /**
     * 返回所有實例 curl http://127.0.0.1:8081/destinations
     */
    @GetMapping("/destinations")
    public List<Map<String, String>> destinations() {
        List<Map<String, String>> result = new ArrayList<>();
        Set<String> destinations = adapterCanalConfig.DESTINATIONS;
        for (String destination : destinations) {
            Map<String, String> resMap = new LinkedHashMap<>();
            boolean status = syncSwitch.status(destination);
            String resStatus;
            if (status) {
                resStatus = "on";
            } else {
                resStatus = "off";
            }
            resMap.put("destination", destination);
            resMap.put("status", resStatus);
            result.add(resMap);
        }
        return result;
    }

syncSwitch在這裏代表每個destination的當前狀態,其值可以理解爲當前destination是否可以進行操作,on代表可以進行操作(on 門是開打的),off代表當前不可以進行操作(off 門是關着的

其SyncSwitch的實現原理與etlLock的實現原理有類似之處,它也會根據當前的環境來選擇用何種方式進行實現

    @PostConstruct
    public void init() {
        CuratorFramework curator = curatorClient.getCurator();
        if (curator != null) {
            mode = Mode.DISTRIBUTED;
            DISTRIBUTED_LOCK.clear();
            for (String destination : adapterCanalConfig.DESTINATIONS) {
                // 對應每個destination註冊鎖
                BooleanMutex mutex = new BooleanMutex(true);
                initMutex(curator, destination, mutex);
                DISTRIBUTED_LOCK.put(destination, mutex);
                startListen(destination, mutex);
            }
        } else {
            mode = Mode.LOCAL;
            LOCAL_LOCK.clear();
            for (String destination : adapterCanalConfig.DESTINATIONS) {
                // 對應每個destination註冊鎖
                LOCAL_LOCK.put(destination, new BooleanMutex(true));
            }
        }
    }

其中BooleanMutex是一個基於AQS的實現,其中的set方法中的innerSetTrue和innerSetFalse可以參考下面Sync的代碼:

    /**
     * 重新設置對應的Boolean mutex
     * 
     * @param mutex
     */
    public void set(Boolean mutex) {
        if (mutex) {
            sync.innerSetTrue();
        } else {
            sync.innerSetFalse();
        }
    }

BooleanMutex.Sync(其於AQS實現的互斥鎖)

Sync的代碼:

/**
     * Synchronization control for BooleanMutex. Uses AQS sync state to
     * represent run status
     */
    private final class Sync extends AbstractQueuedSynchronizer {

        private static final long serialVersionUID = 2559471934544126329L;
        /** State value representing that TRUE */
        private static final int  TRUE             = 1;
        /** State value representing that FALSE */
        private static final int  FALSE            = 2;

        private boolean isTrue(int state) {
            return (state & TRUE) != 0;
        }

        /**
         * 實現AQS的接口,獲取共享鎖的判斷
         */
        protected int tryAcquireShared(int state) {
            // 如果爲true,直接允許獲取鎖對象
            // 如果爲false,進入阻塞隊列,等待被喚醒
            return isTrue(getState()) ? 1 : -1;
        }

        /**
         * 實現AQS的接口,釋放共享鎖的判斷
         */
        protected boolean tryReleaseShared(int ignore) {
            // 始終返回true,代表可以release
            return true;
        }

        boolean innerState() {
            return isTrue(getState());
        }

        void innerGet() throws InterruptedException {
            acquireSharedInterruptibly(0);
        }

        void innerGet(long nanosTimeout) throws InterruptedException, TimeoutException {
            if (!tryAcquireSharedNanos(0, nanosTimeout)) throw new TimeoutException();
        }

        void innerSetTrue() {
            for (;;) {
                int s = getState();
                if (s == TRUE) {
                    return; // 直接退出
                }
                if (compareAndSetState(s, TRUE)) {// cas更新狀態,避免併發更新true操作
                    releaseShared(0);// 釋放一下鎖對象,喚醒一下阻塞的Thread
                    return;
                }
            }
        }

        void innerSetFalse() {
            for (;;) {
                int s = getState();
                if (s == FALSE) {
                    return; // 直接退出
                }
                if (compareAndSetState(s, FALSE)) {// cas更新狀態,避免併發更新false操作
                    return;
                }
            }
        }

    }

多線程執行數據同步

上面的代碼主要是對執行etl請求controller部分的代碼,接下來看下具體執行數據同步的細節實現

主要關注AbstractEtlService的protected EtlResult importData(String sql, List<String> params)方法

    protected EtlResult importData(String sql, List<String> params) {
        EtlResult etlResult = new EtlResult();
        AtomicLong impCount = new AtomicLong();
        List<String> errMsg = new ArrayList<>();
        if (config == null) {
            logger.warn("{} mapping config is null, etl go end ", type);
            etlResult.setErrorMessage(type + "mapping config is null, etl go end ");
            return etlResult;
        }

        long start = System.currentTimeMillis();
        try {
            DruidDataSource dataSource = DatasourceConfig.DATA_SOURCES.get(config.getDataSourceKey());

            List<Object> values = new ArrayList<>();
            // 拼接條件
            if (config.getMapping().getEtlCondition() != null && params != null) {
                String etlCondition = config.getMapping().getEtlCondition();
                for (String param : params) {
                    etlCondition = etlCondition.replace("{}", "?");
                    values.add(param);
                }

                sql += " " + etlCondition;
            }

            if (logger.isDebugEnabled()) {
                logger.debug("etl sql : {}", sql);
            }

            // 獲取總數
            String countSql = "SELECT COUNT(1) FROM ( " + sql + ") _CNT ";
            long cnt = (Long) Util.sqlRS(dataSource, countSql, values, rs -> {
                Long count = null;
                try {
                    if (rs.next()) {
                        count = ((Number) rs.getObject(1)).longValue();
                    }
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                }
                return count == null ? 0L : count;
            });

            // 當大於1萬條記錄時開啓多線程
            if (cnt >= 10000) {
                int threadCount = Runtime.getRuntime().availableProcessors();

                long offset;
                long size = CNT_PER_TASK;
                long workerCnt = cnt / size + (cnt % size == 0 ? 0 : 1);

                if (logger.isDebugEnabled()) {
                    logger.debug("workerCnt {} for cnt {} threadCount {}", workerCnt, cnt, threadCount);
                }

                ExecutorService executor = Util.newFixedThreadPool(threadCount, 5000L);
                List<Future<Boolean>> futures = new ArrayList<>();
                for (long i = 0; i < workerCnt; i++) {
                    offset = size * i;
                    String sqlFinal = sql + " LIMIT " + offset + "," + size;
                    Future<Boolean> future = executor.submit(() -> executeSqlImport(dataSource,
                        sqlFinal,
                        values,
                        config.getMapping(),
                        impCount,
                        errMsg));
                    futures.add(future);
                }

                for (Future<Boolean> future : futures) {
                    future.get();
                }
                executor.shutdown();
            } else {
                executeSqlImport(dataSource, sql, values, config.getMapping(), impCount, errMsg);
            }

            logger.info("數據全量導入完成, 一共導入 {} 條數據, 耗時: {}", impCount.get(), System.currentTimeMillis() - start);
            etlResult.setResultMessage("導入" + type + " 數據:" + impCount.get() + " 條");
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            errMsg.add(type + " 數據導入異常 =>" + e.getMessage());
        }
        if (errMsg.isEmpty()) {
            etlResult.setSucceeded(true);
        } else {
            etlResult.setErrorMessage(Joiner.on("\n").join(errMsg));
        }
        return etlResult;
    }

從代碼可以看出,在執行數據同步前程序會先統計出需要同步數據的總數,如果總數大於了10000條,則會根據當前系統的可用線程線來創建一個固定大小的線程池來採用分批進行查詢,也就是如下的代碼:

            // 當大於1萬條記錄時開啓多線程
            if (cnt >= 10000) {
                int threadCount = Runtime.getRuntime().availableProcessors();
                //起始偏移量
                long offset;
                //每次查詢條數:這裏爲10000
                long size = CNT_PER_TASK;
                //需要創建的worker數:(總記錄數/每頁數)+(總記錄數%每頁數,判斷是否剛剛整除完畢)
                long workerCnt = cnt / size + (cnt % size == 0 ? 0 : 1);

                if (logger.isDebugEnabled()) {
                    logger.debug("workerCnt {} for cnt {} threadCount {}", workerCnt, cnt, threadCount);
                }

                ExecutorService executor = Util.newFixedThreadPool(threadCount, 5000L);
                List<Future<Boolean>> futures = new ArrayList<>();
                for (long i = 0; i < workerCnt; i++) {
                    //offset每次進行偏移
                    offset = size * i;
                    //每次取1000條
                    String sqlFinal = sql + " LIMIT " + offset + "," + size;
                    Future<Boolean> future = executor.submit(() -> executeSqlImport(dataSource,
                            sqlFinal,
                            values,
                            config.getMapping(),
                            impCount,
                            errMsg));
                    futures.add(future);
                }

                for (Future<Boolean> future : futures) {
                    future.get();
                }
                executor.shutdown();
            }

再來關注下executeSqlImport方法:

protected boolean executeSqlImport(DataSource ds, String sql, List<Object> values,
                                       AdapterConfig.AdapterMapping adapterMapping, AtomicLong impCount,
                                       List<String> errMsg) {
        try {
            ESMapping mapping = (ESMapping) adapterMapping;
            Util.sqlRS(ds, sql, values, rs -> {
                int count = 0;
                try {
                    ESBulkRequest esBulkRequest = this.esConnection.new ESBulkRequest();

                    long batchBegin = System.currentTimeMillis();
                    while (rs.next()) {
                        Map<String, Object> esFieldData = new LinkedHashMap<>();
                        Object idVal = null;
                        for (FieldItem fieldItem : mapping.getSchemaItem().getSelectFields().values()) {

                            String fieldName = fieldItem.getFieldName();
                            if (mapping.getSkips().contains(fieldName)) {
                                continue;
                            }

                            // 如果是主鍵字段則不插入
                            if (fieldItem.getFieldName().equals(mapping.get_id())) {
                                idVal = esTemplate.getValFromRS(mapping, rs, fieldName, fieldName);
                            } else {
                                Object val = esTemplate.getValFromRS(mapping, rs, fieldName, fieldName);
                                esFieldData.put(Util.cleanColumn(fieldName), val);
                            }

                        }

                        if (!mapping.getRelations().isEmpty()) {
                            mapping.getRelations().forEach((relationField, relationMapping) -> {
                                Map<String, Object> relations = new HashMap<>();
                                relations.put("name", relationMapping.getName());
                                if (StringUtils.isNotEmpty(relationMapping.getParent())) {
                                    FieldItem parentFieldItem = mapping.getSchemaItem()
                                        .getSelectFields()
                                        .get(relationMapping.getParent());
                                    Object parentVal;
                                    try {
                                        parentVal = esTemplate.getValFromRS(mapping,
                                            rs,
                                            parentFieldItem.getFieldName(),
                                            parentFieldItem.getFieldName());
                                    } catch (SQLException e) {
                                        throw new RuntimeException(e);
                                    }
                                    if (parentVal != null) {
                                        relations.put("parent", parentVal.toString());
                                        esFieldData.put("$parent_routing", parentVal.toString());

                                    }
                                }
                                esFieldData.put(Util.cleanColumn(relationField), relations);
                            });
                        }

                        if (idVal != null) {
                            String parentVal = (String) esFieldData.remove("$parent_routing");
                            if (mapping.isUpsert()) {
                                ESUpdateRequest esUpdateRequest = this.esConnection.new ESUpdateRequest(
                                    mapping.get_index(),
                                    mapping.get_type(),
                                    idVal.toString()).setDoc(esFieldData).setDocAsUpsert(true);

                                if (StringUtils.isNotEmpty(parentVal)) {
                                    esUpdateRequest.setRouting(parentVal);
                                }

                                esBulkRequest.add(esUpdateRequest);
                            } else {
                                ESIndexRequest esIndexRequest = this.esConnection.new ESIndexRequest(mapping
                                    .get_index(), mapping.get_type(), idVal.toString()).setSource(esFieldData);
                                if (StringUtils.isNotEmpty(parentVal)) {
                                    esIndexRequest.setRouting(parentVal);
                                }
                                esBulkRequest.add(esIndexRequest);
                            }
                        } else {
                            idVal = esFieldData.get(mapping.getPk());
                            ESSearchRequest esSearchRequest = this.esConnection.new ESSearchRequest(mapping.get_index(),
                                mapping.get_type()).setQuery(QueryBuilders.termQuery(mapping.getPk(), idVal))
                                    .size(10000);
                            SearchResponse response = esSearchRequest.getResponse();
                            for (SearchHit hit : response.getHits()) {
                                ESUpdateRequest esUpdateRequest = this.esConnection.new ESUpdateRequest(mapping
                                    .get_index(), mapping.get_type(), hit.getId()).setDoc(esFieldData);
                                esBulkRequest.add(esUpdateRequest);
                            }
                        }

                        if (esBulkRequest.numberOfActions() % mapping.getCommitBatch() == 0
                            && esBulkRequest.numberOfActions() > 0) {
                            long esBatchBegin = System.currentTimeMillis();
                            BulkResponse rp = esBulkRequest.bulk();
                            if (rp.hasFailures()) {
                                this.processFailBulkResponse(rp);
                            }

                            if (logger.isTraceEnabled()) {
                                logger.trace("全量數據批量導入批次耗時: {}, es執行時間: {}, 批次大小: {}, index; {}",
                                    (System.currentTimeMillis() - batchBegin),
                                    (System.currentTimeMillis() - esBatchBegin),
                                    esBulkRequest.numberOfActions(),
                                    mapping.get_index());
                            }
                            batchBegin = System.currentTimeMillis();
                            esBulkRequest.resetBulk();
                        }
                        count++;
                        impCount.incrementAndGet();
                    }

                    if (esBulkRequest.numberOfActions() > 0) {
                        long esBatchBegin = System.currentTimeMillis();
                        BulkResponse rp = esBulkRequest.bulk();
                        if (rp.hasFailures()) {
                            this.processFailBulkResponse(rp);
                        }
                        if (logger.isTraceEnabled()) {
                            logger.trace("全量數據批量導入最後批次耗時: {}, es執行時間: {}, 批次大小: {}, index; {}",
                                (System.currentTimeMillis() - batchBegin),
                                (System.currentTimeMillis() - esBatchBegin),
                                esBulkRequest.numberOfActions(),
                                mapping.get_index());
                        }
                    }
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                    errMsg.add(mapping.get_index() + " etl failed! ==>" + e.getMessage());
                    throw new RuntimeException(e);
                }
                return count;
            });

            return true;
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            return false;
        }
    }

採用遊標流式查詢

此方法就是執行查詢並將數據同步到es的方法,其中看下sqlRS(DataSource ds, String sql, List<Object> values, Function<ResultSet, Object> fun)的實現細節:

    public static Object sqlRS(DataSource ds, String sql, List<Object> values, Function<ResultSet, Object> fun) {
        try (Connection conn = ds.getConnection()) {
            try (PreparedStatement pstmt = conn
                .prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)) {
                pstmt.setFetchSize(Integer.MIN_VALUE);
                if (values != null) {
                    for (int i = 0; i < values.size(); i++) {
                        pstmt.setObject(i + 1, values.get(i));
                    }
                }
                try (ResultSet rs = pstmt.executeQuery()) {
                    return fun.apply(rs);
                }
            }
        } catch (Exception e) {
            logger.error("sqlRs has error, sql: {} ", sql);
            throw new RuntimeException(e);
        }
    }

其中參數設爲ResultSet.TYPE_FORWARD_ONLY和fetchSize爲Integer.MIN_VALUE的原因是:

當statement設置以下屬性時,採用的是流數據接收方式,每次只從服務器接收部份數據,直到所有數據處理完畢,不會發生JVM OOM

setResultSetType(ResultSet.TYPE_FORWARD_ONLY);

setFetchSize(Integer.MIN_VALUE); 

        

總結

通過查看canal esAdapater的etl同步中的代碼知道了它有如下特點:

1.數據同步的controller入口處加入了鎖(根據當前環境會啓用jvm的鎖或zk的分佈式鎖),確保不會造成重複提交

2.會根據數據量的大小自動開啓多線程進行查詢,從而提高查詢效率

3.查詢時會採用遊標的方式流式進行查詢,避免oom

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章