datax源碼閱讀四:TaskGroupContainer

根據前面源碼閱讀可以知道,JobContainer將所有的task分配到TaskGroup中執行,TaskGroup啓動5個線程去消費所有的task的,具體實現爲

public void start() {
    try {
        /**
         * 狀態check時間間隔,較短,可以把任務及時分發到對應channel中
         */
        int sleepIntervalInMillSec = this.configuration.getInt(
                CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_SLEEPINTERVAL, 100);
        /**
         * 狀態彙報時間間隔,稍長,避免大量彙報
         */
        long reportIntervalInMillSec = this.configuration.getLong(
                CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_REPORTINTERVAL,
                10000);
        /**
         * 2分鐘彙報一次性能統計
         */
        // 獲取channel數目
        int channelNumber = this.configuration.getInt(
                CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_CHANNEL);
        int taskMaxRetryTimes = this.configuration.getInt(
                CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXRETRYTIMES, 1);
        long taskRetryIntervalInMsec = this.configuration.getLong(
                CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_RETRYINTERVALINMSEC, 10000);
        long taskMaxWaitInMsec = this.configuration.getLong(CoreConstant.DATAX_CORE_CONTAINER_TASK_FAILOVER_MAXWAITINMSEC, 60000);
        List<Configuration> taskConfigs = this.configuration
                .getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
        if(LOG.isDebugEnabled()) {
            LOG.debug("taskGroup[{}]'s task configs[{}]", this.taskGroupId,
                    JSON.toJSONString(taskConfigs));
        }
        int taskCountInThisTaskGroup = taskConfigs.size();
        /*LOG.info(String.format(
                "taskGroupId=[%d] start [%d] channels for [%d] tasks.",
                this.taskGroupId, channelNumber, taskCountInThisTaskGroup));*/
        this.containerCommunicator.registerCommunication(taskConfigs);
        Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); //taskId與task配置
        List<Configuration> taskQueue = buildRemainTasks(taskConfigs); //待運行task列表
        Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>(); //taskId與上次失敗實例
        List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); //正在運行task
        Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); //任務開始時間
        long lastReportTimeStamp = 0;
        Communication lastTaskGroupContainerCommunication = new Communication();
        while (true) {
            //1.判斷task狀態
            boolean failedOrKilled = false;
            Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
            for(Map.Entry<Integer, Communication> entry : communicationMap.entrySet()){
                Integer taskId = entry.getKey();
                Communication taskCommunication = entry.getValue();
                if(!taskCommunication.isFinished()){
                    continue;
                }
                TaskExecutor taskExecutor = removeTask(runTasks, taskId);
                //上面從runTasks裏移除了,因此對應在monitor裏移除
                taskMonitor.removeTask(taskId);
                //失敗,看task是否支持failover,重試次數未超過最大限制
                if(taskCommunication.getState() == State.FAILED){
                    taskFailedExecutorMap.put(taskId, taskExecutor);
                    if(taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes){
                        taskExecutor.shutdown(); //關閉老的executor
                        containerCommunicator.resetCommunication(taskId); //將task的狀態重置
                        Configuration taskConfig = taskConfigMap.get(taskId);
                        taskQueue.add(taskConfig); //重新加入任務列表
                    }else{
                        failedOrKilled = true;
                        break;
                    }
                }else if(taskCommunication.getState() == State.KILLED){
                    failedOrKilled = true;
                    break;
                }else if(taskCommunication.getState() == State.SUCCEEDED){
                    Long taskStartTime = taskStartTimeMap.get(taskId);
                    if(taskStartTime != null){
                        Long usedTime = System.currentTimeMillis() - taskStartTime;
                        /*LOG.info("taskGroup[{}] taskId[{}] is successed, used[{}]ms",
                                this.taskGroupId, taskId, usedTime);*/
                        //usedTime*1000*1000 轉換成PerfRecord記錄的ns,這裏主要是簡單登記,進行最長任務的打印。因此增加特定靜態方法
                        PerfRecord.addPerfRecord(taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL,taskStartTime, usedTime * 1000L * 1000L);
                        taskStartTimeMap.remove(taskId);
                        taskConfigMap.remove(taskId);
                    }
                }
            }
            // 2.發現該taskGroup下taskExecutor的總狀態失敗則彙報錯誤
            if (failedOrKilled) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
                        lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                throw DataXException.asDataXException(
                        FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable());
            }
            //3.有任務未執行,且正在運行的任務數小於最大通道限制
            Iterator<Configuration> iterator = taskQueue.iterator();
            while(iterator.hasNext() && runTasks.size() < channelNumber){
                Configuration taskConfig = iterator.next();
                Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
                int attemptCount = 1;
                TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
                if(lastExecutor!=null){
                    attemptCount = lastExecutor.getAttemptCount() + 1;
                    long now = System.currentTimeMillis();
                    long failedTime = lastExecutor.getTimeStamp();
                    if(now - failedTime < taskRetryIntervalInMsec){  //未到等待時間,繼續留在隊列
                        continue;
                    }
                    if(!lastExecutor.isShutdown()){ //上次失敗的task仍未結束
                        if(now - failedTime > taskMaxWaitInMsec){
                            markCommunicationFailed(taskId);
                            reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                            throw DataXException.asDataXException(CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超時");
                        }else{
                            lastExecutor.shutdown(); //再次嘗試關閉
                            continue;
                        }
                    }else{
                        /*LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
                                this.taskGroupId, taskId, lastExecutor.getAttemptCount());*/
                    }
                }
                Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
                TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);
                taskStartTimeMap.put(taskId, System.currentTimeMillis());
                taskExecutor.doStart();
                iterator.remove();
                runTasks.add(taskExecutor);
                //上面,增加task到runTasks列表,因此在monitor裏註冊。
                taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));
                taskFailedExecutorMap.remove(taskId);
                /*LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] is started",
                        this.taskGroupId, taskId, attemptCount);*/
            }
            //4.任務列表爲空,executor已結束, 蒐集狀態爲success--->成功
            if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) {
                // 成功的情況下,也需要彙報一次。否則在任務結束非常快的情況下,採集的信息將會不準確
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
                        lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                /*LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);*/
                break;
            }
            // 5.如果當前時間已經超出彙報時間的interval,那麼我們需要馬上彙報
            long now = System.currentTimeMillis();
            if (now - lastReportTimeStamp > reportIntervalInMillSec) {
                lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
                        lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
                lastReportTimeStamp = now;
                //taskMonitor對於正在運行的task,每reportIntervalInMillSec進行檢查
                for(TaskExecutor taskExecutor:runTasks){
                  taskMonitor.report(taskExecutor.getTaskId(),this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
                    if(DATX_LOG_ENABLE){
                        LOG.info("Running queue capacity is :[{}], current length is:[{}]", taskExecutor.channel.getCapacity(), taskExecutor.channel.size());
                    }
                }
            }
            Thread.sleep(sleepIntervalInMillSec);
        }
        //6.最後還要彙報一次
        reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
    } catch (Throwable e) {
        Communication nowTaskGroupContainerCommunication = this.containerCommunicator.collect();
        if (nowTaskGroupContainerCommunication.getThrowable() == null) {
            nowTaskGroupContainerCommunication.setThrowable(e);
        }
        nowTaskGroupContainerCommunication.setState(State.FAILED);
        this.containerCommunicator.report(nowTaskGroupContainerCommunication);
        throw DataXException.asDataXException(
                FrameworkErrorCode.RUNTIME_ERROR, e);
    }finally {
        if(!PerfTrace.getInstance().isJob()){
            //最後打印cpu的平均消耗,GC的統計
            VMInfo vmInfo = VMInfo.getVmInfo();
            if (vmInfo != null) {
                vmInfo.getDelta(false);
                LOG.info(vmInfo.totalString());
            }
            LOG.info(PerfTrace.getInstance().summarizeNoException());
        }
    }
}

上述實現主要分爲以下幾個步驟:
  1、初始化task執行相關的狀態信息,分別是taskId->Congifuration的map、待運行的任務隊列taskQueue、運行失敗任務taskFailedExecutorMap、運行中的任務runTasks、任務開始時間taskStartTimeMap
  2、循環檢測所有任務的執行狀態
    1)判斷是否有失敗的task,如果有則放入失敗對立中,並查看當前的執行是否支持重跑和failOver,如果支持則重新放回執行隊列中;如果沒有失敗,則標記任務執行成功,並從狀態輪詢map中移除
    2)如果發現有失敗的任務,則彙報當前TaskGroup的狀態,並拋出異常
    3)查看當前執行隊列的長度,如果發現執行隊列還有通道,則構建TaskExecutor加入執行隊列,並從待運行移除
    4)檢查執行隊列和所有的任務狀態,如果所有的任務都執行成功,則彙報taskGroup的狀態並從循環中退出
    5)檢查當前時間是否超過彙報時間檢測,如果是,則彙報當前狀態
    6)當所有的執行完成從while中退出之後,再次全局彙報當前的任務狀態

至此,taskGroup中的所有執行完成,上述taskGroup的運行隊列只是將負責對task任務進行調度,具體的執行還是TaskExecutor負責實現,下面看看TaskExecutor的執行,代碼實現如下

  public TaskExecutor(Configuration taskConf, int attemptCount) {
        // 獲取該taskExecutor的配置
        this.taskConfig = taskConf;
        Validate.isTrue(null != this.taskConfig.getConfiguration(CoreConstant.JOB_READER)
                        && null != this.taskConfig.getConfiguration(CoreConstant.JOB_WRITER),
                "[reader|writer]的插件參數不能爲空!");
        // 得到taskId
        this.taskId = this.taskConfig.getInt(CoreConstant.TASK_ID);
        this.attemptCount = attemptCount;
        /**
         * 由taskId得到該taskExecutor的Communication
         * 要傳給readerRunner和writerRunner,同時要傳給channel作統計用
         */
        this.taskCommunication = containerCommunicator
                .getCommunication(taskId);
        Validate.notNull(this.taskCommunication,
                String.format("taskId[%d]的Communication沒有註冊過", taskId));
        this.channel = ClassUtil.instantiate(channelClazz,
                Channel.class, configuration);
        this.channel.setCommunication(this.taskCommunication);

        /**
         * 獲取transformer的參數
         */

        List<TransformerExecution> transformerInfoExecs = TransformerUtil.buildTransformerInfo(taskConfig);

        /**
         * 生成writerThread
         */
        writerRunner = (WriterRunner) generateRunner(PluginType.WRITER);
        this.writerThread = new Thread(writerRunner,
                String.format("%d-%d-%d-writer",
                        jobId, taskGroupId, this.taskId));
        //通過設置thread的contextClassLoader,即可實現同步和主程序不通的加載器
        this.writerThread.setContextClassLoader(LoadUtil.getJarLoader(
                PluginType.WRITER, this.taskConfig.getString(
                        CoreConstant.JOB_WRITER_NAME)));

        /**
         * 生成readerThread
         */
        readerRunner = (ReaderRunner) generateRunner(PluginType.READER,transformerInfoExecs);
        this.readerThread = new Thread(readerRunner,
                String.format("%d-%d-%d-reader",
                        jobId, taskGroupId, this.taskId));
        /**
         * 通過設置thread的contextClassLoader,即可實現同步和主程序不通的加載器
         */
        this.readerThread.setContextClassLoader(LoadUtil.getJarLoader(
                PluginType.READER, this.taskConfig.getString(
                        CoreConstant.JOB_READER_NAME)));
    }

    public void doStart() {
        this.writerThread.start();
        // reader沒有起來,writer不可能結束
        if (!this.writerThread.isAlive() || this.taskCommunication.getState() == State.FAILED) {
            throw DataXException.asDataXException(
                    FrameworkErrorCode.RUNTIME_ERROR,
                    this.taskCommunication.getThrowable());
        }
        this.readerThread.start();
        // 這裏reader可能很快結束
        if (!this.readerThread.isAlive() && this.taskCommunication.getState() == State.FAILED) {
            // 這裏有可能出現Reader線上啓動即掛情況 對於這類情況 需要立刻拋出異常
            throw DataXException.asDataXException(
                    FrameworkErrorCode.RUNTIME_ERROR,
                    this.taskCommunication.getThrowable());
        }
    }
    

TaskExecutor構建的時候,生成一個reader、channel和writer,並啓動兩個線程,reader生產數據寫入channel,writer從channel中讀數據,任務執行完畢時,通過wirter將任務狀態置爲成功

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章