Kylin build 源碼解析

前言

Kylin build過程是Kylin中的核心機制,理解build過程,有助於作業的優化,問題排查,理解Kylin的思想

示例數據

事實表+維度表
在這裏插入圖片描述

核心代碼

BatchCubingJobBuilder2 類

 public CubingJob build() {
        logger.info("MR_V2 new job to BUILD segment {}", seg);

        final CubingJob result = CubingJob.createBuildJob(seg, submitter, config);
        final String jobId = result.getId();
        final String cuboidRootPath = getCuboidRootPath(jobId);

       
        //階段1 創建平表和物化視圖
        inputSide.addStepPhase1_CreateFlatTable(result);

        // 階段2: distinct 列
        result.addTask(createFactDistinctColumnsStep(jobId));

        if (isEnableUHCDictStep()) {
            result.addTask(createBuildUHCDictStep(jobId));
        }
        //階段3 構建維度字典
        result.addTask(createBuildDictionaryStep(jobId));
        
        result.addTask(createSaveStatisticsStep(jobId));

        // add materialize lookup tables if needed
        LookupMaterializeContext lookupMaterializeContext = addMaterializeLookupTableSteps(result);

        outputSide.addStepPhase2_BuildDictionary(result);

        if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
            result.addTask(createExtractDictionaryFromGlobalJob(jobId));
        }

        // Phase 3: Build Cube
        
        addLayerCubingSteps(result, jobId, cuboidRootPath); // layer cubing, only selected algorithm will execute
       
        addInMemCubingSteps(result, jobId, cuboidRootPath); // inmem cubing, only selected algorithm will execute
        outputSide.addStepPhase3_BuildCube(result);

        // Phase 4: Update Metadata & Cleanup
        //更新元數據,清除hive臨時表
        result.addTask(createUpdateCubeInfoAfterBuildStep(jobId, lookupMaterializeContext));
        inputSide.addStepPhase4_Cleanup(result);
        outputSide.addStepPhase4_Cleanup(result);
        
        // Set the task priority if specified
        result.setPriorityBasedOnPriorityOffset(priorityOffset);
        result.getTasks().forEach(task -> task.setPriorityBasedOnPriorityOffset(priorityOffset));

        return result;
    }

階段1,創建一箇中間平表

核心過程
刪除表
根據事實表,選出維度列,度量列
插入數據 事實表和維度表Join

DROP TABLE IF EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2;


CREATE EXTERNAL TABLE IF NOT EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2
(
ORDER_TE_CITYID int
,ORDER_TE_MONEY int
)
STORED AS SEQUENCEFILE


INSERT OVERWRITE TABLE kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2 SELECT
ORDER_TE.CITYID as ORDER_TE_CITYID
,ORDER_TE.MONEY as ORDER_TE_MONEY
FROM TE.ORDER_TE as ORDER_TE 
INNER JOIN TE.CITY as CITY
ON ORDER_TE.CITYID = CITY.CITYID
WHERE 1=1;

在Build過程中可以在Default庫中找到這個Hive表,數據如下,如理論分析一致
在這裏插入圖片描述
Task 核心代碼
對應這個類 HiveInputBase

 protected static AbstractExecutable createFlatHiveTableByLivyStep(String hiveInitStatements, String jobWorkingDir,
            String cubeName, IJoinedFlatTableDesc flatDesc) {
        //from hive to hive
        final String dropTableHql = JoinedFlatTable.generateDropTableStatement(flatDesc);
        final String createTableHql = JoinedFlatTable.generateCreateTableStatement(flatDesc, jobWorkingDir);
        String insertDataHqls = JoinedFlatTable.generateInsertDataStatement(flatDesc);

        CreateFlatHiveTableByLivyStep step = new CreateFlatHiveTableByLivyStep();
        step.setInitStatement(hiveInitStatements);
        //對應刪除表,創建表,插入表SQL
        step.setCreateTableStatement(dropTableHql + createTableHql + insertDataHqls);
        CubingExecutableUtil.setCubeName(cubeName, step.getParams());
        step.setName(ExecutableConstants.STEP_NAME_CREATE_FLAT_HIVE_TABLE);
        return step;
    }

階段2,對上一步的數據進行列Distinct

上邊的數據有2列,對這兩列Distinct
核心代碼,返回一個MR任務,可以去Yarn上看到

public MapReduceExecutable createFactDistinctColumnsStep(String jobId) {
        MapReduceExecutable result = new MapReduceExecutable();
        result.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS);
        result.setMapReduceJobClass(FactDistinctColumnsJob.class);
        StringBuilder cmd = new StringBuilder();
        appendMapReduceParameters(cmd);
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, getFactDistinctColumnsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_OUTPUT, getStatisticsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_SAMPLING_PERCENT, String.valueOf(config.getConfig().getCubingInMemSamplingPercent()));
        appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Fact_Distinct_Columns_" + seg.getRealization().getName() + "_Step");
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
        result.setMapReduceParams(cmd.toString());
        result.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES);
        return result;
    }

階段3 構建維度字典

核心代碼

public HadoopShellExecutable createBuildDictionaryStep(String jobId) {
        // base cuboid job
        HadoopShellExecutable buildDictionaryStep = new HadoopShellExecutable();
        buildDictionaryStep.setName(ExecutableConstants.STEP_NAME_BUILD_DICTIONARY);
        StringBuilder cmd = new StringBuilder();
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, getFactDistinctColumnsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_DICT_PATH, getDictRootPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);

        buildDictionaryStep.setJobParams(cmd.toString());
        buildDictionaryStep.setJobClass(CreateDictionaryJob.class);
        return buildDictionaryStep;
    }

示例如下,由於cuboid都是String,轉成整數值,減少內存佔用
在這裏插入圖片描述

階段4 Save Cuboid Statistics

階段5 Create HTable

階段6 Build Base Cuboid

第一步,就是對所有維度進行GROUP BY ,得到度量值
第二步,每次維度減一,比如4個維度輸入,得到3個維度輸出,迭代,直到維度爲0

// Don't know statistics so that tree cuboid scheduler is not determined. Determine the maxLevel at runtime
        final int maxLevel = CuboidUtil.getLongestDepth(seg.getCuboidScheduler().getAllCuboidIds());
        // base cuboid step
        result.addTask(createBaseCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, 0), jobId));
        // n dim cuboid steps
        for (int i = 1; i <= maxLevel; i++) {
        result.addTask(createNDimensionCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, i - 1), getCuboidOutputPathsByLevel(cuboidRootPath, i), i, jobId));

計算核心代碼,對Key相同的,度量值進行聚合

 @Override
    public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        aggs.reset();

        for (Text value : values) {
            if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) {
                logger.info("Handling value with ordinal (This is not KV number!): " + vcounter);
            }
            codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input);
            aggs.aggregate(input, needAggrMeasures);
        }
        aggs.collectStates(result);

        ByteBuffer valueBuf = codec.encode(result);

        outputValue.set(valueBuf.array(), 0, valueBuf.position());
        context.write(key, outputValue);
    }

階段7 Build Cube In-Mem

在內存中構造cube

 protected void addInMemCubingSteps(final CubingJob result, String jobId, String cuboidRootPath) {
        // base cuboid job
        MapReduceExecutable cubeStep = new MapReduceExecutable();

        StringBuilder cmd = new StringBuilder();
        appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX);

        cubeStep.setName(ExecutableConstants.STEP_NAME_BUILD_IN_MEM_CUBE);

        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, cuboidRootPath);
        appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Cube_Builder_" + seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
        if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
            appendExecCmdParameters(cmd, BatchConstants.ARG_SHRUNKEN_DICT_PATH, getShrunkenDictionaryPath(jobId));
        }

        cubeStep.setMapReduceParams(cmd.toString());
        cubeStep.setMapReduceJobClass(getInMemCuboidJob());
        result.addTask(cubeStep);
    }

階段8 Convert Cuboid Data to HFile

階段9 Load HFile to HBase Table

 @Override
            public void addStepPhase3_BuildCube(DefaultChainedExecutable jobFlow) {
                jobFlow.addTask(steps.createConvertCuboidToHfileStep(jobFlow.getId()));
                jobFlow.addTask(steps.createBulkLoadStep(jobFlow.getId()));
            }

階段10 Update Cube Info

cube完成

階段11 Hive Cleanup

刪除Hive臨時表

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章