Kylin build 源码解析

前言

Kylin build过程是Kylin中的核心机制,理解build过程,有助于作业的优化,问题排查,理解Kylin的思想

示例数据

事实表+维度表
在这里插入图片描述

核心代码

BatchCubingJobBuilder2 类

 public CubingJob build() {
        logger.info("MR_V2 new job to BUILD segment {}", seg);

        final CubingJob result = CubingJob.createBuildJob(seg, submitter, config);
        final String jobId = result.getId();
        final String cuboidRootPath = getCuboidRootPath(jobId);

       
        //阶段1 创建平表和物化视图
        inputSide.addStepPhase1_CreateFlatTable(result);

        // 阶段2: distinct 列
        result.addTask(createFactDistinctColumnsStep(jobId));

        if (isEnableUHCDictStep()) {
            result.addTask(createBuildUHCDictStep(jobId));
        }
        //阶段3 构建维度字典
        result.addTask(createBuildDictionaryStep(jobId));
        
        result.addTask(createSaveStatisticsStep(jobId));

        // add materialize lookup tables if needed
        LookupMaterializeContext lookupMaterializeContext = addMaterializeLookupTableSteps(result);

        outputSide.addStepPhase2_BuildDictionary(result);

        if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
            result.addTask(createExtractDictionaryFromGlobalJob(jobId));
        }

        // Phase 3: Build Cube
        
        addLayerCubingSteps(result, jobId, cuboidRootPath); // layer cubing, only selected algorithm will execute
       
        addInMemCubingSteps(result, jobId, cuboidRootPath); // inmem cubing, only selected algorithm will execute
        outputSide.addStepPhase3_BuildCube(result);

        // Phase 4: Update Metadata & Cleanup
        //更新元数据,清除hive临时表
        result.addTask(createUpdateCubeInfoAfterBuildStep(jobId, lookupMaterializeContext));
        inputSide.addStepPhase4_Cleanup(result);
        outputSide.addStepPhase4_Cleanup(result);
        
        // Set the task priority if specified
        result.setPriorityBasedOnPriorityOffset(priorityOffset);
        result.getTasks().forEach(task -> task.setPriorityBasedOnPriorityOffset(priorityOffset));

        return result;
    }

阶段1,创建一个中间平表

核心过程
删除表
根据事实表,选出维度列,度量列
插入数据 事实表和维度表Join

DROP TABLE IF EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2;


CREATE EXTERNAL TABLE IF NOT EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2
(
ORDER_TE_CITYID int
,ORDER_TE_MONEY int
)
STORED AS SEQUENCEFILE


INSERT OVERWRITE TABLE kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2 SELECT
ORDER_TE.CITYID as ORDER_TE_CITYID
,ORDER_TE.MONEY as ORDER_TE_MONEY
FROM TE.ORDER_TE as ORDER_TE 
INNER JOIN TE.CITY as CITY
ON ORDER_TE.CITYID = CITY.CITYID
WHERE 1=1;

在Build过程中可以在Default库中找到这个Hive表,数据如下,如理论分析一致
在这里插入图片描述
Task 核心代码
对应这个类 HiveInputBase

 protected static AbstractExecutable createFlatHiveTableByLivyStep(String hiveInitStatements, String jobWorkingDir,
            String cubeName, IJoinedFlatTableDesc flatDesc) {
        //from hive to hive
        final String dropTableHql = JoinedFlatTable.generateDropTableStatement(flatDesc);
        final String createTableHql = JoinedFlatTable.generateCreateTableStatement(flatDesc, jobWorkingDir);
        String insertDataHqls = JoinedFlatTable.generateInsertDataStatement(flatDesc);

        CreateFlatHiveTableByLivyStep step = new CreateFlatHiveTableByLivyStep();
        step.setInitStatement(hiveInitStatements);
        //对应删除表,创建表,插入表SQL
        step.setCreateTableStatement(dropTableHql + createTableHql + insertDataHqls);
        CubingExecutableUtil.setCubeName(cubeName, step.getParams());
        step.setName(ExecutableConstants.STEP_NAME_CREATE_FLAT_HIVE_TABLE);
        return step;
    }

阶段2,对上一步的数据进行列Distinct

上边的数据有2列,对这两列Distinct
核心代码,返回一个MR任务,可以去Yarn上看到

public MapReduceExecutable createFactDistinctColumnsStep(String jobId) {
        MapReduceExecutable result = new MapReduceExecutable();
        result.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS);
        result.setMapReduceJobClass(FactDistinctColumnsJob.class);
        StringBuilder cmd = new StringBuilder();
        appendMapReduceParameters(cmd);
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, getFactDistinctColumnsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_OUTPUT, getStatisticsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_SAMPLING_PERCENT, String.valueOf(config.getConfig().getCubingInMemSamplingPercent()));
        appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Fact_Distinct_Columns_" + seg.getRealization().getName() + "_Step");
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
        result.setMapReduceParams(cmd.toString());
        result.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES);
        return result;
    }

阶段3 构建维度字典

核心代码

public HadoopShellExecutable createBuildDictionaryStep(String jobId) {
        // base cuboid job
        HadoopShellExecutable buildDictionaryStep = new HadoopShellExecutable();
        buildDictionaryStep.setName(ExecutableConstants.STEP_NAME_BUILD_DICTIONARY);
        StringBuilder cmd = new StringBuilder();
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, getFactDistinctColumnsPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_DICT_PATH, getDictRootPath(jobId));
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);

        buildDictionaryStep.setJobParams(cmd.toString());
        buildDictionaryStep.setJobClass(CreateDictionaryJob.class);
        return buildDictionaryStep;
    }

示例如下,由于cuboid都是String,转成整数值,减少内存占用
在这里插入图片描述

阶段4 Save Cuboid Statistics

阶段5 Create HTable

阶段6 Build Base Cuboid

第一步,就是对所有维度进行GROUP BY ,得到度量值
第二步,每次维度减一,比如4个维度输入,得到3个维度输出,迭代,直到维度为0

// Don't know statistics so that tree cuboid scheduler is not determined. Determine the maxLevel at runtime
        final int maxLevel = CuboidUtil.getLongestDepth(seg.getCuboidScheduler().getAllCuboidIds());
        // base cuboid step
        result.addTask(createBaseCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, 0), jobId));
        // n dim cuboid steps
        for (int i = 1; i <= maxLevel; i++) {
        result.addTask(createNDimensionCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, i - 1), getCuboidOutputPathsByLevel(cuboidRootPath, i), i, jobId));

计算核心代码,对Key相同的,度量值进行聚合

 @Override
    public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        aggs.reset();

        for (Text value : values) {
            if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) {
                logger.info("Handling value with ordinal (This is not KV number!): " + vcounter);
            }
            codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input);
            aggs.aggregate(input, needAggrMeasures);
        }
        aggs.collectStates(result);

        ByteBuffer valueBuf = codec.encode(result);

        outputValue.set(valueBuf.array(), 0, valueBuf.position());
        context.write(key, outputValue);
    }

阶段7 Build Cube In-Mem

在内存中构造cube

 protected void addInMemCubingSteps(final CubingJob result, String jobId, String cuboidRootPath) {
        // base cuboid job
        MapReduceExecutable cubeStep = new MapReduceExecutable();

        StringBuilder cmd = new StringBuilder();
        appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX);

        cubeStep.setName(ExecutableConstants.STEP_NAME_BUILD_IN_MEM_CUBE);

        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
        appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, cuboidRootPath);
        appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Cube_Builder_" + seg.getRealization().getName());
        appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
        if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
            appendExecCmdParameters(cmd, BatchConstants.ARG_SHRUNKEN_DICT_PATH, getShrunkenDictionaryPath(jobId));
        }

        cubeStep.setMapReduceParams(cmd.toString());
        cubeStep.setMapReduceJobClass(getInMemCuboidJob());
        result.addTask(cubeStep);
    }

阶段8 Convert Cuboid Data to HFile

阶段9 Load HFile to HBase Table

 @Override
            public void addStepPhase3_BuildCube(DefaultChainedExecutable jobFlow) {
                jobFlow.addTask(steps.createConvertCuboidToHfileStep(jobFlow.getId()));
                jobFlow.addTask(steps.createBulkLoadStep(jobFlow.getId()));
            }

阶段10 Update Cube Info

cube完成

阶段11 Hive Cleanup

删除Hive临时表

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章