前言
Kylin build過程是Kylin中的核心機制,理解build過程,有助於作業的優化,問題排查,理解Kylin的思想
示例數據
事實表+維度表
核心代碼
BatchCubingJobBuilder2 類
public CubingJob build() {
logger.info("MR_V2 new job to BUILD segment {}", seg);
final CubingJob result = CubingJob.createBuildJob(seg, submitter, config);
final String jobId = result.getId();
final String cuboidRootPath = getCuboidRootPath(jobId);
//階段1 創建平表和物化視圖
inputSide.addStepPhase1_CreateFlatTable(result);
// 階段2: distinct 列
result.addTask(createFactDistinctColumnsStep(jobId));
if (isEnableUHCDictStep()) {
result.addTask(createBuildUHCDictStep(jobId));
}
//階段3 構建維度字典
result.addTask(createBuildDictionaryStep(jobId));
result.addTask(createSaveStatisticsStep(jobId));
// add materialize lookup tables if needed
LookupMaterializeContext lookupMaterializeContext = addMaterializeLookupTableSteps(result);
outputSide.addStepPhase2_BuildDictionary(result);
if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
result.addTask(createExtractDictionaryFromGlobalJob(jobId));
}
// Phase 3: Build Cube
addLayerCubingSteps(result, jobId, cuboidRootPath); // layer cubing, only selected algorithm will execute
addInMemCubingSteps(result, jobId, cuboidRootPath); // inmem cubing, only selected algorithm will execute
outputSide.addStepPhase3_BuildCube(result);
// Phase 4: Update Metadata & Cleanup
//更新元數據,清除hive臨時表
result.addTask(createUpdateCubeInfoAfterBuildStep(jobId, lookupMaterializeContext));
inputSide.addStepPhase4_Cleanup(result);
outputSide.addStepPhase4_Cleanup(result);
// Set the task priority if specified
result.setPriorityBasedOnPriorityOffset(priorityOffset);
result.getTasks().forEach(task -> task.setPriorityBasedOnPriorityOffset(priorityOffset));
return result;
}
階段1,創建一箇中間平表
核心過程
刪除表
根據事實表,選出維度列,度量列
插入數據 事實表和維度表Join
DROP TABLE IF EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2;
CREATE EXTERNAL TABLE IF NOT EXISTS kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2
(
ORDER_TE_CITYID int
,ORDER_TE_MONEY int
)
STORED AS SEQUENCEFILE
INSERT OVERWRITE TABLE kylin_intermediate_te_cube_bc02a33c_5ac9_49c0_ada4_62e21b5746e2 SELECT
ORDER_TE.CITYID as ORDER_TE_CITYID
,ORDER_TE.MONEY as ORDER_TE_MONEY
FROM TE.ORDER_TE as ORDER_TE
INNER JOIN TE.CITY as CITY
ON ORDER_TE.CITYID = CITY.CITYID
WHERE 1=1;
在Build過程中可以在Default庫中找到這個Hive表,數據如下,如理論分析一致
Task 核心代碼
對應這個類 HiveInputBase
protected static AbstractExecutable createFlatHiveTableByLivyStep(String hiveInitStatements, String jobWorkingDir,
String cubeName, IJoinedFlatTableDesc flatDesc) {
//from hive to hive
final String dropTableHql = JoinedFlatTable.generateDropTableStatement(flatDesc);
final String createTableHql = JoinedFlatTable.generateCreateTableStatement(flatDesc, jobWorkingDir);
String insertDataHqls = JoinedFlatTable.generateInsertDataStatement(flatDesc);
CreateFlatHiveTableByLivyStep step = new CreateFlatHiveTableByLivyStep();
step.setInitStatement(hiveInitStatements);
//對應刪除表,創建表,插入表SQL
step.setCreateTableStatement(dropTableHql + createTableHql + insertDataHqls);
CubingExecutableUtil.setCubeName(cubeName, step.getParams());
step.setName(ExecutableConstants.STEP_NAME_CREATE_FLAT_HIVE_TABLE);
return step;
}
階段2,對上一步的數據進行列Distinct
上邊的數據有2列,對這兩列Distinct
核心代碼,返回一個MR任務,可以去Yarn上看到
public MapReduceExecutable createFactDistinctColumnsStep(String jobId) {
MapReduceExecutable result = new MapReduceExecutable();
result.setName(ExecutableConstants.STEP_NAME_FACT_DISTINCT_COLUMNS);
result.setMapReduceJobClass(FactDistinctColumnsJob.class);
StringBuilder cmd = new StringBuilder();
appendMapReduceParameters(cmd);
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, getFactDistinctColumnsPath(jobId));
appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_OUTPUT, getStatisticsPath(jobId));
appendExecCmdParameters(cmd, BatchConstants.ARG_STATS_SAMPLING_PERCENT, String.valueOf(config.getConfig().getCubingInMemSamplingPercent()));
appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Fact_Distinct_Columns_" + seg.getRealization().getName() + "_Step");
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
result.setMapReduceParams(cmd.toString());
result.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES);
return result;
}
階段3 構建維度字典
核心代碼
public HadoopShellExecutable createBuildDictionaryStep(String jobId) {
// base cuboid job
HadoopShellExecutable buildDictionaryStep = new HadoopShellExecutable();
buildDictionaryStep.setName(ExecutableConstants.STEP_NAME_BUILD_DICTIONARY);
StringBuilder cmd = new StringBuilder();
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, getFactDistinctColumnsPath(jobId));
appendExecCmdParameters(cmd, BatchConstants.ARG_DICT_PATH, getDictRootPath(jobId));
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
buildDictionaryStep.setJobParams(cmd.toString());
buildDictionaryStep.setJobClass(CreateDictionaryJob.class);
return buildDictionaryStep;
}
示例如下,由於cuboid都是String,轉成整數值,減少內存佔用
階段4 Save Cuboid Statistics
階段5 Create HTable
階段6 Build Base Cuboid
第一步,就是對所有維度進行GROUP BY ,得到度量值
第二步,每次維度減一,比如4個維度輸入,得到3個維度輸出,迭代,直到維度爲0
// Don't know statistics so that tree cuboid scheduler is not determined. Determine the maxLevel at runtime
final int maxLevel = CuboidUtil.getLongestDepth(seg.getCuboidScheduler().getAllCuboidIds());
// base cuboid step
result.addTask(createBaseCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, 0), jobId));
// n dim cuboid steps
for (int i = 1; i <= maxLevel; i++) {
result.addTask(createNDimensionCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, i - 1), getCuboidOutputPathsByLevel(cuboidRootPath, i), i, jobId));
計算核心代碼,對Key相同的,度量值進行聚合
@Override
public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
aggs.reset();
for (Text value : values) {
if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) {
logger.info("Handling value with ordinal (This is not KV number!): " + vcounter);
}
codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input);
aggs.aggregate(input, needAggrMeasures);
}
aggs.collectStates(result);
ByteBuffer valueBuf = codec.encode(result);
outputValue.set(valueBuf.array(), 0, valueBuf.position());
context.write(key, outputValue);
}
階段7 Build Cube In-Mem
在內存中構造cube
protected void addInMemCubingSteps(final CubingJob result, String jobId, String cuboidRootPath) {
// base cuboid job
MapReduceExecutable cubeStep = new MapReduceExecutable();
StringBuilder cmd = new StringBuilder();
appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX);
cubeStep.setName(ExecutableConstants.STEP_NAME_BUILD_IN_MEM_CUBE);
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName());
appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid());
appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, cuboidRootPath);
appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Cube_Builder_" + seg.getRealization().getName());
appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId);
if (seg.getCubeDesc().isShrunkenDictFromGlobalEnabled()) {
appendExecCmdParameters(cmd, BatchConstants.ARG_SHRUNKEN_DICT_PATH, getShrunkenDictionaryPath(jobId));
}
cubeStep.setMapReduceParams(cmd.toString());
cubeStep.setMapReduceJobClass(getInMemCuboidJob());
result.addTask(cubeStep);
}
階段8 Convert Cuboid Data to HFile
階段9 Load HFile to HBase Table
@Override
public void addStepPhase3_BuildCube(DefaultChainedExecutable jobFlow) {
jobFlow.addTask(steps.createConvertCuboidToHfileStep(jobFlow.getId()));
jobFlow.addTask(steps.createBulkLoadStep(jobFlow.getId()));
}
階段10 Update Cube Info
cube完成
階段11 Hive Cleanup
刪除Hive臨時表