【ClickHouse源碼】ReplicatedMergeTree之表創建流程

ReplicatedMergeTree之表創建流程

複製流是後臺進行的，是異步的，在ReplicatedMergeTree創建時會自動啓動很多個異步task，可以看下其構造函數

StorageReplicatedMergeTree::StorageReplicatedMergeTree(
    const String & zookeeper_path_,
    const String & replica_name_,
    bool attach,
    const StorageID & table_id_,
    const String & relative_data_path_,
    const StorageInMemoryMetadata & metadata,
    Context & context_,
    const String & date_column_name,
    const MergingParams & merging_params_,
    std::unique_ptr<MergeTreeSettings> settings_,
    bool has_force_restore_data_flag)
    : MergeTreeData(table_id_,
                    relative_data_path_,
                    metadata,
                    context_,
                    date_column_name,
                    merging_params_,
                    std::move(settings_),
                    true,                   /// require_part_metadata
                    attach,
                    [this] (const std::string & name) { enqueuePartForCheck(name); })
    , zookeeper_path(global_context.getMacros()->expand(zookeeper_path_, table_id_.database_name, table_id_.table_name))
    , replica_name(global_context.getMacros()->expand(replica_name_, table_id_.database_name, table_id_.table_name))
    , reader(*this)
    , writer(*this)
    , merger_mutator(*this, global_context.getBackgroundPool().getNumberOfThreads())
    , queue(*this)
    , fetcher(*this)
    , cleanup_thread(*this)
    , part_check_thread(*this)
    , restarting_thread(*this)
{
    // 在zk上創建相應節點
    if (!zookeeper_path.empty() && zookeeper_path.back() == '/')
        zookeeper_path.resize(zookeeper_path.size() - 1);

    if (!zookeeper_path.empty() && zookeeper_path.front() != '/')
        zookeeper_path = "/" + zookeeper_path;
    replica_path = zookeeper_path + "/replicas/" + replica_name;

    // queueUpdatingTask
    queue_updating_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::queueUpdatingTask)", [this]{ queueUpdatingTask(); });
    
    // mutationsUpdatingTask
    mutations_updating_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsUpdatingTask)", [this]{ mutationsUpdatingTask(); });

    // mergeSelectingTask
    merge_selecting_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mergeSelectingTask)", [this] { mergeSelectingTask(); });
    /// 此task先設置爲不激活狀態，如果成爲leader再激活
    merge_selecting_task->deactivate();

    // mutationsFinalizingTask
    mutations_finalizing_task = global_context.getSchedulePool().createTask(getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); });

    if (global_context.hasZooKeeper())
        current_zookeeper = global_context.getZooKeeper();

    bool skip_sanity_checks = false;

    // 是不是需要強制恢復數據
    if (current_zookeeper && current_zookeeper->exists(replica_path + "/flags/force_restore_data"))
    {
        skip_sanity_checks = true;
        current_zookeeper->remove(replica_path + "/flags/force_restore_data");

        LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag "
            << replica_path << "/flags/force_restore_data).");
    }
    else if (has_force_restore_data_flag)
    {
        skip_sanity_checks = true;

        LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag force_restore_data).");
    }

    // 開始從其他副本恢復數據
    loadDataParts(skip_sanity_checks);

    // 不存在zk,是通過是否存在zookeeper節點判斷的
    if (!current_zookeeper)
    {
        if (!attach)
            throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);

        /// Do not activate the replica. It will be readonly.
        LOG_ERROR(log, "No ZooKeeper: table will be in readonly mode.");
        is_readonly = true;
        return;
    }

    // 如果是attach命令，且不存在/metadata路徑
    if (attach && !current_zookeeper->exists(zookeeper_path + "/metadata"))
    {
        LOG_WARNING(log, "No metadata in ZooKeeper: table will be in readonly mode.");
        is_readonly = true;
        return;
    }

    // 如果不是attach
    if (!attach)
    {
        // 且數據parts存在，則報錯
        if (!getDataParts().empty())
            throw Exception("Data directory for table already containing data parts - probably it was unclean DROP table or manual intervention. You must either clear directory by hand or use ATTACH TABLE instead of CREATE TABLE if you need to use that parts.", ErrorCodes::INCORRECT_DATA);

        // 如果parts不存在執行以下代碼
        // 創建表
        createTableIfNotExists();
        // 檢查表結構，包含matadata和colums
        checkTableStructure(zookeeper_path);

        Coordination::Stat metadata_stat;
        current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat);
        metadata_version = metadata_stat.version;
        // 在zk上創建replicas及其子節點
        createReplica();
    }
    else
    {
        // 如果是attach命令執行以下代碼
        // 檢查表結構和數據parts
        checkTableStructure(replica_path);
        checkParts(skip_sanity_checks);

        if (current_zookeeper->exists(replica_path + "/metadata_version"))
        {
            metadata_version = parse<int>(current_zookeeper->get(replica_path + "/metadata_version"));
        }
        else 
        {
            Coordination::Stat metadata_stat;
            current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat);
            metadata_version = metadata_stat.version;
        }
        // 清理過時的臨時節點
        clearOldTemporaryDirectories(0);
    }
    // 創建quorum、mutations等相關節點
    createNewZooKeeperNodes();
    other_replicas_fixed_granularity = checkFixedGranualrityInZookeeper();
}

通過以上代碼可以知道在創建了ReplicatedMergeTree時，就創建了4個TaskHolder，可以類似理解成一個線程池的執行器，添加的task都是在後臺SchedulePool中執行的，其各部分主要作用是

queue_updating_task：負責跟蹤所有副本日誌中的更新並將其加載到queue中

mutations_updating_task：負責跟蹤所有副本日誌中的更新並將其加載到mutations中

merge_selecting_task：負責merge任務的選擇

mutations_finalizing_task：複製標記mutation任務的狀態爲done

以queue_updating_task爲例，在創建表時將queueUpdatingTask()添加到了queue_updating_task中，所以會執行這個方法，這個方法主要是實現在表創建後觸發一次數據clone，因爲可能同分片其他副本已經存在同樣表了，在新副本創建該表就要及時同步數據，保證數據一致。

void StorageReplicatedMergeTree::queueUpdatingTask()
{
    if (!queue_update_in_progress)
    {
        last_queue_update_start_time.store(time(nullptr));
        queue_update_in_progress = true;
    }
    try
    {
        // 這裏開始執行clone的操作，獲取log_pointer指針，獲取缺少的log（在log節點
        // 裏的log-xxxxx）,將這些log添加到queue節點裏等操作
        queue.pullLogsToQueue(getZooKeeper(), queue_updating_task->getWatchCallback());
        last_queue_update_finish_time.store(time(nullptr));
        queue_update_in_progress = false;
    }
    catch (const Coordination::Exception & e)
    {
        ...
    }
    catch (...)
    {
        ...
    }
}

上面的過程不會循環執行，只會執行一次，如果失敗會做一些異常處理，但是異常處理並不能說返回異常就可以了，還要能夠有恢復正常的措施，可以找到在dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp中，就有類似邏輯，它的功能是重新啓動ReplicatedMergeTree的後臺線程，重新做表初始化及zk session的創建，它的run()方法就會進入到以下流程中，看到Restarting這個名字也可以表明是這個意思

bool ReplicatedMergeTreeRestartingThread::tryStartup()
{
    try
    {
        removeFailedQuorumParts();
        activateReplica();

        const auto & zookeeper = storage.getZooKeeper();
        const auto storage_settings = storage.getSettings();

        storage.cloneReplicaIfNeeded(zookeeper);

        storage.queue.load(zookeeper);

        // 上面是做一些開始的判斷和操作，比如設置is_active,是不是需要clone數據
        // 這裏開始執行clone的操作，獲取log_pointer指針，獲取缺少的log（在log節點
        // 裏的log-xxxxx）,將這些log添加到queue節點裏
        storage.queue.pullLogsToQueue(zookeeper);
        storage.queue.removeCurrentPartsFromMutations();
        storage.last_queue_update_finish_time.store(time(nullptr));

        updateQuorumIfWeHavePart();

        if (storage_settings->replicated_can_become_leader)
            storage.enterLeaderElection();
        else
            LOG_INFO(log, "Will not enter leader election because replicated_can_become_leader=0");

        storage.partial_shutdown_called = false;
        storage.partial_shutdown_event.reset();

        // 激活相關task
        storage.queue_updating_task->activateAndSchedule();
        storage.mutations_updating_task->activateAndSchedule();
        storage.mutations_finalizing_task->activateAndSchedule();
        storage.cleanup_thread.start();
        storage.part_check_thread.start();

        return true;
    }
    catch (...)
    {
        ...
    }
}

至此，ReplicatedMergeTree的創建過程就全部完成了，也只是ReplicatedMergeTree的第一步，這部分主要是做了如下幾個點：

創建相關的本地表路徑及detached和format_version.txt
創建了相關的zk節點
根據不同條件load表的數據

這裏可能會有疑惑，代碼裏沒有創建本地表相關的代碼，如何實現創建的？

原因在這裏

class StorageReplicatedMergeTree : public ext::shared_ptr_helper<StorageReplicatedMergeTree>, public MergeTreeData

StorageReplicatedMergeTree繼承了MergeTreeData，仔細看上面的構造方法也做了相應的傳參，所以可以看下如下方法

MergeTreeData::MergeTreeData(
    const String & database_,
    const String & table_,
    const ColumnsDescription & columns_,
    const IndicesDescription & indices_,
    const ConstraintsDescription & constraints_,
    Context & context_,
    const String & date_column_name,
    const ASTPtr & partition_by_ast_,
    const ASTPtr & order_by_ast_,
    const ASTPtr & primary_key_ast_,
    const ASTPtr & sample_by_ast_,
    const ASTPtr & ttl_table_ast_,
    const MergingParams & merging_params_,
    std::unique_ptr<MergeTreeSettings> storage_settings_,
    bool require_part_metadata_,
    bool attach,
    BrokenPartCallback broken_part_callback_)
    : global_context(context_)
    , merging_params(merging_params_)
    , partition_by_ast(partition_by_ast_)
    , sample_by_ast(sample_by_ast_)
    , ttl_table_ast(ttl_table_ast_)
    , require_part_metadata(require_part_metadata_)
    , database_name(database_)
    , table_name(table_)
    , broken_part_callback(broken_part_callback_)
    , log_name(database_name + "." + table_name)
    , log(&Logger::get(log_name))
    , storage_settings(std::move(storage_settings_))
    , storage_policy(context_.getStoragePolicy(getSettings()->storage_policy))
    , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
    , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
    , parts_mover(this)
{
    ...

    // 這裏就是創建本地表文件的邏輯
    auto paths = getDataPaths();
    for (const String & path : paths)
    {
        Poco::File(path).createDirectories();
        Poco::File(path + "detached").createDirectory();
        if (Poco::File{path + "format_version.txt"}.exists())
        {
            if (!version_file_path.empty())
            {
                LOG_ERROR(log, "Duplication of version file " << version_file_path << " and " << path << "format_file.txt");
                throw Exception("Multiple format_version.txt file", ErrorCodes::CORRUPTED_DATA);
            }
            version_file_path = path + "format_version.txt";
        }
    }

    ...
}

這裏主要是爲了瞭解ReplicatedMergeTree的創建，所以略去了一些關於MergeTree的邏輯，但是ReplicatedMergeTree也是基於MergeTree的，有興趣可以自行了解下。

【ClickHouse源碼】ReplicatedMergeTree之表創建流程

ReplicatedMergeTree之表創建流程

基於 Nginx Ingress + 雲效 AppStack 實現灰度發佈

12款高效開源Wiki系統推薦，打造團隊知識管理利器

C語言--右移左移

一個開源且全面的C#算法實戰教程

dotnet 基於 DirectML 控制檯運行 Phi-3 模型

自定義MyBatis插件

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

常用的 Git 指令

鼠標控制軟件有可能和虛擬機軟件產生衝突

sm4加密工具類

【ClickHouse系列】ClickHouse集羣自動化搭建

Golang數據庫連接池運行原理（源碼解析）

【ClickHouse源碼】ReplicatedMergeTree之數據同步流程

【ClickHouse源碼】Distributed之表select流程

PostgreSQL目錄結構之global目錄

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結