【ClickHouse源碼】ReplicatedMergeTree之insert流程

ReplicatedMergeTree之insert流程

核心方法在:dbms/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp中的write方法

void ReplicatedMergeTreeBlockOutputStream::write(const Block & block)
{
    last_block_is_duplicate = false;

    /// 判斷是否執行延遲插入
    storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event);
    // 獲取zk連接並判斷是否過期
    auto zookeeper = storage.getZooKeeper();
    assertSessionIsNotExpired(zookeeper);

    // 這裏主要是判斷是否設定了多副本提交驗證,並執行一些驗證和判斷邏輯
    if (quorum)
        checkQuorumPrecondition(zookeeper);
    // 根據最大塊大小限制(64K~1M),將block拆分爲多個,
    auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block);

    for (auto & current_block : part_blocks)
    {
        Stopwatch watch;

        // 將current_block寫入臨時part,並做checksum校驗
        MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block);

        String block_id;
        // 如果不重複
        if (deduplicate)
        {
            SipHash hash;
            part->checksums.computeTotalChecksumDataOnly(hash);
            union
            {
                char bytes[16];
                UInt64 words[2];
            } hash_value;
            hash.get128(hash_value.bytes);

            // 生成block_id
            block_id = part->info.partition_id + "_" + toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]);

            LOG_DEBUG(log, "Wrote block with ID '" << block_id << "', " << block.rows() << " rows");
        }
        else
        {
            LOG_DEBUG(log, "Wrote block with " << block.rows() << " rows");
        }

        try
        {
            // 提交part到zk
            commitPart(zookeeper, part, block_id);

            int error = (deduplicate && last_block_is_duplicate) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0;
            PartLog::addNewPart(storage.global_context, part, watch.elapsed(), ExecutionStatus(error));
        }
        catch (...)
        {
            PartLog::addNewPart(storage.global_context, part, watch.elapsed(), ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__));
            throw;
        }
    }
}

再來看一下commitPart方法,這個方法比較長,請耐心閱讀

void ReplicatedMergeTreeBlockOutputStream::commitPart(zkutil::ZooKeeperPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const String & block_id)
{
    // 做列名和列類型檢測及zk檢測
    storage.check(part->getColumns());
    assertSessionIsNotExpired(zookeeper);

    /// 分配block number並檢查是否重複
    bool deduplicate_block = !block_id.empty();
    String block_id_path = deduplicate_block ? storage.zookeeper_path + "/blocks/" + block_id : "";
    auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path);

    // 如果重複就記錄日誌並標記event爲inserted
    if (!block_number_lock)
    {
        LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it.");
        part->is_duplicate = true;
        last_block_is_duplicate = true;
        ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
        return;
    }

    // 如果沒有重複執行以下代碼
    Int64 block_number = block_number_lock->getNumber();

    // 生成part名及詳細信息
    part->info.min_block = block_number;
    part->info.max_block = block_number;
    part->info.level = 0;

    String part_name = part->getNewName(part->info);
    part->name = part_name;

    // 生成log節點及詳細信息
    StorageReplicatedMergeTree::LogEntry log_entry;
    log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
    log_entry.create_time = time(nullptr);
    log_entry.source_replica = storage.replica_name;
    log_entry.new_part_name = part_name;
    log_entry.quorum = quorum;
    log_entry.block_id = block_id;

    // 下面是構造將要向zk做的操作
    Coordination::Requests ops;

    storage.getCommitPartOps(ops, part, block_id_path);

    /// 生成log節點名,這裏是順序持久化的節點
    ops.emplace_back(zkutil::makeCreateRequest(
        storage.zookeeper_path + "/log/log-",
        log_entry.toString(),
        zkutil::CreateMode::PersistentSequential));

    block_number_lock->getUnlockOps(ops);

    // quorum相關,暫不說明,不影響整個主流程
    if (quorum)
    {
        ......
    }

    // 如果提交zk失敗,還會將該操作進行重試,保證成功,以事務的方式來包裝
    MergeTreeData::Transaction transaction(storage);   
    // 重命名part
    storage.renameTempPartAndAdd(part, nullptr, &transaction);

    Coordination::Responses responses;
    int32_t multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT

    if (multi_code == Coordination::ZOK)
    {
        transaction.commit();
        // 觸發merge操作前的part select任務
        storage.merge_selecting_task->schedule();
        // 對操作解鎖
        block_number_lock->assumeUnlocked();
    }
    else if (multi_code == Coordination::ZCONNECTIONLOSS
        || multi_code == Coordination::ZOPERATIONTIMEOUT)
    {
        // 由於返回值是失去連接或超時,如果成功寫入zk再次寫入會有問題,所以這裏還是做commit()處理
        transaction.commit();
        // 延時一段時間再檢查該part是否成功提交
        storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);

        // 不知道是否成功,拋個異常
        throw Exception("Unknown status, client must retry. Reason: " + String(Coordination::errorMessage(multi_code)),
            ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
    }
    else if (Coordination::isUserError(multi_code))
    {
        String failed_op_path = zkutil::KeeperMultiException(multi_code, ops, responses).getPathForFirstFailedOp();

        if (multi_code == Coordination::ZNODEEXISTS && deduplicate_block && failed_op_path == block_id_path)
        {
            // 如果block存在,則產生衝突,回滾insert操作
            LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")");

            part->is_duplicate = true;
            transaction.rollback();
            last_block_is_duplicate = true;
            ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
        }
        else if (multi_code == Coordination::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
        {
            // 發現其他副本已經在執行操作了,節點已經存在,也執行回滾
            transaction.rollback();

            throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
        }
        else
        {
            // 其他情況,執行回滾
            transaction.rollback();
            throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                            + zkutil::ZooKeeper::error2string(multi_code) + ", path " + failed_op_path,
                            ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
        }
    }
    else if (Coordination::isHardwareError(multi_code))
    {
        // 存儲設備異常回滾
        transaction.rollback();
        throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                        + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
    }
    else
    {
        transaction.rollback();
        throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                        + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
    }

    if (quorum)
    {
        // 等待指定副本數都完成
        LOG_TRACE(log, "Waiting for quorum");

        String quorum_status_path = storage.zookeeper_path + "/quorum/status";

        ......
        
        LOG_TRACE(log, "Quorum satisfied");
    }
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章