數據的合併落盤
本文主要來梳理一下,leveldb數據是怎樣從內存寫入文件這麼一個過程的。在前文的描述中,在數據庫的Open過程中,在數據的Get過程中或者Write數據的時候等情況下都可能會觸發到leveldb數據合併並寫入本地文件的操作即MaybeScheduleCompaction函數的調用,本文就分析一下該函數的執行流程。
MaybeScheduleCompaction函數的執行過程
void DBImpl::MaybeScheduleCompaction() {
mutex_.AssertHeld(); // 判斷鎖是否獲取
if (background_compaction_scheduled_) { // 檢查是否已經調度過
// Already scheduled
} else if (shutting_down_.load(std::memory_order_acquire)) { // 獲取是否設置shutting_down_即停止
// DB is being deleted; no more background compactions
} else if (!bg_error_.ok()) { // 是否出錯
// Already got an error; no more changes
} else if (imm_ == nullptr && manual_compaction_ == nullptr &&
!versions_->NeedsCompaction()) { // 是否不可變imm_爲空 並且manual_compaction_爲空並且需要對比
// No work to be done
} else {
background_compaction_scheduled_ = true; // 設置已經調度標誌位
env_->Schedule(&DBImpl::BGWork, this); // 開啓線程執行BGWork函數,並傳入傳輸this
}
}
該函數只是簡單的進行了幾個條件的檢查然後判斷是否需要執行env_->Schedule函數是否需要執行。
env_->Schedule線程執行函數
本文以posix爲平臺,查看該函數爲PosixEnv::Schedule,
void PosixEnv::Schedule(
void (*background_work_function)(void* background_work_arg),
void* background_work_arg) {
background_work_mutex_.Lock(); // 獲取線程鎖
// Start the background thread, if we haven't done so already.
if (!started_background_thread_) { // 是否已經開始執行了後臺線程
started_background_thread_ = true; // 如果沒有則設置爲true
std::thread background_thread(PosixEnv::BackgroundThreadEntryPoint, this); // 開啓線程 執行BackgroundThreadEntryPoint執行該線程等待回調函數執行
background_thread.detach();
}
// If the queue is empty, the background thread may be waiting for work.
if (background_work_queue_.empty()) { // 如果隊列爲空則 發送信號
background_work_cv_.Signal();
}
background_work_queue_.emplace(background_work_function, background_work_arg); // 隊列中傳入函數和參數
background_work_mutex_.Unlock();
}
void PosixEnv::BackgroundThreadMain() {
while (true) {
background_work_mutex_.Lock(); // 獲取鎖
// Wait until there is work to be done.
while (background_work_queue_.empty()) { // 檢查是否爲空
background_work_cv_.Wait(); // 如果隊列爲空則進入等待
}
assert(!background_work_queue_.empty()); // 確保隊列不爲空
auto background_work_function = background_work_queue_.front().function; // 獲取隊列中的函數
void* background_work_arg = background_work_queue_.front().arg; // 獲取隊列中的函數執行參數
background_work_queue_.pop(); // 然後彈出該值
background_work_mutex_.Unlock();
background_work_function(background_work_arg); // 執行回調函數
}
}
通過該函數可知,有專門一個後臺線程循環等待隊列中傳入的數據,如果傳入了回調函數則執行該回調函數,此時傳入的回調函數就是BGWork
BGWork函數
void DBImpl::BGWork(void* db) {
reinterpret_cast<DBImpl*>(db)->BackgroundCall(); // 調用了db的BackgroundCall函數
}
void DBImpl::BackgroundCall() {
MutexLock l(&mutex_);
assert(background_compaction_scheduled_);
if (shutting_down_.load(std::memory_order_acquire)) { // 檢查是否停止
// No more background work when shutting down.
} else if (!bg_error_.ok()) { // 檢查是否出錯
// No more background work after a background error.
} else {
BackgroundCompaction(); // 調用後臺合併處理函數
}
background_compaction_scheduled_ = false; // 設置標誌位爲爲false
// Previous compaction may have produced too many files in a level,
// so reschedule another compaction if needed.
MaybeScheduleCompaction(); // 有可能出現錯誤 等其他情況 繼續調用該函數進行檢查
background_work_finished_signal_.SignalAll();
}
該函數如果不出異常的情況下,應該會執行到BackgroundCompaction函數,該函數就是最終執行的函數。
BackgroundCompaction函數
void DBImpl::BackgroundCompaction() {
mutex_.AssertHeld();
if (imm_ != nullptr) { // 判斷如果不可變table不爲空 則調用CompactMemTable函數
CompactMemTable();
return;
}
Compaction* c; // 此時不可變table已經爲空,則證明已經將不可變table數據寫入文件
bool is_manual = (manual_compaction_ != nullptr); // 默認爲空
InternalKey manual_end;
if (is_manual) {
ManualCompaction* m = manual_compaction_;
c = versions_->CompactRange(m->level, m->begin, m->end); // 比較層級
m->done = (c == nullptr);
if (c != nullptr) {
manual_end = c->input(0, c->num_input_files(0) - 1)->largest; // 壓入最大
}
Log(options_.info_log,
"Manual compaction at level-%d from %s .. %s; will stop at %s\n",
m->level, (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"),
(m->done ? "(end)" : manual_end.DebugString().c_str()));
} else {
c = versions_->PickCompaction(); // 獲取當前的Compaction比較層級
}
Status status;
if (c == nullptr) {
// Nothing to do
} else if (!is_manual && c->IsTrivialMove()) { // 移動當前文件到下一個層級
// Move file to next level
assert(c->num_input_files(0) == 1);
FileMetaData* f = c->input(0, 0);
c->edit()->DeleteFile(c->level(), f->number); // 先刪除當前層級的
c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest,
f->largest); // 添加文件到下一個層級中去
status = versions_->LogAndApply(c->edit(), &mutex_); // 寫入日誌
if (!status.ok()) {
RecordBackgroundError(status); // 如果出錯則報錯
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number), c->level() + 1,
static_cast<unsigned long long>(f->file_size),
status.ToString().c_str(), versions_->LevelSummary(&tmp));
} else {
CompactionState* compact = new CompactionState(c); // 則是一個新的compact
status = DoCompactionWork(compact); //
if (!status.ok()) {
RecordBackgroundError(status);
}
CleanupCompaction(compact); // 清理文件
c->ReleaseInputs();
DeleteObsoleteFiles(); // 刪除不必要的文件
}
delete c;
if (status.ok()) {
// Done
} else if (shutting_down_.load(std::memory_order_acquire)) {
// Ignore compaction errors found during shutting down
} else {
Log(options_.info_log, "Compaction error: %s", status.ToString().c_str());
}
if (is_manual) {
ManualCompaction* m = manual_compaction_;
if (!status.ok()) {
m->done = true;
}
if (!m->done) {
// We only compacted part of the requested range. Update *m
// to the range that is left to be compacted.
m->tmp_storage = manual_end;
m->begin = &m->tmp_storage;
}
manual_compaction_ = nullptr;
}
}
該函數主要就是先將imm不可變的內容合併落盤,然後再會在後續的執行過程中,繼續執行imm_ != nullptr條件之下的內容,首先先查看一下CompactMemTable函數;
void DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != nullptr); // 確保imm_不爲空
// Save the contents of the memtable as a new Table
VersionEdit edit;
Version* base = versions_->current(); // 獲取當前的版本
base->Ref();
Status s = WriteLevel0Table(imm_, &edit, base); // 觸發寫入第0層的操作
base->Unref();
if (s.ok() && shutting_down_.load(std::memory_order_acquire)) {
s = Status::IOError("Deleting DB during memtable compaction");
}
// Replace immutable memtable with the generated Table
if (s.ok()) {
edit.SetPrevLogNumber(0); // 設置前一個日誌編號爲0
edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
s = versions_->LogAndApply(&edit, &mutex_); // 寫入日誌
}
if (s.ok()) {
// Commit to the new state
imm_->Unref();
imm_ = nullptr; // 將不可變imm置空
has_imm_.store(false, std::memory_order_release); // 存儲該變量值
DeleteObsoleteFiles(); // 刪除不必要的文件文件
} else {
RecordBackgroundError(s); // 否則就執行失敗
}
}
此時主要的就是觸發了寫入第0層的函數,並將imm不可變table置空,
WriteLevel0Table第0層寫入函數
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
Version* base) {
mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros(); // 獲取時間值
FileMetaData meta;
meta.number = versions_->NewFileNumber(); // 獲取一個新的文件編號 文件編號是累加1
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator(); // 獲取memtable的迭代器
Log(options_.info_log, "Level-0 table #%llu: started",
(unsigned long long)meta.number);
Status s;
{
mutex_.Unlock();
s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); // 調用BuildTable寫入文件數據
mutex_.Lock();
}
Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long)meta.number, (unsigned long long)meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.file_size > 0) { // 如果文件大小大於0
const Slice min_user_key = meta.smallest.user_key(); // 獲取最小值
const Slice max_user_key = meta.largest.user_key(); // 獲取最大值
if (base != nullptr) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); // 獲取層級
}
edit->AddFile(level, meta.number, meta.file_size, meta.smallest,
meta.largest); // 將該內容寫入層級中
}
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros; // 記錄寫入的時間
stats.bytes_written = meta.file_size;
stats_[level].Add(stats); // 添加該狀態
return s;
}
主要寫入工作都交給了BuildTable函數來實現;
BuildTable函數
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
TableCache* table_cache, Iterator* iter, FileMetaData* meta) {
Status s;
meta->file_size = 0;
iter->SeekToFirst();
std::string fname = TableFileName(dbname, meta->number); // 獲取文件名稱
if (iter->Valid()) {
WritableFile* file;
s = env->NewWritableFile(fname, &file); // 生成一個寫入文件
if (!s.ok()) {
return s; // 如果失敗則報錯返回
}
TableBuilder* builder = new TableBuilder(options, file); // 生成一個TableBuilder實例
meta->smallest.DecodeFrom(iter->key()); // 設置最小值
for (; iter->Valid(); iter->Next()) { // 循環遍歷
Slice key = iter->key();
meta->largest.DecodeFrom(key); // 設置最大值
builder->Add(key, iter->value()); // 添加對應的key value
}
// Finish and check for builder errors
s = builder->Finish(); // 寫入所有信息
if (s.ok()) {
meta->file_size = builder->FileSize(); // 獲取文件大小
assert(meta->file_size > 0);
}
delete builder;
// Finish and check for file errors
if (s.ok()) {
s = file->Sync(); // 文件同步寫入磁盤
}
if (s.ok()) {
s = file->Close(); // 關閉文件
}
delete file;
file = nullptr;
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number,
meta->file_size); // 標記該table可用
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) { // 檢查是否正確
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
// Keep it
} else {
env->DeleteFile(fname);
}
return s;
}
該函數主要還是調用了TableBuilder將對應順序大小的數據寫入到文件中去,在寫入之後就調用了文件同步的功能去寫入數據並設置到緩存中。其中TableBuilder添加和Finish流程如下;
void TableBuilder::Add(const Slice& key, const Slice& value) {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->num_entries > 0) {
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); // 比較最小值要大於最後一個插入的值
}
if (r->pending_index_entry) {
assert(r->data_block.empty());
r->options.comparator->FindShortestSeparator(&r->last_key, key); // 比較查找
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding)); // 插入該值
r->pending_index_entry = false;
}
if (r->filter_block != nullptr) { // 過濾該值
r->filter_block->AddKey(key);
}
r->last_key.assign(key.data(), key.size()); // 設置最後一個值
r->num_entries++;
r->data_block.Add(key, value); // 添加key value
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
if (estimated_block_size >= r->options.block_size) {
Flush();
}
}
void TableBuilder::Flush() {
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->data_block.empty()) return; // 如果data_block爲空則返回
assert(!r->pending_index_entry);
WriteBlock(&r->data_block, &r->pending_handle); // 寫入塊數據
if (ok()) {
r->pending_index_entry = true; // 設置爲True
r->status = r->file->Flush(); // 刷新數據
}
if (r->filter_block != nullptr) {
r->filter_block->StartBlock(r->offset); // 添加過濾數據
}
}
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert(ok());
Rep* r = rep_;
Slice raw = block->Finish(); // 獲取最終數據
Slice block_contents;
CompressionType type = r->options.compression; // 獲取壓縮類型
// TODO(postrelease): Support more compression options: zlib?
switch (type) {
case kNoCompression:
block_contents = raw;
break;
case kSnappyCompression: {
std::string* compressed = &r->compressed_output;
if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
compressed->size() < raw.size() - (raw.size() / 8u)) {
block_contents = *compressed;
} else {
// Snappy not supported, or compressed less than 12.5%, so just
// store uncompressed form
block_contents = raw;
type = kNoCompression;
}
break;
}
}
WriteRawBlock(block_contents, type, handle); // 將壓縮內容和類型寫入
r->compressed_output.clear();
block->Reset();
}
void TableBuilder::WriteRawBlock(const Slice& block_contents,
CompressionType type, BlockHandle* handle) {
Rep* r = rep_;
handle->set_offset(r->offset); // 獲取偏移
handle->set_size(block_contents.size());
r->status = r->file->Append(block_contents); // 加入數據
if (r->status.ok()) {
char trailer[kBlockTrailerSize];
trailer[0] = type;
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); // 添加校驗碼
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
EncodeFixed32(trailer + 1, crc32c::Mask(crc));
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); // 加入校驗碼
if (r->status.ok()) {
r->offset += block_contents.size() + kBlockTrailerSize; // 文件移動對應校驗碼的偏移
}
}
}
從中可知,對於需要寫入的文件會進行數據的壓縮,壓縮完成後會進行數據的校驗碼的加入,此時就把所有待寫入的內容完成,然後再調用Finish函數來添加尾部信息
Status TableBuilder::Finish() {
Rep* r = rep_;
Flush();
assert(!r->closed);
r->closed = true;
BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
// Write filter block
if (ok() && r->filter_block != nullptr) {
WriteRawBlock(r->filter_block->Finish(), kNoCompression,
&filter_block_handle); // 寫入過濾值信息
}
// Write metaindex block
if (ok()) {
BlockBuilder meta_index_block(&r->options); // 寫入頭部塊信息過濾信息
if (r->filter_block != nullptr) {
// Add mapping from "filter.Name" to location of filter data
std::string key = "filter.";
key.append(r->options.filter_policy->Name());
std::string handle_encoding;
filter_block_handle.EncodeTo(&handle_encoding);
meta_index_block.Add(key, handle_encoding);
}
// TODO(postrelease): Add stats and other meta blocks
WriteBlock(&meta_index_block, &metaindex_block_handle); // 寫入到塊中
}
// Write index block
if (ok()) { // 寫入index塊信息
if (r->pending_index_entry) {
r->options.comparator->FindShortSuccessor(&r->last_key); // 查找該值
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding)); // 添加該值
r->pending_index_entry = false;
}
WriteBlock(&r->index_block, &index_block_handle); // 寫入信息
}
// Write footer
if (ok()) {
Footer footer;
footer.set_metaindex_handle(metaindex_block_handle); // 設置編號
footer.set_index_handle(index_block_handle); // 設置index
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
r->status = r->file->Append(footer_encoding); // 添加尾部信息
if (r->status.ok()) {
r->offset += footer_encoding.size();
}
}
return r->status;
}
此時寫入的數據格式與內容都已經寫入到文件中去了。基本的插入流程就描述完畢。
總結
本文大致描述了合併並寫入文件的操作的基本流程,基本思路就是啓動一個後臺線程,該線程時刻等待隊列中寫入的回調函數去執行合併落盤的操作,本文只是簡單的分析了數據落盤的操作,並沒有去詳細分析數據層級的分層流程(細節部分內容較多)後續如有機會會補充該部分內容。由於本人才疏學淺,如有錯誤請批評指正。