前面幾篇簡單介紹了LevelDB中的數據結構和存儲方式(log、MemTable、sstable)。這一章節來解讀一下LevelDB最核心的內容——合併操作與版本控制。
Compaction
按照LevelDB的框架思路,Compaction有兩種使用方式,一種是MemTable -> Level0 ,稱爲Minor Com-paction,另一種是Level0 -> Level1,稱爲Major Compaction,這裏會分別進行敘述,看看其具體實現。
整個函數調用過程如下圖所示。
Minor Compaction
這一部分的Compaction較爲簡單,其邏輯主要是:首先在VersionEdit中得到當前的version(這個在後面會敘述),然後直接將Immutable MemTable寫入level0層。
//minor Compaction
void DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != nullptr);
// Save the contents of the memtable as a new Table
VersionEdit edit;
Version* base = versions_->current();
base->Ref();
//將Immutable MemTable寫入到Level0
Status s = WriteLevel0Table(imm_, &edit, base);
base->Unref();
if (s.ok() && shutting_down_.load(std::memory_order_acquire)) {
s = Status::IOError("Deleting DB during memtable compaction");
}
// Replace immutable memtable with the generated Table
//設置VersionEdit的參數
if (s.ok()) {
edit.SetPrevLogNumber(0);
edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
s = versions_->LogAndApply(&edit, &mutex_);
}
//最後將Immutable 指針置空
if (s.ok()) {
// Commit to the new state
imm_->Unref();
imm_ = nullptr;
has_imm_.store(false, std::memory_order_release);
DeleteObsoleteFiles();
} else {
RecordBackgroundError(s);
}
}
接下來就是具體實現,WriteLevel0Table()裏面有一點需要注意,就是原理上,每次Minor Compaction操作只能將Immutable MemTable合併到level0,但是其實內部源碼是調用PickLevelForMemTableOutput()函數,選擇一個包含最小key值與最大key值範圍之間的level進行插入,這樣可以減少Compaction操作的次數。
//寫入level0的具體實現
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
Version* base) {
mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros();
FileMetaData meta;
meta.number = versions_->NewFileNumber();
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator();
Log(options_.info_log, "Level-0 table #%llu: started",
(unsigned long long)meta.number);
//BuildTable用於寫文件
Status s;
{
mutex_.Unlock();
s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
mutex_.Lock();
}
Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long)meta.number, (unsigned long long)meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.file_size > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != nullptr) {
//這裏是選擇一個最高可以放MemTable的level(一般情況下爲level0)
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
}
//將文件添加到VersionEdit中
edit->AddFile(level, meta.number, meta.file_size, meta.smallest,
meta.largest);
}
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);
return s;
}
Major Compaction
這一部分是level n -> level n+1的關鍵。這裏先整理一下Major Compaction的思路。
Major compaction的過程如下:對多個文件採用多路歸併排序的方式,依次找出其中最小的Key記錄,也就是對多個文件中的所有記錄重新進行排序。之後採取一定的標準判斷(比如如果在低於L層中有key則說明有更新的數據,這個沒有保存價值)這個Key是否還需要保存,如果判斷沒有保存價值,那麼直接拋掉,如果覺得還需要繼續保存,那麼就將其寫入level L+1層中新生成的一個SSTable文件中。就這樣對KV數據一一處理,形成了一系列新的L+1層數據文件,之前的L層文件和L+1層參與compaction 的文件數據此時已經沒有意義了,所以全部刪除。這樣就完成了L層和L+1層文件記錄的合併過程。
Status DBImpl::DoCompactionWork(CompactionState* compact) {
const uint64_t start_micros = env_->NowMicros();
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
...
//記錄snapshot的數據
if (snapshots_.empty()) {
compact->smallest_snapshot = versions_->LastSequence();
} else {
compact->smallest_snapshot = snapshots_.oldest()->sequence_number();
}
//遍歷所有的input文件
Iterator* input = versions_->MakeInputIterator(compact->compaction);
// Release mutex while we're actually doing the compaction work
mutex_.Unlock();
input->SeekToFirst();
Status status;
ParsedInternalKey ikey;
std::string current_user_key;
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
while (input->Valid() && !shutting_down_.load(std::memory_order_acquire)) {
// Prioritize immutable compaction work
//(1)如果Immutable MemTable需要compact,則先compact其
if (has_imm_.load(std::memory_order_relaxed)) {
const uint64_t imm_start = env_->NowMicros();
mutex_.Lock();
if (imm_ != nullptr) {
//如果Immutable MemTable還有則再次CompactMemTable
CompactMemTable();
// Wake up MakeRoomForWrite() if necessary.
background_work_finished_signal_.SignalAll();
}
mutex_.Unlock();
imm_micros += (env_->NowMicros() - imm_start);
}
Slice key = input->key();
//當前(level +1)生成的文件和level + 2中有過多的重疊,則直接寫文件到磁盤
if (compact->compaction->ShouldStopBefore(key) &&
compact->builder != nullptr) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
// Handle key/value, add to state, etc.
bool drop = false;
//解碼
if (!ParseInternalKey(key, &ikey)) {
// Do not hide error keys
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
} else {
//解碼成功
if (!has_current_user_key ||
user_comparator()->Compare(ikey.user_key, Slice(current_user_key)) !=
0) {
// First occurrence of this user key
//第一次出現key值
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
}
if (last_sequence_for_key <= compact->smallest_snapshot) {
// Hidden by an newer entry for same user key
drop = true; // (A)
} else if (ikey.type == kTypeDeletion &&
ikey.sequence <= compact->smallest_snapshot &&
compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger sequence numbers
// (3) data in layers that are being compacted here and have
// smaller sequence numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true;
}
last_sequence_for_key = ikey.sequence;
}
if (!drop) {
// Open output file if necessary
//第一次Compaction或者剛剛寫入到磁盤則新建一個文件
if (compact->builder == nullptr) {
status = OpenCompactionOutputFile(compact);
if (!status.ok()) {
break;
}
}
if (compact->builder->NumEntries() == 0) {
compact->current_output()->smallest.DecodeFrom(key);
}
//每一次遍歷到一個記錄將其設置爲largest
compact->current_output()->largest.DecodeFrom(key);
compact->builder->Add(key, input->value());
// Close output file if it is big enough
//如果超過閾值將文件寫入磁盤
if (compact->builder->FileSize() >=
compact->compaction->MaxOutputFileSize()) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
}
//下一個input文件,一直接着遍歷
input->Next();
}
//判斷狀態並將未寫入磁盤的數據寫入磁盤
if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
status = Status::IOError("Deleting DB during compaction");
}
if (status.ok() && compact->builder != nullptr) {
status = FinishCompactionOutputFile(compact, input);
}
if (status.ok()) {
status = input->status();
}
delete input;
input = nullptr;
//記錄Compaction的相關數據
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros;
for (int which = 0; which < 2; which++) {
for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
//compaction操作讀入文件的總大小
stats.bytes_read += compact->compaction->input(which, i)->file_size;
}
}
for (size_t i = 0; i < compact->outputs.size(); i++) {
//compaction操作寫出文件的總大小
stats.bytes_written += compact->outputs[i].file_size;
}
mutex_.Lock();
stats_[compact->compaction->level() + 1].Add(stats);
if (status.ok()) {
status = InstallCompactionResults(compact);
}
if (!status.ok()) {
RecordBackgroundError(status);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp));
return status;
}
Version版本控制
LevelDB的版本管理策略十分優秀,這也是其精髓。LevelDB之所以有“Level”就是因爲其在每個sstable改變、增加、刪除的時候就會新生成一個level,而管理不同的level就需要一個版本控制策略。首先來介紹幾個相關文件:
- MANIFEST文件。記錄了當前版本與上次的版本不同之處,內容包括:增加了的sstable,刪除的sstable,當前版本日誌。
- CURRENT文件。文件中只有一行記錄就是當前系統版本(MANIFEST文件)
下面記錄四個類:FileMetaData,Version,VersionSet,VersionEdit。這幾個類實現了Version的記錄、切換功能。結構體之間的關係如下圖所示,這裏只分析裏面存儲的成員變量,成員函數的實現基本上就是對其進行操作。
FileMetaData使用來表示sstable的元數據。
struct FileMetaData {
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) {}
int refs; //引用次數,表示這個sstable在不同的version中有幾個複製
int allowed_seeks; // Seeks allowed until compaction 允許查找的最多次數
uint64_t number; //sstable編號
uint64_t file_size; // File size in bytes 文件大小
InternalKey smallest; // Smallest internal key served by table 最小key值
InternalKey largest; // Largest internal key served by table 最大key值
};
Version表示版本信息結構體。files_是一個二維數組,存儲全局信息,即每個level有哪些sstable的信息。
class Version {
public:
...
private:
...
VersionSet* vset_; // VersionSet to which this Version belongs 本個Version屬於哪個VersionSet
Version* next_; // Next version in linked list 下一個節點
Version* prev_; // Previous version in linked list 上一個節點
int refs_; // Number of live refs to this version 引用參數
// List of files per level
// 每個level的sstable元數據FileMetaData
std::vector<FileMetaData*> files_[config::kNumLevels];
// Next file to compact based on seek stats.
// 下一個合併的文件(輪詢機制)
FileMetaData* file_to_compact_;
// 下一個合併文件所在的level
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_; //score>1表示需要合併
int compaction_level_; //需要進行合併的level
};
VersionSet是Version的集合。 裏面有一些db的屬性,還有Version雙向鏈表的頭結點,current指針指向當前版本。
class VersionSet {
public:
...
private:
...
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.雙向鏈表的頭結點
Version* current_; // == dummy_versions_.prev_ 指向當前Version
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
};
VersionEdit表示每兩個Version之間的變化,也就是說OldVersion + VersionEdit = LatestVersion。
class VersionEdit {
public:
...
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file, uint64_t file_size,
const InternalKey& smallest, const InternalKey& largest) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
private:
...
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
std::string comparator_;
uint64_t log_number_;
uint64_t prev_log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector<std::pair<int, InternalKey>> compact_pointers_;
DeletedFileSet deleted_files_; //刪除文件集合
std::vector<std::pair<int, FileMetaData>> new_files_; //新增的文件
};
參考博客:
- http://catkang.github.io/2017/02/03/leveldb-version.html
- https://www.cnblogs.com/ym65536/p/11223407.html
- https://leveldb-handbook.readthedocs.io/zh/latest/version.html