前面几篇简单介绍了LevelDB中的数据结构和存储方式(log、MemTable、sstable)。这一章节来解读一下LevelDB最核心的内容——合并操作与版本控制。
Compaction
按照LevelDB的框架思路,Compaction有两种使用方式,一种是MemTable -> Level0 ,称为Minor Com-paction,另一种是Level0 -> Level1,称为Major Compaction,这里会分别进行叙述,看看其具体实现。
整个函数调用过程如下图所示。
Minor Compaction
这一部分的Compaction较为简单,其逻辑主要是:首先在VersionEdit中得到当前的version(这个在后面会叙述),然后直接将Immutable MemTable写入level0层。
//minor Compaction
void DBImpl::CompactMemTable() {
mutex_.AssertHeld();
assert(imm_ != nullptr);
// Save the contents of the memtable as a new Table
VersionEdit edit;
Version* base = versions_->current();
base->Ref();
//将Immutable MemTable写入到Level0
Status s = WriteLevel0Table(imm_, &edit, base);
base->Unref();
if (s.ok() && shutting_down_.load(std::memory_order_acquire)) {
s = Status::IOError("Deleting DB during memtable compaction");
}
// Replace immutable memtable with the generated Table
//设置VersionEdit的参数
if (s.ok()) {
edit.SetPrevLogNumber(0);
edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
s = versions_->LogAndApply(&edit, &mutex_);
}
//最后将Immutable 指针置空
if (s.ok()) {
// Commit to the new state
imm_->Unref();
imm_ = nullptr;
has_imm_.store(false, std::memory_order_release);
DeleteObsoleteFiles();
} else {
RecordBackgroundError(s);
}
}
接下来就是具体实现,WriteLevel0Table()里面有一点需要注意,就是原理上,每次Minor Compaction操作只能将Immutable MemTable合并到level0,但是其实内部源码是调用PickLevelForMemTableOutput()函数,选择一个包含最小key值与最大key值范围之间的level进行插入,这样可以减少Compaction操作的次数。
//写入level0的具体实现
Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
Version* base) {
mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros();
FileMetaData meta;
meta.number = versions_->NewFileNumber();
pending_outputs_.insert(meta.number);
Iterator* iter = mem->NewIterator();
Log(options_.info_log, "Level-0 table #%llu: started",
(unsigned long long)meta.number);
//BuildTable用于写文件
Status s;
{
mutex_.Unlock();
s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
mutex_.Lock();
}
Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
(unsigned long long)meta.number, (unsigned long long)meta.file_size,
s.ToString().c_str());
delete iter;
pending_outputs_.erase(meta.number);
// Note that if file_size is zero, the file has been deleted and
// should not be added to the manifest.
int level = 0;
if (s.ok() && meta.file_size > 0) {
const Slice min_user_key = meta.smallest.user_key();
const Slice max_user_key = meta.largest.user_key();
if (base != nullptr) {
//这里是选择一个最高可以放MemTable的level(一般情况下为level0)
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
}
//将文件添加到VersionEdit中
edit->AddFile(level, meta.number, meta.file_size, meta.smallest,
meta.largest);
}
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros;
stats.bytes_written = meta.file_size;
stats_[level].Add(stats);
return s;
}
Major Compaction
这一部分是level n -> level n+1的关键。这里先整理一下Major Compaction的思路。
Major compaction的过程如下:对多个文件采用多路归并排序的方式,依次找出其中最小的Key记录,也就是对多个文件中的所有记录重新进行排序。之后采取一定的标准判断(比如如果在低于L层中有key则说明有更新的数据,这个没有保存价值)这个Key是否还需要保存,如果判断没有保存价值,那么直接抛掉,如果觉得还需要继续保存,那么就将其写入level L+1层中新生成的一个SSTable文件中。就这样对KV数据一一处理,形成了一系列新的L+1层数据文件,之前的L层文件和L+1层参与compaction 的文件数据此时已经没有意义了,所以全部删除。这样就完成了L层和L+1层文件记录的合并过程。
Status DBImpl::DoCompactionWork(CompactionState* compact) {
const uint64_t start_micros = env_->NowMicros();
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
...
//记录snapshot的数据
if (snapshots_.empty()) {
compact->smallest_snapshot = versions_->LastSequence();
} else {
compact->smallest_snapshot = snapshots_.oldest()->sequence_number();
}
//遍历所有的input文件
Iterator* input = versions_->MakeInputIterator(compact->compaction);
// Release mutex while we're actually doing the compaction work
mutex_.Unlock();
input->SeekToFirst();
Status status;
ParsedInternalKey ikey;
std::string current_user_key;
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
while (input->Valid() && !shutting_down_.load(std::memory_order_acquire)) {
// Prioritize immutable compaction work
//(1)如果Immutable MemTable需要compact,则先compact其
if (has_imm_.load(std::memory_order_relaxed)) {
const uint64_t imm_start = env_->NowMicros();
mutex_.Lock();
if (imm_ != nullptr) {
//如果Immutable MemTable还有则再次CompactMemTable
CompactMemTable();
// Wake up MakeRoomForWrite() if necessary.
background_work_finished_signal_.SignalAll();
}
mutex_.Unlock();
imm_micros += (env_->NowMicros() - imm_start);
}
Slice key = input->key();
//当前(level +1)生成的文件和level + 2中有过多的重叠,则直接写文件到磁盘
if (compact->compaction->ShouldStopBefore(key) &&
compact->builder != nullptr) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
// Handle key/value, add to state, etc.
bool drop = false;
//解码
if (!ParseInternalKey(key, &ikey)) {
// Do not hide error keys
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
} else {
//解码成功
if (!has_current_user_key ||
user_comparator()->Compare(ikey.user_key, Slice(current_user_key)) !=
0) {
// First occurrence of this user key
//第一次出现key值
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
}
if (last_sequence_for_key <= compact->smallest_snapshot) {
// Hidden by an newer entry for same user key
drop = true; // (A)
} else if (ikey.type == kTypeDeletion &&
ikey.sequence <= compact->smallest_snapshot &&
compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger sequence numbers
// (3) data in layers that are being compacted here and have
// smaller sequence numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true;
}
last_sequence_for_key = ikey.sequence;
}
if (!drop) {
// Open output file if necessary
//第一次Compaction或者刚刚写入到磁盘则新建一个文件
if (compact->builder == nullptr) {
status = OpenCompactionOutputFile(compact);
if (!status.ok()) {
break;
}
}
if (compact->builder->NumEntries() == 0) {
compact->current_output()->smallest.DecodeFrom(key);
}
//每一次遍历到一个记录将其设置为largest
compact->current_output()->largest.DecodeFrom(key);
compact->builder->Add(key, input->value());
// Close output file if it is big enough
//如果超过阈值将文件写入磁盘
if (compact->builder->FileSize() >=
compact->compaction->MaxOutputFileSize()) {
status = FinishCompactionOutputFile(compact, input);
if (!status.ok()) {
break;
}
}
}
//下一个input文件,一直接着遍历
input->Next();
}
//判断状态并将未写入磁盘的数据写入磁盘
if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
status = Status::IOError("Deleting DB during compaction");
}
if (status.ok() && compact->builder != nullptr) {
status = FinishCompactionOutputFile(compact, input);
}
if (status.ok()) {
status = input->status();
}
delete input;
input = nullptr;
//记录Compaction的相关数据
CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros;
for (int which = 0; which < 2; which++) {
for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
//compaction操作读入文件的总大小
stats.bytes_read += compact->compaction->input(which, i)->file_size;
}
}
for (size_t i = 0; i < compact->outputs.size(); i++) {
//compaction操作写出文件的总大小
stats.bytes_written += compact->outputs[i].file_size;
}
mutex_.Lock();
stats_[compact->compaction->level() + 1].Add(stats);
if (status.ok()) {
status = InstallCompactionResults(compact);
}
if (!status.ok()) {
RecordBackgroundError(status);
}
VersionSet::LevelSummaryStorage tmp;
Log(options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp));
return status;
}
Version版本控制
LevelDB的版本管理策略十分优秀,这也是其精髓。LevelDB之所以有“Level”就是因为其在每个sstable改变、增加、删除的时候就会新生成一个level,而管理不同的level就需要一个版本控制策略。首先来介绍几个相关文件:
- MANIFEST文件。记录了当前版本与上次的版本不同之处,内容包括:增加了的sstable,删除的sstable,当前版本日志。
- CURRENT文件。文件中只有一行记录就是当前系统版本(MANIFEST文件)
下面记录四个类:FileMetaData,Version,VersionSet,VersionEdit。这几个类实现了Version的记录、切换功能。结构体之间的关系如下图所示,这里只分析里面存储的成员变量,成员函数的实现基本上就是对其进行操作。
FileMetaData使用来表示sstable的元数据。
struct FileMetaData {
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) {}
int refs; //引用次数,表示这个sstable在不同的version中有几个复制
int allowed_seeks; // Seeks allowed until compaction 允许查找的最多次数
uint64_t number; //sstable编号
uint64_t file_size; // File size in bytes 文件大小
InternalKey smallest; // Smallest internal key served by table 最小key值
InternalKey largest; // Largest internal key served by table 最大key值
};
Version表示版本信息结构体。files_是一个二维数组,存储全局信息,即每个level有哪些sstable的信息。
class Version {
public:
...
private:
...
VersionSet* vset_; // VersionSet to which this Version belongs 本个Version属于哪个VersionSet
Version* next_; // Next version in linked list 下一个节点
Version* prev_; // Previous version in linked list 上一个节点
int refs_; // Number of live refs to this version 引用参数
// List of files per level
// 每个level的sstable元数据FileMetaData
std::vector<FileMetaData*> files_[config::kNumLevels];
// Next file to compact based on seek stats.
// 下一个合并的文件(轮询机制)
FileMetaData* file_to_compact_;
// 下一个合并文件所在的level
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_; //score>1表示需要合并
int compaction_level_; //需要进行合并的level
};
VersionSet是Version的集合。 里面有一些db的属性,还有Version双向链表的头结点,current指针指向当前版本。
class VersionSet {
public:
...
private:
...
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.双向链表的头结点
Version* current_; // == dummy_versions_.prev_ 指向当前Version
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
};
VersionEdit表示每两个Version之间的变化,也就是说OldVersion + VersionEdit = LatestVersion。
class VersionEdit {
public:
...
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file, uint64_t file_size,
const InternalKey& smallest, const InternalKey& largest) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
private:
...
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
std::string comparator_;
uint64_t log_number_;
uint64_t prev_log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector<std::pair<int, InternalKey>> compact_pointers_;
DeletedFileSet deleted_files_; //删除文件集合
std::vector<std::pair<int, FileMetaData>> new_files_; //新增的文件
};
参考博客:
- http://catkang.github.io/2017/02/03/leveldb-version.html
- https://www.cnblogs.com/ym65536/p/11223407.html
- https://leveldb-handbook.readthedocs.io/zh/latest/version.html