推荐结合 leveldb-handbook 阅读源码
数据库每次启动时,都会有一个recover的过程,简要地来说,就是利用Manifest信息重新构建一个最新的version。
过程如下:
- 利用Current文件读取最近使用的manifest文件;
- 创建一个空的version,并利用manifest文件中的session record依次作apply操作,还原出一个最新的version,注意manifest的第一条session record是一个version的快照,后续的session record记录的都是增量的变化;
- 将非current文件指向的其他过期的manifest文件删除;
- 将新建的version作为当前数据库的version;
注意,随着leveldb运行时间的增长,一个manifest中包含的session record会越来越多,故leveldb在每次启动时都会重新创建一个manifest文件,并将第一条session record中记录当前version的快照状态。其他过期的manifest文件会在下次启动的recover流程中进行删除。leveldb通过这种方式,来控制manifest文件的大小,但是数据库本身没有重启,manifest还是会一直增长。
Current
由于每次启动,都会新建一个Manifest文件,因此leveldb当中可能会存在多个manifest文件。因此需要一个额外的current文件来指示当前系统使用的到底是哪个manifest文件。
该文件中只有一个内容,即当前使用的manifest文件的文件名。
Recover 恢复
Status DBImpl::Recover(VersionEdit* edit, bool* save_manifest) {
mutex_.AssertHeld();
// 创建目录,因为可能之前已经创建过,所以不需要.
env_->CreateDir(dbname_);
assert(db_lock_ == nullptr);
// 创建一个锁文件,就是db路径下的LOCK
Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (!s.ok()) {
return s;
}
// 检测CURRENT 文件是否存在
if (!env_->FileExists(CurrentFileName(dbname_))) {
if (options_.create_if_missing) { // 文件不存在则创建db
s = NewDB();
if (!s.ok()) {
return s;
}
} else {
return Status::InvalidArgument(
dbname_, "does not exist (create_if_missing is false)");
}
} else {
if (options_.error_if_exists) {
return Status::InvalidArgument(dbname_,
"exists (error_if_exists is true)");
}
}
// 根据manifest文件恢复 version
s = versions_->Recover(save_manifest);
if (!s.ok()) {
return s;
}
SequenceNumber max_sequence(0);
// Recover from all newer log files than the ones named in the
// descriptor (new log files may have been added by the previous
// incarnation without registering them in the descriptor).
//
// Note that PrevLogNumber() is no longer used, but we pay
// attention to it in case we are recovering a database
// produced by an older version of leveldb.
const uint64_t min_log = versions_->LogNumber();
const uint64_t prev_log = versions_->PrevLogNumber();
std::vector<std::string> filenames;
s = env_->GetChildren(dbname_, &filenames);
if (!s.ok()) {
return s;
}
std::set<uint64_t> expected;
versions_->AddLiveFiles(&expected);
uint64_t number;
FileType type;
std::vector<uint64_t> logs;
// 恢复所有时间上较新的log文件
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
expected.erase(number);
if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
logs.push_back(number);
}
}
// manifest文件里有的,在db目录下没有,丢失了
if (!expected.empty()) {
char buf[50];
snprintf(buf, sizeof(buf), "%d missing files; e.g.",
static_cast<int>(expected.size()));
return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
}
// 按log文件产生的顺序恢复db
std::sort(logs.begin(), logs.end());
for (size_t i = 0; i < logs.size(); i++) {
s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
&max_sequence);
if (!s.ok()) {
return s;
}
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
// update the file number allocation counter in VersionSet.
versions_->MarkFileNumberUsed(logs[i]);
}
if (versions_->LastSequence() < max_sequence) {
versions_->SetLastSequence(max_sequence);
}
return Status::OK();
}
按照log时间顺序恢复,这里用到log的Reader类,可参考:leveldb源码学习之日志 log 读取
Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
bool* save_manifest, VersionEdit* edit,
SequenceNumber* max_sequence) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
const char* fname;
Status* status; // null if options_.paranoid_checks==false
void Corruption(size_t bytes, const Status& s) override {
Log(info_log, "%s%s: dropping %d bytes; %s",
(this->status == nullptr ? "(ignoring error) " : ""), fname,
static_cast<int>(bytes), s.ToString().c_str());
if (this->status != nullptr && this->status->ok()) *this->status = s;
}
};
mutex_.AssertHeld();
// Open the log file
std::string fname = LogFileName(dbname_, log_number);
SequentialFile* file;
Status status = env_->NewSequentialFile(fname, &file);
if (!status.ok()) {
MaybeIgnoreError(&status);
return status;
}
// 创建日志的 reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.fname = fname.c_str();
reporter.status = (options_.paranoid_checks ? &status : nullptr);
// 这里让 log::Reader 做校验和检验即使 paranoid_checks==false 这样终端会导致整个提交被忽略,防止错误信息传播.
log::Reader reader(file, &reporter, true /*校验和*/, 0 /*initial_offset*/);
Log(options_.info_log, "Recovering log #%llu",
(unsigned long long)log_number);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
int compactions = 0;
MemTable* mem = nullptr;
// 读取一条日志记录
while (reader.ReadRecord(&record, &scratch) && status.ok()) {
if (record.size() < 12) {
reporter.Corruption(record.size(),
Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
if (mem == nullptr) {
mem = new MemTable(internal_comparator_);
mem->Ref();
}
// 往memtable 插入key
status = WriteBatchInternal::InsertInto(&batch, mem);
MaybeIgnoreError(&status);
if (!status.ok()) {
break;
}
// 更新序列号为 批量恢复后的序号
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1;
if (last_seq > *max_sequence) {
*max_sequence = last_seq;
}
// 内存使用大于阈值
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
compactions++;
*save_manifest = true;
status = WriteLevel0Table(mem, edit, nullptr); // 做一次minor 压缩,将当前memtable写入第0层sstable
mem->Unref();
mem = nullptr;
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
break;
}
}
}
delete file;
// 是否复用上一个log文件 .
if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
assert(logfile_ == nullptr);
assert(log_ == nullptr);
assert(mem_ == nullptr);
uint64_t lfile_size;
// 当上一个日志文件fname没有遇到memtable压缩时,接着复用整个log文件,新的修改追加到后面
if (env_->GetFileSize(fname, &lfile_size).ok() &&
env_->NewAppendableFile(fname, &logfile_).ok()) {
Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
log_ = new log::Writer(logfile_, lfile_size);
logfile_number_ = log_number;
if (mem != nullptr) {
mem_ = mem;
mem = nullptr;
} else {
// 当日志存在但是为空时,mem 可能为空.
mem_ = new MemTable(internal_comparator_);
mem_->Ref();
}
}
}
// 恢复日志后的memtable没有做压缩,这里压缩
if (mem != nullptr) {
if (status.ok()) {
*save_manifest = true;
status = WriteLevel0Table(mem, edit, nullptr);
}
mem->Unref();
}
return status;
}