leveldb源码学习之 DBImpl 类(恢复日志到memtable:Recover)

推荐结合 leveldb-handbook 阅读源码

数据库每次启动时,都会有一个recover的过程,简要地来说,就是利用Manifest信息重新构建一个最新的version。

过程如下:

  1. 利用Current文件读取最近使用的manifest文件;
  2. 创建一个空的version,并利用manifest文件中的session record依次作apply操作,还原出一个最新的version,注意manifest的第一条session record是一个version的快照,后续的session record记录的都是增量的变化;
  3. 将非current文件指向的其他过期的manifest文件删除;
  4. 将新建的version作为当前数据库的version;

注意,随着leveldb运行时间的增长,一个manifest中包含的session record会越来越多,故leveldb在每次启动时都会重新创建一个manifest文件,并将第一条session record中记录当前version的快照状态。其他过期的manifest文件会在下次启动的recover流程中进行删除。leveldb通过这种方式,来控制manifest文件的大小,但是数据库本身没有重启,manifest还是会一直增长。

Current

由于每次启动,都会新建一个Manifest文件,因此leveldb当中可能会存在多个manifest文件。因此需要一个额外的current文件来指示当前系统使用的到底是哪个manifest文件。

该文件中只有一个内容,即当前使用的manifest文件的文件名。

 

Recover 恢复

Status DBImpl::Recover(VersionEdit* edit, bool* save_manifest) {
  mutex_.AssertHeld();

  // 创建目录,因为可能之前已经创建过,所以不需要.
  env_->CreateDir(dbname_);
  assert(db_lock_ == nullptr);
  // 创建一个锁文件,就是db路径下的LOCK
  Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
  if (!s.ok()) {
    return s;
  }

  // 检测CURRENT 文件是否存在
  if (!env_->FileExists(CurrentFileName(dbname_))) {
    if (options_.create_if_missing) { // 文件不存在则创建db
      s = NewDB();
      if (!s.ok()) {
        return s;
      }
    } else {
      return Status::InvalidArgument(
          dbname_, "does not exist (create_if_missing is false)");
    }
  } else {
    if (options_.error_if_exists) {
      return Status::InvalidArgument(dbname_,
                                     "exists (error_if_exists is true)");
    }
  }

  // 根据manifest文件恢复 version
  s = versions_->Recover(save_manifest); 
  if (!s.ok()) {
    return s;
  }
  SequenceNumber max_sequence(0);

  // Recover from all newer log files than the ones named in the
  // descriptor (new log files may have been added by the previous
  // incarnation without registering them in the descriptor).
  //
  // Note that PrevLogNumber() is no longer used, but we pay
  // attention to it in case we are recovering a database
  // produced by an older version of leveldb.
  const uint64_t min_log = versions_->LogNumber();
  const uint64_t prev_log = versions_->PrevLogNumber();
  std::vector<std::string> filenames;
  s = env_->GetChildren(dbname_, &filenames);
  if (!s.ok()) {
    return s;
  }
  std::set<uint64_t> expected;
  versions_->AddLiveFiles(&expected);
  uint64_t number;
  FileType type;
  std::vector<uint64_t> logs;
  // 恢复所有时间上较新的log文件
  for (size_t i = 0; i < filenames.size(); i++) {
    if (ParseFileName(filenames[i], &number, &type)) {
      expected.erase(number);
      if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
        logs.push_back(number);
    }
  }
  // manifest文件里有的,在db目录下没有,丢失了
  if (!expected.empty()) {
    char buf[50];
    snprintf(buf, sizeof(buf), "%d missing files; e.g.",
             static_cast<int>(expected.size()));
    return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
  }

  // 按log文件产生的顺序恢复db
  std::sort(logs.begin(), logs.end());
  for (size_t i = 0; i < logs.size(); i++) {
    s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
                       &max_sequence);
    if (!s.ok()) {
      return s;
    }

    // The previous incarnation may not have written any MANIFEST
    // records after allocating this log number.  So we manually
    // update the file number allocation counter in VersionSet.
    versions_->MarkFileNumberUsed(logs[i]);
  }

  if (versions_->LastSequence() < max_sequence) {
    versions_->SetLastSequence(max_sequence);
  }

  return Status::OK();
}

按照log时间顺序恢复,这里用到log的Reader类,可参考:leveldb源码学习之日志 log 读取

Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
                              bool* save_manifest, VersionEdit* edit,
                              SequenceNumber* max_sequence) {
  struct LogReporter : public log::Reader::Reporter {
    Env* env;
    Logger* info_log;
    const char* fname;
    Status* status;  // null if options_.paranoid_checks==false
    void Corruption(size_t bytes, const Status& s) override {
      Log(info_log, "%s%s: dropping %d bytes; %s",
          (this->status == nullptr ? "(ignoring error) " : ""), fname,
          static_cast<int>(bytes), s.ToString().c_str());
      if (this->status != nullptr && this->status->ok()) *this->status = s;
    }
  };
 
  mutex_.AssertHeld();
 
  // Open the log file
  std::string fname = LogFileName(dbname_, log_number);
  SequentialFile* file;
  Status status = env_->NewSequentialFile(fname, &file);
  if (!status.ok()) {
    MaybeIgnoreError(&status);
    return status;
  }
 
  // 创建日志的 reader.
  LogReporter reporter;
  reporter.env = env_;
  reporter.info_log = options_.info_log;
  reporter.fname = fname.c_str();
  reporter.status = (options_.paranoid_checks ? &status : nullptr);
  // 这里让 log::Reader 做校验和检验即使 paranoid_checks==false 这样终端会导致整个提交被忽略,防止错误信息传播.
  log::Reader reader(file, &reporter, true /*校验和*/, 0 /*initial_offset*/);
  Log(options_.info_log, "Recovering log #%llu",
      (unsigned long long)log_number);
 
  // Read all the records and add to a memtable
  std::string scratch;
  Slice record;
  WriteBatch batch;
  int compactions = 0;
  MemTable* mem = nullptr;
  // 读取一条日志记录
  while (reader.ReadRecord(&record, &scratch) && status.ok()) {
    if (record.size() < 12) {
      reporter.Corruption(record.size(),
                          Status::Corruption("log record too small"));
      continue;
    }
    WriteBatchInternal::SetContents(&batch, record);
 
    if (mem == nullptr) {
      mem = new MemTable(internal_comparator_);
      mem->Ref();
    }
	// 往memtable 插入key
    status = WriteBatchInternal::InsertInto(&batch, mem);
    MaybeIgnoreError(&status);
    if (!status.ok()) {
      break;
    }
	// 更新序列号为 批量恢复后的序号
    const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
                                    WriteBatchInternal::Count(&batch) - 1;
    if (last_seq > *max_sequence) {
      *max_sequence = last_seq;
    }
 
	// 内存使用大于阈值
    if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
      compactions++;
      *save_manifest = true;
      status = WriteLevel0Table(mem, edit, nullptr); // 做一次minor 压缩,将当前memtable写入第0层sstable
      mem->Unref();
      mem = nullptr;
      if (!status.ok()) {
        // Reflect errors immediately so that conditions like full
        // file-systems cause the DB::Open() to fail.
        break;
      }
    }
  }
 
  delete file;
 
  // 是否复用上一个log文件 .
  if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
    assert(logfile_ == nullptr);
    assert(log_ == nullptr);
    assert(mem_ == nullptr);
    uint64_t lfile_size;
	 // 当上一个日志文件fname没有遇到memtable压缩时,接着复用整个log文件,新的修改追加到后面
    if (env_->GetFileSize(fname, &lfile_size).ok() &&
        env_->NewAppendableFile(fname, &logfile_).ok()) {
      Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
      log_ = new log::Writer(logfile_, lfile_size);
      logfile_number_ = log_number;
      if (mem != nullptr) {
        mem_ = mem;
        mem = nullptr;
      } else {
        // 当日志存在但是为空时,mem 可能为空.
        mem_ = new MemTable(internal_comparator_);
        mem_->Ref();
      }
    }
  }
 
  // 恢复日志后的memtable没有做压缩,这里压缩
  if (mem != nullptr) {
    if (status.ok()) {
      *save_manifest = true;
      status = WriteLevel0Table(mem, edit, nullptr);
    }
    mem->Unref();
  }
 
  return status;
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章