leveldb數據的插入與獲取
leveldb提供的數據的交互接口如下;
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options, const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options, const Slice& key,
std::string* value) = 0;
主要提供了Write,Delete,Get和Put等操作接口,本文就來分析一下這些操作的具體流程。
Put和Delete操作
概述說明,在上一篇打開的流程過程中,調用了DB::Open的函數,該函數在完成打開操作之後,初始化了一個db,是初始化本質是一個DBImpl實現了DB接口的子類,所以在調用DB->Get或者DB->Put的時候其實是調用了DBImpl的對應方法,而DBImpl則直接類似與如下形式;
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
return DB::Put(o, key, val);
}
直接就調用了父類的DB的靜態方法Put函數,同理我們查看Delete函數;
// Default implementations of convenience methods that subclasses of DB
// can call if they wish
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
WriteBatch batch;
batch.Put(key, value);
return Write(opt, &batch);
}
Status DB::Delete(const WriteOptions& opt, const Slice& key) {
WriteBatch batch;
batch.Delete(key);
return Write(opt, &batch);
}
從代碼執行過程可以看出,都是先通過一個WriteBatch來先保存操作的流程,然後調用Write方法,將對應的batch去執行。
WriteBatch類
class LEVELDB_EXPORT WriteBatch {
public:
class LEVELDB_EXPORT Handler {
public:
virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0;
virtual void Delete(const Slice& key) = 0;
};
WriteBatch();
// Intentionally copyable.
WriteBatch(const WriteBatch&) = default;
WriteBatch& operator=(const WriteBatch&) = default;
~WriteBatch();
// Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value); // 添加
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key); // 刪除
// Clear all updates buffered in this batch.
void Clear();
// The size of the database changes caused by this batch.
//
// This number is tied to implementation details, and may change across
// releases. It is intended for LevelDB usage metrics.
size_t ApproximateSize() const;
// Copies the operations in "source" to this batch.
//
// This runs in O(source size) time. However, the constant factor is better
// than calling Iterate() over the source batch with a Handler that replicates
// the operations into this batch.
void Append(const WriteBatch& source); // 追加
// Support for iterating over the contents of a batch.
Status Iterate(Handler* handler) const;
private:
friend class WriteBatchInternal;
std::string rep_; // See comment in write_batch.cc for the format of rep_
};
} // namespace leveldb
該類的具體實現如下;
static const size_t kHeader = 12;
WriteBatch::WriteBatch() { Clear(); }
WriteBatch::~WriteBatch() = default;
WriteBatch::Handler::~Handler() = default;
void WriteBatch::Clear() {
rep_.clear(); // 清理
rep_.resize(kHeader);
}
size_t WriteBatch::ApproximateSize() const { return rep_.size(); } // 返回rep_的字符串的大小
Status WriteBatch::Iterate(Handler* handler) const { // 迭代器
Slice input(rep_);
if (input.size() < kHeader) { // 如果輸入的大小小於頭部信息的大小 則太小了
return Status::Corruption("malformed WriteBatch (too small)");
}
input.remove_prefix(kHeader); // 移除頭部
Slice key, value;
int found = 0;
while (!input.empty()) { // 檢查是否爲空
found++;
char tag = input[0]; // 獲取當前的tag
input.remove_prefix(1); // 移除一個該位
switch (tag) { // 檢查該tag是Put還是Delete
case kTypeValue: // 如果是添加
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) { // 分別獲取key 和 value
handler->Put(key, value); // 調用handler去添加
} else {
return Status::Corruption("bad WriteBatch Put");
}
break;
case kTypeDeletion: // 如果是刪除
if (GetLengthPrefixedSlice(&input, &key)) { // 獲取對應的key
handler->Delete(key); // 調用handle的刪除方法
} else {
return Status::Corruption("bad WriteBatch Delete");
}
break;
default:
return Status::Corruption("unknown WriteBatch tag"); // 如果tag不對則 返回錯誤
}
}
if (found != WriteBatchInternal::Count(this)) { // 檢查查找到的與當前數據保存的數據是否相同
return Status::Corruption("WriteBatch has wrong count");
} else {
return Status::OK(); // 返回成功
}
}
int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8); // 獲取大小
}
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
EncodeFixed32(&b->rep_[8], n); // 設置count
}
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
return SequenceNumber(DecodeFixed64(b->rep_.data())); // 獲取序列號
}
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq); // 設置序列號
}
void WriteBatch::Put(const Slice& key, const Slice& value) { // 插入數據
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); // 設置count
rep_.push_back(static_cast<char>(kTypeValue)); // 壓入類型數據
PutLengthPrefixedSlice(&rep_, key); // 設置數據
PutLengthPrefixedSlice(&rep_, value); // 設置value
}
void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); // 設置計數
rep_.push_back(static_cast<char>(kTypeDeletion)); // 壓入類型數據
PutLengthPrefixedSlice(&rep_, key); // 壓入數據
}
void WriteBatch::Append(const WriteBatch& source) {
WriteBatchInternal::Append(this, &source); // 調用WriteBatchInternal的append函數
}
namespace {
class MemTableInserter : public WriteBatch::Handler { // MemTable插入類
public:
SequenceNumber sequence_;
MemTable* mem_;
void Put(const Slice& key, const Slice& value) override { // 添加內容
mem_->Add(sequence_, kTypeValue, key, value); // 添加序列號 插入類型 key value
sequence_++;
}
void Delete(const Slice& key) override {
mem_->Add(sequence_, kTypeDeletion, key, Slice()); // 添加內容 序列號 刪除類型 key 空的value
sequence_++;
}
};
} // namespace
Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* memtable) {
MemTableInserter inserter;
inserter.sequence_ = WriteBatchInternal::Sequence(b); // 先獲取序列號
inserter.mem_ = memtable; // 設置memtabe
return b->Iterate(&inserter); // 迭代插入
}
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
assert(contents.size() >= kHeader);
b->rep_.assign(contents.data(), contents.size()); // 重置內容爲content的內容,並且設置內容大小
}
void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
SetCount(dst, Count(dst) + Count(src));
assert(src->rep_.size() >= kHeader);
dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); // 追加 內容加頭部信息 大小要減去頭部信息
}
} // namespace leveldb
該類主要就是包括了對memtable的追加刪除等操作,基本上都暴露了對外提供操作的接口。
Write函數
Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
Writer w(&mutex_); // 初始化一個Writer
w.batch = updates; // 保存對應的WriteBatch
w.sync = options.sync; // 是否是同步寫入
w.done = false; // 是否已經完成
MutexLock l(&mutex_); // 初始化線程鎖
writers_.push_back(&w); // 添加到隊列中
while (!w.done && &w != writers_.front()) { // 如果當前的任務還沒有完成 並且當前的任務不是w 則等待
w.cv.Wait();
}
if (w.done) { // 如果任務已經完成則返回狀態否則就繼續執行
return w.status;
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(updates == nullptr); // 獲取可以寫入的空間
uint64_t last_sequence = versions_->LastSequence(); // 獲取最後的序列號
Writer* last_writer = &w;
if (status.ok() && updates != nullptr) { // nullptr batch is for compactions // 有空間可用並且update有內容輸入
WriteBatch* updates = BuildBatchGroup(&last_writer); // 獲取當前要操作的WriteBatch
WriteBatchInternal::SetSequence(updates, last_sequence + 1); // 設置序列號
last_sequence += WriteBatchInternal::Count(updates); // 序列號加上當前執行完的大小
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into mem_.
{
mutex_.Unlock(); // 獲取鎖
status = log_->AddRecord(WriteBatchInternal::Contents(updates)); // 將內容添加到當前的日誌中
bool sync_error = false;
if (status.ok() && options.sync) { // 如果添加成功 並且需要同步寫入
status = logfile_->Sync(); // 調用同步寫入的函數
if (!status.ok()) { // 如果同步寫入失敗
sync_error = true; // 設置同步寫入失敗的標誌
}
}
if (status.ok()) { // 如果成功
status = WriteBatchInternal::InsertInto(updates, mem_); // 將內容插入到memTable中
}
mutex_.Lock(); // 釋放
if (sync_error) { // 如果同步失敗 則記錄失敗的狀態
// The state of the log file is indeterminate: the log record we
// just added may or may not show up when the DB is re-opened.
// So we force the DB into a mode where all future writes fail.
RecordBackgroundError(status);
}
}
if (updates == tmp_batch_) tmp_batch_->Clear(); // 如果update與臨時的batch相同則釋放tmp_batch
versions_->SetLastSequence(last_sequence); // 插入最新的序列號
}
while (true) { // 遍歷writers
Writer* ready = writers_.front();
writers_.pop_front(); // 刪除第一個
if (ready != &w) { // 如果當前的第一個不等於w
ready->status = status; // 獲取當前的狀態
ready->done = true; // 設置爲done
ready->cv.Signal(); // 喚醒其他等待的線程
}
if (ready == last_writer) break; // 如果等於當前的就停止
}
// Notify new head of write queue
if (!writers_.empty()) { // 檢查當前的writers隊列是否爲空 如果不爲空則 喚醒剩餘等待要執行的線程
writers_.front()->cv.Signal();
}
return status; // 返回狀態
}
Write函數主要就是講數據封裝成一個writer,然後將該writer壓入一個隊列中,如果隊列中還有未完成的操作則進入等待,因爲可以將多個操作壓縮在一起執行,如果隊列中沒有其他數據或者阻塞的隊列被喚醒,則先檢查writer是否被執行完成,因爲有可能壓入隊列的數據被批量執行完成,如果沒有被完成,則首先去檢查當前是否還有空間可用(MakeRoomForWrite),如果有內容並且update有數據可更新則將更新重新用BuildBatchGroup包裝一下將多個操作數據壓縮在一起,然後先將要操作的數據追加到日誌中AddRecord,然後再添加通過InsertInto添加到MemTable中,基本執行邏輯如上所述。
MakeRoomForWrite檢查空間
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld(); // 檢查是否獲取鎖
assert(!writers_.empty());
bool allow_delay = !force;
Status s;
while (true) {
if (!bg_error_.ok()) { // 如果出錯則直接停止並設置出錯狀態並返回
// Yield previous error
s = bg_error_;
break;
} else if (allow_delay && versions_->NumLevelFiles(0) >=
config::kL0_SlowdownWritesTrigger) { // 是否可以延遲 並檢查延遲觸發的時間是否大於配置值
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
// individual write by 1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
mutex_.Unlock();
env_->SleepForMicroseconds(1000); // 休息一秒
allow_delay = false; // Do not delay a single write more than once // 只能休眠一次
mutex_.Lock();
} else if (!force &&
(mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { // 如果不是強制使用新的並且 當前mem可用的內存大小 小於等於待寫的數據內容大小則證明有空間可用則終止循環並返回
// There is room in current memtable
break;
} else if (imm_ != nullptr) { // 如果不爲空
// We have filled up the current memtable, but the previous
// one is still being compacted, so we wait.
Log(options_.info_log, "Current memtable full; waiting...\n");
background_work_finished_signal_.Wait(); // 等待數據落盤之後被喚醒
} else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
// There are too many level-0 files.
Log(options_.info_log, "Too many L0 files; waiting...\n"); // 如果太多了第0層文件則等待
background_work_finished_signal_.Wait();
} else {
// Attempt to switch to a new memtable and trigger compaction of old
assert(versions_->PrevLogNumber() == 0); // 判斷之前的日誌數爲0
uint64_t new_log_number = versions_->NewFileNumber(); // 獲取一個新的文件
WritableFile* lfile = nullptr;
s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); // 設置一個新的log寫文件
if (!s.ok()) { // 如果出錯
// Avoid chewing through file number space in a tight loop.
versions_->ReuseFileNumber(new_log_number); // 重新使用該文件號
break;
}
delete log_;
delete logfile_;
logfile_ = lfile;
logfile_number_ = new_log_number; // 設置文件並這隻文件編號
log_ = new log::Writer(lfile); // 生成一個log實例
imm_ = mem_; // 獲取舊的mem內容
has_imm_.store(true, std::memory_order_release); // 保存該數據
mem_ = new MemTable(internal_comparator_); // 申請一個新的memtable
mem_->Ref();
force = false; // Do not force another compaction if have room
MaybeScheduleCompaction(); // 將就數據調度落盤
}
}
return s;
}
檢查是否還有空間,如果空間不夠則重新生成新的memtable空間來裝載數據。
BuildBatchGroup合併操作內容
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-null batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
mutex_.AssertHeld();
assert(!writers_.empty());
Writer* first = writers_.front(); // 獲取第一個
WriteBatch* result = first->batch; // 獲取第一個的batch
assert(result != nullptr);
size_t size = WriteBatchInternal::ByteSize(first->batch); // 獲取操作的數據大小
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t max_size = 1 << 20; // 獲取最大的字節數
if (size <= (128 << 10)) {
max_size = size + (128 << 10);
}
*last_writer = first;
std::deque<Writer*>::iterator iter = writers_.begin(); // 獲取迭代器
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) { // 循環數據
Writer* w = *iter;
if (w->sync && !first->sync) { // 檢查當前的sync是否與第一個的sync一直 即如果w爲同步 而first未異步則停止
// Do not include a sync write into a batch handled by a non-sync write.
break;
}
if (w->batch != nullptr) {
size += WriteBatchInternal::ByteSize(w->batch); // 獲取插入數據大小
if (size > max_size) { // 如果超過最大值則停止
// Do not make batch too big
break;
}
// Append to *result
if (result == first->batch) { // 如果result 與第一個的batch相同
// Switch to temporary batch instead of disturbing caller's batch
result = tmp_batch_;
assert(WriteBatchInternal::Count(result) == 0);
WriteBatchInternal::Append(result, first->batch); // 追加數據
}
WriteBatchInternal::Append(result, w->batch); // 追加內容
}
*last_writer = w; // 重置最後一個繼續循環
}
return result; // 返回結果
}
主要就是合併對應的操作數據,將相同的數據進行append,以此可以提高單次的操作效率。
總結
有關數據的基本的Put和Delete的流程,基本上就是將數據通過WriteBatch類,將插入操作和寫入操作都做成了write的操作,以此提高了寫入的效率,讓在刪除的時候直接以類型的形式去添加到數據中,本文只是從基本代碼流程上分析了Put和Delete對應的操作。由於本人才疏學淺,如有錯誤請批評指正。