針對Log文件的作用及格式介紹系列文章中有介紹,可點此處查看Log文件介紹說明。
所有的寫操作都是先成功的append到Log日誌中,然後在更新內存memtable的。
這樣做有如下優點:
- 可以將隨機的寫IO變成append,極大的提高寫磁盤速度;
- 防止在節點down機導致內存數據丟失,造成數據丟失,這對系統來說是個災難。
日誌文件的切換是在寫KV記錄之前會進行MakeRoomForWrite
來決定是否切換新的日誌文件,所以在寫入的過程中是不需要關注文件切換的。接下來介紹Log模塊的讀寫流程及結構。
一、文件結構
- log_format.h:描述Log格式及Record類型。
- log_reader.h、log_reader.cc:讀模塊實現。
- log_writer.h、log_writer.cc:寫模塊實現。
二、格式信息
結構字段
- 一共有四種Record類型。
- 每個Block爲32KB
- 每個Record頭大小爲4 + 2 + 1 = 7個字節。
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kMaxRecordType = kLastType;
static const int kBlockSize = 32768;
// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
static const int kHeaderSize = 4 + 2 + 1;
} // namespace log
構造格式
三、寫流程
1.類關係圖
2.源碼
log_writer.h
namespace leveldb {
class WritableFile;
namespace log {
class Writer {
public:
<!實例一個Writer,傳入的參數*dest要爲空,且在寫期間,*dest要保持存活>
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
// Create a writer that will append data to "*dest".
// "*dest" must have initial length "dest_length".
// "*dest" must remain live while this Writer is in use.
Writer(WritableFile* dest, uint64_t dest_length);
Writer(const Writer&) = delete;
Writer& operator=(const Writer&) = delete;
~Writer();
<!寫一個Record到文件中>
Status AddRecord(const Slice& slice);
private:
<!實際寫>
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
<!Log文件>
WritableFile* dest_;
<!位於當前block的哪個位置>
int block_offset_; // Current offset in block
<!提前計算好的Type對應的CRC值,減少使用過程中的計算>
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
};
} // namespace log
} // namespace leveldb
log_writer.cc
namespace leveldb {
namespace log {
<!計算RecordType的CRC32值>
static void InitTypeCrc(uint32_t* type_crc) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc[i] = crc32c::Value(&t, 1);
}
}
Writer::Writer(WritableFile* dest) : dest_(dest), block_offset_(0) {
InitTypeCrc(type_crc_);
}
Writer::Writer(WritableFile* dest, uint64_t dest_length)
: dest_(dest), block_offset_(dest_length % kBlockSize) {
InitTypeCrc(type_crc_);
}
<!指定默認析構函數>
Writer::~Writer() = default;
<!寫Record流程>
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
<!
1、有必要的情況下,需要record進行分片寫入;
2、如果slice數據爲空,仍然會寫一次,只是長度爲0,
讀取的時候會對此種情況進行處理。
>
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
<!寫文件是以一個Block(32KB)爲單元寫入的,而寫入到Block這是一個個Record,
每個Record的頭長度爲7Byte。假設這個Block剩餘可寫的長度爲L,
要寫入的數據爲N,則分以下情況進行處理:
1、L >= N+7,說明Block空間足以容納下一個Record和7Byte的頭,
則這個數據被定義爲一個Type爲kFullType的Record。
2、N + 7 > L >= 7,即當前Block空間大於等於7Byte,但不足以保存全部內容,
則在當前頁生存一個Type爲kFirstType的Record,Payload(Block剩餘空間)保存
數據前面L-7字節的內容(可以爲0,那就直說一個頭),如果數據剩餘的長度小於32KB,
則在下一個頁中生成一個Type爲kLastType的Record,否則在下一個Block中生成一個
Type爲kMiddleType的Record,依次類推,直至數據被完全保存下來。
3、L < 7,當前Block的剩餘長度小於7Byte,則填充0。
以上流程就是整個寫流程了。
>
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
static_assert(kHeaderSize == 7, "");
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
<!實際寫實現:
1、格式化打包頭;
2、CRC校驗計算;
3、先寫頭、在寫Payload,寫成功之後flush下;
4、將block_offset_位置重新計算下。
>
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr,
size_t length) {
assert(length <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + length <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(length & 0xff);
buf[5] = static_cast<char>(length >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, length);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, length));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + length;
return s;
}
} // namespace log
} // namespace leveldb
四、讀流程
1.類關係圖
2.源碼
log_reader.h
namespace leveldb {
<!順序讀取文件的抽象封裝類>
class SequentialFile;
namespace log {
class Reader {
public:
<!負責上報錯誤類>
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-null, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
<!
1.file: 要讀取的Log文件封裝。
2.reporter: 錯誤上報類。
3.checksum: 是否check校驗。
4.initial_offset:開始讀取數據偏移位置。
>
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset);
<!禁止拷貝構造和賦值構造>
Reader(const Reader&) = delete;
Reader& operator=(const Reader&) = delete;
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
<!
1.讀取一個Record記錄,成功返回true,失敗返回false。
2.讀取的數據在*record參數中,傳入的*scratch用於臨時內部臨時存儲使用。
>
bool ReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
<!返回最近一次讀取Record的偏移位,也就是這個Record的起始位>
uint64_t LastRecordOffset();
private:
// Extend record types with the following special values
<!
擴展兩種類型用於錯誤表示。
1.kEof表示到達文件尾。
2.kBadRecord表示以下三種錯誤:
1)CRC校驗失敗、
2)讀取長度爲0、
3)讀取的內存在initial_offset之外,比方說從64位置開始讀而Record在31~63之間。
>
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
<!跳到起始位置initial_offset處開始讀取>
bool SkipToInitialBlock();
// Return type, or one of the preceding special values
<!讀取一個Record>
unsigned int ReadPhysicalRecord(Slice* result);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
<!上報錯誤和丟棄>
void ReportCorruption(uint64_t bytes, const char* reason);
void ReportDrop(uint64_t bytes, const Status& reason);
SequentialFile* const file_;
Reporter* const reporter_;
bool const checksum_;
<!32kb大小數據存儲空間,用於從文件中讀取一個Block>
char* const backing_store_;
<!將從文件讀取到的數據封裝爲一個Slice,用buffer_來表示>
Slice buffer_;
<!當讀取的文件數據大小小於kBlockSize,表示讀取到文件尾,將eof_置位true>
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
<!最近一次讀取Record的偏移位,也就是這個Record的起始位>
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
<!讀取的Buffer尾部的偏移位>
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
<!開始讀取數據位置>
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
<!是否重新開始讀取Record>
<!在初始讀取位置initial_offset > 0的情況下,resyncing_才爲true,
因爲初始位置如果不是從0開始,首次讀取到的Record的type是kMiddleType和
kLastType的話,則不是一個完整的record,所以要丟棄重新讀取。
>
// True if we are resynchronizing after a seek (initial_offset_ > 0). In
// particular, a run of kMiddleType and kLastType records can be silently
// skipped in this mode
bool resyncing_;
};
} // namespace log
} // namespace leveldb
log_reader.cc
namespace log {
<!指定下默認析構函數>
Reader::Reporter::~Reporter() = default;
<!實例化時,做如下事情:
1、賦值下讀取文件、異常上報程序;
2、是否執行數據校驗(checksum_爲true,則校驗);
3、申請一塊32KB大小的內存用於讀取block;
4、Slice(buffer_)初始化;
5、上次讀取的record偏移位爲0;
6、讀取的一個buffer尾部偏移位爲0;
7、初始化讀取Record位置。
8、重讀取標誌(resyncing_)
>
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset),
resyncing_(initial_offset > 0) {}
<!析構時,釋放內存>
Reader::~Reader() { delete[] backing_store_; }
<!根據initial_offset跳轉到第一個Block處>
bool Reader::SkipToInitialBlock() {
const size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
<!寫數據時,會有個最後6字節的0x00填充位,也就是trailer
如果最後求到的餘的位置落在這6字節範圍內,直接跳過一個32KB
的Block,進行讀取。
>
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
block_start_location += kBlockSize;
}
<!跳轉到的開始讀取位置指定爲Buffer的尾部偏移位>
end_of_buffer_offset_ = block_start_location;
<!跳轉到第一個包含初始Record的Block處,如果異常就報錯>
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
<!讀取Record實現>
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
<!如果上一次讀取record位置小於當前起始讀取位置
則跳過中間部分,直接到開始讀取數據處>
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
<!
1、初始化值;
2、首次進來,肯定不在一個record片段中,
所以 in_fragmented_recordw爲false。
>
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
<!正在讀取Record的偏移位,初始化爲0>
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
<!讀取一個Record,並返回Record的Type,實現及註釋看下文>
const unsigned int record_type = ReadPhysicalRecord(&fragment);
// ReadPhysicalRecord may have only had an empty trailer remaining in its
// internal buffer. Calculate the offset of the next physical record now
// that it has returned, properly accounting for its header size.
<!這裏就是計算出當前讀取的Record的開始位置偏移位>
uint64_t physical_record_offset =
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
<!如果initial_offset > 0,則resyncing_爲true
1、如果讀取到的record_type是kMiddleType,則少了kFirstType,重新讀。
2、如果讀取到的record_type是kLastType,則少了kFirstType和kMiddleType,重新讀,
同時要把resyncing_置位false。
>
if (resyncing_) {
if (record_type == kMiddleType) {
continue;
} else if (record_type == kLastType) {
resyncing_ = false;
continue;
} else {
resyncing_ = false;
}
}
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
<!早期版本有BUG,Writer會寫一個空的kFirstType,
然後後面跟着一個kFullType,這樣讀取到kFirstType之後,
in_fragmented_record置位true了,如此則進入此流程
>
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (!scratch->empty()) {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
<!
1、記錄下當前Record起始地址,
2、返回讀取到的record。
>
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
<!早期版本有BUG,在下一個block之前會存在一個kFirstType,
這樣如果讀取到下一個block有kFirstType,而之前已經讀了一個kFirstType,
則in_fragmented_record置位true了,如此則進入此流程
>
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (!scratch->empty()) {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
<!進入此流程表示一個完整的record由first、middle、last組成
剩下的就是組裝數據。
>
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
<!理論下如果record是kMiddleType,則in_fragmented_record爲true,否則報錯>
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
<!最後一個type,Record,讀完則組成一個完整的record,
同時賦值下當前完整record的起始位置。>
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
<!餘下的都是錯誤處理,很容易看懂,就不註釋了>
case kEof:
if (in_fragmented_record) {
// This can be caused by the writer dying immediately after
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
<!返回最近讀取Record的偏移位>
uint64_t Reader::LastRecordOffset() { return last_record_offset_; }
void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
if (reporter_ != nullptr &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(static_cast<size_t>(bytes), reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
<!while true的目的就是讀取一個完整的Record>
while (true) {
if (buffer_.size() < kHeaderSize) {
<!kHeaderSize爲7,如果buffer剩餘大小小於
7Byte,分兩組情況:
1、還未讀取到文件尾部;
2、已經讀取到文件尾部。
>
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
<!如果buffer_剩餘大小小於7Byte且文件未讀取到尾,那上一次讀是讀取了一個完整的Record,
剩餘的大小隻是6B的填充trailer,所以只需跳過這個trailer,清空即可。
>
buffer_.clear();
<!
1、讀取32KB大小數據;
2、將end_of_buffer_offset_偏移下位置。
>
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
<!讀取失敗,直接報錯並返回讀到文件尾>
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
<!讀取數據大小小於32KB,認爲讀取到文件尾了,
通過continue,由上文判斷下是不是小於7Byte的大小。
>
eof_ = true;
}
continue;
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF.
<!如果buffer_是大於0,小於7(頭大小)且到文件尾了,
很可能是正在寫頭的時候,寫流程崩潰了導致截斷的頭,
這裏我們只需要返回到達文件尾即可,不會影響數據。
>
buffer_.clear();
return kEof;
}
}
<!準備解析數據,先解析header>
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
<!解析出的數據長度大於實際讀取的數據,則是異常的,返回>
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption.
return kEof;
}
<!在env_posix.cc環境下寫文件時存在預分配的情況會導致此類型type,
返回異常即可,不用上報>
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
<!主要是校驗type+data數據,校驗失敗這要上報數據異常,並返回>
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
<!從buffer_中移除讀取到的Record數據指向和大小>
buffer_.remove_prefix(kHeaderSize + length);
<!end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length 就是讀取Record
的開始位置,也就是說讀取Record的開始位置在initial_offset之前,則丟棄這個Record。
>
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
<!返回一個完整Record>
*result = Slice(header + kHeaderSize, length);
return type;
}
}
} // namespace log
參考鏈接:
https://blog.csdn.net/weixin_36145588/article/details/76423194