日志文件是LevelDB架构中最重要的一个部分,由于LSM-Tree是将写入的数据先存放在内存中的Memtable, 当到达临界时,就将其变成只读Immutable Memtable,随后后台进程将其flush到磁盘上sstable,如果在这个过程中系统发生错误,内存中的数据就会受到破坏,这时候就需要日志文件进行恢复。LevelDB在将数据写入内存的时候就会将其记录在日志文件中。
日志格式
每一条日志记录由下面格式组成,每一个block包含若干条记录,block的大小默认为32kb。type有四种类型:FULL、FIRST、MIDDLE、LAST,FULL表示这一个块只包含一条记录,FIRST、MIDDLE、LAST分别表示一条记录拆分到不同的块中。
enum RecordType {
// 0是留给还没有分配的log文件
kZeroType = 0,
kFullType = 1,
// 一条记录分成三种类型,跨越多个block
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
//表示当前最大的type数
static const int kMaxRecordType = kLastType;
//block大小,默认32kb
static const int kBlockSize = 32768;
// 记录头部 checksum (4 bytes), length (2 bytes), type (1 byte).
static const int kHeaderSize = 4 + 2 + 1;
日志的读过程
LevelDB整体的读过程如下图。可以看到查询过程要从内存中查,如果没有再到磁盘上查找。步骤分成三步:
- 先从内存中MemTable中查找,找到返回
- 然后从Immutable Memtable中查找,找到返回
- 如果内存中没有找到,再从磁盘中sstable中查找
接下来看Reader的源码实现。Reader类定义。
class Reader {
public:
// 抽象类Reporter,用来报告错误
class Reporter {
public:
virtual ~Reporter();
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-null, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset);
Reader(const Reader&) = delete;
Reader& operator=(const Reader&) = delete;
~Reader();
// 读取下一条记录到record中,scratch是临时存储,成功返回true
bool ReadRecord(Slice* record, std::string* scratch);
// 返回最后一条记录的偏移量,要在ReadRecord之后调用
uint64_t LastRecordOffset();
private:
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
//直接跳到initial_offeset
bool SkipToInitialBlock();
// 读取物理空间上的记录
unsigned int ReadPhysicalRecord(Slice* result);
// 当出现问题时候,将其原因放到reporter
void ReportCorruption(uint64_t bytes, const char* reason);
void ReportDrop(uint64_t bytes, const Status& reason);
SequentialFile* const file_; //读取的文件
Reporter* const reporter_; //错误报告对象
bool const checksum_; //校验和
char* const backing_store_; //备份存储
Slice buffer_; //缓存
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
uint64_t last_record_offset_; //最后一个记录的offset
uint64_t end_of_buffer_offset_; //buffer中的offset
uint64_t const initial_offset_; //第一个记录的初始offset
bool resyncing_;
};
ReadRecord函数将记录放到record中,首先调用ReadPhysicalRecord返回一条记录,如果一条记录是在不同的block中,则想将记录放到scratch中,然后最后到kLastType才将整个scratch放到record中返回。
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
//清零
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
const unsigned int record_type = ReadPhysicalRecord(&fragment);
// ReadPhysicalRecord may have only had an empty trailer remaining in its
// internal buffer. Calculate the offset of the next physical record now
// that it has returned, properly accounting for its header size.
uint64_t physical_record_offset =
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
if (resyncing_) {
if (record_type == kMiddleType) {
continue;
} else if (record_type == kLastType) {
resyncing_ = false;
continue;
} else {
resyncing_ = false;
}
}
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (!scratch->empty()) {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (!scratch->empty()) {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
// This can be caused by the writer dying immediately after
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
ReadPhysicalRecord函数就是从磁盘上读取
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() < kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
//从文件中读取一个block放到buffer中
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
continue; //这里是block读取完成(status.ok() == true)
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Instead of considering this an error,
// just report EOF.
buffer_.clear();
return kEof;
}
}
const char* header = buffer_.data();
// [0,1,2,3]是crc [4,5]是长度 [6]是type
//解析长度
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
//解析类型
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption.
return kEof;
}
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
//把头部之外的剩余放到result中返回
*result = Slice(header + kHeaderSize, length);
return type;
}
}
日志的写过程
LevelDB整体的写过程如下图。可以看到整体分成两个步骤,首先是将写操作写入log,然后才是写入内存Memtable中。
日志写类的实现如下。首先了解一下,Slice是一个简单的结构,用来包含一个指向额外存储空间的指针和一个大小,可以和string相互转换。
class Writer {
public:
// 向dest写入
explicit Writer(WritableFile* dest);
Writer(WritableFile* dest, uint64_t dest_length);
Writer(const Writer&) = delete;
Writer& operator=(const Writer&) = delete;
~Writer();
//公有接口,向日志中写入字符串
Status AddRecord(const Slice& slice);
private:
//向磁盘中写入
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
//dest_是一个抽象类,需要自己继承实现
WritableFile* dest_;
// 当前block中的偏移量
int block_offset_;
// 不同type的crc码.
uint32_t type_crc_[kMaxRecordType + 1];
};
AddRecord函数首先判断不同记录的类型,然后调用EmitPhysicalRecord函数将其写入物理磁盘
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
Status s;
bool begin = true;
do {
//leftover表示block剩余空间
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
// 剩余大小小于header大小,则Switch to a new block
if (leftover > 0) {
//这个block的剩余空间用\x00填充
static_assert(kHeaderSize == 7, "");
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
//avail表示block剩余存放数据的空间
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
//end表示达到block的尾部
const bool end = (left == fragment_length);
//这里记录type
if (begin && end) {
//表示记录一次写完
type = kFullType;
} else if (begin) {
//表示第一次开始,slice剩余量大于block(一个block装不下)
type = kFirstType;
} else if (end) {
//表示在最后一个block
type = kLastType;
} else {
//中间的block
type = kMiddleType;
}
//写入物理存储中
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
EmitPhysicalRecord函数是调用用户重写过的WritableFile::Append()进行加载到底层磁盘。
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr,
size_t length) {
assert(length <= 0xffff);
assert(block_offset_ + kHeaderSize + length <= kBlockSize);
// [0,1,2,3]是crc(4字节) [4,5]是长度(2字节) [6]是type(1字节)
char buf[kHeaderSize];
buf[4] = static_cast<char>(length & 0xff);
buf[5] = static_cast<char>(length >> 8);
buf[6] = static_cast<char>(t);
// 计算crc值
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, length);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
//写头部
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
//写主体
s = dest_->Append(Slice(ptr, length));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + length;
return s;
}
参考博客:
- https://leveldb-handbook.readthedocs.io/zh/latest/rwopt.html
- https://www.jianshu.com/p/d1bb2e2ceb4c