本文链接： https://blog.csdn.net/puliao4167/article/details/102678826

日志文件是LevelDB架构中最重要的一个部分，由于LSM-Tree是将写入的数据先存放在内存中的Memtable, 当到达临界时，就将其变成只读Immutable Memtable，随后后台进程将其flush到磁盘上sstable，如果在这个过程中系统发生错误，内存中的数据就会受到破坏，这时候就需要日志文件进行恢复。LevelDB在将数据写入内存的时候就会将其记录在日志文件中。

日志格式

每一条日志记录由下面格式组成，每一个block包含若干条记录，block的大小默认为32kb。type有四种类型：FULL、FIRST、MIDDLE、LAST，FULL表示这一个块只包含一条记录，FIRST、MIDDLE、LAST分别表示一条记录拆分到不同的块中。
日志格式

enum RecordType {
  // 0是留给还没有分配的log文件
  kZeroType = 0,
  kFullType = 1,
  // 一条记录分成三种类型，跨越多个block
  kFirstType = 2,
  kMiddleType = 3,
  kLastType = 4
};
//表示当前最大的type数
static const int kMaxRecordType = kLastType;
//block大小，默认32kb
static const int kBlockSize = 32768;
// 记录头部 checksum (4 bytes), length (2 bytes), type (1 byte).
static const int kHeaderSize = 4 + 2 + 1;

日志的读过程

LevelDB整体的读过程如下图。可以看到查询过程要从内存中查，如果没有再到磁盘上查找。步骤分成三步：

先从内存中MemTable中查找，找到返回
然后从Immutable Memtable中查找，找到返回
如果内存中没有找到，再从磁盘中sstable中查找

接下来看Reader的源码实现。Reader类定义。

class Reader {
 public:
  // 抽象类Reporter，用来报告错误
  class Reporter {
   public:
    virtual ~Reporter();
    virtual void Corruption(size_t bytes, const Status& status) = 0;
  };

  // Create a reader that will return log records from "*file".
  // "*file" must remain live while this Reader is in use.
  //
  // If "reporter" is non-null, it is notified whenever some data is
  // dropped due to a detected corruption.  "*reporter" must remain
  // live while this Reader is in use.
  //
  // If "checksum" is true, verify checksums if available.
  //
  // The Reader will start reading at the first record located at physical
  // position >= initial_offset within the file.
  Reader(SequentialFile* file, Reporter* reporter, bool checksum,
         uint64_t initial_offset);
  Reader(const Reader&) = delete;
  Reader& operator=(const Reader&) = delete;
  ~Reader();

  // 读取下一条记录到record中,scratch是临时存储，成功返回true
  bool ReadRecord(Slice* record, std::string* scratch);

  // 返回最后一条记录的偏移量，要在ReadRecord之后调用
  uint64_t LastRecordOffset();

 private:
  enum {
    kEof = kMaxRecordType + 1,
    // Returned whenever we find an invalid physical record.
    // Currently there are three situations in which this happens:
    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
    // * The record is a 0-length record (No drop is reported)
    // * The record is below constructor's initial_offset (No drop is reported)
    kBadRecord = kMaxRecordType + 2
  };

  //直接跳到initial_offeset
  bool SkipToInitialBlock();

  // 读取物理空间上的记录
  unsigned int ReadPhysicalRecord(Slice* result);

  // 当出现问题时候，将其原因放到reporter
  void ReportCorruption(uint64_t bytes, const char* reason);
  void ReportDrop(uint64_t bytes, const Status& reason);

  SequentialFile* const file_;      //读取的文件
  Reporter* const reporter_;        //错误报告对象
  bool const checksum_;             //校验和
  char* const backing_store_;       //备份存储
  Slice buffer_;                    //缓存
  bool eof_;  // Last Read() indicated EOF by returning < kBlockSize

  uint64_t last_record_offset_;     //最后一个记录的offset

  uint64_t end_of_buffer_offset_;   //buffer中的offset

  uint64_t const initial_offset_;   //第一个记录的初始offset

  bool resyncing_;
};

ReadRecord函数将记录放到record中，首先调用ReadPhysicalRecord返回一条记录，如果一条记录是在不同的block中，则想将记录放到scratch中，然后最后到kLastType才将整个scratch放到record中返回。

bool Reader::ReadRecord(Slice* record, std::string* scratch) {
  if (last_record_offset_ < initial_offset_) {
    if (!SkipToInitialBlock()) {
      return false;
    }
  }
  //清零
  scratch->clear();
  record->clear();
  bool in_fragmented_record = false;
  // Record offset of the logical record that we're reading
  // 0 is a dummy value to make compilers happy
  uint64_t prospective_record_offset = 0;

  Slice fragment;
  while (true) {
    const unsigned int record_type = ReadPhysicalRecord(&fragment);

    // ReadPhysicalRecord may have only had an empty trailer remaining in its
    // internal buffer. Calculate the offset of the next physical record now
    // that it has returned, properly accounting for its header size.
    uint64_t physical_record_offset =
        end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();

    if (resyncing_) {
      if (record_type == kMiddleType) {
        continue;
      } else if (record_type == kLastType) {
        resyncing_ = false;
        continue;
      } else {
        resyncing_ = false;
      }
    }

    switch (record_type) {
      case kFullType:
        if (in_fragmented_record) {
          // Handle bug in earlier versions of log::Writer where
          // it could emit an empty kFirstType record at the tail end
          // of a block followed by a kFullType or kFirstType record
          // at the beginning of the next block.
          if (!scratch->empty()) {
            ReportCorruption(scratch->size(), "partial record without end(1)");
          }
        }
        prospective_record_offset = physical_record_offset;
        scratch->clear();
        *record = fragment;
        last_record_offset_ = prospective_record_offset;
        return true;

      case kFirstType:
        if (in_fragmented_record) {
          // Handle bug in earlier versions of log::Writer where
          // it could emit an empty kFirstType record at the tail end
          // of a block followed by a kFullType or kFirstType record
          // at the beginning of the next block.
          if (!scratch->empty()) {
            ReportCorruption(scratch->size(), "partial record without end(2)");
          }
        }
        prospective_record_offset = physical_record_offset;
        scratch->assign(fragment.data(), fragment.size());
        in_fragmented_record = true;
        break;

      case kMiddleType:
        if (!in_fragmented_record) {
          ReportCorruption(fragment.size(),
                           "missing start of fragmented record(1)");
        } else {
          scratch->append(fragment.data(), fragment.size());
        }
        break;

      case kLastType:
        if (!in_fragmented_record) {
          ReportCorruption(fragment.size(),
                           "missing start of fragmented record(2)");
        } else {
          scratch->append(fragment.data(), fragment.size());
          *record = Slice(*scratch);
          last_record_offset_ = prospective_record_offset;
          return true;
        }
        break;

      case kEof:
        if (in_fragmented_record) {
          // This can be caused by the writer dying immediately after
          // writing a physical record but before completing the next; don't
          // treat it as a corruption, just ignore the entire logical record.
          scratch->clear();
        }
        return false;

      case kBadRecord:
        if (in_fragmented_record) {
          ReportCorruption(scratch->size(), "error in middle of record");
          in_fragmented_record = false;
          scratch->clear();
        }
        break;

      default: {
        char buf[40];
        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
        ReportCorruption(
            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
            buf);
        in_fragmented_record = false;
        scratch->clear();
        break;
      }
    }
  }
  return false;
}

ReadPhysicalRecord函数就是从磁盘上读取

unsigned int Reader::ReadPhysicalRecord(Slice* result) {
  while (true) {
    if (buffer_.size() < kHeaderSize) {
      if (!eof_) {
        // Last read was a full read, so this is a trailer to skip
        buffer_.clear();
        //从文件中读取一个block放到buffer中
        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
        end_of_buffer_offset_ += buffer_.size();
        if (!status.ok()) {
          buffer_.clear();
          ReportDrop(kBlockSize, status);
          eof_ = true;
          return kEof;
        } else if (buffer_.size() < kBlockSize) {
          eof_ = true;
        }
        continue; //这里是block读取完成（status.ok() == true）
      } else {
        // Note that if buffer_ is non-empty, we have a truncated header at the
        // end of the file, which can be caused by the writer crashing in the
        // middle of writing the header. Instead of considering this an error,
        // just report EOF.
        buffer_.clear();
        return kEof;
      }
    }

    const char* header = buffer_.data();
    // [0,1,2,3]是crc [4，5]是长度 [6]是type
    //解析长度
    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
    //解析类型
    const unsigned int type = header[6];
    const uint32_t length = a | (b << 8);
    if (kHeaderSize + length > buffer_.size()) {
      size_t drop_size = buffer_.size();
      buffer_.clear();
      if (!eof_) {
        ReportCorruption(drop_size, "bad record length");
        return kBadRecord;
      }
      // If the end of the file has been reached without reading |length| bytes
      // of payload, assume the writer died in the middle of writing the record.
      // Don't report a corruption.
      return kEof;
    }

    if (type == kZeroType && length == 0) {
      // Skip zero length record without reporting any drops since
      // such records are produced by the mmap based writing code in
      // env_posix.cc that preallocates file regions.
      buffer_.clear();
      return kBadRecord;
    }

    // Check crc
    if (checksum_) {
      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
      if (actual_crc != expected_crc) {
        // Drop the rest of the buffer since "length" itself may have
        // been corrupted and if we trust it, we could find some
        // fragment of a real log record that just happens to look
        // like a valid log record.
        size_t drop_size = buffer_.size();
        buffer_.clear();
        ReportCorruption(drop_size, "checksum mismatch");
        return kBadRecord;
      }
    }

    buffer_.remove_prefix(kHeaderSize + length);

    // Skip physical record that started before initial_offset_
    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
        initial_offset_) {
      result->clear();
      return kBadRecord;
    }
    //把头部之外的剩余放到result中返回
    *result = Slice(header + kHeaderSize, length);
    return type;
  }
}

日志的写过程

LevelDB整体的写过程如下图。可以看到整体分成两个步骤，首先是将写操作写入log，然后才是写入内存Memtable中。
写入过程
日志写类的实现如下。首先了解一下，Slice是一个简单的结构，用来包含一个指向额外存储空间的指针和一个大小，可以和string相互转换。

class Writer {
 public:
 // 向dest写入
  explicit Writer(WritableFile* dest);
  Writer(WritableFile* dest, uint64_t dest_length);
  Writer(const Writer&) = delete;
  Writer& operator=(const Writer&) = delete;
  ~Writer();
  
  //公有接口，向日志中写入字符串
  Status AddRecord(const Slice& slice);

 private:
  //向磁盘中写入
  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
  //dest_是一个抽象类，需要自己继承实现
  WritableFile* dest_;  
  // 当前block中的偏移量
  int block_offset_;  
  // 不同type的crc码.
  uint32_t type_crc_[kMaxRecordType + 1];
};

AddRecord函数首先判断不同记录的类型，然后调用EmitPhysicalRecord函数将其写入物理磁盘

Status Writer::AddRecord(const Slice& slice) {
  const char* ptr = slice.data();
  size_t left = slice.size();

  Status s;
  bool begin = true;
  do {
    //leftover表示block剩余空间
    const int leftover = kBlockSize - block_offset_;
    assert(leftover >= 0);
    if (leftover < kHeaderSize) {
      // 剩余大小小于header大小，则Switch to a new block
      if (leftover > 0) {
        //这个block的剩余空间用\x00填充
        static_assert(kHeaderSize == 7, "");
        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
      }
      block_offset_ = 0;
    }

    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
    //avail表示block剩余存放数据的空间
    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
    const size_t fragment_length = (left < avail) ? left : avail;

    RecordType type;
    //end表示达到block的尾部
    const bool end = (left == fragment_length);
    //这里记录type
    if (begin && end) {
      //表示记录一次写完
      type = kFullType;
    } else if (begin) {
      //表示第一次开始，slice剩余量大于block（一个block装不下）
      type = kFirstType;
    } else if (end) {
      //表示在最后一个block
      type = kLastType;
    } else {
      //中间的block
      type = kMiddleType;
    }

    //写入物理存储中
    s = EmitPhysicalRecord(type, ptr, fragment_length);
    ptr += fragment_length;
    left -= fragment_length;
    begin = false;
  } while (s.ok() && left > 0);
  return s;
}

EmitPhysicalRecord函数是调用用户重写过的WritableFile::Append()进行加载到底层磁盘。

Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr,
                                  size_t length) {
  assert(length <= 0xffff);  
  assert(block_offset_ + kHeaderSize + length <= kBlockSize);

  // [0,1,2,3]是crc(4字节) [4，5]是长度（2字节） [6]是type（1字节）
  char buf[kHeaderSize];
  buf[4] = static_cast<char>(length & 0xff);
  buf[5] = static_cast<char>(length >> 8);
  buf[6] = static_cast<char>(t);

  // 计算crc值
  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, length);
  crc = crc32c::Mask(crc);  // Adjust for storage
  EncodeFixed32(buf, crc);

  //写头部
  Status s = dest_->Append(Slice(buf, kHeaderSize));
  if (s.ok()) {
    //写主体
    s = dest_->Append(Slice(ptr, length));
    if (s.ok()) {
      s = dest_->Flush();
    }
  }
  block_offset_ += kHeaderSize + length;
  return s;
}

参考博客：

https://leveldb-handbook.readthedocs.io/zh/latest/rwopt.html
https://www.jianshu.com/p/d1bb2e2ceb4c

LevelDB源码解读——Log日志文件

日志格式

日志的读过程

日志的写过程

猜你喜欢