dragonfly/server/snapshot.cc

// Copyright 2022, Roman Gershman.  All rights reserved.
// See LICENSE for licensing terms.
//

#include "server/snapshot.h"

extern "C" {
#include "redis/object.h"
}

#include <absl/strings/str_cat.h>

#include "base/logging.h"
#include "server/rdb_save.h"
#include "util/fiber_sched_algo.h"
#include "util/proactor_base.h"

namespace dfly {

using namespace std;
using namespace util;
using namespace chrono_literals;
namespace this_fiber = ::boost::this_fiber;
using boost::fibers::fiber;

SliceSnapshot::SliceSnapshot(PrimeTable* prime, ExpireTable* et, StringChannel* dest)
    : prime_table_(prime), dest_(dest) {
}

SliceSnapshot::~SliceSnapshot() {
}

void SliceSnapshot::Start(uint64_t version) {
  DCHECK(!fb_.joinable());

  VLOG(1) << "DbSaver::Start - saving entries with version less than " << version;
  sfile_.reset(new io::StringFile);

  rdb_serializer_.reset(new RdbSerializer(sfile_.get()));
  snapshot_version_ = version;
  fb_ = fiber([this] { FiberFunc(); });
}

void SliceSnapshot::Join() {
  fb_.join();
}

static_assert(sizeof(PrimeTable::const_iterator) == 16);

void SliceSnapshot::SerializeCb(MainIterator it) {
  error_code ec;

  string tmp;

  auto key = it->first.GetSlice(&tmp);

  // TODO: fetch expire.
  if (it->second.ObjType() == OBJ_STRING) {
    ec = rdb_serializer_->SaveKeyVal(key, it->second.ToString(), 0);
  } else {
    robj* obj = it->second.AsRObj();
    ec = rdb_serializer_->SaveKeyVal(key, obj, 0);
  }
  CHECK(!ec);  // we write to StringFile.
  ++serialized_;
}

// Serializes all the entries with version less than snapshot_version_.
void SliceSnapshot::FiberFunc() {
  this_fiber::properties<FiberProps>().set_name(
      absl::StrCat("SliceSnapshot", ProactorBase::GetIndex()));
  uint64_t cursor = 0;
  static_assert(PHYSICAL_LEN > PrimeTable::kPhysicalBucketNum);

  uint64_t last_yield = 0;
  do {
    // Traverse a single logical bucket but do not update its versions.
    // we can not update a version because entries in the same bucket share part of the version.
    // Therefore we save first, and then update version in one atomic swipe.
    uint64_t next = prime_table_->Traverse(cursor, [this](auto it) { this->SaveCb(move(it)); });

    cursor = next;
    physical_mask_.reset();

    // Flush if needed.
    FlushSfile(false);
    if (serialized_ >= last_yield + 100) {
      DVLOG(2) << "Before sleep " << this_fiber::properties<FiberProps>().name();
      this_fiber::yield();
      last_yield = serialized_;
      DVLOG(2) << "After sleep";
      // flush in case other fibers (writes commands that pushed previous values) filled the file.
      FlushSfile(false);
    }
  } while (cursor > 0);

  DVLOG(1) << "after loop " << this_fiber::properties<FiberProps>().name();
  FlushSfile(true);
  dest_->StartClosing();

  VLOG(1) << "Exit RdbProducer fiber with " << serialized_ << " serialized";
}

bool SliceSnapshot::FlushSfile(bool force) {
  if (force) {
    auto ec = rdb_serializer_->FlushMem();
    CHECK(!ec);
    if (sfile_->val.empty())
      return false;
  } else {
    if (sfile_->val.size() < 4096) {
      return false;
    }

    // Make sure we flush everything from membuffer in order to preserve the atomicity of keyvalue
    // serializations.
    auto ec = rdb_serializer_->FlushMem();
    CHECK(!ec);  // stringfile always succeeds.
  }
  VLOG(2) << "FlushSfile " << sfile_->val.size() << " bytes";

  string tmp = std::move(sfile_->val);  // important to move before pushing!
  dest_->Push(std::move(tmp));
  return true;
}

// The algorithm is to go over all the buckets and serialize entries that
// have version < snapshot_version_. In order to serialize each entry exactly once we update its
// version to snapshot_version_ once it has been serialized.
// Due to how bucket versions work we can not update individual entries - they may affect their
// neighbours in the bucket. Instead we handle serialization at physical bucket granularity.
// To further complicate things, Table::Traverse covers a logical bucket that may comprise of
// several physical buckets. The reason for this complication is that we need to guarantee
// a stable traversal during prime table mutations. PrimeTable::Traverse guarantees an atomic
// traversal of a single logical bucket, it also guarantees 100% coverage of all items
// that existed when the traversal started and survived until it finished.
//
// It's important that cb will run atomically so we avoid anu I/O work inside it.
// Instead, we flush our string file to disk in the traverse loop below.
bool SliceSnapshot::SaveCb(MainIterator it) {
  // if we touched that physical bucket - skip it.
  // We must to make sure we TraverseBucket exactly once for each physical bucket.
  // This test is the first one because it's likely to be the fastest one:
  // physical_mask_ is likely to be loaded in L1 and bucket_id() does not require accesing the
  // prime_table.
  if (physical_mask_.test(it.bucket_id())) {
    return false;
  }

  uint64_t v = it.GetVersion();
  if (v >= snapshot_version_) {
    // either has been already serialized or added after snapshotting started.
    DVLOG(2) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << ":" << it.slot_id()
             << " at " << v;
    ++skipped_;
    return false;
  }

  physical_mask_.set(it.bucket_id());

  // Both traversals below execute atomically.
  // traverse physical bucket and write into string file.
  prime_table_->TraverseBucket(it, [this](auto entry_it) { this->SerializeCb(move(entry_it)); });

  // Theoretically we could merge version_cb into the traversal above but then would would need
  // to give up on DCHECK.
  auto version_cb = [this](MainIterator entry_it) {
    DCHECK_LE(entry_it.GetVersion(), snapshot_version_);
    DVLOG(3) << "Bumping up version " << entry_it.bucket_id() << ":" << entry_it.slot_id();

    entry_it.SetVersion(snapshot_version_);
  };

  prime_table_->TraverseBucket(it, version_cb);

  return false;
}

}  // namespace dfly