// Copyright 2022, Roman Gershman. All rights reserved. // See LICENSE for licensing terms. // #include "server/snapshot.h" extern "C" { #include "redis/object.h" } #include #include "base/logging.h" #include "server/rdb_save.h" #include "util/fiber_sched_algo.h" #include "util/proactor_base.h" namespace dfly { using namespace std; using namespace util; using namespace chrono_literals; namespace this_fiber = ::boost::this_fiber; using boost::fibers::fiber; SliceSnapshot::SliceSnapshot(PrimeTable* prime, ExpireTable* et, StringChannel* dest) : prime_table_(prime), dest_(dest) { } SliceSnapshot::~SliceSnapshot() { } void SliceSnapshot::Start(uint64_t version) { DCHECK(!fb_.joinable()); VLOG(1) << "DbSaver::Start - saving entries with version less than " << version; sfile_.reset(new io::StringFile); rdb_serializer_.reset(new RdbSerializer(sfile_.get())); snapshot_version_ = version; fb_ = fiber([this] { FiberFunc(); }); } void SliceSnapshot::Join() { fb_.join(); } static_assert(sizeof(PrimeTable::const_iterator) == 16); void SliceSnapshot::SerializeCb(MainIterator it) { error_code ec; string tmp; auto key = it->first.GetSlice(&tmp); // TODO: fetch expire. if (it->second.ObjType() == OBJ_STRING) { ec = rdb_serializer_->SaveKeyVal(key, it->second.ToString(), 0); } else { robj* obj = it->second.AsRObj(); ec = rdb_serializer_->SaveKeyVal(key, obj, 0); } CHECK(!ec); // we write to StringFile. ++serialized_; } // Serializes all the entries with version less than snapshot_version_. void SliceSnapshot::FiberFunc() { this_fiber::properties().set_name( absl::StrCat("SliceSnapshot", ProactorBase::GetIndex())); uint64_t cursor = 0; static_assert(PHYSICAL_LEN > PrimeTable::kPhysicalBucketNum); uint64_t last_yield = 0; do { // Traverse a single logical bucket but do not update its versions. // we can not update a version because entries in the same bucket share part of the version. // Therefore we save first, and then update version in one atomic swipe. uint64_t next = prime_table_->Traverse(cursor, [this](auto it) { this->SaveCb(move(it)); }); cursor = next; physical_mask_.reset(); // Flush if needed. FlushSfile(false); if (serialized_ >= last_yield + 100) { DVLOG(2) << "Before sleep " << this_fiber::properties().name(); this_fiber::yield(); last_yield = serialized_; DVLOG(2) << "After sleep"; // flush in case other fibers (writes commands that pushed previous values) filled the file. FlushSfile(false); } } while (cursor > 0); DVLOG(1) << "after loop " << this_fiber::properties().name(); FlushSfile(true); dest_->StartClosing(); VLOG(1) << "Exit RdbProducer fiber with " << serialized_ << " serialized"; } bool SliceSnapshot::FlushSfile(bool force) { if (force) { auto ec = rdb_serializer_->FlushMem(); CHECK(!ec); if (sfile_->val.empty()) return false; } else { if (sfile_->val.size() < 4096) { return false; } // Make sure we flush everything from membuffer in order to preserve the atomicity of keyvalue // serializations. auto ec = rdb_serializer_->FlushMem(); CHECK(!ec); // stringfile always succeeds. } VLOG(2) << "FlushSfile " << sfile_->val.size() << " bytes"; string tmp = std::move(sfile_->val); // important to move before pushing! dest_->Push(std::move(tmp)); return true; } // The algorithm is to go over all the buckets and serialize entries that // have version < snapshot_version_. In order to serialize each entry exactly once we update its // version to snapshot_version_ once it has been serialized. // Due to how bucket versions work we can not update individual entries - they may affect their // neighbours in the bucket. Instead we handle serialization at physical bucket granularity. // To further complicate things, Table::Traverse covers a logical bucket that may comprise of // several physical buckets. The reason for this complication is that we need to guarantee // a stable traversal during prime table mutations. PrimeTable::Traverse guarantees an atomic // traversal of a single logical bucket, it also guarantees 100% coverage of all items // that existed when the traversal started and survived until it finished. // // It's important that cb will run atomically so we avoid anu I/O work inside it. // Instead, we flush our string file to disk in the traverse loop below. bool SliceSnapshot::SaveCb(MainIterator it) { // if we touched that physical bucket - skip it. // We must to make sure we TraverseBucket exactly once for each physical bucket. // This test is the first one because it's likely to be the fastest one: // physical_mask_ is likely to be loaded in L1 and bucket_id() does not require accesing the // prime_table. if (physical_mask_.test(it.bucket_id())) { return false; } uint64_t v = it.GetVersion(); if (v >= snapshot_version_) { // either has been already serialized or added after snapshotting started. DVLOG(2) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << ":" << it.slot_id() << " at " << v; ++skipped_; return false; } physical_mask_.set(it.bucket_id()); // Both traversals below execute atomically. // traverse physical bucket and write into string file. prime_table_->TraverseBucket(it, [this](auto entry_it) { this->SerializeCb(move(entry_it)); }); // Theoretically we could merge version_cb into the traversal above but then would would need // to give up on DCHECK. auto version_cb = [this](MainIterator entry_it) { DCHECK_LE(entry_it.GetVersion(), snapshot_version_); DVLOG(3) << "Bumping up version " << entry_it.bucket_id() << ":" << entry_it.slot_id(); entry_it.SetVersion(snapshot_version_); }; prime_table_->TraverseBucket(it, version_cb); return false; } } // namespace dfly