chore(server): rdb save can now save into tcp socket directly. (#317)

In more detail, RdbSaver uses AlignedBuffer that writes into io::Sink in chunks of 4KB.
It's great for the direct file I/O, but bad for sockets that receive blocks of 4KB with garbage
at the end. I improved the code around this and actually simplified the logic, so now AlignedBuffer
is just another Sink that is passed into serializer when writing into files. When sending to
sockets a socket sink is passed instead.

Also many other unrelated changes grouped into this pretty big cr.
1. dashtable readability improvements.
2. Move methods from facade::ConnectionContext - into facade::Service,
   make ConnectionContext a dumb object.
3. Optionally allow journal to be memory only (not backed up by a disk)
   by using a ring buffer to store last k entries in each journal slice. Also renamed
   journal_shard into journal_slice because journal has presence in each DF thread and not
   only in its shards.
4. Introduce journal::Entry that will consolidate any store change that happens in the thread.
5. Introduce GetRandomHex utility function.
6. Introduce two hooks: ServerFamily::OnClose that is called when a connection is closed,
   and ServerFamily::BreakOnShutdown that is called when process exits and any background fibers neet to
   break early.
7. Pull some noisy info logs out of rdb_load class.
8. Snapshot class now has the ability to subscribe to journal changes, thus it can include concurrent changes into the snapshot.
   Currently only journal::Op::VAL is supported (it's part of RDB format anyway).

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2022-09-20 01:09:03 -07:00 committed by GitHub
parent 1733af4cf6
commit 0a1b5eb297
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 573 additions and 318 deletions

View File

@ -716,12 +716,12 @@ template <typename U, typename V, typename EvictionPolicy>
auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, EvictionPolicy& ev)
-> std::pair<iterator, bool> {
uint64_t key_hash = DoHash(key);
uint32_t seg_id = SegmentId(key_hash);
uint32_t target_seg_id = SegmentId(key_hash);
while (true) {
// Keep last global_depth_ msb bits of the hash.
assert(seg_id < segment_.size());
SegmentType* target = segment_[seg_id];
assert(target_seg_id < segment_.size());
SegmentType* target = segment_[target_seg_id];
// Load heap allocated segment data - to avoid TLB miss when accessing the bucket.
__builtin_prefetch(target, 0, 1);
@ -731,12 +731,12 @@ auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, Evictio
if (res) { // success
++size_;
return std::make_pair(iterator{this, seg_id, it.index, it.slot}, true);
return std::make_pair(iterator{this, target_seg_id, it.index, it.slot}, true);
}
/*duplicate insert, insertion failure*/
if (it.found()) {
return std::make_pair(iterator{this, seg_id, it.index, it.slot}, false);
return std::make_pair(iterator{this, target_seg_id, it.index, it.slot}, false);
}
// At this point we must split the segment.
@ -749,12 +749,12 @@ auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, Evictio
hotspot.key_hash = key_hash;
for (unsigned j = 0; j < HotspotBuckets::kRegularBuckets; ++j) {
hotspot.probes.by_type.regular_buckets[j] = bucket_iterator{this, seg_id, bid[j]};
hotspot.probes.by_type.regular_buckets[j] = bucket_iterator{this, target_seg_id, bid[j]};
}
for (unsigned i = 0; i < Policy::kStashBucketNum; ++i) {
hotspot.probes.by_type.stash_buckets[i] =
bucket_iterator{this, seg_id, uint8_t(kLogicalBucketNum + i), 0};
bucket_iterator{this, target_seg_id, uint8_t(kLogicalBucketNum + i), 0};
}
hotspot.num_buckets = HotspotBuckets::kNumBuckets;
@ -770,7 +770,7 @@ auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, Evictio
/*unsigned start = (bid[HotspotBuckets::kNumBuckets - 1] + 1) % kLogicalBucketNum;
for (unsigned i = 0; i < HotspotBuckets::kNumBuckets; ++i) {
uint8_t id = (start + i) % kLogicalBucketNum;
buckets.probes.arr[i] = bucket_iterator{this, seg_id, id};
buckets.probes.arr[i] = bucket_iterator{this, target_seg_id, id};
}
garbage_collected_ += ev.GarbageCollect(buckets, this);
*/
@ -804,12 +804,12 @@ auto DashTable<_Key, _Value, Policy>::InsertInternal(U&& key, V&& value, Evictio
if (target->local_depth() == global_depth_) {
IncreaseDepth(global_depth_ + 1);
seg_id = SegmentId(key_hash);
assert(seg_id < segment_.size() && segment_[seg_id] == target);
target_seg_id = SegmentId(key_hash);
assert(target_seg_id < segment_.size() && segment_[target_seg_id] == target);
}
ev.RecordSplit(target);
Split(seg_id);
Split(target_seg_id);
}
return std::make_pair(iterator{}, false);

View File

@ -1220,6 +1220,8 @@ void Segment<Key, Value, Policy>::Split(HFunc&& hfn, Segment* dest_right) {
auto it = dest_right->InsertUniq(std::forward<Key_t>(Key(bid, slot)),
std::forward<Value_t>(Value(bid, slot)), hash);
(void)it;
assert(it.index != kNanBid);
if constexpr (USE_VERSION) {
// Update the version in the destination bucket.
uint64_t ver = stash.GetVersion();

View File

@ -17,8 +17,8 @@ class ConnectionContext {
public:
ConnectionContext(::io::Sink* stream, Connection* owner);
// We won't have any virtual methods, probably. However, since we allocate derived class,
// we need to declare a virtual d-tor so we could delete them inside Connection.
// We won't have any virtual methods, probably. However, since we allocate a derived class,
// we need to declare a virtual d-tor, so we could properly delete it from Connection code.
virtual ~ConnectionContext() {}
Connection* owner() {
@ -51,10 +51,6 @@ class ConnectionContext {
bool authenticated: 1;
bool force_dispatch: 1; // whether we should route all requests to the dispatch fiber.
virtual void OnClose() {}
virtual std::string GetContextInfo() const { return std::string{}; }
private:
Connection* owner_;
Protocol protocol_ = Protocol::REDIS;

View File

@ -300,7 +300,7 @@ string Connection::GetClientInfo() const {
absl::StrAppend(&res, " age=", now - creation_time_, " idle=", now - last_interaction_);
absl::StrAppend(&res, " phase=", phase_, " ");
if (cc_) {
absl::StrAppend(&res, cc_->GetContextInfo());
absl::StrAppend(&res, service_->GetContextInfo(cc_.get()));
}
return res;
@ -374,7 +374,7 @@ void Connection::ConnectionFlow(FiberSocketBase* peer) {
VLOG(1) << "Before dispatch_fb.join()";
dispatch_fb.join();
VLOG(1) << "After dispatch_fb.join()";
cc_->OnClose();
service_->OnClose(cc_.get());
stats->read_buf_capacity -= io_buf_.Capacity();

View File

@ -32,6 +32,13 @@ class ServiceInterface {
virtual void ConfigureHttpHandlers(util::HttpListenerBase* base) {
}
virtual void OnClose(ConnectionContext* cntx) {
}
virtual std::string GetContextInfo(ConnectionContext* cntx) {
return {};
}
};
} // namespace facade

View File

@ -27,7 +27,7 @@
#define MAXMEMORY_NO_EVICTION (7<<8)
#define CONFIG_RUN_ID_SIZE 40
#define CONFIG_RUN_ID_SIZE 40U
#define EVPOOL_CACHED_SDS_SIZE 255
#define EVPOOL_SIZE 16

View File

@ -2,7 +2,7 @@ add_executable(dragonfly dfly_main.cc)
cxx_link(dragonfly base dragonfly_lib)
add_library(dfly_transaction db_slice.cc engine_shard_set.cc blocking_controller.cc common.cc
io_mgr.cc journal/journal.cc journal/journal_shard.cc table.cc
io_mgr.cc journal/journal.cc journal/journal_slice.cc table.cc
tiered_storage.cc transaction.cc)
cxx_link(dfly_transaction uring_fiber_lib dfly_core strings_lib)

View File

@ -5,6 +5,7 @@
#pragma once
#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>
#include <absl/types/span.h>
#include <string_view>
@ -131,4 +132,28 @@ extern unsigned kernel_version;
const char* GlobalStateName(GlobalState gs);
template <typename RandGen> std::string GetRandomHex(RandGen& gen, size_t len) {
static_assert(std::is_same<uint64_t, decltype(gen())>::value);
std::string res(len, '\0');
size_t indx = 0;
for (size_t i = 0; i < len / 16; ++i) { // 2 chars per byte
absl::AlphaNum an(absl::Hex(gen(), absl::kZeroPad16));
for (unsigned j = 0; j < 16; ++j) {
res[indx++] = an.Piece()[j];
}
}
if (indx < res.size()) {
absl::AlphaNum an(absl::Hex(gen(), absl::kZeroPad16));
for (unsigned j = 0; indx < res.size(); indx++, j++) {
res[indx] = an.Piece()[j];
}
}
return res;
}
} // namespace dfly

View File

@ -211,45 +211,6 @@ void ConnectionContext::SendSubscriptionChangedResponse(string_view action,
(*this)->SendLong(count);
}
void ConnectionContext::OnClose() {
if (!conn_state.exec_info.watched_keys.empty()) {
shard_set->RunBriefInParallel([this](EngineShard* shard) {
return shard->db_slice().UnregisterConnectionWatches(&conn_state.exec_info);
});
}
if (!conn_state.subscribe_info)
return;
if (!conn_state.subscribe_info->channels.empty()) {
auto token = conn_state.subscribe_info->borrow_token;
UnsubscribeAll(false);
// Check that all borrowers finished processing
token.Wait();
}
if (conn_state.subscribe_info) {
DCHECK(!conn_state.subscribe_info->patterns.empty());
auto token = conn_state.subscribe_info->borrow_token;
PUnsubscribeAll(false);
// Check that all borrowers finished processing
token.Wait();
DCHECK(!conn_state.subscribe_info);
}
}
string ConnectionContext::GetContextInfo() const {
char buf[16] = {0};
unsigned index = 0;
if (async_dispatch)
buf[index++] = 'a';
if (conn_closing)
buf[index++] = 't';
return index ? absl::StrCat("flags:", buf) : string();
}
void ConnectionState::ExecInfo::Clear() {
state = EXEC_INACTIVE;
body.clear();

View File

@ -83,8 +83,11 @@ struct ConnectionState {
// For set op - it's the flag value we are storing along with the value.
// For get op - we use it as a mask of MCGetMask values.
uint32_t memcache_flag = 0;
// If it's a replication client - then it holds positive sync session id.
uint32_t sync_session_id = 0;
// If this server is master, and this connection is from a secondary replica,
// then it holds positive sync session id.
uint32_t repl_session_id = 0;
uint32_t repl_threadid = kuint32max;
ExecInfo exec_info;
std::optional<ScriptInfo> script_info;
@ -97,8 +100,6 @@ class ConnectionContext : public facade::ConnectionContext {
: facade::ConnectionContext(stream, owner) {
}
void OnClose() override;
struct DebugInfo {
uint32_t shards_count = 0;
TxClock clock = 0;
@ -123,8 +124,6 @@ class ConnectionContext : public facade::ConnectionContext {
bool is_replicating = false;
std::string GetContextInfo() const override;
private:
void SendSubscriptionChangedResponse(std::string_view action,
std::optional<std::string_view> topic,

View File

@ -67,6 +67,7 @@ class PrimeEvictionPolicy {
can_evict_(can_evict) {
}
// A hook function that is called every time a segment is full and requires splitting.
void RecordSplit(PrimeTable::Segment_t* segment) {
mem_budget_ -= PrimeTable::kSegBytes;
DVLOG(1) << "split: " << segment->SlowSize() << "/" << segment->capacity();

View File

@ -113,6 +113,23 @@ void DflyCmd::Run(CmdArgList args, ConnectionContext* cntx) {
rb->SendError(kSyntaxErr);
}
void DflyCmd::OnClose(ConnectionContext* cntx) {
boost::fibers::fiber repl_fb;
if (cntx->conn_state.repl_session_id > 0 && cntx->conn_state.repl_threadid != kuint32max) {
unique_lock lk(mu_);
auto it = sync_info_.find(cntx->conn_state.repl_session_id);
if (it != sync_info_.end()) {
VLOG(1) << "Found tbd: " << cntx->conn_state.repl_session_id;
}
}
if (repl_fb.joinable()) {
repl_fb.join();
}
}
void DflyCmd::HandleJournal(CmdArgList args, ConnectionContext* cntx) {
DCHECK_GE(args.size(), 3u);
ToUpper(&args[2]);
@ -127,7 +144,26 @@ void DflyCmd::HandleJournal(CmdArgList args, ConnectionContext* cntx) {
journal::Journal* journal = ServerState::tlocal()->journal();
if (!journal) {
string dir = absl::GetFlag(FLAGS_dir);
sf_->journal()->StartLogging(dir);
atomic_uint32_t created{0};
auto* pool = shard_set->pool();
auto open_cb = [&](auto* pb) {
auto ec = sf_->journal()->OpenInThread(true, dir);
if (ec) {
LOG(ERROR) << "Could not create journal " << ec;
} else {
created.fetch_add(1, memory_order_relaxed);
}
};
pool->AwaitFiberOnAll(open_cb);
if (created.load(memory_order_acquire) != pool->size()) {
LOG(FATAL) << "TBD / revert";
}
// We can not use transaction distribution mechanism because we must open journal for all
// threads and not only for shards.
trans->Schedule();
auto barrier_cb = [](Transaction* t, EngineShard* shard) { return OpStatus::OK; };
trans->Execute(barrier_cb, true);
@ -165,4 +201,8 @@ uint32_t DflyCmd::AllocateSyncSession() {
return it->first;
}
void DflyCmd::BreakOnShutdown() {
VLOG(1) << "BreakOnShutdown";
}
} // namespace dfly

View File

@ -29,6 +29,11 @@ class DflyCmd {
uint32_t AllocateSyncSession();
void OnClose(ConnectionContext* cntx);
// stops all background processes so we could exit in orderly manner.
void BreakOnShutdown();
private:
void HandleJournal(CmdArgList args, ConnectionContext* cntx);

View File

@ -8,7 +8,7 @@
#include "base/logging.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal_shard.h"
#include "server/journal/journal_slice.h"
#include "server/server_state.h"
namespace dfly {
@ -21,43 +21,33 @@ namespace fibers = boost::fibers;
namespace {
thread_local JournalShard journal_shard;
// Present in all threads (not only in shard threads).
thread_local JournalSlice journal_slice;
} // namespace
Journal::Journal() {
}
error_code Journal::StartLogging(std::string_view dir) {
if (journal_shard.IsOpen()) {
return error_code{};
error_code Journal::OpenInThread(bool persistent, string_view dir) {
journal_slice.Init(unsigned(ProactorBase::GetIndex()));
error_code ec;
if (persistent) {
ec = journal_slice.Open(dir);
if (ec) {
return ec;
}
}
auto* pool = shard_set->pool();
atomic_uint32_t created{0};
lock_guard lk(state_mu_);
auto open_cb = [&](auto* pb) {
auto ec = journal_shard.Open(dir, unsigned(ProactorBase::GetIndex()));
if (ec) {
LOG(FATAL) << "Could not create journal " << ec; // TODO
} else {
created.fetch_add(1, memory_order_relaxed);
ServerState::tlocal()->set_journal(this);
EngineShard* shard = EngineShard::tlocal();
if (shard) {
shard->set_journal(this);
}
}
};
pool->AwaitFiberOnAll(open_cb);
if (created.load(memory_order_acquire) != pool->size()) {
LOG(FATAL) << "TBD / revert";
}
return error_code{};
return ec;
}
error_code Journal::Close() {
@ -76,7 +66,7 @@ error_code Journal::Close() {
shard->set_journal(nullptr);
}
auto ec = journal_shard.Close();
auto ec = journal_slice.Close();
if (ec) {
lock_guard lk2(ec_mu);
@ -89,21 +79,30 @@ error_code Journal::Close() {
return res;
}
uint32_t Journal::RegisterOnChange(ChangeCallback cb) {
return journal_slice.RegisterOnChange(cb);
}
void Journal::Unregister(uint32_t id) {
journal_slice.Unregister(id);
}
bool Journal::SchedStartTx(TxId txid, unsigned num_keys, unsigned num_shards) {
if (!journal_shard.IsOpen() || lameduck_.load(memory_order_relaxed))
if (!journal_slice.IsOpen() || lameduck_.load(memory_order_relaxed))
return false;
journal_shard.AddLogRecord(txid, unsigned(Op::SCHED));
// TODO: to complete the metadata.
journal_slice.AddLogRecord(Entry::Sched(txid));
return true;
}
LSN Journal::GetLsn() const {
return journal_shard.cur_lsn();
return journal_slice.cur_lsn();
}
bool Journal::EnterLameDuck() {
if (!journal_shard.IsOpen()) {
if (!journal_slice.IsOpen()) {
return false;
}
@ -112,15 +111,17 @@ bool Journal::EnterLameDuck() {
return res;
}
void Journal::RecordEntry(const Entry& entry) {
journal_slice.AddLogRecord(entry);
}
/*
void Journal::OpArgs(TxId txid, Op opcode, Span keys) {
DCHECK(journal_shard.IsOpen());
DCHECK(journal_slice.IsOpen());
journal_shard.AddLogRecord(txid, unsigned(opcode));
}
void Journal::RecordEntry(TxId txid, const PrimeKey& key, const PrimeValue& pval) {
journal_shard.AddLogRecord(txid, unsigned(Op::VAL));
journal_slice.AddLogRecord(txid, opcode);
}
*/
} // namespace journal
} // namespace dfly

View File

@ -4,8 +4,7 @@
#pragma once
#include "server/common.h"
#include "server/table.h"
#include "server/journal/types.h"
#include "util/proactor_pool.h"
namespace dfly {
@ -14,17 +13,6 @@ class Transaction;
namespace journal {
enum class Op : uint8_t {
NOOP = 0,
LOCK = 1,
UNLOCK = 2,
LOCK_SHARD = 3,
UNLOCK_SHARD = 4,
SCHED = 5,
VAL = 10,
DEL,
MSET,
};
class Journal {
public:
@ -32,8 +20,6 @@ class Journal {
Journal();
std::error_code StartLogging(std::string_view dir);
// Returns true if journal has been active and changed its state to lameduck mode
// and false otherwise.
bool EnterLameDuck(); // still logs ongoing transactions but refuses to start new ones.
@ -41,10 +27,20 @@ class Journal {
// Requires: journal is in lameduck mode.
std::error_code Close();
// Opens journal inside a Dragonfly thread. Must be called in each thread.
std::error_code OpenInThread(bool persistent, std::string_view dir);
//******* The following functions must be called in the context of the owning shard *********//
uint32_t RegisterOnChange(ChangeCallback cb);
void Unregister(uint32_t id);
// Returns true if transaction was scheduled, false if journal is inactive
// or in lameduck mode and does not log new transactions.
bool SchedStartTx(TxId txid, unsigned num_keys, unsigned num_shards);
/*
void AddCmd(TxId txid, Op opcode, Span args) {
OpArgs(txid, opcode, args);
}
@ -56,13 +52,12 @@ class Journal {
void Unlock(TxId txid, Span keys) {
OpArgs(txid, Op::UNLOCK, keys);
}
*/
LSN GetLsn() const;
void RecordEntry(TxId txid, const PrimeKey& key, const PrimeValue& pval);
void RecordEntry(const Entry& entry);
private:
void OpArgs(TxId id, Op opcode, Span keys);
mutable boost::fibers::mutex state_mu_;

View File

@ -2,12 +2,11 @@
// See LICENSE for licensing terms.
//
#include "server/journal/journal_shard.h"
#include <fcntl.h>
#include "server/journal/journal_slice.h"
#include <absl/container/inlined_vector.h>
#include <absl/strings/str_cat.h>
#include <fcntl.h>
#include <filesystem>
@ -35,17 +34,30 @@ string ShardName(std::string_view base, unsigned index) {
CHECK(!__ec$) << "Error: " << __ec$ << " " << __ec$.message() << " for " << #x; \
} while (false)
struct JournalSlice::RingItem {
LSN lsn;
TxId txid;
Op opcode;
};
JournalShard::JournalShard() {
JournalSlice::JournalSlice() {
}
JournalShard::~JournalShard() {
JournalSlice::~JournalSlice() {
CHECK(!shard_file_);
}
std::error_code JournalShard::Open(const std::string_view dir, unsigned index) {
void JournalSlice::Init(unsigned index) {
if (ring_buffer_) // calling this function multiple times is allowed and it's a no-op.
return;
slice_index_ = index;
ring_buffer_.emplace(128); // TODO: to make it configurable
}
std::error_code JournalSlice::Open(std::string_view dir) {
CHECK(!shard_file_);
DCHECK_NE(slice_index_, UINT32_MAX);
fs::path dir_path;
@ -65,7 +77,8 @@ std::error_code JournalShard::Open(const std::string_view dir, unsigned index) {
}
// LOG(INFO) << int(dir_status.type());
}
dir_path.append(ShardName("journal", index));
dir_path.append(ShardName("journal", slice_index_));
shard_path_ = dir_path;
// For file integrity guidelines see:
@ -81,15 +94,14 @@ std::error_code JournalShard::Open(const std::string_view dir, unsigned index) {
DVLOG(1) << "Opened journal " << shard_path_;
shard_file_ = std::move(res).value();
shard_index_ = index;
file_offset_ = 0;
status_ec_.clear();
return error_code{};
}
error_code JournalShard::Close() {
VLOG(1) << "JournalShard::Close";
error_code JournalSlice::Close() {
VLOG(1) << "JournalSlice::Close";
CHECK(shard_file_);
lameduck_ = true;
@ -103,13 +115,44 @@ error_code JournalShard::Close() {
return ec;
}
void JournalShard::AddLogRecord(TxId txid, unsigned opcode) {
string line = absl::StrCat(lsn_, " ", txid, " ", opcode, "\n");
error_code ec = shard_file_->Write(io::Buffer(line), file_offset_, 0);
CHECK_EC(ec);
file_offset_ += line.size();
void JournalSlice::AddLogRecord(const Entry& entry) {
DCHECK(ring_buffer_);
for (const auto& k_v : change_cb_arr_) {
k_v.second(entry);
}
RingItem item;
item.lsn = lsn_;
item.opcode = entry.opcode;
item.txid = entry.txid;
VLOG(1) << "Writing item " << item.lsn;
ring_buffer_->EmplaceOrOverride(move(item));
if (shard_file_) {
string line = absl::StrCat(lsn_, " ", entry.txid, " ", entry.opcode, "\n");
error_code ec = shard_file_->Write(io::Buffer(line), file_offset_, 0);
CHECK_EC(ec);
file_offset_ += line.size();
}
++lsn_;
}
uint32_t JournalSlice::RegisterOnChange(ChangeCallback cb) {
uint32_t id = next_cb_id_++;
change_cb_arr_.emplace_back(id, std::move(cb));
return id;
}
void JournalSlice::Unregister(uint32_t id) {
for (auto it = change_cb_arr_.begin(); it != change_cb_arr_.end(); ++it) {
if (it->first == id) {
change_cb_arr_.erase(it);
break;
}
}
}
} // namespace journal
} // namespace dfly

View File

@ -9,18 +9,23 @@
#include <optional>
#include <string_view>
#include "base/ring_buffer.h"
#include "server/common.h"
#include "server/journal/types.h"
#include "util/uring/uring_file.h"
namespace dfly {
namespace journal {
class JournalShard {
// Journal slice is present for both shards and io threads.
class JournalSlice {
public:
JournalShard();
~JournalShard();
JournalSlice();
~JournalSlice();
std::error_code Open(const std::string_view dir, unsigned index);
void Init(unsigned index);
std::error_code Open(std::string_view dir);
std::error_code Close();
@ -32,20 +37,30 @@ class JournalShard {
return status_ec_;
}
// Whether the file-based journaling is open.
bool IsOpen() const {
return bool(shard_file_);
}
void AddLogRecord(TxId txid, unsigned opcode);
void AddLogRecord(const Entry& entry);
uint32_t RegisterOnChange(ChangeCallback cb);
void Unregister(uint32_t);
private:
struct RingItem;
std::string shard_path_;
std::unique_ptr<util::uring::LinuxFile> shard_file_;
std::optional<base::RingBuffer<RingItem>> ring_buffer_;
std::vector<std::pair<uint32_t, ChangeCallback>> change_cb_arr_;
size_t file_offset_ = 0;
LSN lsn_ = 1;
unsigned shard_index_ = -1;
uint32_t slice_index_ = UINT32_MAX;
uint32_t next_cb_id_ = 1;
std::error_code status_ec_;

View File

@ -0,0 +1,50 @@
// Copyright 2022, Roman Gershman. All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once
#include "server/common.h"
#include "server/table.h"
namespace dfly {
namespace journal {
enum class Op : uint8_t {
NOOP = 0,
LOCK = 1,
UNLOCK = 2,
LOCK_SHARD = 3,
UNLOCK_SHARD = 4,
SCHED = 5,
VAL = 10,
DEL,
MSET,
};
// TODO: to pass all the attributes like ttl, stickiness etc.
struct Entry {
Entry(Op op, DbIndex did, TxId tid, std::string_view skey)
: opcode(op), db_ind(did), txid(tid), key(skey) {
}
Entry(DbIndex did, TxId tid, std::string_view skey, const PrimeValue& pval)
: Entry(Op::VAL, did, tid, skey) {
pval_ptr = &pval;
}
static Entry Sched(TxId tid) {
return Entry{Op::SCHED, 0, tid, {}};
}
Op opcode;
DbIndex db_ind;
TxId txid;
std::string_view key;
const PrimeValue* pval_ptr = nullptr;
uint64_t expire_ms = 0; // 0 means no expiry.
};
using ChangeCallback = std::function<void(const Entry&)>;
} // namespace journal
} // namespace dfly

View File

@ -51,9 +51,10 @@ string GetString(EngineShard* shard, const PrimeValue& pv) {
return res;
}
inline void RecordJournal(const OpArgs& op_args, const PrimeKey& pkey, const PrimeKey& pvalue) {
inline void RecordJournal(const OpArgs& op_args, string_view key, const PrimeKey& pvalue) {
if (op_args.shard->journal()) {
op_args.shard->journal()->RecordEntry(op_args.txid, pkey, pvalue);
journal::Entry entry{op_args.db_ind, op_args.txid, key, pvalue};
op_args.shard->journal()->RecordEntry(entry);
}
}
@ -63,7 +64,7 @@ void SetString(const OpArgs& op_args, string_view key, const string& value) {
db_slice.PreUpdate(op_args.db_ind, it_output);
it_output->second.SetString(value);
db_slice.PostUpdate(op_args.db_ind, it_output, key);
RecordJournal(op_args, it_output->first, it_output->second);
RecordJournal(op_args, key, it_output->second);
}
string JsonType(const json& val) {

View File

@ -675,10 +675,11 @@ facade::ConnectionContext* Service::CreateContext(util::FiberSocketBase* peer,
// a bit of a hack. I set up breaker callback here for the owner.
// Should work though it's confusing to have it here.
owner->RegisterOnBreak([res](uint32_t) {
owner->RegisterOnBreak([res, this](uint32_t) {
if (res->transaction) {
res->transaction->BreakOnClose();
res->transaction->BreakOnShutdown();
}
this->server_family().BreakOnShutdown();
});
return res;
@ -1060,7 +1061,7 @@ void Service::Publish(CmdArgList args, ConnectionContext* cntx) {
// How do we know that subsribers did not disappear after we fetched them?
// Each subscriber object hold a borrow_token.
// ConnectionContext::OnClose does not reset subscribe_info before all tokens are returned.
// OnClose does not reset subscribe_info before all tokens are returned.
vector<ChannelSlice::Subscriber> subscriber_arr = shard_set->Await(sid, std::move(cb));
atomic_uint32_t published{0};
@ -1249,6 +1250,45 @@ void Service::ConfigureHttpHandlers(util::HttpListenerBase* base) {
base->RegisterCb("/txz", TxTable);
}
void Service::OnClose(facade::ConnectionContext* cntx) {
ConnectionContext* server_cntx = static_cast<ConnectionContext*>(cntx);
ConnectionState& conn_state = server_cntx->conn_state;
if (conn_state.subscribe_info) { // Clean-ups related to PUBSUB
if (!conn_state.subscribe_info->channels.empty()) {
auto token = conn_state.subscribe_info->borrow_token;
server_cntx->UnsubscribeAll(false);
// Check that all borrowers finished processing.
// token is increased in channel_slice (the publisher side).
token.Wait();
}
if (conn_state.subscribe_info) {
DCHECK(!conn_state.subscribe_info->patterns.empty());
auto token = conn_state.subscribe_info->borrow_token;
server_cntx->PUnsubscribeAll(false);
// Check that all borrowers finished processing
token.Wait();
DCHECK(!conn_state.subscribe_info);
}
}
server_family_.OnClose(server_cntx);
}
string Service::GetContextInfo(facade::ConnectionContext* cntx) {
char buf[16] = {0};
unsigned index = 0;
if (cntx->async_dispatch)
buf[index++] = 'a';
if (cntx->conn_closing)
buf[index++] = 't';
return index ? absl::StrCat("flags:", buf) : string();
}
using ServiceFunc = void (Service::*)(CmdArgList, ConnectionContext* cntx);
#define HFUNC(x) SetHandler(&Service::x)

View File

@ -82,6 +82,8 @@ class Service : public facade::ServiceInterface {
GlobalState SwitchState(GlobalState from , GlobalState to);
void ConfigureHttpHandlers(util::HttpListenerBase* base) final;
void OnClose(facade::ConnectionContext* cntx) final;
std::string GetContextInfo(facade::ConnectionContext* cntx) final;
private:
static void Quit(CmdArgList args, ConnectionContext* cntx);

View File

@ -1004,9 +1004,8 @@ error_code RdbLoader::Load(io::Source* src) {
bc.Wait(); // wait for sentinels to report.
absl::Duration dur = absl::Now() - start;
double seconds = double(absl::ToInt64Milliseconds(dur)) / 1000;
LOG(INFO) << "Done loading RDB, keys loaded: " << keys_loaded;
LOG(INFO) << "Loading finished after " << strings::HumanReadableElapsedTime(seconds);
load_time_ = double(absl::ToInt64Milliseconds(dur)) / 1000;
keys_loaded_ = keys_loaded;
return kOk;
}

View File

@ -34,10 +34,20 @@ class RdbLoader {
::io::Bytes Leftover() const {
return mem_buf_.InputBuffer();
}
size_t bytes_read() const {
return bytes_read_;
}
size_t keys_loaded() const {
return keys_loaded_;
}
// returns time in seconds.
double load_time() const {
return load_time_;
}
private:
using MutableBytes = ::io::MutableBytes;
struct ObjSettings;
@ -49,8 +59,8 @@ class RdbLoader {
struct LoadTrace;
using RdbVariant = std::variant<long long, base::PODArray<char>, LzfString,
std::unique_ptr<LoadTrace>>;
using RdbVariant =
std::variant<long long, base::PODArray<char>, LzfString, std::unique_ptr<LoadTrace>>;
struct OpaqueObj {
RdbVariant obj;
int rdb_type;
@ -164,6 +174,9 @@ class RdbLoader {
::io::Source* src_ = nullptr;
size_t bytes_read_ = 0;
size_t source_limit_ = SIZE_MAX;
size_t keys_loaded_ = 0;
double load_time_ = 0;
DbIndex cur_db_index_ = 0;
::boost::fibers::mutex mu_;

View File

@ -2,13 +2,14 @@
// See LICENSE for licensing terms.
//
#include "core/string_set.h"
#include "server/rdb_save.h"
#include <absl/cleanup/cleanup.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_format.h>
#include "core/string_set.h"
extern "C" {
#include "redis/intset.h"
#include "redis/listpack.h"
@ -159,10 +160,6 @@ constexpr size_t kAmask = 4_KB - 1;
RdbSerializer::RdbSerializer(io::Sink* s) : sink_(s), mem_buf_{4_KB}, tmp_buf_(nullptr) {
}
RdbSerializer::RdbSerializer(AlignedBuffer* aligned_buf) : RdbSerializer((io::Sink*)nullptr) {
aligned_buf_ = aligned_buf;
}
RdbSerializer::~RdbSerializer() {
}
@ -311,7 +308,7 @@ error_code RdbSerializer::SaveSetObject(const PrimeValue& obj) {
RETURN_ON_ERR(SaveString(string_view{ele, sdslen(ele)}));
}
} else if (obj.Encoding() == kEncodingStrMap2) {
StringSet *set = (StringSet*)obj.RObjPtr();
StringSet* set = (StringSet*)obj.RObjPtr();
RETURN_ON_ERR(SaveLen(set->Size()));
@ -593,22 +590,14 @@ error_code RdbSerializer::WriteRaw(const io::Bytes& buf) {
io::Bytes ib = mem_buf_.InputBuffer();
if (ib.empty()) {
if (sink_) {
return sink_->Write(buf);
} else {
return aligned_buf_->Write(buf);
}
} else {
if (sink_) {
iovec v[2] = {{.iov_base = const_cast<uint8_t*>(ib.data()), .iov_len = ib.size()},
{.iov_base = const_cast<uint8_t*>(buf.data()), .iov_len = buf.size()}};
RETURN_ON_ERR(sink_->Write(v, ABSL_ARRAYSIZE(v)));
} else {
RETURN_ON_ERR(aligned_buf_->Write(ib));
RETURN_ON_ERR(aligned_buf_->Write(buf));
}
mem_buf_.ConsumeInput(ib.size());
return sink_->Write(buf);
}
// else
iovec v[2] = {{.iov_base = const_cast<uint8_t*>(ib.data()), .iov_len = ib.size()},
{.iov_base = const_cast<uint8_t*>(buf.data()), .iov_len = buf.size()}};
RETURN_ON_ERR(sink_->Write(v, ABSL_ARRAYSIZE(v)));
mem_buf_.ConsumeInput(ib.size());
return error_code{};
}
@ -620,11 +609,7 @@ error_code RdbSerializer::FlushMem() {
DVLOG(2) << "FlushMem " << sz << " bytes";
// interrupt point.
if (sink_) {
RETURN_ON_ERR(sink_->Write(mem_buf_.InputBuffer()));
} else {
RETURN_ON_ERR(aligned_buf_->Write(mem_buf_.InputBuffer()));
}
RETURN_ON_ERR(sink_->Write(mem_buf_.InputBuffer()));
mem_buf_.ConsumeInput(sz);
return error_code{};
@ -705,37 +690,37 @@ AlignedBuffer::~AlignedBuffer() {
mi_free(aligned_buf_);
}
// TODO: maybe to derive AlignedBuffer from Sink?
std::error_code AlignedBuffer::Write(io::Bytes record) {
if (buf_offs_ + record.size() < capacity_) {
memcpy(aligned_buf_ + buf_offs_, record.data(), record.size());
buf_offs_ += record.size();
return error_code{};
io::Result<size_t> AlignedBuffer::WriteSome(const iovec* v, uint32_t len) {
size_t total_len = 0;
uint32_t vindx = 0;
for (; vindx < len; ++vindx) {
auto item = v[vindx];
total_len += item.iov_len;
while (buf_offs_ + item.iov_len > capacity_) {
size_t to_write = capacity_ - buf_offs_;
memcpy(aligned_buf_ + buf_offs_, item.iov_base, to_write);
iovec ivec{.iov_base = aligned_buf_, .iov_len = capacity_};
error_code ec = upstream_->Write(&ivec, 1);
if (ec)
return nonstd::make_unexpected(ec);
item.iov_len -= to_write;
item.iov_base = reinterpret_cast<char*>(item.iov_base) + to_write;
buf_offs_ = 0;
}
DCHECK_GT(item.iov_len, 0u);
memcpy(aligned_buf_ + buf_offs_, item.iov_base, item.iov_len);
buf_offs_ += item.iov_len;
}
memcpy(aligned_buf_ + buf_offs_, record.data(), capacity_ - buf_offs_);
size_t record_offs = capacity_ - buf_offs_;
buf_offs_ = 0;
size_t needed;
do {
iovec ivec{.iov_base = aligned_buf_, .iov_len = capacity_};
RETURN_ON_ERR(upstream_->Write(&ivec, 1));
needed = record.size() - record_offs;
if (needed < capacity_)
break;
memcpy(aligned_buf_, record.data() + record_offs, capacity_);
record_offs += capacity_;
} while (true);
if (needed) {
memcpy(aligned_buf_, record.data() + record_offs, needed);
buf_offs_ = needed;
}
return error_code{};
return total_len;
}
// Note that it may write more than AlignedBuffer has at this point since it rounds up the length
// to the nearest page boundary.
error_code AlignedBuffer::Flush() {
size_t len = (buf_offs_ + kAmask) & (~kAmask);
iovec ivec{.iov_base = aligned_buf_, .iov_len = len};
@ -748,7 +733,7 @@ class RdbSaver::Impl {
public:
// We pass K=sz to say how many producers are pushing data in order to maintain
// correct closing semantics - channel is closing when K producers marked it as closed.
Impl(unsigned producers_len, io::Sink* sink);
Impl(bool align_writes, unsigned producers_len, io::Sink* sink);
error_code SaveAuxFieldStrStr(string_view key, string_view val);
@ -758,10 +743,13 @@ class RdbSaver::Impl {
error_code ConsumeChannel();
void StartSnapshotting(EngineShard* shard);
void StartSnapshotting(bool include_journal_changes, EngineShard* shard);
error_code Flush() {
return aligned_buf_.Flush();
if (aligned_buf_)
return aligned_buf_->Flush();
return error_code{};
}
size_t Size() const {
@ -771,20 +759,23 @@ class RdbSaver::Impl {
void FillFreqMap(RdbTypeFreqMap* dest) const;
private:
AlignedBuffer aligned_buf_;
io::Sink* sink_;
// used for serializing non-body components in the calling fiber.
RdbSerializer meta_serializer_;
vector<unique_ptr<SliceSnapshot>> shard_snapshots_;
SliceSnapshot::RecordChannel channel_;
std::optional<AlignedBuffer> aligned_buf_;
};
// We pass K=sz to say how many producers are pushing data in order to maintain
// correct closing semantics - channel is closing when K producers marked it as closed.
RdbSaver::Impl::Impl(unsigned producers_len, io::Sink* sink)
: aligned_buf_(kBufLen, sink), meta_serializer_(&aligned_buf_),
RdbSaver::Impl::Impl(bool align_writes, unsigned producers_len, io::Sink* sink)
: sink_(sink), meta_serializer_(sink),
shard_snapshots_(producers_len), channel_{128, producers_len} {
if (align_writes) {
aligned_buf_.emplace(kBufLen, sink);
meta_serializer_.set_sink(&aligned_buf_.value());
}
}
error_code RdbSaver::Impl::SaveAuxFieldStrStr(string_view key, string_view val) {
@ -799,8 +790,6 @@ error_code RdbSaver::Impl::SaveAuxFieldStrStr(string_view key, string_view val)
error_code RdbSaver::Impl::ConsumeChannel() {
error_code io_error;
// we can not exit on io-error since we spawn fibers that push data.
// TODO: we may signal them to stop processing and exit asap in case of the error.
uint8_t buf[16];
size_t channel_bytes = 0;
SliceSnapshot::DbRecord record;
@ -808,6 +797,9 @@ error_code RdbSaver::Impl::ConsumeChannel() {
buf[0] = RDB_OPCODE_SELECTDB;
// we can not exit on io-error since we spawn fibers that push data.
// TODO: we may signal them to stop processing and exit asap in case of the error.
auto& channel = channel_;
while (channel.Pop(record)) {
if (io_error)
@ -816,9 +808,13 @@ error_code RdbSaver::Impl::ConsumeChannel() {
do {
if (record.db_index != last_db_index) {
unsigned enclen = SerializeLen(record.db_index, buf + 1);
char* str = (char*)buf;
string_view str{(char*)buf, enclen + 1};
io_error = aligned_buf_.Write(string_view{str, enclen + 1});
if (aligned_buf_) {
io_error = aligned_buf_->Write(str);
} else {
io_error = sink_->Write(io::Buffer(str));
}
if (io_error)
break;
last_db_index = record.db_index;
@ -826,7 +822,12 @@ error_code RdbSaver::Impl::ConsumeChannel() {
DVLOG(2) << "Pulled " << record.id;
channel_bytes += record.value.size();
io_error = aligned_buf_.Write(record.value);
if (aligned_buf_) {
io_error = aligned_buf_->Write(record.value);
} else {
io_error = sink_->Write(io::Buffer(record.value));
}
record.value.clear();
} while (!io_error && channel.TryPop(record));
} // while (channel.pop)
@ -844,10 +845,10 @@ error_code RdbSaver::Impl::ConsumeChannel() {
return io_error;
}
void RdbSaver::Impl::StartSnapshotting(EngineShard* shard) {
void RdbSaver::Impl::StartSnapshotting(bool include_journal_changes, EngineShard* shard) {
auto s = make_unique<SliceSnapshot>(&shard->db_slice(), &channel_);
s->Start();
s->Start(include_journal_changes);
// For single shard configuration, we maintain only one snapshot,
// so we do not have to map it via shard_id.
@ -863,10 +864,10 @@ void RdbSaver::Impl::FillFreqMap(RdbTypeFreqMap* dest) const {
}
}
RdbSaver::RdbSaver(::io::Sink* sink, bool single_shard) {
RdbSaver::RdbSaver(::io::Sink* sink, bool single_shard, bool align_writes) {
CHECK_NOTNULL(sink);
impl_.reset(new Impl(single_shard ? 1 : shard_set->size(), sink));
impl_.reset(new Impl(align_writes, single_shard ? 1 : shard_set->size(), sink));
}
RdbSaver::~RdbSaver() {
@ -904,8 +905,8 @@ error_code RdbSaver::SaveBody(RdbTypeFreqMap* freq_map) {
return error_code{};
}
void RdbSaver::StartSnapshotInShard(EngineShard* shard) {
impl_->StartSnapshotting(shard);
void RdbSaver::StartSnapshotInShard(bool include_journal_changes, EngineShard* shard) {
impl_->StartSnapshotting(include_journal_changes, shard);
}
error_code RdbSaver::SaveAux(const StringVec& lua_scripts) {

View File

@ -40,21 +40,24 @@ class LinuxWriteWrapper : public io::Sink {
off_t offset_ = 0;
};
class AlignedBuffer {
class AlignedBuffer : public ::io::Sink {
public:
using io::Sink::Write;
AlignedBuffer(size_t cap, ::io::Sink* upstream);
~AlignedBuffer();
// TODO: maybe to derive AlignedBuffer from Sink?
std::error_code Write(std::string_view buf) {
return Write(io::Buffer(buf));
}
std::error_code Write(io::Bytes buf);
io::Result<size_t> WriteSome(const iovec* v, uint32_t len) final;
std::error_code Flush();
::io::Sink* upstream() { return upstream_;}
::io::Sink* upstream() {
return upstream_;
}
private:
size_t capacity_;
@ -70,7 +73,9 @@ class RdbSaver {
// to snapshot all the datastore shards.
// single_shard - false, means we capture all the data using a single RdbSaver instance
// (corresponds to legacy, redis compatible mode)
explicit RdbSaver(::io::Sink* sink, bool single_shard);
// if align_writes is true - writes data in aligned chunks of 4KB to fit direct I/O requirements.
explicit RdbSaver(::io::Sink* sink, bool single_shard, bool align_writes);
~RdbSaver();
std::error_code SaveHeader(const StringVec& lua_scripts);
@ -81,7 +86,8 @@ class RdbSaver {
std::error_code SaveBody(RdbTypeFreqMap* freq_map);
// Initiates the serialization in the shard's thread.
void StartSnapshotInShard(EngineShard* shard);
// TODO: to implement break functionality to allow stopping early.
void StartSnapshotInShard(bool include_journal_changes, EngineShard* shard);
private:
class Impl;
@ -94,13 +100,12 @@ class RdbSaver {
std::unique_ptr<Impl> impl_;
};
// TODO: it does not make sense that RdbSerializer will buffer into unaligned
// mem_buf_ and then flush it into the next level. We should probably use AlignedBuffer
// directly.
class RdbSerializer {
public:
// TODO: for aligned cased, it does not make sense that RdbSerializer buffers into unaligned
// mem_buf_ and then flush it into the next level. We should probably use AlignedBuffer
// directly.
RdbSerializer(::io::Sink* s);
RdbSerializer(AlignedBuffer* aligned_buf);
~RdbSerializer();
@ -117,6 +122,7 @@ class RdbSerializer {
// Must be called in the thread to which `it` belongs.
// Returns the serialized rdb_type or the error.
// expire_ms = 0 means no expiry.
io::Result<uint8_t> SaveEntry(const PrimeKey& pk, const PrimeValue& pv, uint64_t expire_ms);
std::error_code WriteRaw(const ::io::Bytes& buf);
std::error_code SaveString(std::string_view val);
@ -143,8 +149,7 @@ class RdbSerializer {
std::error_code SaveStreamPEL(rax* pel, bool nacks);
std::error_code SaveStreamConsumers(streamCG* cg);
::io::Sink* sink_ = nullptr;
AlignedBuffer* aligned_buf_ = nullptr;
::io::Sink* sink_;
std::unique_ptr<LZF_HSLOT[]> lzf_;
base::IoBuf mem_buf_;

View File

@ -8,7 +8,6 @@
#include <absl/random/random.h> // for master_id_ generation.
#include <absl/strings/match.h>
#include <absl/strings/str_join.h>
#include <sys/resource.h>
#include <chrono>
@ -71,6 +70,8 @@ using util::http::StringResponse;
namespace {
const auto kRdbWriteFlags = O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC | O_DIRECT;
using EngineFunc = void (ServerFamily::*)(CmdArgList args, ConnectionContext* cntx);
inline CommandId::Handler HandlerFunc(ServerFamily* se, EngineFunc f) {
@ -152,7 +153,7 @@ bool IsValidSaveScheduleNibble(string_view time, unsigned int max) {
class RdbSnapshot {
public:
RdbSnapshot(bool single_shard, uring::LinuxFile* fl)
: file_(fl), linux_sink_(fl), saver_(&linux_sink_, single_shard) {
: file_(fl), linux_sink_(fl), saver_(&linux_sink_, single_shard, kRdbWriteFlags & O_DIRECT) {
}
error_code Start(const StringVec& lua_scripts);
@ -191,7 +192,7 @@ error_code RdbSnapshot::Close() {
}
void RdbSnapshot::StartInShard(EngineShard* shard) {
saver_.StartSnapshotInShard(shard);
saver_.StartSnapshotInShard(false, shard);
started_ = true;
}
@ -279,19 +280,9 @@ ServerFamily::ServerFamily(Service* service) : service_(*service) {
journal_.reset(new journal::Journal);
{
// TODO: if we start using random generator in more places, we should probably
// refactor this code.
absl::InsecureBitGen eng;
absl::uniform_int_distribution<uint32_t> ud;
absl::AlphaNum a1(absl::Hex(eng(), absl::kZeroPad16));
absl::AlphaNum a2(absl::Hex(eng(), absl::kZeroPad16));
absl::AlphaNum a3(absl::Hex(ud(eng), absl::kZeroPad8));
absl::StrAppend(&master_id_, a1, a2, a3);
size_t constexpr kConfigRunIdSize = CONFIG_RUN_ID_SIZE;
DCHECK_EQ(kConfigRunIdSize, master_id_.size());
master_id_ = GetRandomHex(eng, CONFIG_RUN_ID_SIZE);
DCHECK_EQ(CONFIG_RUN_ID_SIZE, master_id_.size());
}
}
@ -472,6 +463,11 @@ error_code ServerFamily::LoadRdb(const std::string& rdb_file) {
RdbLoader loader(script_mgr());
ec = loader.Load(&fs);
if (!ec) {
LOG(INFO) << "Done loading RDB, keys loaded: " << loader.keys_loaded();
LOG(INFO) << "Loading finished after "
<< strings::HumanReadableElapsedTime(loader.load_time());
}
} else {
ec = res.error();
}
@ -556,8 +552,8 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
&resp->body());
AppendMetricWithoutLabels("memory_used_peak_bytes", "", used_mem_peak.load(memory_order_relaxed),
MetricType::GAUGE, &resp->body());
AppendMetricWithoutLabels("comitted_memory", "", GetMallocCurrentCommitted(),
MetricType::GAUGE, &resp->body());
AppendMetricWithoutLabels("comitted_memory", "", GetMallocCurrentCommitted(), MetricType::GAUGE,
&resp->body());
AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
&resp->body());
@ -618,6 +614,10 @@ void ServerFamily::PauseReplication(bool pause) {
}
}
void ServerFamily::OnClose(ConnectionContext* cntx) {
dfly_cmd_->OnClose(cntx);
}
void ServerFamily::StatsMC(std::string_view section, facade::ConnectionContext* cntx) {
if (!section.empty()) {
return cntx->reply_builder()->SendError("");
@ -697,7 +697,6 @@ error_code ServerFamily::DoSave(bool new_version, Transaction* trans, string* er
service_.SwitchState(GlobalState::SAVING, GlobalState::ACTIVE);
};
const auto kFlags = O_CREAT | O_WRONLY | O_TRUNC | O_CLOEXEC | O_DIRECT;
auto start = absl::Now();
shared_ptr<LastSaveInfo> save_info;
StringVec lua_scripts = script_mgr_->GetLuaScripts();
@ -745,7 +744,7 @@ error_code ServerFamily::DoSave(bool new_version, Transaction* trans, string* er
abs_path += shard_file;
VLOG(1) << "Saving to " << abs_path;
auto res = uring::OpenLinux(abs_path.generic_string(), kFlags, 0666);
auto res = uring::OpenLinux(abs_path.generic_string(), kRdbWriteFlags, 0666);
if (res) {
snapshots[sid].reset(new RdbSnapshot{true, res.value().release()});
@ -773,7 +772,7 @@ error_code ServerFamily::DoSave(bool new_version, Transaction* trans, string* er
ExtendFilename(now, -1, &filename);
path += filename;
auto res = uring::OpenLinux(path.generic_string(), kFlags, 0666);
auto res = uring::OpenLinux(path.generic_string(), kRdbWriteFlags, 0666);
if (!res) {
return res.error();
}
@ -862,6 +861,10 @@ void ServerFamily::DbSize(CmdArgList args, ConnectionContext* cntx) {
return (*cntx)->SendLong(num_keys.load(memory_order_relaxed));
}
void ServerFamily::BreakOnShutdown() {
dfly_cmd_->BreakOnShutdown();
}
void ServerFamily::FlushDb(CmdArgList args, ConnectionContext* cntx) {
DCHECK(cntx->transaction);
DoFlush(cntx->transaction, cntx->transaction->db_index());
@ -910,7 +913,9 @@ void ServerFamily::Client(CmdArgList args, ConnectionContext* cntx) {
if (sub_cmd == "SETNAME" && args.size() == 3) {
cntx->owner()->SetName(ArgS(args, 2));
return (*cntx)->SendOk();
} else if (sub_cmd == "LIST") {
}
if (sub_cmd == "LIST") {
vector<string> client_info;
fibers::mutex mu;
auto cb = [&](util::Connection* conn) {
@ -1377,14 +1382,16 @@ void ServerFamily::ReplConf(CmdArgList args, ConnectionContext* cntx) {
if (cmd == "CAPA") {
if (arg == "dragonfly" && args.size() == 3 && i == 1) {
uint32_t sid = dfly_cmd_->AllocateSyncSession();
cntx->owner()->SetName(absl::StrCat("repl_ctrl_", sid));
string sync_id = absl::StrCat("SYNC", sid);
cntx->conn_state.sync_session_id = sid;
cntx->conn_state.repl_session_id = sid;
// The response for 'capa dragonfly' is: <masterid> <syncid> <numthreads>
(*cntx)->StartArray(3);
(*cntx)->SendSimpleString(master_id_);
(*cntx)->SendSimpleString(sync_id);
(*cntx)->SendLong(shard_set->size());
(*cntx)->SendLong(shard_set->pool()->size());
return;
}
} else {
@ -1487,8 +1494,10 @@ void ServerFamily::Register(CommandRegistry* registry) {
<< CI{"REPLICAOF", kReplicaOpts, 3, 0, 0, 0}.HFUNC(ReplicaOf)
<< CI{"REPLCONF", CO::ADMIN | CO::LOADING, -1, 0, 0, 0}.HFUNC(ReplConf)
<< CI{"ROLE", CO::LOADING | CO::FAST | CO::NOSCRIPT, 1, 0, 0, 0}.HFUNC(Role)
<< CI{"SYNC", CO::ADMIN | CO::GLOBAL_TRANS, 1, 0, 0, 0}.HFUNC(Sync)
<< CI{"PSYNC", CO::ADMIN | CO::GLOBAL_TRANS, 3, 0, 0, 0}.HFUNC(Psync)
// We won't support DF->REDIS replication for now, hence we do not need to support
// these commands.
// << CI{"SYNC", CO::ADMIN | CO::GLOBAL_TRANS, 1, 0, 0, 0}.HFUNC(Sync)
// << CI{"PSYNC", CO::ADMIN | CO::GLOBAL_TRANS, 3, 0, 0, 0}.HFUNC(Psync)
<< CI{"SCRIPT", CO::NOSCRIPT, -2, 0, 0, 0}.HFUNC(Script)
<< CI{"DFLY", CO::ADMIN | CO::GLOBAL_TRANS, -2, 0, 0, 0}.HFUNC(Dfly);
}

View File

@ -102,6 +102,10 @@ class ServerFamily {
return journal_.get();
}
void OnClose(ConnectionContext* cntx);
void BreakOnShutdown();
private:
uint32_t shard_count() const {
return shard_set->size();

View File

@ -22,8 +22,9 @@ class Journal;
// Present in every server thread. This class differs from EngineShard. The latter manages
// state around engine shards while the former represents coordinator/connection state.
// There may be threads that handle engine shards but not IO, there may be threads that handle IO
// but not engine shards and there can be threads that handle both. This class is present only
// for threads that handle IO and manage incoming connections.
// but not engine shards and there can be threads that handle both.
// Instances of ServerState are present only for threads that handle
// IO and manage incoming connections.
class ServerState { // public struct - to allow initialization.
ServerState(const ServerState&) = delete;
void operator=(const ServerState&) = delete;

View File

@ -13,6 +13,8 @@ extern "C" {
#include "base/logging.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/rdb_save.h"
#include "util/fiber_sched_algo.h"
#include "util/proactor_base.h"
@ -25,15 +27,14 @@ using namespace chrono_literals;
namespace this_fiber = ::boost::this_fiber;
using boost::fibers::fiber;
SliceSnapshot::SliceSnapshot(DbSlice* slice, RecordChannel* dest)
: db_slice_(slice), dest_(dest) {
SliceSnapshot::SliceSnapshot(DbSlice* slice, RecordChannel* dest) : db_slice_(slice), dest_(dest) {
db_array_ = slice->databases();
}
SliceSnapshot::~SliceSnapshot() {
}
void SliceSnapshot::Start() {
void SliceSnapshot::Start(bool include_journal_changes) {
DCHECK(!fb_.joinable());
auto on_change = [this](DbIndex db_index, const DbSlice::ChangeReq& req) {
@ -42,6 +43,14 @@ void SliceSnapshot::Start() {
snapshot_version_ = db_slice_->RegisterOnChange(move(on_change));
VLOG(1) << "DbSaver::Start - saving entries with version less than " << snapshot_version_;
if (include_journal_changes) {
auto* journal = db_slice_->shard_owner()->journal();
DCHECK(journal);
journal_cb_id_ = journal->RegisterOnChange(
[this](const journal::Entry& e) { OnJournalEntry(e); });
}
sfile_.reset(new io::StringFile);
rdb_serializer_.reset(new RdbSerializer(sfile_.get()));
@ -49,6 +58,8 @@ void SliceSnapshot::Start() {
fb_ = fiber([this] {
FiberFunc();
db_slice_->UnregisterOnChange(snapshot_version_);
if (journal_cb_id_)
db_slice_->shard_owner()->journal()->Unregister(journal_cb_id_);
});
}
@ -141,15 +152,8 @@ bool SliceSnapshot::FlushSfile(bool force) {
}
VLOG(2) << "FlushSfile " << sfile_->val.size() << " bytes";
string tmp = std::move(sfile_->val); // important to move before pushing!
channel_bytes_ += tmp.size();
DbRecord rec{.db_index = savecb_current_db_,
.id = rec_id_,
.num_records = num_records_in_blob_,
.value = std::move(tmp)};
DVLOG(2) << "Pushed " << rec_id_;
++rec_id_;
num_records_in_blob_ = 0;
DbRecord rec = GetDbRecord(savecb_current_db_, std::move(sfile_->val), num_records_in_blob_);
num_records_in_blob_ = 0; // We can not move this line after the push, because Push is blocking.
dest_->Push(std::move(rec));
return true;
@ -206,6 +210,32 @@ void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req)
}
}
void SliceSnapshot::OnJournalEntry(const journal::Entry& entry) {
CHECK(journal::Op::VAL == entry.opcode);
PrimeKey pkey{entry.key};
if (entry.db_ind == savecb_current_db_) {
++num_records_in_blob_;
io::Result<uint8_t> res =
rdb_serializer_->SaveEntry(pkey, *entry.pval_ptr, entry.expire_ms);
CHECK(res); // we write to StringFile.
} else {
io::StringFile sfile;
RdbSerializer tmp_serializer(&sfile);
io::Result<uint8_t> res =
tmp_serializer.SaveEntry(pkey, *entry.pval_ptr, entry.expire_ms);
CHECK(res); // we write to StringFile.
error_code ec = tmp_serializer.FlushMem();
CHECK(!ec && !sfile.val.empty());
DbRecord rec = GetDbRecord(entry.db_ind, std::move(sfile.val), 1);
dest_->Push(std::move(rec));
}
}
unsigned SliceSnapshot::SerializePhysicalBucket(DbIndex db_index, PrimeTable::bucket_iterator it) {
DCHECK_LT(it.GetVersion(), snapshot_version_);
@ -234,17 +264,19 @@ unsigned SliceSnapshot::SerializePhysicalBucket(DbIndex db_index, PrimeTable::bu
error_code ec = tmp_serializer.FlushMem();
CHECK(!ec && !sfile.val.empty());
string tmp = std::move(sfile.val);
channel_bytes_ += tmp.size();
DbRecord rec{
.db_index = db_index, .id = rec_id_, .num_records = result, .value = std::move(tmp)};
DVLOG(2) << "Pushed " << rec_id_;
++rec_id_;
dest_->Push(std::move(rec));
dest_->Push(GetDbRecord(db_index, std::move(sfile.val), result));
}
return result;
}
auto SliceSnapshot::GetDbRecord(DbIndex db_index, std::string value, unsigned num_records)
-> DbRecord {
channel_bytes_ += value.size();
auto id = rec_id_++;
DVLOG(2) << "Pushed " << id;
return DbRecord{
.db_index = db_index, .id = id, .num_records = num_records, .value = std::move(value)};
}
} // namespace dfly

View File

@ -13,6 +13,10 @@
namespace dfly {
namespace journal {
struct Entry;
} // namespace journal
class RdbSerializer;
class SliceSnapshot {
@ -32,7 +36,7 @@ class SliceSnapshot {
SliceSnapshot(DbSlice* slice, RecordChannel* dest);
~SliceSnapshot();
void Start();
void Start(bool include_journal_changes);
void Join();
uint64_t snapshot_version() const {
@ -59,10 +63,12 @@ class SliceSnapshot {
bool SaveCb(PrimeIterator it);
void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);
void OnJournalEntry(const journal::Entry& entry);
// Returns number of entries serialized.
// Updates the version of the bucket to snapshot version.
unsigned SerializePhysicalBucket(DbIndex db_index, PrimeTable::bucket_iterator it);
DbRecord GetDbRecord(DbIndex db_index, std::string value, unsigned num_records);
::boost::fibers::fiber fb_;
@ -82,6 +88,7 @@ class SliceSnapshot {
size_t serialized_ = 0, skipped_ = 0, side_saved_ = 0, savecb_calls_ = 0;
uint64_t rec_id_ = 0;
uint32_t num_records_in_blob_ = 0;
uint32_t journal_cb_id_ = 0;
};
} // namespace dfly

View File

@ -64,9 +64,10 @@ string_view GetSlice(EngineShard* shard, const PrimeValue& pv, string* tmp) {
return pv.GetSlice(tmp);
}
inline void RecordJournal(const OpArgs& op_args, const PrimeKey& pkey, const PrimeKey& pvalue) {
inline void RecordJournal(const OpArgs& op_args, string_view key, const PrimeKey& pvalue) {
if (op_args.shard->journal()) {
op_args.shard->journal()->RecordEntry(op_args.txid, pkey, pvalue);
journal::Entry entry{op_args.db_ind, op_args.txid, key, pvalue};
op_args.shard->journal()->RecordEntry(entry);
}
}
@ -104,7 +105,7 @@ OpResult<uint32_t> OpSetRange(const OpArgs& op_args, string_view key, size_t sta
memcpy(s.data() + start, value.data(), value.size());
it->second.SetString(s);
db_slice.PostUpdate(op_args.db_ind, it, key, !added);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return it->second.Size();
}
@ -141,8 +142,8 @@ OpResult<string> OpGetRange(const OpArgs& op_args, string_view key, int32_t star
return string(slice.substr(start, end - start + 1));
};
size_t ExtendExisting(const OpArgs& op_args, PrimeIterator it, string_view key, string_view val,
bool prepend) {
size_t ExtendExisting(const OpArgs& op_args, PrimeIterator it, string_view key,
string_view val, bool prepend) {
string tmp, new_val;
auto* shard = op_args.shard;
string_view slice = GetSlice(shard, it->second, &tmp);
@ -155,7 +156,7 @@ size_t ExtendExisting(const OpArgs& op_args, PrimeIterator it, string_view key,
db_slice.PreUpdate(op_args.db_ind, it);
it->second.SetString(new_val);
db_slice.PostUpdate(op_args.db_ind, it, key, true);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return new_val.size();
}
@ -169,7 +170,7 @@ OpResult<uint32_t> ExtendOrSet(const OpArgs& op_args, string_view key, string_vi
if (inserted) {
it->second.SetString(val);
db_slice.PostUpdate(op_args.db_ind, it, key, false);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return val.size();
}
@ -180,7 +181,7 @@ OpResult<uint32_t> ExtendOrSet(const OpArgs& op_args, string_view key, string_vi
return ExtendExisting(op_args, it, key, val, prepend);
}
OpResult<bool> ExtendOrSkip(const OpArgs& op_args, std::string_view key, std::string_view val,
OpResult<bool> ExtendOrSkip(const OpArgs& op_args, string_view key, string_view val,
bool prepend) {
auto& db_slice = op_args.shard->db_slice();
OpResult<PrimeIterator> it_res = db_slice.Find(op_args.db_ind, key, OBJ_STRING);
@ -201,7 +202,7 @@ OpResult<string> OpGet(const OpArgs& op_args, string_view key) {
return GetString(op_args.shard, pv);
}
OpResult<double> OpIncrFloat(const OpArgs& op_args, std::string_view key, double val) {
OpResult<double> OpIncrFloat(const OpArgs& op_args, string_view key, double val) {
auto& db_slice = op_args.shard->db_slice();
auto [it, inserted] = db_slice.AddOrFind(op_args.db_ind, key);
@ -211,7 +212,7 @@ OpResult<double> OpIncrFloat(const OpArgs& op_args, std::string_view key, double
char* str = RedisReplyBuilder::FormatDouble(val, buf, sizeof(buf));
it->second.SetString(str);
db_slice.PostUpdate(op_args.db_ind, it, key, false);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return val;
}
@ -243,13 +244,13 @@ OpResult<double> OpIncrFloat(const OpArgs& op_args, std::string_view key, double
db_slice.PreUpdate(op_args.db_ind, it);
it->second.SetString(str);
db_slice.PostUpdate(op_args.db_ind, it, key, true);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return base;
}
// if skip_on_missing - returns KEY_NOTFOUND.
OpResult<int64_t> OpIncrBy(const OpArgs& op_args, std::string_view key, int64_t incr,
OpResult<int64_t> OpIncrBy(const OpArgs& op_args, string_view key, int64_t incr,
bool skip_on_missing) {
auto& db_slice = op_args.shard->db_slice();
@ -270,7 +271,7 @@ OpResult<int64_t> OpIncrBy(const OpArgs& op_args, std::string_view key, int64_t
return OpStatus::OUT_OF_MEMORY;
}
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return incr;
}
@ -295,7 +296,7 @@ OpResult<int64_t> OpIncrBy(const OpArgs& op_args, std::string_view key, int64_t
db_slice.PreUpdate(op_args.db_ind, it);
it->second.SetInt(new_val);
db_slice.PostUpdate(op_args.db_ind, it, key);
RecordJournal(op_args, it->first, it->second);
RecordJournal(op_args, key, it->second);
return new_val;
}
@ -393,7 +394,7 @@ OpStatus SetCmd::Set(const SetParams& params, string_view key, string_view value
}
}
RecordJournal(op_args_, it->first, it->second);
RecordJournal(op_args_, key, it->second);
return OpStatus::OK;
}
@ -447,7 +448,7 @@ OpStatus SetCmd::SetExisting(const SetParams& params, PrimeIterator it, ExpireIt
}
db_slice.PostUpdate(params.db_index, it, key);
RecordJournal(op_args_, it->first, it->second);
RecordJournal(op_args_, key, it->second);
return OpStatus::OK;
}
@ -572,7 +573,7 @@ void StringFamily::SetNx(CmdArgList args, ConnectionContext* cntx) {
void StringFamily::Get(CmdArgList args, ConnectionContext* cntx) {
get_qps.Inc();
std::string_view key = ArgS(args, 1);
string_view key = ArgS(args, 1);
auto cb = [&](Transaction* t, EngineShard* shard) { return OpGet(t->GetOpArgs(shard), key); };
@ -596,8 +597,8 @@ void StringFamily::Get(CmdArgList args, ConnectionContext* cntx) {
}
void StringFamily::GetSet(CmdArgList args, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
std::string_view value = ArgS(args, 2);
string_view key = ArgS(args, 1);
string_view value = ArgS(args, 2);
std::optional<string> prev_val;
SetCmd::SetParams sparams{cntx->db_index()};
@ -624,15 +625,15 @@ void StringFamily::GetSet(CmdArgList args, ConnectionContext* cntx) {
}
void StringFamily::Incr(CmdArgList args, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
string_view key = ArgS(args, 1);
return IncrByGeneric(key, 1, cntx);
}
void StringFamily::IncrBy(CmdArgList args, ConnectionContext* cntx) {
DCHECK_EQ(3u, args.size());
std::string_view key = ArgS(args, 1);
std::string_view sval = ArgS(args, 2);
string_view key = ArgS(args, 1);
string_view sval = ArgS(args, 2);
int64_t val;
if (!absl::SimpleAtoi(sval, &val)) {
@ -642,8 +643,8 @@ void StringFamily::IncrBy(CmdArgList args, ConnectionContext* cntx) {
}
void StringFamily::IncrByFloat(CmdArgList args, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
std::string_view sval = ArgS(args, 2);
string_view key = ArgS(args, 1);
string_view sval = ArgS(args, 2);
double val;
if (!absl::SimpleAtod(sval, &val)) {
@ -666,13 +667,13 @@ void StringFamily::IncrByFloat(CmdArgList args, ConnectionContext* cntx) {
}
void StringFamily::Decr(CmdArgList args, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
string_view key = ArgS(args, 1);
return IncrByGeneric(key, -1, cntx);
}
void StringFamily::DecrBy(CmdArgList args, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
std::string_view sval = ArgS(args, 2);
string_view key = ArgS(args, 1);
string_view sval = ArgS(args, 2);
int64_t val;
if (!absl::SimpleAtoi(sval, &val)) {
@ -693,7 +694,7 @@ void StringFamily::Prepend(CmdArgList args, ConnectionContext* cntx) {
ExtendGeneric(std::move(args), true, cntx);
}
void StringFamily::IncrByGeneric(std::string_view key, int64_t val, ConnectionContext* cntx) {
void StringFamily::IncrByGeneric(string_view key, int64_t val, ConnectionContext* cntx) {
bool skip_on_missing = cntx->protocol() == Protocol::MEMCACHE;
auto cb = [&](Transaction* t, EngineShard* shard) {
@ -725,8 +726,8 @@ void StringFamily::IncrByGeneric(std::string_view key, int64_t val, ConnectionCo
}
void StringFamily::ExtendGeneric(CmdArgList args, bool prepend, ConnectionContext* cntx) {
std::string_view key = ArgS(args, 1);
std::string_view sval = ArgS(args, 2);
string_view key = ArgS(args, 1);
string_view sval = ArgS(args, 2);
if (cntx->protocol() == Protocol::REDIS) {
auto cb = [&](Transaction* t, EngineShard* shard) {

View File

@ -1160,7 +1160,7 @@ bool Transaction::NotifySuspended(TxId committed_txid, ShardId sid) {
return false;
}
void Transaction::BreakOnClose() {
void Transaction::BreakOnShutdown() {
if (coordinator_state_ & COORD_BLOCKED) {
coordinator_state_ |= COORD_CANCELLED;
blocking_ec_.notify();

View File

@ -169,7 +169,7 @@ class Transaction {
// this transaction has been awaked.
bool NotifySuspended(TxId committed_ts, ShardId sid);
void BreakOnClose();
void BreakOnShutdown();
// Called by EngineShard when performing Execute over the tx queue.
// Returns true if transaction should be kept in the queue.