480 lines
14 KiB
C++
480 lines
14 KiB
C++
// Copyright 2022, Roman Gershman. All rights reserved.
|
|
// See LICENSE for licensing terms.
|
|
//
|
|
|
|
#include "server/engine_shard_set.h"
|
|
|
|
extern "C" {
|
|
#include "redis/zmalloc.h"
|
|
}
|
|
|
|
#include "base/logging.h"
|
|
#include "server/transaction.h"
|
|
#include "util/fiber_sched_algo.h"
|
|
#include "util/varz.h"
|
|
|
|
namespace dfly {
|
|
|
|
using namespace std;
|
|
using namespace util;
|
|
namespace this_fiber = ::boost::this_fiber;
|
|
namespace fibers = ::boost::fibers;
|
|
|
|
thread_local EngineShard* EngineShard::shard_ = nullptr;
|
|
constexpr size_t kQueueLen = 64;
|
|
|
|
struct WatchItem {
|
|
::boost::intrusive_ptr<Transaction> trans;
|
|
|
|
WatchItem(Transaction* t) : trans(t) {
|
|
}
|
|
};
|
|
|
|
struct EngineShard::WatchQueue {
|
|
deque<WatchItem> items;
|
|
TxId notify_txid = UINT64_MAX;
|
|
|
|
// Updated by both coordinator and shard threads but at different times.
|
|
enum State { SUSPENDED, ACTIVE } state = SUSPENDED;
|
|
|
|
void Suspend() {
|
|
state = SUSPENDED;
|
|
notify_txid = UINT64_MAX;
|
|
}
|
|
};
|
|
|
|
bool EngineShard::DbWatchTable::RemoveEntry(WatchQueueMap::iterator it) {
|
|
DVLOG(1) << "Erasing watchqueue key " << it->first;
|
|
|
|
awakened_keys.erase(it->first);
|
|
queue_map.erase(it);
|
|
|
|
return queue_map.empty();
|
|
}
|
|
|
|
EngineShard::EngineShard(util::ProactorBase* pb, bool update_db_time, mi_heap_t* heap)
|
|
: queue_(kQueueLen), txq_([](const Transaction* t) { return t->txid(); }), mi_resource_(heap),
|
|
db_slice_(pb->GetIndex(), this) {
|
|
fiber_q_ = fibers::fiber([this, index = pb->GetIndex()] {
|
|
this_fiber::properties<FiberProps>().set_name(absl::StrCat("shard_queue", index));
|
|
queue_.Run();
|
|
});
|
|
|
|
if (update_db_time) {
|
|
periodic_task_ = pb->AddPeriodic(1, [] {
|
|
auto* shard = EngineShard::tlocal();
|
|
DCHECK(shard);
|
|
// absl::GetCurrentTimeNanos() returns current time since the Unix Epoch.
|
|
shard->db_slice().UpdateExpireClock(absl::GetCurrentTimeNanos() / 1000000);
|
|
});
|
|
}
|
|
|
|
tmp_str = sdsempty();
|
|
}
|
|
|
|
EngineShard::~EngineShard() {
|
|
queue_.Shutdown();
|
|
fiber_q_.join();
|
|
sdsfree(tmp_str);
|
|
if (periodic_task_) {
|
|
ProactorBase::me()->CancelPeriodic(periodic_task_);
|
|
}
|
|
}
|
|
|
|
void EngineShard::InitThreadLocal(ProactorBase* pb, bool update_db_time) {
|
|
CHECK(shard_ == nullptr) << pb->GetIndex();
|
|
|
|
init_zmalloc_threadlocal();
|
|
|
|
mi_heap_t* tlh = mi_heap_get_backing();
|
|
void* ptr = mi_heap_malloc_aligned(tlh, sizeof(EngineShard), alignof(EngineShard));
|
|
shard_ = new (ptr) EngineShard(pb, update_db_time, tlh);
|
|
}
|
|
|
|
void EngineShard::DestroyThreadLocal() {
|
|
if (!shard_)
|
|
return;
|
|
|
|
uint32_t index = shard_->db_slice_.shard_id();
|
|
shard_->~EngineShard();
|
|
mi_free(shard_);
|
|
shard_ = nullptr;
|
|
|
|
VLOG(1) << "Shard reset " << index;
|
|
}
|
|
|
|
// Is called by Transaction::ExecuteAsync in order to run transaction tasks.
|
|
// Only runs in its own thread.
|
|
void EngineShard::PollExecution(const char* context, Transaction* trans) {
|
|
DVLOG(1) << "PollExecution " << context << " " << (trans ? trans->DebugId() : "");
|
|
ShardId sid = shard_id();
|
|
|
|
uint16_t trans_mask = trans ? trans->GetLocalMask(sid) : 0;
|
|
if (trans_mask & Transaction::AWAKED_Q) {
|
|
DCHECK(continuation_trans_ == nullptr);
|
|
|
|
CHECK_EQ(committed_txid_, trans->notify_txid()) << "TBD";
|
|
bool keep = trans->RunInShard(this);
|
|
if (keep)
|
|
return;
|
|
}
|
|
|
|
if (continuation_trans_) {
|
|
if (trans == continuation_trans_)
|
|
trans = nullptr;
|
|
|
|
if (continuation_trans_->IsArmedInShard(sid)) {
|
|
bool to_keep = continuation_trans_->RunInShard(this);
|
|
DVLOG(1) << "RunContTrans: " << continuation_trans_->DebugId() << " keep: " << to_keep;
|
|
if (!to_keep) {
|
|
continuation_trans_ = nullptr;
|
|
OnTxFinish();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool has_awaked_trans = HasAwakedTransaction();
|
|
Transaction* head = nullptr;
|
|
string dbg_id;
|
|
|
|
if (continuation_trans_ == nullptr && !has_awaked_trans) {
|
|
while (!txq_.Empty()) {
|
|
auto val = txq_.Front();
|
|
head = absl::get<Transaction*>(val);
|
|
|
|
// The fact that Tx is in the queue, already means that coordinator fiber will not progress,
|
|
// hence here it's enough to test for run_count and check local_mask.
|
|
bool is_armed = head->IsArmedInShard(sid);
|
|
if (!is_armed)
|
|
break;
|
|
|
|
// It could be that head is processed and unblocks multi-hop transaction .
|
|
// The transaction will schedule again and will arm another callback.
|
|
// Then we will reach invalid state by running trans after this loop,
|
|
// which is not what we want.
|
|
// This function should not process 2 different callbacks for the same transaction.
|
|
// Hence we make sure to reset trans if it has been processed via tx-queue.
|
|
if (head == trans)
|
|
trans = nullptr;
|
|
TxId txid = head->txid();
|
|
|
|
// committed_txid_ is strictly increasing when processed via TxQueue.
|
|
DCHECK_LT(committed_txid_, txid);
|
|
|
|
// We update committed_txid_ before calling RunInShard() to avoid cases where
|
|
// a transaction stalls the execution with IO while another fiber queries this shard for
|
|
// committed_txid_ (for example during the scheduling).
|
|
committed_txid_ = txid;
|
|
if (VLOG_IS_ON(2)) {
|
|
dbg_id = head->DebugId();
|
|
}
|
|
|
|
bool keep = head->RunInShard(this);
|
|
// We should not access head from this point since RunInShard callback decrements refcount.
|
|
DLOG_IF(INFO, !dbg_id.empty()) << "RunHead " << dbg_id << ", keep " << keep;
|
|
|
|
if (keep) {
|
|
continuation_trans_ = head;
|
|
break;
|
|
}
|
|
|
|
OnTxFinish();
|
|
} // while(!txq_.Empty())
|
|
} else { // if (continuation_trans_ == nullptr && !has_awaked_trans)
|
|
DVLOG(1) << "Skipped TxQueue " << continuation_trans_ << " " << has_awaked_trans;
|
|
}
|
|
|
|
// For SUSPENDED_Q - if transaction has not been notified, it will still be
|
|
// in the watch queue. We need to unlock an Execute by running a noop.
|
|
if (trans_mask & Transaction::SUSPENDED_Q) {
|
|
TxId notify_txid = trans->notify_txid();
|
|
DCHECK(HasResultConverged(notify_txid));
|
|
trans->RunNoop(this);
|
|
return;
|
|
}
|
|
|
|
// If trans is out of order, i.e. locks keys that previous transactions have not locked.
|
|
// It may be that there are other transactions that touch those keys but they necessary ordered
|
|
// after trans in the queue, hence it's safe to run trans out of order.
|
|
if (trans && trans_mask & Transaction::OUT_OF_ORDER) {
|
|
DCHECK(trans != head);
|
|
DCHECK(!trans->IsMulti()); // multi, global transactions can not be OOO.
|
|
DCHECK(trans_mask & Transaction::ARMED);
|
|
|
|
dbg_id.clear();
|
|
|
|
if (VLOG_IS_ON(1)) {
|
|
dbg_id = trans->DebugId();
|
|
}
|
|
++stats_.ooo_runs;
|
|
|
|
bool keep = trans->RunInShard(this);
|
|
DLOG_IF(INFO, !dbg_id.empty()) << "Eager run " << sid << ", " << dbg_id << ", keep " << keep;
|
|
|
|
// Should be enforced via Schedule(). TODO: to remove the check once the code is mature.
|
|
CHECK(!keep) << "multi-hop transactions can not be OOO.";
|
|
}
|
|
}
|
|
|
|
// Internal function called from ProcessAwakened().
|
|
// Marks the queue as active and notifies the first transaction in the queue.
|
|
Transaction* EngineShard::NotifyWatchQueue(WatchQueue* wq) {
|
|
wq->state = WatchQueue::ACTIVE;
|
|
|
|
auto& q = wq->items;
|
|
ShardId sid = shard_id();
|
|
|
|
do {
|
|
const WatchItem& wi = q.front();
|
|
Transaction* head = wi.trans.get();
|
|
|
|
if (head->NotifySuspended(committed_txid_, sid)) {
|
|
wq->notify_txid = committed_txid_;
|
|
return head;
|
|
}
|
|
|
|
q.pop_front();
|
|
} while (!q.empty());
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
// Processes potentially awakened keys and verifies that these are indeed
|
|
// awakened to eliminate false positives.
|
|
// In addition, optionally removes completed_t from the watch queues.
|
|
void EngineShard::ProcessAwakened(Transaction* completed_t) {
|
|
for (DbIndex index : awakened_indices_) {
|
|
DbWatchTable& wt = watched_dbs_[index];
|
|
|
|
for (auto key : wt.awakened_keys) {
|
|
string_view sv_key = static_cast<string_view>(key);
|
|
auto [it, exp_it] = db_slice_.FindExt(index, sv_key); // Double verify we still got the item.
|
|
if (!IsValid(it))
|
|
continue;
|
|
|
|
auto w_it = wt.queue_map.find(sv_key);
|
|
CHECK(w_it != wt.queue_map.end());
|
|
DVLOG(1) << "NotifyWatchQueue " << key;
|
|
Transaction* t2 = NotifyWatchQueue(w_it->second.get());
|
|
if (t2) {
|
|
awakened_transactions_.insert(t2);
|
|
}
|
|
}
|
|
wt.awakened_keys.clear();
|
|
}
|
|
awakened_indices_.clear();
|
|
|
|
if (!completed_t)
|
|
return;
|
|
|
|
auto dbit = watched_dbs_.find(completed_t->db_index());
|
|
if (dbit == watched_dbs_.end())
|
|
return;
|
|
|
|
DbWatchTable& wt = dbit->second;
|
|
KeyLockArgs lock_args = completed_t->GetLockArgs(shard_id());
|
|
|
|
for (size_t i = 0; i < lock_args.args.size(); i += lock_args.key_step) {
|
|
string_view key = lock_args.args[i];
|
|
auto w_it = wt.queue_map.find(key);
|
|
|
|
if (w_it == wt.queue_map.end() || w_it->second->state != WatchQueue::ACTIVE)
|
|
continue;
|
|
|
|
WatchQueue& wq = *w_it->second;
|
|
|
|
DCHECK_LE(wq.notify_txid, committed_txid_);
|
|
|
|
auto& queue = wq.items;
|
|
DCHECK(!queue.empty()); // since it's active
|
|
|
|
if (queue.front().trans == completed_t) {
|
|
queue.pop_front();
|
|
|
|
while (!queue.empty()) {
|
|
const WatchItem& bi = queue.front();
|
|
Transaction* head = bi.trans.get();
|
|
|
|
if (head->NotifySuspended(wq.notify_txid, shard_id()))
|
|
break;
|
|
queue.pop_front();
|
|
}
|
|
|
|
if (queue.empty()) {
|
|
wt.RemoveEntry(w_it);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (wt.queue_map.empty()) {
|
|
watched_dbs_.erase(dbit);
|
|
}
|
|
awakened_transactions_.erase(completed_t);
|
|
}
|
|
|
|
void EngineShard::AddWatched(string_view key, Transaction* me) {
|
|
DbWatchTable& wt = watched_dbs_[me->db_index()];
|
|
auto [res, inserted] = wt.queue_map.emplace(key, nullptr);
|
|
if (inserted) {
|
|
res->second.reset(new WatchQueue);
|
|
}
|
|
|
|
res->second->items.emplace_back(me);
|
|
}
|
|
|
|
// Runs in O(N) complexity.
|
|
bool EngineShard::RemovedWatched(string_view key, Transaction* me) {
|
|
auto dbit = watched_dbs_.find(me->db_index());
|
|
CHECK(dbit != watched_dbs_.end());
|
|
|
|
DbWatchTable& wt = dbit->second;
|
|
auto watch_it = wt.queue_map.find(key);
|
|
CHECK(watch_it != wt.queue_map.end());
|
|
|
|
WatchQueue& wq = *watch_it->second;
|
|
for (auto j = wq.items.begin(); j != wq.items.end(); ++j) {
|
|
if (j->trans == me) {
|
|
wq.items.erase(j);
|
|
if (wq.items.empty()) {
|
|
if (wt.RemoveEntry(watch_it)) {
|
|
watched_dbs_.erase(dbit);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
LOG(FATAL) << "should not happen";
|
|
|
|
return false;
|
|
}
|
|
|
|
void EngineShard::GCWatched(const KeyLockArgs& largs) {
|
|
auto dbit = watched_dbs_.find(largs.db_index);
|
|
CHECK(dbit != watched_dbs_.end());
|
|
|
|
DbWatchTable& wt = dbit->second;
|
|
|
|
for (size_t i = 0; i < largs.args.size(); i += largs.key_step) {
|
|
string_view key = largs.args[i];
|
|
auto watch_it = wt.queue_map.find(key);
|
|
CHECK(watch_it != wt.queue_map.end());
|
|
|
|
WatchQueue& wq = *watch_it->second;
|
|
DCHECK(!wq.items.empty());
|
|
do {
|
|
auto local_mask = wq.items.front().trans->GetLocalMask(shard_id());
|
|
if ((local_mask & Transaction::EXPIRED_Q) == 0) {
|
|
break;
|
|
}
|
|
wq.items.pop_front();
|
|
} while (!wq.items.empty());
|
|
|
|
if (wq.items.empty()) {
|
|
if (wt.RemoveEntry(watch_it)) {
|
|
watched_dbs_.erase(dbit);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Called from commands like lpush.
|
|
void EngineShard::AwakeWatched(DbIndex db_index, string_view db_key) {
|
|
auto it = watched_dbs_.find(db_index);
|
|
if (it == watched_dbs_.end())
|
|
return;
|
|
|
|
DbWatchTable& wt = it->second;
|
|
DCHECK(!wt.queue_map.empty());
|
|
|
|
auto wit = wt.queue_map.find(db_key);
|
|
|
|
if (wit == wt.queue_map.end())
|
|
return; /// Similarly, nobody watches this key.
|
|
|
|
string_view key = wit->first;
|
|
|
|
// Already awakened this key.
|
|
if (wt.awakened_keys.find(key) != wt.awakened_keys.end())
|
|
return;
|
|
|
|
wt.awakened_keys.insert(wit->first);
|
|
awakened_indices_.insert(db_index);
|
|
}
|
|
|
|
void EngineShard::ShutdownMulti(Transaction* multi) {
|
|
if (continuation_trans_ == multi) {
|
|
continuation_trans_ = nullptr;
|
|
}
|
|
OnTxFinish();
|
|
}
|
|
|
|
void EngineShard::WaitForConvergence(TxId notifyid, Transaction* t) {
|
|
DVLOG(1) << "ConvergeNotification " << t->DebugId() << " at notify " << notifyid;
|
|
waiting_convergence_.emplace(notifyid, t);
|
|
}
|
|
|
|
void EngineShard::OnTxFinish() {
|
|
DCHECK(continuation_trans_ == nullptr); // By definition of OnTxFinish.
|
|
|
|
if (waiting_convergence_.empty())
|
|
return;
|
|
|
|
if (txq_.Empty()) {
|
|
for (const auto& k_v : waiting_convergence_) {
|
|
NotifyConvergence(k_v.second);
|
|
}
|
|
waiting_convergence_.clear();
|
|
return;
|
|
}
|
|
|
|
TxId txq_score = txq_.HeadScore();
|
|
do {
|
|
auto tx_waiting = waiting_convergence_.begin();
|
|
|
|
// Instead of taking the map key, we use upto date notify_txid
|
|
// That could meanwhile improve. Not important though.
|
|
TxId notifyid = tx_waiting->second->notify_txid();
|
|
if (notifyid > committed_txid_ && txq_score <= tx_waiting->first)
|
|
break;
|
|
auto nh = waiting_convergence_.extract(tx_waiting);
|
|
NotifyConvergence(nh.mapped());
|
|
} while (!waiting_convergence_.empty());
|
|
}
|
|
|
|
void EngineShard::NotifyConvergence(Transaction* tx) {
|
|
LOG(FATAL) << "TBD";
|
|
}
|
|
|
|
// There are several cases that contain proof of convergence for this shard:
|
|
// 1. txq_ empty - it means that anything that is goonna be scheduled will already be scheduled
|
|
// with txid > notifyid.
|
|
// 2. committed_txid_ > notifyid - similarly, this shard can not affect the result with timestamp
|
|
// notifyid.
|
|
// 3. committed_txid_ == notifyid, then if a transaction in progress (continuation_trans_ != NULL)
|
|
// the this transaction can still affect the result, hence we require continuation_trans_ is null
|
|
// which will point to converged result @notifyid.
|
|
// 4. Finally with committed_txid_ < notifyid and continuation_trans_ == nullptr,
|
|
// we can check if the next in line (HeadScore) is after notifyid in that case we can also
|
|
// conclude regarding the result convergence for this shard.
|
|
bool EngineShard::HasResultConverged(TxId notifyid) const {
|
|
return txq_.Empty() || committed_txid_ > notifyid ||
|
|
(continuation_trans_ == nullptr &&
|
|
(committed_txid_ == notifyid || txq_.HeadScore() > notifyid));
|
|
}
|
|
|
|
void EngineShardSet::Init(uint32_t sz) {
|
|
CHECK_EQ(0u, size());
|
|
|
|
shard_queue_.resize(sz);
|
|
}
|
|
|
|
void EngineShardSet::InitThreadLocal(ProactorBase* pb, bool update_db_time) {
|
|
EngineShard::InitThreadLocal(pb, update_db_time);
|
|
EngineShard* es = EngineShard::tlocal();
|
|
shard_queue_[es->shard_id()] = es->GetFiberQueue();
|
|
}
|
|
|
|
} // namespace dfly
|