From b50428d1a63c02299dfd23dac0e57639efc63fb5 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sat, 23 Apr 2022 22:03:12 +0300 Subject: [PATCH] More work on tiered storage. 1. External allocator now allocates blocks that are multiples of 8KB and are aligned to 8KB. This allows to use it with O_DIRECT files. 4KB was enough but since we can group small strings within a single block anyway, we can increase minimal block sizes. 2. Fix constants dependencies within external allocator. Improve code structure. 3. Limit string offloading to string sizes below 2MB. Currently we do not support external allocations of larger sizes. --- src/core/external_alloc.cc | 332 ++++++++++++++++++++------------ src/core/external_alloc.h | 32 ++- src/core/external_alloc_test.cc | 38 +++- src/core/generate_bin_sizes.py | 12 +- src/server/rdb_load.cc | 1 - src/server/rdb_save.cc | 1 - src/server/string_family.cc | 2 +- src/server/tiered_storage.cc | 20 +- 8 files changed, 281 insertions(+), 157 deletions(-) diff --git a/src/core/external_alloc.cc b/src/core/external_alloc.cc index cf07911..0304dfc 100644 --- a/src/core/external_alloc.cc +++ b/src/core/external_alloc.cc @@ -19,73 +19,84 @@ using BinIdx = uint8_t; namespace { -constexpr inline unsigned long long operator""_MB(unsigned long long x) { - return x << 20U; +constexpr inline size_t divup(size_t num, size_t div) { + return (num + div - 1) / div; } -constexpr inline unsigned long long operator""_KB(unsigned long long x) { - return x << 10U; +constexpr inline size_t wsize_from_size(size_t size) { + return divup(size, sizeof(uintptr_t)); } -constexpr size_t kMediumObjSize = 1_MB; -constexpr size_t kSmallPageShift = 20; -constexpr size_t kMediumPageShift = 23; +constexpr size_t kMinBlockSize = ExternalAllocator::kMinBlockSize; + +constexpr size_t kSmallPageShift = 21; +constexpr size_t kMediumPageShift = 24; +constexpr size_t kSmallPageSize = 1UL << kSmallPageShift; // 2MB +constexpr size_t kMediumPageSize = 1UL << kMediumPageShift; // 16MB +constexpr size_t kMediumObjMaxSize = kMediumPageSize / 8; + constexpr size_t kSegmentAlignment = 256_MB; constexpr size_t kSegmentDefaultSize = 256_MB; -constexpr inline size_t wsize_from_size(size_t size) { - return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); -} +constexpr unsigned kNumBins = detail::kNumFreePages; +constexpr unsigned kLargeSizeBin = kNumBins - 1; +constexpr unsigned kMaxPagesInSegment = kSegmentDefaultSize / kSmallPageSize; +constexpr unsigned kSegDescrAlignment = 8_KB; -// TODO: we may want to round it up to the nearst 512 multiplier so that all the allocated -// blocks will be multipliers of 4kb. -constexpr size_t kBinLens[detail::kNumSizeBins] = { - 512, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, - 3584, 4096, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, - 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688, 131072, UINT64_MAX}; +constexpr size_t kBinWordLens[kNumBins] = { + 1024, 1024 * 2, 1024 * 3, 4096, 5120, 6144, 7168, 8192, 10240, 12288, + 14336, 16384, 20480, 24576, 28672, 32768, 40960, 49152, 57344, 65536, + 81920, 98304, 114688, 131072, 163840, 196608, 229376, 262144, UINT64_MAX}; -static_assert(kBinLens[detail::kLargeSizeBin] == UINT64_MAX); +static_assert(kBinWordLens[kLargeSizeBin - 1] * 8 == kMediumObjMaxSize); +static_assert(kBinWordLens[kLargeSizeBin] == UINT64_MAX); constexpr inline BinIdx ToBinIdx(size_t size) { + // first 4 bins are multiplies of kMinBlockSize. + if (size < ExternalAllocator::kMinBlockSize * 4) { + return size <= ExternalAllocator::kMinBlockSize ? 0 + : (size - 1) / ExternalAllocator::kMinBlockSize; + } + + if (size > kMediumObjMaxSize) { + return kLargeSizeBin; + } + size_t wsize = wsize_from_size(size); - if (wsize <= 512) { - return 1; - } - - if (wsize > kMediumObjSize) { - return detail::kLargeSizeBin; - } - // to correct rounding up of size to words that the last word will be within the range. --wsize; // find the highest bit uint8_t b = 63 - __builtin_clzl(wsize); - return (b << 2) + ((wsize >> (b - 2)) & 3) - 34; + return (b << 2) + ((wsize >> (b - 2)) & 3) - 44; } -static_assert(ToBinIdx(4096) == 1); -static_assert(ToBinIdx(4097) == 2); -static_assert(ToBinIdx(5120) == 2); -static_assert(ToBinIdx(5121) == 3); -static_assert(ToBinIdx(6144) == 3); -static_assert(ToBinIdx(6145) == 4); +static_assert(ToBinIdx(kMinBlockSize) == 0); +static_assert(ToBinIdx(kMinBlockSize * 2) == 1); +static_assert(ToBinIdx(kMinBlockSize * 3) == 2); +static_assert(ToBinIdx(kMinBlockSize * 4) == 3); +static_assert(ToBinIdx(kMinBlockSize * 5) == 4); +static_assert(ToBinIdx(kMinBlockSize * 6) == 5); +static_assert(ToBinIdx(kMinBlockSize * 6 + 1) == 6); +static_assert(ToBinIdx(kMinBlockSize * 7) == 6); -PageClass ClassFromBlockSize(size_t sz) { - if (sz <= 128_KB) +// we preserve 8:1 ratio, i.e. each page can host at least 8 blocks within its class. +PageClass ClassFromSize(size_t size) { + if (size <= kSmallPageSize / 8) return PageClass::SMALL_P; - if (sz <= 1_MB) + if (size <= kMediumPageSize / 8) return PageClass::MEDIUM_P; return PageClass::LARGE_P; } size_t ToBlockSize(BinIdx idx) { - return kBinLens[idx] * 8; + return kBinWordLens[idx] * 8; } -unsigned NumPagesInClass(PageClass pc) { +// num pages in a segment of that class. +unsigned NumPagesInSegment(PageClass pc) { switch (pc) { case PageClass::SMALL_P: return kSegmentDefaultSize >> kSmallPageShift; @@ -93,35 +104,35 @@ unsigned NumPagesInClass(PageClass pc) { return kSegmentDefaultSize >> kMediumPageShift; break; case PageClass::LARGE_P: - DLOG(FATAL) << "TBD"; + return 1; break; } + // unreachable. return 0; } -} // namespace +}; // namespace /* - block 4Kb or more, page - 1MB (256 blocks) or bigger. + block 8Kb or more, page - 2MB (256 blocks) or bigger. Block sizes grow exponentially - by factor ~1.25. See MI_PAGE_QUEUES_EMPTY definition for sizes example. */ namespace detail { + struct Page { std::bitset<256> free_blocks; // bitmask of free blocks (32 bytes). uint8_t id; // index inside the Segment.pages array. - // need some mapping function to map from block_size to real_block_size given Page class. + // need some mapping function to map from block_size to real_block_size given Page class. BinIdx block_size_bin; uint8_t segment_inuse : 1; // true if segment allocated this page. - uint8_t reserved1; + uint8_t reserved[3]; - // number of available free blocks. Note: we could get rid of this field and use - // free_blocks.count instead. - uint16_t available; - uint8_t reserved2[2]; + // can be computed via free_blocks.count(). + uint16_t available; // in number of blocks. // We can not use c'tor because we use the trick in segment where we allocate more pages // than SegmentDescr declares. @@ -132,23 +143,29 @@ struct Page { id = new_id; } - void Init(PageClass pc, BinIdx bin_id) { - DCHECK_EQ(available, 0); - DCHECK(segment_inuse); - - size_t page_size = 1UL << (pc == PageClass::SMALL_P ? kSmallPageShift : kMediumPageShift); - - block_size_bin = bin_id; - available = page_size / ToBlockSize(bin_id); - - free_blocks.reset(); - for (unsigned i = 0; i < available; ++i) { - free_blocks.set(i, true); - } - } + void Init(PageClass pc, BinIdx bin_id); }; -static_assert(sizeof(std::bitset<256>) == 32); +static_assert(sizeof(Page) * kMaxPagesInSegment + 128 < kSegDescrAlignment); + +void Page::Init(PageClass pc, BinIdx bin_id) { + DCHECK_EQ(available, 0); + DCHECK(segment_inuse); + + block_size_bin = bin_id; + if (pc == PageClass::LARGE_P) { + available = 1; + } else { + size_t page_size = (pc == PageClass::SMALL_P) ? kSmallPageSize : kMediumPageSize; + available = page_size / ToBlockSize(bin_id); + } + + free_blocks.reset(); + for (unsigned i = 0; i < available; ++i) { + free_blocks.set(i, true); + } +} + } // namespace detail // @@ -165,7 +182,7 @@ static_assert(sizeof(std::bitset<256>) == 32); * page_start. segment.pages[page_id].block_size gives us the block size and that in turn gives us * block id within the page. We can also know block_size if the originally allocated size is provided by using round_up function that was used to allocate the block. - * SegmentDescr be aligned by 16KB boundaries - ToSegDescr relies on that. + * SegmentDescr be aligned by kSegDescrAlignment boundaries - ToSegDescr relies on that. */ class ExternalAllocator::SegmentDescr { SegmentDescr(const SegmentDescr&) = delete; @@ -175,30 +192,33 @@ class ExternalAllocator::SegmentDescr { public: explicit SegmentDescr(PageClass pc, size_t offs, uint16_t capacity); - Page* FindPageSegment(); + Page* FindPageSegment() { + return pi_.FindPageSegment(); + } Page* GetPage(unsigned i) { - return pages_ + i; + return pi_.pages + i; } size_t BlockOffset(const Page* page, unsigned blockpos) { - return offset_ + page->id * (1 << page_shift_) + ToBlockSize(page->block_size_bin) * blockpos; + return offset_ + page->id * (1 << pi_.page_shift) + + ToBlockSize(page->block_size_bin) * blockpos; } bool HasFreePages() const { - return capacity_ > used_; + return pi_.capacity > pi_.used; } unsigned capacity() const { - return capacity_; + return pi_.capacity; } unsigned used() const { - return used_; + return pi_.used; } unsigned page_shift() const { - return page_shift_; + return pi_.page_shift; } PageClass page_class() const { @@ -207,48 +227,89 @@ class ExternalAllocator::SegmentDescr { SegmentDescr *next, *prev; + // Links seg before this. + void LinkBefore(SegmentDescr* seg) { + seg->next = this; + seg->prev = prev; + this->prev->next = seg; + this->prev = seg; + } + + // detaches this from the circular list. + // returns next if the list is has more than 1 element + // returns null otherwise. + SegmentDescr* Detach() { + if (next == this) + return nullptr; + + next->prev = prev; + prev->next = next; + + SegmentDescr* res = next; + next = prev = this; + return res; + } + private: - uint64_t offset_; - uint16_t capacity_, used_; - + uint64_t offset_; // size_ - relevant for large segments. PageClass page_class_; - uint8_t page_shift_; - Page pages_[1]; // must be the last field. Can be 1-256 pages. + struct PageInfo { + uint16_t capacity, used; // in number of pages. + uint8_t page_shift; + Page pages[0]; // must be the last field. Can be 1-256 pages. + + PageInfo(uint16_t c) : capacity(c), used(0), page_shift(0) { + } + + auto FindPageSegment() -> Page* { + for (uint32_t i = 0; i < capacity; ++i) { + if (!pages[i].segment_inuse) { + pages[i].segment_inuse = 1; + ++used; + return pages + i; + } + } + + LOG(DFATAL) << "Should not reach here"; + + return nullptr; + } + }; + + struct LargeInfo { + size_t seg_size; + }; + + union { + PageInfo pi_; + LargeInfo li_; + }; }; ExternalAllocator::SegmentDescr::SegmentDescr(PageClass pc, size_t offs, uint16_t capacity) - : offset_(offs), capacity_(capacity), used_(0), page_class_(pc), page_shift_(kSmallPageShift) { + : offset_(offs), page_class_(pc), pi_(capacity) { + constexpr size_t kDescrSize = sizeof(SegmentDescr); + (void)kDescrSize; + next = prev = this; DCHECK(pc != PageClass::LARGE_P); if (pc == PageClass::MEDIUM_P) - page_shift_ = kMediumPageShift; + pi_.page_shift = kMediumPageShift; + else + pi_.page_shift = kSmallPageShift; for (unsigned i = 0; i < capacity; ++i) { - pages_[i].Reset(i); + pi_.pages[i].Reset(i); } } -auto ExternalAllocator::SegmentDescr::FindPageSegment() -> Page* { - for (uint32_t i = 0; i < capacity_; ++i) { - if (!pages_[i].segment_inuse) { - pages_[i].segment_inuse = 1; - ++used_; - return pages_ + i; - } - } - - LOG(DFATAL) << "Should not reach here"; - - return nullptr; -} - static detail::Page empty_page; ExternalAllocator::ExternalAllocator() { - std::fill(sq_, sq_ + 3, nullptr); - std::fill(free_pages_, free_pages_ + detail::kNumSizeBins, &empty_page); + std::fill(sq_, sq_ + ABSL_ARRAYSIZE(sq_), nullptr); + std::fill(free_pages_, free_pages_ + detail::kNumFreePages, &empty_page); } ExternalAllocator::~ExternalAllocator() { @@ -262,16 +323,21 @@ int64_t ExternalAllocator::Malloc(size_t sz) { Page* page = free_pages_[bin_idx]; if (page->available == 0) { // empty page. - PageClass pc = ClassFromBlockSize(sz); - CHECK_NE(pc, PageClass::LARGE_P) << "not supported, TBD"; + PageClass pc = ClassFromSize(sz); - size_t seg_size = 0; - page = FindPage(pc, &seg_size); - if (!page) - return -int64_t(seg_size); + if (pc == PageClass::LARGE_P) { + size_t req_seg_size = 0; + page = FindLargePage(sz, &req_seg_size); + if (!page) + return -int64_t(req_seg_size); + } else { + page = FindPage(pc); + if (!page) + return -int64_t(kSegmentDefaultSize); + free_pages_[bin_idx] = page; + } page->Init(pc, bin_idx); - free_pages_[bin_idx] = page; } DCHECK(page->available); @@ -322,16 +388,16 @@ void ExternalAllocator::AddStorage(size_t offset, size_t size) { size_t idx = offset / 256_MB; CHECK_LE(segments_.size(), idx); - auto [it, added] = added_segs_.emplace(offset, size); + auto [it, added] = segm_intervals_.emplace(offset, size); CHECK(added); - if (it != added_segs_.begin()) { + if (it != segm_intervals_.begin()) { auto prev = it; --prev; CHECK_LE(prev->first + prev->second, offset); } auto next = it; ++next; - if (next != added_segs_.end()) { + if (next != segm_intervals_.end()) { CHECK_LE(offset + size, next->first); } @@ -340,7 +406,19 @@ void ExternalAllocator::AddStorage(size_t offset, size_t size) { size_t ExternalAllocator::GoodSize(size_t sz) { uint8_t bin_idx = ToBinIdx(sz); - return ToBlockSize(bin_idx); + if (bin_idx < kLargeSizeBin) + return ToBlockSize(bin_idx); + + return divup(sz, 4_KB) * 4_KB; +} + +detail::PageClass ExternalAllocator::PageClassFromOffset(size_t offset) const { + size_t idx = offset / 256_MB; + CHECK_LT(idx, segments_.size()); + CHECK(segments_[idx]); + + SegmentDescr* seg = segments_[idx]; + return seg->page_class(); } /** @@ -356,7 +434,7 @@ size_t ExternalAllocator::GoodSize(size_t sz) { */ // private functions -auto ExternalAllocator::FindPage(PageClass pc, size_t* seg_size) -> Page* { +auto ExternalAllocator::FindPage(PageClass pc) -> Page* { DCHECK_NE(pc, PageClass::LARGE_P); SegmentDescr* seg = sq_[pc]; @@ -367,33 +445,28 @@ auto ExternalAllocator::FindPage(PageClass pc, size_t* seg_size) -> Page* { } // remove head. - SegmentDescr* next = seg->next; - if (next == seg->prev) { - sq_[pc] = nullptr; - DCHECK(next == seg); + SegmentDescr* next = seg->Detach(); + sq_[pc] = next; + if (next == nullptr) { break; } - - sq_[pc] = next; - next->prev = seg->prev; - seg->prev->next = next; - seg->next = seg->prev = seg; seg = next; } } - if (!added_segs_.empty()) { - unsigned num_pages = NumPagesInClass(pc); + if (!segm_intervals_.empty()) { + unsigned num_pages = NumPagesInSegment(pc); - auto it = added_segs_.begin(); + auto it = segm_intervals_.begin(); size_t seg_idx = it->first / kSegmentAlignment; CHECK_LE(segments_.size(), seg_idx); segments_.resize(seg_idx + 1); - void* ptr = mi_malloc_aligned(sizeof(SegmentDescr) + (num_pages - 1) * sizeof(Page), 16_KB); + void* ptr = + mi_malloc_aligned(sizeof(SegmentDescr) + num_pages * sizeof(Page), kSegDescrAlignment); SegmentDescr* seg = new (ptr) SegmentDescr(pc, it->first, num_pages); segments_[seg_idx] = seg; - added_segs_.erase(it); + segm_intervals_.erase(it); DCHECK(sq_[pc] == NULL); DCHECK(seg->next == seg->prev && seg == seg->next); @@ -402,7 +475,14 @@ auto ExternalAllocator::FindPage(PageClass pc, size_t* seg_size) -> Page* { return seg->FindPageSegment(); } - *seg_size = kSegmentDefaultSize; + return nullptr; +} + +auto ExternalAllocator::FindLargePage(size_t size, size_t* segment_size) -> Page* { + LOG(FATAL) << "TBD"; + // size_t aligned_blocks = divup(size, 4_KB); + // size_t offset = GetLargeInterval(aligned_blocks); + // return nullptr; } @@ -429,19 +509,17 @@ void ExternalAllocator::FreePage(Page* page, SegmentDescr* owner, size_t block_s if (sq == nullptr) { sq = owner; } else { - SegmentDescr* last = sq->prev; - last->next = owner; - owner->prev = last; - owner->next = sq; - sq->prev = owner; + sq->LinkBefore(owner); } } - --owner->used_; + --owner->pi_.used; } inline auto ExternalAllocator::ToSegDescr(Page* page) -> SegmentDescr* { uintptr_t ptr = (uintptr_t)page; - uintptr_t seg_ptr = ptr & ~uintptr_t(16_KB - 1); // align to 16KB boundary. + + // find SegDescr boundary. + uintptr_t seg_ptr = ptr & ~uintptr_t(kSegDescrAlignment - 1); SegmentDescr* res = reinterpret_cast(seg_ptr); DCHECK(res->GetPage(page->id) == page); diff --git a/src/core/external_alloc.h b/src/core/external_alloc.h index 8db5a2d..ce53bea 100644 --- a/src/core/external_alloc.h +++ b/src/core/external_alloc.h @@ -11,6 +11,14 @@ namespace dfly { +constexpr inline unsigned long long operator""_MB(unsigned long long x) { + return x << 20U; +} + +constexpr inline unsigned long long operator""_KB(unsigned long long x) { + return x << 10U; +} + /** * * An external allocator inspired by mimalloc. Its goal is to maintain a state machine for @@ -26,12 +34,11 @@ namespace dfly { namespace detail { class Page; -constexpr unsigned kLargeSizeBin = 34; -constexpr unsigned kNumSizeBins = kLargeSizeBin + 1; +constexpr unsigned kNumFreePages = 29; /** - * pages classes can be SMALL, MEDIUM or LARGE. SMALL (1MB) for block sizes upto 128KB. - * MEDIUM (8MB) for block sizes upto 1MB. LARGE - blocks larger than 1MB. + * pages classes can be SMALL, MEDIUM or LARGE. SMALL (2MB) for block sizes upto 256KB. + * MEDIUM (16MB) for block sizes 256KB-2MB. Anything else is LARGE. * */ enum PageClass : uint8_t { @@ -48,6 +55,7 @@ class ExternalAllocator { public: static constexpr size_t kExtAlignment = 1ULL << 28; // 256 MB + static constexpr size_t kMinBlockSize = 1 << 13; // 8KB ExternalAllocator(); ~ExternalAllocator(); @@ -78,25 +86,33 @@ class ExternalAllocator { return allocated_bytes_; } + // accessors useful for tests. + detail::PageClass PageClassFromOffset(size_t offset) const; + private: class SegmentDescr; using Page = detail::Page; - Page* FindPage(detail::PageClass sc, size_t* seg_size); + // Returns a page if there is a segment of that class. + // Returns NULL if no page is found. + Page* FindPage(detail::PageClass sc); + + Page* FindLargePage(size_t size, size_t* segment_size); SegmentDescr* GetNewSegment(detail::PageClass sc); void FreePage(Page* page, SegmentDescr* owner, size_t block_size); static SegmentDescr* ToSegDescr(Page*); - SegmentDescr* sq_[3]; // map: PageClass -> free Segment. - Page* free_pages_[detail::kNumSizeBins]; + SegmentDescr* sq_[2]; // map: PageClass -> free Segment. + Page* free_pages_[detail::kNumFreePages]; // A segment for each 256MB range. To get a segment id from the offset, shift right by 28. std::vector segments_; // weird queue to support AddStorage interface. We can not instantiate segment // until we know its class and that we know only when a page is demanded. - absl::btree_map added_segs_; + // sorted map of offset -> size. + absl::btree_map segm_intervals_; size_t capacity_ = 0; // in bytes. size_t allocated_bytes_ = 0; diff --git a/src/core/external_alloc_test.cc b/src/core/external_alloc_test.cc index 7b648be..ab00d74 100644 --- a/src/core/external_alloc_test.cc +++ b/src/core/external_alloc_test.cc @@ -22,7 +22,7 @@ class ExternalAllocatorTest : public ::testing::Test { ExternalAllocator ext_alloc_; }; -constexpr int64_t kSegSize = 1 << 28; +constexpr int64_t kSegSize = 256_MB; std::map AllocateFully(ExternalAllocator* alloc) { std::map ranges; @@ -30,7 +30,7 @@ std::map AllocateFully(ExternalAllocator* alloc) { int64_t res = 0; while (res >= 0) { for (unsigned j = 1; j < 5; ++j) { - size_t sz = 4000 * j; + size_t sz = 8000 * j; res = alloc->Malloc(sz); if (res < 0) break; @@ -43,21 +43,24 @@ std::map AllocateFully(ExternalAllocator* alloc) { return ranges; } +constexpr size_t kMinBlockSize = ExternalAllocator::kMinBlockSize; + TEST_F(ExternalAllocatorTest, Basic) { int64_t res = ext_alloc_.Malloc(128); EXPECT_EQ(-kSegSize, res); ext_alloc_.AddStorage(0, kSegSize); - EXPECT_EQ(0, ext_alloc_.Malloc(4000)); - EXPECT_EQ(4096, ext_alloc_.Malloc(4096)); - EXPECT_EQ(1048576, ext_alloc_.Malloc(8192)); // another page. + EXPECT_EQ(0, ext_alloc_.Malloc(4000)); // page0: 1 + EXPECT_EQ(kMinBlockSize, ext_alloc_.Malloc(4_KB)); // page0: 2 + size_t offset2 = ext_alloc_.Malloc(8193); // page1: 1 + EXPECT_GT(offset2, 1_MB); // another page. - ext_alloc_.Free(1048576, 8192); // should return the page to the segment. - EXPECT_EQ(1048576, ext_alloc_.Malloc(1 << 14)); // another page. + ext_alloc_.Free(offset2, 8193); // should return the page to the segment. + EXPECT_EQ(offset2, ext_alloc_.Malloc(16_KB)); // another page. page1: 1 - ext_alloc_.Free(0, 4000); - ext_alloc_.Free(4096, 4096); - EXPECT_EQ(0, ext_alloc_.Malloc(4097)); + ext_alloc_.Free(0, 4000); // page0: 1 + ext_alloc_.Free(kMinBlockSize, 4_KB); // page0: 0 + EXPECT_EQ(0, ext_alloc_.Malloc(8_KB)); // page0 } TEST_F(ExternalAllocatorTest, Invariants) { @@ -83,4 +86,19 @@ TEST_F(ExternalAllocatorTest, Invariants) { } } +TEST_F(ExternalAllocatorTest, Classes) { + ext_alloc_.AddStorage(0, kSegSize); + off_t offs1 = ext_alloc_.Malloc(256_KB); + EXPECT_EQ(detail::SMALL_P, ext_alloc_.PageClassFromOffset(offs1)); + off_t offs2 = ext_alloc_.Malloc(256_KB + 1); + EXPECT_EQ(offs2, -kSegSize); + + ext_alloc_.AddStorage(kSegSize, kSegSize); + offs2 = ext_alloc_.Malloc(256_KB + 1); + EXPECT_EQ(detail::MEDIUM_P, ext_alloc_.PageClassFromOffset(offs2)); + off_t offs3 = ext_alloc_.Malloc(2_MB); + EXPECT_EQ(detail::MEDIUM_P, ext_alloc_.PageClassFromOffset(offs3)); + EXPECT_EQ(2_MB, ExternalAllocator::GoodSize(2_MB)); +} + } // namespace dfly \ No newline at end of file diff --git a/src/core/generate_bin_sizes.py b/src/core/generate_bin_sizes.py index abe5c48..80c76e5 100755 --- a/src/core/generate_bin_sizes.py +++ b/src/core/generate_bin_sizes.py @@ -4,18 +4,24 @@ import argparse import random from array import array +# We print in 64 bit words. +ALIGN = 1 << 10 # 1KB alignment + + def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-n', type=int, dest='num', - help='number of numbers', default=9) + help='number of quadruplets', default=9) args = parser.parse_args() - size = 512 - print ('{512, ', end=' ') + size = 4096 + print ('{1024, 1024*2, 1024*3, ', end=' ') + # print ('{', end=' ') for i in range(args.num): incr = size // 4 for j in range(4): + assert size % 1024 == 0, size print (f'{size}, ', end=' ') size += incr if i % 2 == 1: diff --git a/src/server/rdb_load.cc b/src/server/rdb_load.cc index cd67463..17ea30d 100644 --- a/src/server/rdb_load.cc +++ b/src/server/rdb_load.cc @@ -37,7 +37,6 @@ using base::IoBuf; using nonstd::make_unexpected; using namespace util; using rdb::errc; -using facade::operator""_KB; namespace { class error_category : public std::error_category { diff --git a/src/server/rdb_save.cc b/src/server/rdb_save.cc index 27f4230..43b578c 100644 --- a/src/server/rdb_save.cc +++ b/src/server/rdb_save.cc @@ -30,7 +30,6 @@ using namespace std; using base::IoBuf; using io::Bytes; using nonstd::make_unexpected; -using facade::operator""_KB; namespace { diff --git a/src/server/string_family.cc b/src/server/string_family.cc index 95949f2..c6cef39 100644 --- a/src/server/string_family.cc +++ b/src/server/string_family.cc @@ -101,7 +101,7 @@ OpResult SetCmd::Set(const SetParams& params, std::string_view key, std::s EngineShard* shard = db_slice_->shard_owner(); if (shard->tiered_storage()) { // external storage enabled. - if (value.size() >= 64) { + if (value.size() >= 64 && value.size() < 2_MB) { shard->tiered_storage()->UnloadItem(params.db_index, it); } } diff --git a/src/server/tiered_storage.cc b/src/server/tiered_storage.cc index acf7b5c..b9bfbd4 100644 --- a/src/server/tiered_storage.cc +++ b/src/server/tiered_storage.cc @@ -165,7 +165,7 @@ error_code TieredStorage::UnloadItem(DbIndex db_index, PrimeIterator it) { db->pending_upload[it.bucket_cursor().value()] += blob_len; size_t grow_size = 0; - if (!io_mgr_.grow_pending() && pending_unload_bytes_ > 4080) { + if (!io_mgr_.grow_pending() && pending_unload_bytes_ >= ExternalAllocator::kMinBlockSize) { grow_size = SerializePendingItems(); } @@ -198,7 +198,8 @@ size_t TieredStorage::SerializePendingItems() { constexpr size_t kArrLen = 64; PrimeTable::iterator iters[kArrLen]; - unsigned count = 0; + unsigned iter_count = 0; + bool break_early = false; auto is_good = [](const PrimeValue& pv) { return pv.ObjType() == OBJ_STRING && !pv.IsExternal() && pv.Size() >= 64 && !pv.HasIoPending(); @@ -206,8 +207,8 @@ size_t TieredStorage::SerializePendingItems() { auto tr_cb = [&](PrimeTable::iterator it) { if (is_good(it->second)) { - CHECK_LT(count, kArrLen); - iters[count++] = it; + CHECK_LT(iter_count, kArrLen); + iters[iter_count++] = it; } }; @@ -234,7 +235,7 @@ size_t TieredStorage::SerializePendingItems() { PrimeTable::cursor curs(cursor_val); db_slice_.GetTables(db_ind).first->Traverse(curs, tr_cb); - for (unsigned j = 0; j < count; ++j) { + for (unsigned j = 0; j < iter_count; ++j) { PrimeIterator it = iters[j]; size_t item_size = it->second.Size(); DCHECK_GT(item_size, 0u); @@ -249,6 +250,11 @@ size_t TieredStorage::SerializePendingItems() { open_block_size = 0; } + if (pending_unload_bytes_ < unsigned(0.8 * ExternalAllocator::kMinBlockSize)) { + break_early = true; + break; + } + DCHECK_EQ(0u, open_block_size); int64_t res = alloc_.Malloc(item_size); if (res < 0) { @@ -275,10 +281,12 @@ size_t TieredStorage::SerializePendingItems() { block_offset += item_size; // saved into opened block. pending_unload_bytes_ -= item_size; } - count = 0; + iter_count = 0; db->pending_upload.erase(cursor_val); } // sorted_cursors + if (break_early) + break; DCHECK(db->pending_upload.empty()); } // db_arr