Skip to content

Commit db11be5

Browse files
maxkozlovskyclaude
andcommitted
[dual-timeline 1/N] Add secondary timeline metadata, root offset ring, and lifecycle management
Introduce the foundational infrastructure for dual-timeline support: Types (category/mpt/detail/timeline.hpp): - timeline_id enum (primary/secondary) and NUM_TIMELINES constant - timeline_compaction_state struct with per-timeline compaction boundary Metadata (category/mpt/detail/db_metadata.hpp): - secondary_timeline_header_t carved from future_variables_unused: version_lower_bound_, next_version_, active_ flag - Total db_metadata size unchanged (528512 bytes) for backward compat UpdateAux (trie.hpp, trie.cpp, update_aux.cpp): - Per-timeline compaction state array (timeline_[NUM_TIMELINES]) with tl(timeline_id) accessor, replacing bare member fields - Secondary root offset ring mapped from cnv chunk 0's unused space (65536 entries, 512KB per copy) - root_offsets_delegator parameterized on timeline_id - timeline_active(), get_root_offset_at_version(v, tid), db_history_{min,max}_version(tid), version_is_valid_ondisk(v, tid) - Lifecycle: activate_secondary_timeline (initializes compaction from primary), deactivate_secondary_timeline, promote_secondary_to_primary Tests: - db_metadata_test: layout, field offsets, zero-init semantics, read/write round-trip, memcpy survival, no overlap with consensus - timeline_test: enum values, default construction, trivial copyability - update_aux_test: secondary ring mapping, activate/deactivate lifecycle, ring push/read, promote, per-timeline version queries, version_is_valid_ondisk per-timeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f81fee5 commit db11be5

File tree

9 files changed

+1111
-67
lines changed

9 files changed

+1111
-67
lines changed

category/mpt/db_metadata_context.cpp

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,50 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io)
105105
}
106106
}
107107

108-
// Version mismatch detection
108+
// Migration: MONAD007 -> MONAD008. Old code left future_variables_unused
109+
// filled with 0xff; under the new layout those bytes overlap the newly
110+
// introduced secondary_timeline header and would spuriously mark it
111+
// active_=1. Zero the header and bump the magic. Each copy is migrated
112+
// independently: if a crash strands one copy at MONAD007, on restart the
113+
// magic-validation step above restores from the MONAD008 copy when that
114+
// side is valid, otherwise the per-copy migration loop below picks up
115+
// whichever copies still carry MONAD007. Either path converges to both
116+
// copies at MONAD008.
117+
auto const is_previous_magic = [](detail::db_metadata const *m) {
118+
return 0 == memcmp(
119+
m->magic,
120+
detail::db_metadata::PREVIOUS_MAGIC,
121+
detail::db_metadata::MAGIC_STRING_LEN);
122+
};
123+
if (is_previous_magic(copies_[0].main) ||
124+
is_previous_magic(copies_[1].main)) {
125+
if (!can_write_to_map_) {
126+
MONAD_ABORT_PRINTF(
127+
"Detected pre-dual-timeline DB (magic=%s), which requires "
128+
"writable mapping to migrate to %s. Open with write access.",
129+
detail::db_metadata::PREVIOUS_MAGIC,
130+
detail::db_metadata::MAGIC);
131+
}
132+
for (auto &copy : copies_) {
133+
auto *const m = copy.main;
134+
if (!is_previous_magic(m)) {
135+
continue;
136+
}
137+
auto const g = m->hold_dirty();
138+
memset(&m->secondary_timeline, 0, sizeof(m->secondary_timeline));
139+
std::atomic_signal_fence(std::memory_order_seq_cst);
140+
memcpy(
141+
m->magic,
142+
detail::db_metadata::MAGIC,
143+
detail::db_metadata::MAGIC_STRING_LEN);
144+
}
145+
LOG_INFO(
146+
"Migrated DB metadata from {} to {}.",
147+
detail::db_metadata::PREVIOUS_MAGIC,
148+
detail::db_metadata::MAGIC);
149+
}
150+
151+
// Version mismatch detection (for any other version that we don't migrate)
109152
constexpr unsigned magic_version_len = 3;
110153
constexpr unsigned magic_prefix_len =
111154
detail::db_metadata::MAGIC_STRING_LEN - magic_version_len;
@@ -235,6 +278,7 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io)
235278
else {
236279
// Existing pool: map root offsets immediately
237280
map_root_offsets();
281+
map_secondary_root_offsets();
238282
}
239283
}
240284
#if defined(__GNUC__) && !defined(__clang__)
@@ -306,6 +350,39 @@ void DbMetadataContext::map_root_offsets()
306350
copies_[0].root_offsets.size());
307351
}
308352

353+
void DbMetadataContext::map_secondary_root_offsets()
354+
{
355+
// Map the secondary timeline's root offset ring from cnv chunk 0's
356+
// unused space, immediately after db_metadata (one copy per half-chunk).
357+
// The assertion below verifies the chosen ring size fits.
358+
static constexpr size_t SECONDARY_RING_CAPACITY = 65536; // 2^16
359+
static constexpr size_t secondary_ring_bytes =
360+
SECONDARY_RING_CAPACITY * sizeof(chunk_offset_t);
361+
362+
auto &cnv_chunk = io_->storage_pool().chunk(storage_pool::cnv, 0);
363+
auto const fdr = cnv_chunk.read_fd();
364+
auto const fdw = cnv_chunk.write_fd(0);
365+
auto const secondary_offset = round_up_align<CPU_PAGE_BITS>(db_map_size_);
366+
MONAD_ASSERT(
367+
secondary_offset + secondary_ring_bytes <= cnv_chunk.capacity() / 2);
368+
369+
for (unsigned i = 0; i < 2; i++) {
370+
auto const file_offset =
371+
fdr.second + i * (cnv_chunk.capacity() / 2) + secondary_offset;
372+
auto *ptr = ::mmap(
373+
nullptr,
374+
secondary_ring_bytes,
375+
prot_,
376+
mapflags_,
377+
(can_write_to_map_ ? fdw : fdr).first,
378+
off_t(file_offset));
379+
MONAD_ASSERT(ptr != MAP_FAILED);
380+
copies_[i].secondary_root_offsets = {
381+
start_lifetime_as<chunk_offset_t>((chunk_offset_t *)ptr),
382+
SECONDARY_RING_CAPACITY};
383+
}
384+
}
385+
309386
// Version metadata getters
310387

311388
uint64_t DbMetadataContext::get_latest_finalized_version() const noexcept
@@ -573,13 +650,18 @@ chunk_offset_t DbMetadataContext::get_root_offset_at_version(
573650

574651
DbMetadataContext::~DbMetadataContext()
575652
{
576-
// munmap root_offsets
577653
for (auto &copy : copies_) {
578654
if (copy.root_offsets.data() != nullptr) {
579655
(void)::munmap(
580656
copy.root_offsets.data(), copy.root_offsets.size_bytes());
581657
copy.root_offsets = {};
582658
}
659+
if (copy.secondary_root_offsets.data() != nullptr) {
660+
(void)::munmap(
661+
copy.secondary_root_offsets.data(),
662+
copy.secondary_root_offsets.size_bytes());
663+
copy.secondary_root_offsets = {};
664+
}
583665
}
584666
// munmap db_metadata
585667
if (copies_[0].main != nullptr) {
@@ -711,6 +793,13 @@ void DbMetadataContext::init_new_pool(
711793
detail::db_metadata::MAGIC_STRING_LEN);
712794

713795
map_root_offsets();
796+
map_secondary_root_offsets();
797+
for (auto &copy : copies_) {
798+
std::fill(
799+
copy.secondary_root_offsets.begin(),
800+
copy.secondary_root_offsets.end(),
801+
INVALID_OFFSET);
802+
}
714803
// Set history length, MUST be after root offsets are mapped
715804
if (history_len.has_value()) {
716805
update_history_length_metadata(*history_len);

category/mpt/db_metadata_context.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <category/core/detail/start_lifetime_as_polyfill.hpp>
2323
#include <category/mpt/config.hpp>
2424
#include <category/mpt/detail/db_metadata.hpp>
25+
#include <category/mpt/detail/timeline.hpp>
2526
#include <category/mpt/util.hpp>
2627

2728
#include <atomic>
@@ -48,6 +49,7 @@ class DbMetadataContext
4849
{
4950
detail::db_metadata *main{nullptr};
5051
std::span<chunk_offset_t> root_offsets;
52+
std::span<chunk_offset_t> secondary_root_offsets;
5153
};
5254

5355
// Construct and mmap metadata from the given AsyncIO's storage pool.
@@ -133,6 +135,28 @@ class DbMetadataContext
133135
root_offsets_chunks_.size());
134136
}
135137

138+
root_offsets_delegator(
139+
uint64_t &version_lower_bound, uint64_t &next_version,
140+
std::span<chunk_offset_t> root_offsets_chunks)
141+
: version_lower_bound_(version_lower_bound)
142+
, next_version_(next_version)
143+
, root_offsets_chunks_(root_offsets_chunks)
144+
{
145+
MONAD_ASSERT_PRINTF(
146+
root_offsets_chunks_.size() == 0 ||
147+
root_offsets_chunks_.size() ==
148+
1ULL
149+
<< (63 - std::countl_zero(
150+
root_offsets_chunks_.size())),
151+
"root offsets chunks size is %lu, not a power of 2",
152+
root_offsets_chunks_.size());
153+
}
154+
155+
bool empty() const noexcept
156+
{
157+
return root_offsets_chunks_.empty();
158+
}
159+
136160
size_t capacity() const noexcept
137161
{
138162
return root_offsets_chunks_.size();
@@ -210,6 +234,32 @@ class DbMetadataContext
210234
return root_offsets_delegator{&copies_[which]};
211235
}
212236

237+
auto root_offsets(timeline_id const tid, unsigned const which = 0) const
238+
{
239+
using delegator = decltype(root_offsets(0));
240+
auto const *m = &copies_[which];
241+
if (tid == timeline_id::primary) {
242+
return delegator{&copies_[which]};
243+
}
244+
return delegator{
245+
m->main->secondary_timeline.version_lower_bound_,
246+
m->main->secondary_timeline.next_version_,
247+
m->secondary_root_offsets};
248+
}
249+
250+
std::span<chunk_offset_t>
251+
secondary_root_offsets(unsigned const which = 0) const noexcept
252+
{
253+
return copies_[which].secondary_root_offsets;
254+
}
255+
256+
void swap_root_offsets_spans() noexcept
257+
{
258+
for (auto &copy : copies_) {
259+
std::swap(copy.root_offsets, copy.secondary_root_offsets);
260+
}
261+
}
262+
213263
// Version metadata getters/setters
214264
uint64_t get_latest_finalized_version() const noexcept;
215265
void set_latest_finalized_version(uint64_t version) noexcept;
@@ -287,6 +337,7 @@ class DbMetadataContext
287337
// Map root_offsets from cnv chunks. Called by the constructor for existing
288338
// pools, and by UpdateAux::init() after writing magic for new pools.
289339
void map_root_offsets();
340+
void map_secondary_root_offsets();
290341

291342
detail::db_metadata *main_mutable(unsigned const which = 0) noexcept
292343
{

category/mpt/detail/db_metadata.hpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,11 @@ namespace detail
6363
// For the memory map of the first conventional chunk
6464
struct db_metadata
6565
{
66-
static constexpr char const *MAGIC = "MONAD007";
66+
static constexpr char const *MAGIC = "MONAD008";
6767
static constexpr unsigned MAGIC_STRING_LEN = 8;
68+
// Previous magic supported via on-the-fly migration (see
69+
// DbMetadataContext constructor).
70+
static constexpr char const *PREVIOUS_MAGIC = "MONAD007";
6871

6972
friend class MONAD_MPT_NAMESPACE::DbMetadataContext;
7073
friend class MONAD_MPT_NAMESPACE::UpdateAux;
@@ -156,8 +159,23 @@ namespace detail
156159
bytes32_t latest_voted_block_id;
157160
bytes32_t latest_proposed_block_id;
158161

162+
// Secondary timeline root offsets ring header. The ring data
163+
// itself lives in cnv chunk 0's unused space (after db_metadata),
164+
// not in separate cnv_chunks. active_ == 0 means inactive; the
165+
// other fields are defined only when active_ != 0.
166+
struct secondary_timeline_header_t
167+
{
168+
uint64_t version_lower_bound_;
169+
uint64_t next_version_;
170+
uint8_t active_; // nonzero = active
171+
uint8_t reserved_[23];
172+
} secondary_timeline;
173+
174+
static_assert(sizeof(secondary_timeline_header_t) == 40);
175+
159176
// padding for adding future atomics without requiring DB reset
160-
uint8_t future_variables_unused[4032];
177+
uint8_t
178+
future_variables_unused[4032 - sizeof(secondary_timeline_header_t)];
161179

162180
// used to know if the metadata was being
163181
// updated when the process suddenly exited
@@ -508,13 +526,18 @@ namespace detail
508526
dest->root_offsets.next_version_ = 0; // INVALID_BLOCK_NUM
509527
auto const old_next_version = intr->root_offsets.next_version_;
510528
intr->root_offsets.next_version_ = 0; // INVALID_BLOCK_NUM
529+
dest->secondary_timeline.next_version_ = 0;
530+
auto const old_secondary_next = intr->secondary_timeline.next_version_;
531+
intr->secondary_timeline.next_version_ = 0;
511532
atomic_memcpy((void *)dest, buffer, sizeof(db_metadata));
512533
atomic_memcpy(
513534
((std::byte *)dest) + sizeof(db_metadata),
514535
((std::byte const *)src) + sizeof(db_metadata),
515536
bytes - sizeof(db_metadata));
516537
std::atomic_ref<uint64_t>(dest->root_offsets.next_version_)
517538
.store(old_next_version, std::memory_order_release);
539+
std::atomic_ref<uint64_t>(dest->secondary_timeline.next_version_)
540+
.store(old_secondary_next, std::memory_order_release);
518541
};
519542
}
520543

category/mpt/detail/timeline.hpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Copyright (C) 2025 Category Labs, Inc.
2+
//
3+
// This program is free software: you can redistribute it and/or modify
4+
// it under the terms of the GNU General Public License as published by
5+
// the Free Software Foundation, either version 3 of the License, or
6+
// (at your option) any later version.
7+
//
8+
// This program is distributed in the hope that it will be useful,
9+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
// GNU General Public License for more details.
12+
//
13+
// You should have received a copy of the GNU General Public License
14+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
15+
16+
#pragma once
17+
18+
#include <category/mpt/config.hpp>
19+
#include <category/mpt/util.hpp>
20+
21+
#include <cstdint>
22+
#include <type_traits>
23+
24+
MONAD_MPT_NAMESPACE_BEGIN
25+
26+
enum class timeline_id : uint8_t
27+
{
28+
primary = 0,
29+
secondary = 1
30+
};
31+
32+
static constexpr unsigned NUM_TIMELINES = 2;
33+
static_assert(
34+
static_cast<unsigned>(timeline_id::secondary) + 1 == NUM_TIMELINES);
35+
36+
/// Per-timeline compaction state. Each timeline maintains its own compaction
37+
/// boundary and stride. Disk growth tracking remains global since both
38+
/// timelines share the same fast/slow append rings.
39+
struct timeline_compaction_state
40+
{
41+
compact_offset_pair compact_offsets{
42+
MIN_COMPACT_VIRTUAL_OFFSET, MIN_COMPACT_VIRTUAL_OFFSET};
43+
compact_virtual_chunk_offset_t compact_offset_range_fast_{
44+
MIN_COMPACT_VIRTUAL_OFFSET};
45+
compact_virtual_chunk_offset_t compact_offset_range_slow_{
46+
MIN_COMPACT_VIRTUAL_OFFSET};
47+
int64_t curr_upsert_auto_expire_version{0};
48+
};
49+
50+
static_assert(std::is_trivially_copyable_v<timeline_compaction_state>);
51+
52+
MONAD_MPT_NAMESPACE_END

0 commit comments

Comments
 (0)