Skip to content

Commit 95df8df

Browse files
maxkozlovskyclaude
andcommitted
[dual-timeline 1/N] Add secondary timeline metadata, root offset ring, and lifecycle management
Introduce the foundational infrastructure for dual-timeline support: Types (category/mpt/detail/timeline.hpp): - timeline_id enum (primary/secondary) and NUM_TIMELINES constant - timeline_compaction_state struct with per-timeline compaction boundary Metadata (category/mpt/detail/db_metadata.hpp): - secondary_timeline_header_t carved from future_variables_unused: version_lower_bound_, next_version_, active_ flag - Total db_metadata size unchanged (528512 bytes) for backward compat UpdateAux (trie.hpp, trie.cpp, update_aux.cpp): - Per-timeline compaction state array (timeline_[NUM_TIMELINES]) with tl(timeline_id) accessor, replacing bare member fields - Secondary root offset ring mapped from cnv chunk 0's unused space (65536 entries, 512KB per copy) - root_offsets_delegator parameterized on timeline_id - timeline_active(), get_root_offset_at_version(v, tid), db_history_{min,max}_version(tid), version_is_valid_ondisk(v, tid) - Lifecycle: activate_secondary_timeline (initializes compaction from primary), deactivate_secondary_timeline, promote_secondary_to_primary Tests: - db_metadata_test: layout, field offsets, zero-init semantics, read/write round-trip, memcpy survival, no overlap with consensus - timeline_test: enum values, default construction, trivial copyability - update_aux_test: secondary ring mapping, activate/deactivate lifecycle, ring push/read, promote, per-timeline version queries, version_is_valid_ondisk per-timeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f81fee5 commit 95df8df

File tree

9 files changed

+1181
-67
lines changed

9 files changed

+1181
-67
lines changed

category/mpt/db_metadata_context.cpp

Lines changed: 230 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <category/mpt/config.hpp>
2424
#include <category/mpt/db_metadata_context.hpp>
2525
#include <category/mpt/detail/db_metadata.hpp>
26+
#include <category/mpt/detail/timeline.hpp>
2627
#include <category/mpt/trie.hpp>
2728
#include <category/mpt/util.hpp>
2829

@@ -105,7 +106,50 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io)
105106
}
106107
}
107108

108-
// Version mismatch detection
109+
// Migration: MONAD007 -> MONAD008. Old code left future_variables_unused
110+
// filled with 0xff; under the new layout those bytes overlap the newly
111+
// introduced secondary_timeline header and would spuriously mark it
112+
// active_=1. Zero the header and bump the magic. Each copy is migrated
113+
// independently: if a crash strands one copy at MONAD007, on restart the
114+
// magic-validation step above restores from the MONAD008 copy when that
115+
// side is valid, otherwise the per-copy migration loop below picks up
116+
// whichever copies still carry MONAD007. Either path converges to both
117+
// copies at MONAD008.
118+
auto const is_previous_magic = [](detail::db_metadata const *m) {
119+
return 0 == memcmp(
120+
m->magic,
121+
detail::db_metadata::PREVIOUS_MAGIC,
122+
detail::db_metadata::MAGIC_STRING_LEN);
123+
};
124+
if (is_previous_magic(copies_[0].main) ||
125+
is_previous_magic(copies_[1].main)) {
126+
if (!can_write_to_map_) {
127+
MONAD_ABORT_PRINTF(
128+
"Detected pre-dual-timeline DB (magic=%s), which requires "
129+
"writable mapping to migrate to %s. Open with write access.",
130+
detail::db_metadata::PREVIOUS_MAGIC,
131+
detail::db_metadata::MAGIC);
132+
}
133+
for (auto const &copy : copies_) {
134+
auto *const m = copy.main;
135+
if (!is_previous_magic(m)) {
136+
continue;
137+
}
138+
auto const g = m->hold_dirty();
139+
memset(&m->secondary_timeline, 0, sizeof(m->secondary_timeline));
140+
std::atomic_signal_fence(std::memory_order_seq_cst);
141+
memcpy(
142+
m->magic,
143+
detail::db_metadata::MAGIC,
144+
detail::db_metadata::MAGIC_STRING_LEN);
145+
}
146+
LOG_INFO(
147+
"Migrated DB metadata from {} to {}.",
148+
detail::db_metadata::PREVIOUS_MAGIC,
149+
detail::db_metadata::MAGIC);
150+
}
151+
152+
// Version mismatch detection (for any other version that we don't migrate)
109153
constexpr unsigned magic_version_len = 3;
110154
constexpr unsigned magic_prefix_len =
111155
detail::db_metadata::MAGIC_STRING_LEN - magic_version_len;
@@ -235,6 +279,7 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io)
235279
else {
236280
// Existing pool: map root offsets immediately
237281
map_root_offsets();
282+
map_secondary_root_offsets();
238283
}
239284
}
240285
#if defined(__GNUC__) && !defined(__clang__)
@@ -306,6 +351,39 @@ void DbMetadataContext::map_root_offsets()
306351
copies_[0].root_offsets.size());
307352
}
308353

354+
void DbMetadataContext::map_secondary_root_offsets()
355+
{
356+
// Map the secondary timeline's root offset ring from cnv chunk 0's
357+
// unused space, immediately after db_metadata (one copy per half-chunk).
358+
// The assertion below verifies the chosen ring size fits.
359+
static constexpr size_t SECONDARY_RING_CAPACITY = 65536; // 2^16
360+
static constexpr size_t secondary_ring_bytes =
361+
SECONDARY_RING_CAPACITY * sizeof(chunk_offset_t);
362+
363+
auto &cnv_chunk = io_->storage_pool().chunk(storage_pool::cnv, 0);
364+
auto const fdr = cnv_chunk.read_fd();
365+
auto const fdw = cnv_chunk.write_fd(0);
366+
auto const secondary_offset = round_up_align<CPU_PAGE_BITS>(db_map_size_);
367+
MONAD_ASSERT(
368+
secondary_offset + secondary_ring_bytes <= cnv_chunk.capacity() / 2);
369+
370+
for (unsigned i = 0; i < 2; i++) {
371+
auto const file_offset =
372+
fdr.second + i * (cnv_chunk.capacity() / 2) + secondary_offset;
373+
auto *ptr = ::mmap(
374+
nullptr,
375+
secondary_ring_bytes,
376+
prot_,
377+
mapflags_,
378+
(can_write_to_map_ ? fdw : fdr).first,
379+
off_t(file_offset));
380+
MONAD_ASSERT(ptr != MAP_FAILED);
381+
copies_[i].secondary_root_offsets = {
382+
start_lifetime_as<chunk_offset_t>((chunk_offset_t *)ptr),
383+
SECONDARY_RING_CAPACITY};
384+
}
385+
}
386+
309387
// Version metadata getters
310388

311389
uint64_t DbMetadataContext::get_latest_finalized_version() const noexcept
@@ -571,15 +649,158 @@ chunk_offset_t DbMetadataContext::get_root_offset_at_version(
571649
return INVALID_OFFSET;
572650
}
573651

652+
bool DbMetadataContext::timeline_active(timeline_id const tid) const noexcept
653+
{
654+
if (tid == timeline_id::primary) {
655+
return true;
656+
}
657+
return start_lifetime_as<std::atomic<uint8_t> const>(
658+
&copies_[0].main->secondary_timeline.active_)
659+
->load(std::memory_order_acquire) != 0;
660+
}
661+
662+
chunk_offset_t DbMetadataContext::get_root_offset_at_version(
663+
uint64_t const version, timeline_id const tid) const noexcept
664+
{
665+
if (!timeline_active(tid)) {
666+
return INVALID_OFFSET;
667+
}
668+
if (tid == timeline_id::primary) {
669+
return get_root_offset_at_version(version);
670+
}
671+
auto const ro = root_offsets(tid);
672+
if (ro.empty()) {
673+
return INVALID_OFFSET;
674+
}
675+
auto const max_version = ro.max_version();
676+
if (max_version == INVALID_BLOCK_NUM || version > max_version) {
677+
return INVALID_OFFSET;
678+
}
679+
// Secondary's valid range is bounded by ring capacity (wrap) and by the
680+
// header's version_lower_bound_.
681+
auto const capacity_min_version =
682+
max_version >= ro.capacity() ? max_version - ro.capacity() + 1 : 0;
683+
auto const header_lower_bound =
684+
start_lifetime_as<std::atomic_uint64_t const>(
685+
&copies_[0].main->secondary_timeline.version_lower_bound_)
686+
->load(std::memory_order_acquire);
687+
if (version < std::max(capacity_min_version, header_lower_bound)) {
688+
return INVALID_OFFSET;
689+
}
690+
return ro[version];
691+
}
692+
693+
uint64_t
694+
DbMetadataContext::db_history_max_version(timeline_id const tid) const noexcept
695+
{
696+
if (!timeline_active(tid)) {
697+
return INVALID_BLOCK_NUM;
698+
}
699+
if (tid == timeline_id::primary) {
700+
return db_history_max_version();
701+
}
702+
return root_offsets(tid).max_version();
703+
}
704+
705+
uint64_t DbMetadataContext::db_history_min_valid_version(
706+
timeline_id const tid) const noexcept
707+
{
708+
if (!timeline_active(tid)) {
709+
return INVALID_BLOCK_NUM;
710+
}
711+
if (tid == timeline_id::primary) {
712+
return db_history_min_valid_version();
713+
}
714+
auto const ro = root_offsets(tid);
715+
if (ro.empty() || ro.max_version() == INVALID_BLOCK_NUM) {
716+
return INVALID_BLOCK_NUM;
717+
}
718+
return start_lifetime_as<std::atomic_uint64_t const>(
719+
&copies_[0].main->secondary_timeline.version_lower_bound_)
720+
->load(std::memory_order_acquire);
721+
}
722+
723+
void DbMetadataContext::activate_secondary_header(uint64_t const fork_version)
724+
{
725+
MONAD_ASSERT(!timeline_active(timeline_id::secondary));
726+
MONAD_ASSERT(!copies_[0].secondary_root_offsets.empty());
727+
728+
for (auto const &copy : copies_) {
729+
std::fill(
730+
copy.secondary_root_offsets.begin(),
731+
copy.secondary_root_offsets.end(),
732+
INVALID_OFFSET);
733+
}
734+
// Release-order stores with active_ last: a reader that observes
735+
// active_=1 is guaranteed to see the populated version fields.
736+
for (auto const &copy : copies_) {
737+
auto *const m = copy.main;
738+
auto const g = m->hold_dirty();
739+
start_lifetime_as<std::atomic_uint64_t>(
740+
&m->secondary_timeline.version_lower_bound_)
741+
->store(fork_version, std::memory_order_release);
742+
start_lifetime_as<std::atomic_uint64_t>(
743+
&m->secondary_timeline.next_version_)
744+
->store(fork_version, std::memory_order_release);
745+
start_lifetime_as<std::atomic<uint8_t>>(&m->secondary_timeline.active_)
746+
->store(1, std::memory_order_release);
747+
}
748+
LOG_INFO("Activated secondary timeline at fork version {}", fork_version);
749+
}
750+
751+
void DbMetadataContext::deactivate_secondary_header()
752+
{
753+
MONAD_ASSERT(timeline_active(timeline_id::secondary));
754+
// Clear active_ only. Version fields are defined only when active_!=0,
755+
// so a reader that observes active_=0 ignores them — leaving them
756+
// unchanged avoids the torn-read hazard where a racing reader sees
757+
// stale active_=1 alongside newly-zeroed version fields.
758+
for (auto const &copy : copies_) {
759+
auto *const m = copy.main;
760+
auto const g = m->hold_dirty();
761+
start_lifetime_as<std::atomic<uint8_t>>(&m->secondary_timeline.active_)
762+
->store(0, std::memory_order_release);
763+
}
764+
LOG_INFO("Deactivated secondary timeline");
765+
}
766+
767+
void DbMetadataContext::unsafe_swap_primary_secondary_headers() noexcept
768+
{
769+
MONAD_ASSERT(timeline_active(timeline_id::secondary));
770+
auto const atomic_swap = [](uint64_t &a, uint64_t &b) {
771+
auto *const pa = start_lifetime_as<std::atomic_uint64_t>(&a);
772+
auto *const pb = start_lifetime_as<std::atomic_uint64_t>(&b);
773+
auto const va = pa->load(std::memory_order_acquire);
774+
auto const vb = pb->load(std::memory_order_acquire);
775+
pa->store(vb, std::memory_order_release);
776+
pb->store(va, std::memory_order_release);
777+
};
778+
for (auto const &copy : copies_) {
779+
auto *const m = copy.main;
780+
auto const g = m->hold_dirty();
781+
atomic_swap(
782+
m->root_offsets.version_lower_bound_,
783+
m->secondary_timeline.version_lower_bound_);
784+
atomic_swap(
785+
m->root_offsets.next_version_, m->secondary_timeline.next_version_);
786+
}
787+
swap_root_offsets_spans();
788+
}
789+
574790
DbMetadataContext::~DbMetadataContext()
575791
{
576-
// munmap root_offsets
577792
for (auto &copy : copies_) {
578793
if (copy.root_offsets.data() != nullptr) {
579794
(void)::munmap(
580795
copy.root_offsets.data(), copy.root_offsets.size_bytes());
581796
copy.root_offsets = {};
582797
}
798+
if (copy.secondary_root_offsets.data() != nullptr) {
799+
(void)::munmap(
800+
copy.secondary_root_offsets.data(),
801+
copy.secondary_root_offsets.size_bytes());
802+
copy.secondary_root_offsets = {};
803+
}
583804
}
584805
// munmap db_metadata
585806
if (copies_[0].main != nullptr) {
@@ -711,6 +932,13 @@ void DbMetadataContext::init_new_pool(
711932
detail::db_metadata::MAGIC_STRING_LEN);
712933

713934
map_root_offsets();
935+
map_secondary_root_offsets();
936+
for (auto const &copy : copies_) {
937+
std::fill(
938+
copy.secondary_root_offsets.begin(),
939+
copy.secondary_root_offsets.end(),
940+
INVALID_OFFSET);
941+
}
714942
// Set history length, MUST be after root offsets are mapped
715943
if (history_len.has_value()) {
716944
update_history_length_metadata(*history_len);

0 commit comments

Comments
 (0)