|
23 | 23 | #include <category/mpt/config.hpp> |
24 | 24 | #include <category/mpt/db_metadata_context.hpp> |
25 | 25 | #include <category/mpt/detail/db_metadata.hpp> |
| 26 | +#include <category/mpt/detail/timeline.hpp> |
26 | 27 | #include <category/mpt/trie.hpp> |
27 | 28 | #include <category/mpt/util.hpp> |
28 | 29 |
|
@@ -105,7 +106,50 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io) |
105 | 106 | } |
106 | 107 | } |
107 | 108 |
|
108 | | - // Version mismatch detection |
| 109 | + // Migration: MONAD007 -> MONAD008. Old code left future_variables_unused |
| 110 | + // filled with 0xff; under the new layout those bytes overlap the newly |
| 111 | + // introduced secondary_timeline header and would spuriously mark it |
| 112 | + // active_=1. Zero the header and bump the magic. Each copy is migrated |
| 113 | + // independently: if a crash strands one copy at MONAD007, on restart the |
| 114 | + // magic-validation step above restores from the MONAD008 copy when that |
| 115 | + // side is valid, otherwise the per-copy migration loop below picks up |
| 116 | + // whichever copies still carry MONAD007. Either path converges to both |
| 117 | + // copies at MONAD008. |
| 118 | + auto const is_previous_magic = [](detail::db_metadata const *m) { |
| 119 | + return 0 == memcmp( |
| 120 | + m->magic, |
| 121 | + detail::db_metadata::PREVIOUS_MAGIC, |
| 122 | + detail::db_metadata::MAGIC_STRING_LEN); |
| 123 | + }; |
| 124 | + if (is_previous_magic(copies_[0].main) || |
| 125 | + is_previous_magic(copies_[1].main)) { |
| 126 | + if (!can_write_to_map_) { |
| 127 | + MONAD_ABORT_PRINTF( |
| 128 | + "Detected pre-dual-timeline DB (magic=%s), which requires " |
| 129 | + "writable mapping to migrate to %s. Open with write access.", |
| 130 | + detail::db_metadata::PREVIOUS_MAGIC, |
| 131 | + detail::db_metadata::MAGIC); |
| 132 | + } |
| 133 | + for (auto const © : copies_) { |
| 134 | + auto *const m = copy.main; |
| 135 | + if (!is_previous_magic(m)) { |
| 136 | + continue; |
| 137 | + } |
| 138 | + auto const g = m->hold_dirty(); |
| 139 | + memset(&m->secondary_timeline, 0, sizeof(m->secondary_timeline)); |
| 140 | + std::atomic_signal_fence(std::memory_order_seq_cst); |
| 141 | + memcpy( |
| 142 | + m->magic, |
| 143 | + detail::db_metadata::MAGIC, |
| 144 | + detail::db_metadata::MAGIC_STRING_LEN); |
| 145 | + } |
| 146 | + LOG_INFO( |
| 147 | + "Migrated DB metadata from {} to {}.", |
| 148 | + detail::db_metadata::PREVIOUS_MAGIC, |
| 149 | + detail::db_metadata::MAGIC); |
| 150 | + } |
| 151 | + |
| 152 | + // Version mismatch detection (for any other version that we don't migrate) |
109 | 153 | constexpr unsigned magic_version_len = 3; |
110 | 154 | constexpr unsigned magic_prefix_len = |
111 | 155 | detail::db_metadata::MAGIC_STRING_LEN - magic_version_len; |
@@ -235,6 +279,7 @@ DbMetadataContext::DbMetadataContext(AsyncIO &io) |
235 | 279 | else { |
236 | 280 | // Existing pool: map root offsets immediately |
237 | 281 | map_root_offsets(); |
| 282 | + map_secondary_root_offsets(); |
238 | 283 | } |
239 | 284 | } |
240 | 285 | #if defined(__GNUC__) && !defined(__clang__) |
@@ -306,6 +351,39 @@ void DbMetadataContext::map_root_offsets() |
306 | 351 | copies_[0].root_offsets.size()); |
307 | 352 | } |
308 | 353 |
|
| 354 | +void DbMetadataContext::map_secondary_root_offsets() |
| 355 | +{ |
| 356 | + // Map the secondary timeline's root offset ring from cnv chunk 0's |
| 357 | + // unused space, immediately after db_metadata (one copy per half-chunk). |
| 358 | + // The assertion below verifies the chosen ring size fits. |
| 359 | + static constexpr size_t SECONDARY_RING_CAPACITY = 65536; // 2^16 |
| 360 | + static constexpr size_t secondary_ring_bytes = |
| 361 | + SECONDARY_RING_CAPACITY * sizeof(chunk_offset_t); |
| 362 | + |
| 363 | + auto &cnv_chunk = io_->storage_pool().chunk(storage_pool::cnv, 0); |
| 364 | + auto const fdr = cnv_chunk.read_fd(); |
| 365 | + auto const fdw = cnv_chunk.write_fd(0); |
| 366 | + auto const secondary_offset = round_up_align<CPU_PAGE_BITS>(db_map_size_); |
| 367 | + MONAD_ASSERT( |
| 368 | + secondary_offset + secondary_ring_bytes <= cnv_chunk.capacity() / 2); |
| 369 | + |
| 370 | + for (unsigned i = 0; i < 2; i++) { |
| 371 | + auto const file_offset = |
| 372 | + fdr.second + i * (cnv_chunk.capacity() / 2) + secondary_offset; |
| 373 | + auto *ptr = ::mmap( |
| 374 | + nullptr, |
| 375 | + secondary_ring_bytes, |
| 376 | + prot_, |
| 377 | + mapflags_, |
| 378 | + (can_write_to_map_ ? fdw : fdr).first, |
| 379 | + off_t(file_offset)); |
| 380 | + MONAD_ASSERT(ptr != MAP_FAILED); |
| 381 | + copies_[i].secondary_root_offsets = { |
| 382 | + start_lifetime_as<chunk_offset_t>((chunk_offset_t *)ptr), |
| 383 | + SECONDARY_RING_CAPACITY}; |
| 384 | + } |
| 385 | +} |
| 386 | + |
309 | 387 | // Version metadata getters |
310 | 388 |
|
311 | 389 | uint64_t DbMetadataContext::get_latest_finalized_version() const noexcept |
@@ -571,15 +649,158 @@ chunk_offset_t DbMetadataContext::get_root_offset_at_version( |
571 | 649 | return INVALID_OFFSET; |
572 | 650 | } |
573 | 651 |
|
| 652 | +bool DbMetadataContext::timeline_active(timeline_id const tid) const noexcept |
| 653 | +{ |
| 654 | + if (tid == timeline_id::primary) { |
| 655 | + return true; |
| 656 | + } |
| 657 | + return start_lifetime_as<std::atomic<uint8_t> const>( |
| 658 | + &copies_[0].main->secondary_timeline.active_) |
| 659 | + ->load(std::memory_order_acquire) != 0; |
| 660 | +} |
| 661 | + |
| 662 | +chunk_offset_t DbMetadataContext::get_root_offset_at_version( |
| 663 | + uint64_t const version, timeline_id const tid) const noexcept |
| 664 | +{ |
| 665 | + if (!timeline_active(tid)) { |
| 666 | + return INVALID_OFFSET; |
| 667 | + } |
| 668 | + if (tid == timeline_id::primary) { |
| 669 | + return get_root_offset_at_version(version); |
| 670 | + } |
| 671 | + auto const ro = root_offsets(tid); |
| 672 | + if (ro.empty()) { |
| 673 | + return INVALID_OFFSET; |
| 674 | + } |
| 675 | + auto const max_version = ro.max_version(); |
| 676 | + if (max_version == INVALID_BLOCK_NUM || version > max_version) { |
| 677 | + return INVALID_OFFSET; |
| 678 | + } |
| 679 | + // Secondary's valid range is bounded by ring capacity (wrap) and by the |
| 680 | + // header's version_lower_bound_. |
| 681 | + auto const capacity_min_version = |
| 682 | + max_version >= ro.capacity() ? max_version - ro.capacity() + 1 : 0; |
| 683 | + auto const header_lower_bound = |
| 684 | + start_lifetime_as<std::atomic_uint64_t const>( |
| 685 | + &copies_[0].main->secondary_timeline.version_lower_bound_) |
| 686 | + ->load(std::memory_order_acquire); |
| 687 | + if (version < std::max(capacity_min_version, header_lower_bound)) { |
| 688 | + return INVALID_OFFSET; |
| 689 | + } |
| 690 | + return ro[version]; |
| 691 | +} |
| 692 | + |
| 693 | +uint64_t |
| 694 | +DbMetadataContext::db_history_max_version(timeline_id const tid) const noexcept |
| 695 | +{ |
| 696 | + if (!timeline_active(tid)) { |
| 697 | + return INVALID_BLOCK_NUM; |
| 698 | + } |
| 699 | + if (tid == timeline_id::primary) { |
| 700 | + return db_history_max_version(); |
| 701 | + } |
| 702 | + return root_offsets(tid).max_version(); |
| 703 | +} |
| 704 | + |
| 705 | +uint64_t DbMetadataContext::db_history_min_valid_version( |
| 706 | + timeline_id const tid) const noexcept |
| 707 | +{ |
| 708 | + if (!timeline_active(tid)) { |
| 709 | + return INVALID_BLOCK_NUM; |
| 710 | + } |
| 711 | + if (tid == timeline_id::primary) { |
| 712 | + return db_history_min_valid_version(); |
| 713 | + } |
| 714 | + auto const ro = root_offsets(tid); |
| 715 | + if (ro.empty() || ro.max_version() == INVALID_BLOCK_NUM) { |
| 716 | + return INVALID_BLOCK_NUM; |
| 717 | + } |
| 718 | + return start_lifetime_as<std::atomic_uint64_t const>( |
| 719 | + &copies_[0].main->secondary_timeline.version_lower_bound_) |
| 720 | + ->load(std::memory_order_acquire); |
| 721 | +} |
| 722 | + |
| 723 | +void DbMetadataContext::activate_secondary_header(uint64_t const fork_version) |
| 724 | +{ |
| 725 | + MONAD_ASSERT(!timeline_active(timeline_id::secondary)); |
| 726 | + MONAD_ASSERT(!copies_[0].secondary_root_offsets.empty()); |
| 727 | + |
| 728 | + for (auto const © : copies_) { |
| 729 | + std::fill( |
| 730 | + copy.secondary_root_offsets.begin(), |
| 731 | + copy.secondary_root_offsets.end(), |
| 732 | + INVALID_OFFSET); |
| 733 | + } |
| 734 | + // Release-order stores with active_ last: a reader that observes |
| 735 | + // active_=1 is guaranteed to see the populated version fields. |
| 736 | + for (auto const © : copies_) { |
| 737 | + auto *const m = copy.main; |
| 738 | + auto const g = m->hold_dirty(); |
| 739 | + start_lifetime_as<std::atomic_uint64_t>( |
| 740 | + &m->secondary_timeline.version_lower_bound_) |
| 741 | + ->store(fork_version, std::memory_order_release); |
| 742 | + start_lifetime_as<std::atomic_uint64_t>( |
| 743 | + &m->secondary_timeline.next_version_) |
| 744 | + ->store(fork_version, std::memory_order_release); |
| 745 | + start_lifetime_as<std::atomic<uint8_t>>(&m->secondary_timeline.active_) |
| 746 | + ->store(1, std::memory_order_release); |
| 747 | + } |
| 748 | + LOG_INFO("Activated secondary timeline at fork version {}", fork_version); |
| 749 | +} |
| 750 | + |
| 751 | +void DbMetadataContext::deactivate_secondary_header() |
| 752 | +{ |
| 753 | + MONAD_ASSERT(timeline_active(timeline_id::secondary)); |
| 754 | + // Clear active_ only. Version fields are defined only when active_!=0, |
| 755 | + // so a reader that observes active_=0 ignores them — leaving them |
| 756 | + // unchanged avoids the torn-read hazard where a racing reader sees |
| 757 | + // stale active_=1 alongside newly-zeroed version fields. |
| 758 | + for (auto const © : copies_) { |
| 759 | + auto *const m = copy.main; |
| 760 | + auto const g = m->hold_dirty(); |
| 761 | + start_lifetime_as<std::atomic<uint8_t>>(&m->secondary_timeline.active_) |
| 762 | + ->store(0, std::memory_order_release); |
| 763 | + } |
| 764 | + LOG_INFO("Deactivated secondary timeline"); |
| 765 | +} |
| 766 | + |
| 767 | +void DbMetadataContext::unsafe_swap_primary_secondary_headers() noexcept |
| 768 | +{ |
| 769 | + MONAD_ASSERT(timeline_active(timeline_id::secondary)); |
| 770 | + auto const atomic_swap = [](uint64_t &a, uint64_t &b) { |
| 771 | + auto *const pa = start_lifetime_as<std::atomic_uint64_t>(&a); |
| 772 | + auto *const pb = start_lifetime_as<std::atomic_uint64_t>(&b); |
| 773 | + auto const va = pa->load(std::memory_order_acquire); |
| 774 | + auto const vb = pb->load(std::memory_order_acquire); |
| 775 | + pa->store(vb, std::memory_order_release); |
| 776 | + pb->store(va, std::memory_order_release); |
| 777 | + }; |
| 778 | + for (auto const © : copies_) { |
| 779 | + auto *const m = copy.main; |
| 780 | + auto const g = m->hold_dirty(); |
| 781 | + atomic_swap( |
| 782 | + m->root_offsets.version_lower_bound_, |
| 783 | + m->secondary_timeline.version_lower_bound_); |
| 784 | + atomic_swap( |
| 785 | + m->root_offsets.next_version_, m->secondary_timeline.next_version_); |
| 786 | + } |
| 787 | + swap_root_offsets_spans(); |
| 788 | +} |
| 789 | + |
574 | 790 | DbMetadataContext::~DbMetadataContext() |
575 | 791 | { |
576 | | - // munmap root_offsets |
577 | 792 | for (auto © : copies_) { |
578 | 793 | if (copy.root_offsets.data() != nullptr) { |
579 | 794 | (void)::munmap( |
580 | 795 | copy.root_offsets.data(), copy.root_offsets.size_bytes()); |
581 | 796 | copy.root_offsets = {}; |
582 | 797 | } |
| 798 | + if (copy.secondary_root_offsets.data() != nullptr) { |
| 799 | + (void)::munmap( |
| 800 | + copy.secondary_root_offsets.data(), |
| 801 | + copy.secondary_root_offsets.size_bytes()); |
| 802 | + copy.secondary_root_offsets = {}; |
| 803 | + } |
583 | 804 | } |
584 | 805 | // munmap db_metadata |
585 | 806 | if (copies_[0].main != nullptr) { |
@@ -711,6 +932,13 @@ void DbMetadataContext::init_new_pool( |
711 | 932 | detail::db_metadata::MAGIC_STRING_LEN); |
712 | 933 |
|
713 | 934 | map_root_offsets(); |
| 935 | + map_secondary_root_offsets(); |
| 936 | + for (auto const © : copies_) { |
| 937 | + std::fill( |
| 938 | + copy.secondary_root_offsets.begin(), |
| 939 | + copy.secondary_root_offsets.end(), |
| 940 | + INVALID_OFFSET); |
| 941 | + } |
714 | 942 | // Set history length, MUST be after root offsets are mapped |
715 | 943 | if (history_len.has_value()) { |
716 | 944 | update_history_length_metadata(*history_len); |
|
0 commit comments