From 0b9e30a27489707fda75b7f76e8e98b03236d6ce Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Wed, 24 Jun 2026 10:53:13 +0200 Subject: [PATCH 1/7] Batch validator catch-up on missing cross-chain bundles via an aggregated MissingCrossChainUpdates error. --- linera-chain/src/chain.rs | 22 +++-- linera-chain/src/lib.rs | 15 +-- linera-core/src/client/mod.rs | 31 +++--- linera-core/src/node.rs | 24 ++--- linera-core/src/unit_tests/client_tests.rs | 84 +++++++++++++++++ linera-core/src/unit_tests/worker_tests.rs | 94 +++++++++++++++++++ linera-core/src/updater.rs | 59 ++++++++---- .../tests/snapshots/format__format.yaml.snap | 11 ++- 8 files changed, 273 insertions(+), 67 deletions(-) diff --git a/linera-chain/src/chain.rs b/linera-chain/src/chain.rs index b6849f12dcbf..50e71ae50ba7 100644 --- a/linera-chain/src/chain.rs +++ b/linera-chain/src/chain.rs @@ -664,6 +664,10 @@ where } let origins = bundles_by_origin.keys().copied().collect::>(); let inboxes = self.inboxes.try_load_entries_mut(&origins).await?; + // When the bundles must already be present (block proposals), collect *every* missing + // `(origin, height)` rather than bailing on the first, so the caller can be told the + // full set of cross-chain updates to fetch in a single round-trip. + let mut missing_bundles = Vec::new(); for ((origin, bundles), mut inbox) in bundles_by_origin.into_iter().zip(inboxes) { tracing::trace!( "Removing [{}] from inbox for {origin}", @@ -679,15 +683,8 @@ where .remove_bundle(bundle) .await .map_err(|error| (chain_id, origin, error))?; - if must_be_present { - ensure!( - was_present, - ChainError::MissingCrossChainUpdate { - chain_id, - origin, - height: bundle.height, - } - ); + if must_be_present && !was_present { + missing_bundles.push((origin, bundle.height)); } } inbox.observe_size_metric(); @@ -695,6 +692,13 @@ where self.nonempty_inboxes.get_mut().remove(&origin); } } + ensure!( + missing_bundles.is_empty(), + ChainError::MissingCrossChainUpdates { + chain_id, + bundles: missing_bundles, + } + ); Ok(()) } diff --git a/linera-chain/src/lib.rs b/linera-chain/src/lib.rs index c12cf059e437..2b99b0a0f2ce 100644 --- a/linera-chain/src/lib.rs +++ b/linera-chain/src/lib.rs @@ -53,13 +53,16 @@ pub enum ChainError { #[error("The chain being queried is not active {0}")] InactiveChain(ChainId), #[error( - "Cannot vote for block proposal of chain {chain_id} because a message \ - from chain {origin} at height {height} has not been received yet" + "Cannot vote for block proposal of chain {chain_id} because {} cross-chain message \ + bundle(s) have not been received yet", + bundles.len() )] - MissingCrossChainUpdate { + MissingCrossChainUpdates { chain_id: ChainId, - origin: ChainId, - height: BlockHeight, + /// The missing incoming message bundles, as `(origin chain, height)` pairs that must + /// all be received before this block can be validated. The validator reports every + /// missing bundle at once so the client can fetch them in a single round. + bundles: Vec<(ChainId, BlockHeight)>, }, #[error( "Message in block proposed to {chain_id} does not match the previously received messages from \ @@ -216,7 +219,7 @@ impl ChainError { | ChainError::RoundDoesNotTimeOut | ChainError::NotTimedOutYet(_) | ChainError::CheckpointPreconditionFailed(_) - | ChainError::MissingCrossChainUpdate { .. } => false, + | ChainError::MissingCrossChainUpdates { .. } => false, ChainError::ViewError(_) | ChainError::UnexpectedMessage { .. } | ChainError::InboxGapDetected { .. } diff --git a/linera-core/src/client/mod.rs b/linera-core/src/client/mod.rs index db6cd3169244..a3ecfb1a0baa 100644 --- a/linera-core/src/client/mod.rs +++ b/linera-core/src/client/mod.rs @@ -2211,20 +2211,25 @@ impl Client { } } while let LocalNodeError::WorkerError(WorkerError::ChainError(chain_err)) = &err { - if let ChainError::MissingCrossChainUpdate { - chain_id, - origin, - height, - } = &**chain_err + if let ChainError::MissingCrossChainUpdates { chain_id, bundles } = &**chain_err { - self.download_sender_block_with_sending_ancestors( - *chain_id, - *origin, - *height, - remote_node, - ) - .await?; - // Retry + let chain_id = *chain_id; + // Clone to end the borrow of `err` before we reassign it below. + let bundles = bundles.clone(); + if bundles.is_empty() { + break; + } + // Download every missing sender block the validator reported, in one + // pass, then retry once. + for (origin, height) in bundles { + self.download_sender_block_with_sending_ancestors( + chain_id, + origin, + height, + remote_node, + ) + .await?; + } if let Err(new_err) = self .local_node .handle_block_proposal(proposal.clone()) diff --git a/linera-core/src/node.rs b/linera-core/src/node.rs index 95a9713f3e57..433bbe575e4a 100644 --- a/linera-core/src/node.rs +++ b/linera-core/src/node.rs @@ -276,13 +276,13 @@ pub enum NodeError { // This error must be normalized during conversions. #[error( - "Cannot vote for block proposal of chain {chain_id} because a message \ - from chain {origin} at height {height} has not been received yet" + "Validator is missing {} cross-chain message bundle(s) to validate the block for \ + chain {chain_id}", + bundles.len() )] - MissingCrossChainUpdate { + MissingCrossChainUpdates { chain_id: ChainId, - origin: ChainId, - height: BlockHeight, + bundles: Vec<(ChainId, BlockHeight)>, }, #[error("Blobs not found: {0:?}")] @@ -382,7 +382,7 @@ impl NodeError { NodeError::BlobsNotFound(_) | NodeError::BlocksNotFound(_) | NodeError::EventsNotFound(_) - | NodeError::MissingCrossChainUpdate { .. } + | NodeError::MissingCrossChainUpdates { .. } | NodeError::WrongRound(_) | NodeError::UnexpectedBlockHeight { .. } | NodeError::InactiveChain(_) @@ -473,15 +473,9 @@ impl From for NodeError { impl From for NodeError { fn from(error: ChainError) -> Self { match error { - ChainError::MissingCrossChainUpdate { - chain_id, - origin, - height, - } => Self::MissingCrossChainUpdate { - chain_id, - origin, - height, - }, + ChainError::MissingCrossChainUpdates { chain_id, bundles } => { + Self::MissingCrossChainUpdates { chain_id, bundles } + } ChainError::InactiveChain(chain_id) => Self::InactiveChain(chain_id), ChainError::ExecutionError(execution_error, context) => match *execution_error { ExecutionError::BlobsNotFound(blob_ids) => Self::BlobsNotFound(blob_ids), diff --git a/linera-core/src/unit_tests/client_tests.rs b/linera-core/src/unit_tests/client_tests.rs index 26cedb49f331..485c4b92d7f0 100644 --- a/linera-core/src/unit_tests/client_tests.rs +++ b/linera-core/src/unit_tests/client_tests.rs @@ -1132,6 +1132,90 @@ where Ok(()) } +#[test_case(MemoryStorageBuilder::default(); "memory")] +#[cfg_attr(feature = "storage-service", test_case(ServiceStorageBuilder::new(); "storage_service"))] +#[cfg_attr(feature = "rocksdb", test_case(RocksDbStorageBuilder::new().await; "rocks_db"))] +#[cfg_attr(feature = "scylladb", test_case(ScyllaDbStorageBuilder::default(); "scylla_db"))] +#[test_log::test(tokio::test)] +async fn test_proposal_batches_missing_cross_chain_update_catch_up( + storage_builder: B, +) -> anyhow::Result<()> +where + B: StorageBuilder, +{ + // A block that consumes incoming bundles from several sender chains is proposed to a + // validator that is behind on *all* of those senders. The validator must report every + // missing sender at once (as a single `MissingCrossChainUpdates`) and the client must catch + // them up in a single batch, instead of the one-rejection-and-round-trip-per-sender loop + // that serialized into multi-minute stalls on busy hub chains. This exercises the + // client-side batching/termination logic in `updater.rs`. + let signer = InMemorySigner::new(None); + // Zero default-faulty validators so the only unavailable validators are the ones this test + // deliberately takes offline below; the quorum is still three out of four. + let mut builder = TestBuilder::new(storage_builder, 4, 0, signer).await?; + + // Three independent sender chains and one recipient ("hub") chain. The committee has four + // validators with a quorum of three. + let sender1 = builder.add_root_chain(1, Amount::from_tokens(2)).await?; + let sender2 = builder.add_root_chain(2, Amount::from_tokens(2)).await?; + let sender3 = builder.add_root_chain(3, Amount::from_tokens(2)).await?; + let recipient = builder.add_root_chain(4, Amount::ZERO).await?; + let recipient_id = recipient.chain_id(); + + // Phase A: validator 3 is unavailable while the senders transfer to the recipient. In the + // single-process harness a validator only delivers a cross-chain message to the recipient's + // inbox when it handles the sender's certificate, so validator 3 ends up missing both the + // sender blocks and the resulting bundles in the recipient's inbox. The transfers still + // reach the quorum {0, 1, 2}. + builder.set_fault_type([3], FaultType::OfflineWithInfo); + for sender in [&sender1, &sender2, &sender3] { + sender + .transfer_to_account( + AccountOwner::CHAIN, + Amount::ONE, + Account::chain(recipient_id), + ) + .await + .unwrap_ok_committed(); + } + + // The recipient learns about all three incoming bundles (and preprocesses the sender blocks + // into its local storage) from the honest quorum. + recipient.synchronize_from_validators().await?; + + // Phase B: validator 3 comes back, but validator 2 — one of the validators that *does* have + // the sender blocks — goes offline. The recipient's block now needs validator 3's vote to + // reach a quorum {0, 1, 3}, so the updater cannot route around validator 3: it must catch it + // up on every missing sender before validator 3 can accept the proposal. + builder.set_fault_type([3], FaultType::Honest); + builder.set_fault_type([2], FaultType::Offline); + + // Consume all three bundles in a single block. Validator 3 rejects the proposal with an + // aggregated `MissingCrossChainUpdates` listing all three senders; the client syncs them in + // one batch and the block is confirmed. + recipient.process_inbox().await?; + + assert_eq!( + recipient.local_balance().await.unwrap(), + Amount::from_tokens(3), + ); + // A single block consumed all three bundles (height 1, not 3). + assert_eq!( + recipient.chain_info().await?.next_block_height, + BlockHeight::from(1), + ); + assert!(recipient.pending_proposal().await.is_none()); + + // The previously-lagging validator 3 was caught up and voted: the block reached the quorum + // {0, 1, 3} (validator 2 is offline and is skipped by the check). + builder + .check_that_validators_have_certificate(recipient_id, BlockHeight::ZERO, 3) + .await + .unwrap(); + + Ok(()) +} + #[test_case(MemoryStorageBuilder::default(); "memory")] #[cfg_attr(feature = "storage-service", test_case(ServiceStorageBuilder::new(); "storage_service"))] #[cfg_attr(feature = "rocksdb", test_case(RocksDbStorageBuilder::new().await; "rocks_db"))] diff --git a/linera-core/src/unit_tests/worker_tests.rs b/linera-core/src/unit_tests/worker_tests.rs index b8d220028616..c180f66b23cf 100644 --- a/linera-core/src/unit_tests/worker_tests.rs +++ b/linera-core/src/unit_tests/worker_tests.rs @@ -1640,6 +1640,100 @@ where Ok(()) } +#[test_case(MemoryStorageBuilder::default(); "memory")] +#[cfg_attr(feature = "rocksdb", test_case(RocksDbStorageBuilder::new().await; "rocks_db"))] +#[cfg_attr(feature = "scylladb", test_case(ScyllaDbStorageBuilder::default(); "scylla_db"))] +#[test_log::test(tokio::test)] +async fn test_handle_block_proposal_reports_all_missing_bundles( + mut storage_builder: B, +) -> anyhow::Result<()> +where + B: StorageBuilder, +{ + // A proposal that consumes two cross-chain bundles whose messages were never delivered to + // the recipient's inbox must be rejected with a single `MissingCrossChainUpdates` listing + // *both* missing senders, rather than bailing on the first. This lets the client fetch them + // all in one round-trip. + let mut signer = InMemorySigner::new(None); + let sender_public_key = signer.generate_new(); + let sender_owner = sender_public_key.into(); + let recipient_public_key = signer.generate_new(); + let recipient_owner = recipient_public_key.into(); + let mut env = TestEnvironment::new(&mut storage_builder, false, false).await?; + let chain_1 = env + .add_root_chain(1, sender_owner, Amount::from_tokens(6)) + .await + .id(); + let chain_2 = env + .add_root_chain(2, recipient_owner, Amount::ZERO) + .await + .id(); + + // The sender produces two blocks that credit the recipient, but we never deliver the + // resulting messages to the recipient's inbox. + let proposal0 = make_first_block(chain_1) + .with_simple_transfer(chain_2, Amount::ONE) + .with_simple_transfer(chain_2, Amount::from_tokens(2)) + .with_authenticated_owner(Some(sender_owner)); + let certificate0 = env.execute_proposal(proposal0, vec![]).await?; + let proposal1 = make_child_block(&certificate0.clone().into_value()) + .with_simple_transfer(chain_2, Amount::from_tokens(3)) + .with_authenticated_owner(Some(sender_owner)); + let certificate1 = env.execute_proposal(proposal1, vec![]).await?; + + // The recipient proposes a block consuming both not-yet-received bundles. + let block_proposal = + make_first_block(chain_2) + .with_incoming_bundle(IncomingBundle { + origin: chain_1, + bundle: MessageBundle { + certificate_hash: certificate0.hash(), + height: BlockHeight::from(0), + timestamp: Timestamp::from(0), + transaction_index: 1, + messages: vec![system_credit_message(Amount::from_tokens(2)) + .to_posted(MessageKind::Tracked)], + }, + action: MessageAction::Accept, + }) + .with_incoming_bundle(IncomingBundle { + origin: chain_1, + bundle: MessageBundle { + certificate_hash: certificate1.hash(), + height: BlockHeight::from(1), + timestamp: Timestamp::from(0), + transaction_index: 0, + messages: vec![system_credit_message(Amount::from_tokens(3)) + .to_posted(MessageKind::Tracked)], + }, + action: MessageAction::Accept, + }) + .with_authenticated_owner(Some(recipient_owner)) + .into_first_proposal(recipient_owner, &signer) + .await + .unwrap(); + let error = env + .worker() + .handle_block_proposal(block_proposal) + .await + .0 + .unwrap_err(); + let WorkerError::ChainError(chain_error) = error else { + panic!("unexpected error: {error}"); + }; + match *chain_error { + ChainError::MissingCrossChainUpdates { bundles, .. } => assert_eq!( + bundles, + vec![ + (chain_1, BlockHeight::from(0)), + (chain_1, BlockHeight::from(1)) + ], + ), + other => panic!("expected MissingCrossChainUpdates, got {other}"), + } + Ok(()) +} + #[test_case(MemoryStorageBuilder::default(); "memory")] #[cfg_attr(feature = "rocksdb", test_case(RocksDbStorageBuilder::new().await; "rocks_db"))] #[cfg_attr(feature = "scylladb", test_case(ScyllaDbStorageBuilder::default(); "scylla_db"))] diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index d9db3ebd3fe4..f8ccc3f4e316 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -526,29 +526,50 @@ where ) .await?; } - Err(NodeError::MissingCrossChainUpdate { - chain_id, - origin, - height, - }) if chain_id == proposal.content.block.chain_id - && sent_cross_chain_updates - .get(&origin) - .is_none_or(|h| *h < height) => - { + // The validator reports every missing cross-chain bundle in a single + // `MissingCrossChainUpdates`. Sync all the origin chains in one batch instead of + // discovering each one through a separate rejection and round-trip. Some received + // certificates may be missing for this validator (e.g. to create the chain or + // make the balance sufficient), so we synchronize them now and retry. + Err(NodeError::MissingCrossChainUpdates { + chain_id: dependencies_chain_id, + bundles, + }) if dependencies_chain_id == proposal.content.block.chain_id => { tracing::debug!( remote_node = %self.remote_node.address(), - chain_id = %origin, - "Missing cross-chain update; sending chain to validator.", + %chain_id, + bundles = bundles.len(), + "validator reported missing cross-chain updates; syncing them in one batch", ); - sent_cross_chain_updates.insert(origin, height); - // Some received certificates may be missing for this validator - // (e.g. to create the chain or make the balance sufficient) so we are going to - // synchronize them now and retry. - self.send_chain_information( - origin, - height.try_add_one()?, + // Sync each origin chain up to the needed height. The + // `sent_cross_chain_updates` guard prevents re-sync loops. + let mut origin_heights: BTreeMap = BTreeMap::new(); + for (origin, height) in bundles { + if sent_cross_chain_updates + .get(&origin) + .is_some_and(|sent| *sent >= height) + { + continue; + } + sent_cross_chain_updates.insert(origin, height); + let target = height.try_add_one()?; + let entry = origin_heights.entry(origin).or_insert(target); + *entry = (*entry).max(target); + } + // Guard against an infinite loop if the validator keeps reporting bundles + // we have already synced. + ensure!( + !origin_heights.is_empty(), + NodeError::ResponseHandlingError { + error: format!( + "validator repeatedly reported already-synced cross-chain \ + updates for chain {dependencies_chain_id}" + ), + } + ); + self.send_chain_info_up_to_heights( + origin_heights, CrossChainMessageDelivery::Blocking, - None, ) .await?; } diff --git a/linera-rpc/tests/snapshots/format__format.yaml.snap b/linera-rpc/tests/snapshots/format__format.yaml.snap index 27b3e0f99337..161bf9d9fd5f 100644 --- a/linera-rpc/tests/snapshots/format__format.yaml.snap +++ b/linera-rpc/tests/snapshots/format__format.yaml.snap @@ -725,14 +725,15 @@ NodeError: - found_block_height: TYPENAME: BlockHeight 8: - MissingCrossChainUpdate: + MissingCrossChainUpdates: STRUCT: - chain_id: TYPENAME: ChainId - - origin: - TYPENAME: ChainId - - height: - TYPENAME: BlockHeight + - bundles: + SEQ: + TUPLE: + - TYPENAME: ChainId + - TYPENAME: BlockHeight 9: BlobsNotFound: NEWTYPE: From 15fdc5f9f17aca3493b82bff89e51073dad24d62 Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Wed, 24 Jun 2026 11:44:44 +0200 Subject: [PATCH 2/7] Handle MissingCrossChainUpdates in one pass instead of incrementally re-syncing. --- linera-core/src/client/mod.rs | 48 +++++++++++++++++------------------ linera-core/src/updater.rs | 45 ++++++++++++++------------------ 2 files changed, 42 insertions(+), 51 deletions(-) diff --git a/linera-core/src/client/mod.rs b/linera-core/src/client/mod.rs index a3ecfb1a0baa..c6f5675aefd1 100644 --- a/linera-core/src/client/mod.rs +++ b/linera-core/src/client/mod.rs @@ -2210,37 +2210,35 @@ impl Client { continue; } } - while let LocalNodeError::WorkerError(WorkerError::ChainError(chain_err)) = &err { + // The local node reports *every* missing sender in a single + // `MissingCrossChainUpdates`, so we download them all in one pass and retry once; + // there is no need to iterate the discovery one sender at a time. + if let LocalNodeError::WorkerError(WorkerError::ChainError(chain_err)) = &err { if let ChainError::MissingCrossChainUpdates { chain_id, bundles } = &**chain_err { let chain_id = *chain_id; // Clone to end the borrow of `err` before we reassign it below. let bundles = bundles.clone(); - if bundles.is_empty() { - break; - } - // Download every missing sender block the validator reported, in one - // pass, then retry once. - for (origin, height) in bundles { - self.download_sender_block_with_sending_ancestors( - chain_id, - origin, - height, - remote_node, - ) - .await?; - } - if let Err(new_err) = self - .local_node - .handle_block_proposal(proposal.clone()) - .await - { - err = new_err; - } else { - continue 'proposal_loop; + if !bundles.is_empty() { + for (origin, height) in bundles { + self.download_sender_block_with_sending_ancestors( + chain_id, + origin, + height, + remote_node, + ) + .await?; + } + if let Err(new_err) = self + .local_node + .handle_block_proposal(proposal.clone()) + .await + { + err = new_err; + } else { + continue 'proposal_loop; + } } - } else { - break; } } diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index f8ccc3f4e316..cf8aee31c2b9 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -479,7 +479,7 @@ where clock_skew_sender: mpsc::UnboundedSender, ) -> Result, chain_client::Error> { let chain_id = proposal.content.block.chain_id; - let mut sent_cross_chain_updates = BTreeMap::new(); + let mut synced_cross_chain_updates = false; let mut publisher_chain_ids_sent = BTreeSet::new(); let storage = self.client.local_node.storage_client(); loop { @@ -526,47 +526,40 @@ where ) .await?; } - // The validator reports every missing cross-chain bundle in a single - // `MissingCrossChainUpdates`. Sync all the origin chains in one batch instead of - // discovering each one through a separate rejection and round-trip. Some received - // certificates may be missing for this validator (e.g. to create the chain or - // make the balance sufficient), so we synchronize them now and retry. + // The validator reports *every* missing cross-chain bundle in a single + // `MissingCrossChainUpdates`, so we sync all of them at once and retry. Some + // received certificates may be missing for this validator (e.g. to create the + // chain or make the balance sufficient). If it still reports missing bundles + // after we synced the whole set, retrying would not make progress, so we surface + // the error instead of looping. Err(NodeError::MissingCrossChainUpdates { chain_id: dependencies_chain_id, bundles, }) if dependencies_chain_id == proposal.content.block.chain_id => { + ensure!( + !synced_cross_chain_updates, + NodeError::ResponseHandlingError { + error: format!( + "validator still reports missing cross-chain updates for chain \ + {dependencies_chain_id} after they were all synced" + ), + } + ); + synced_cross_chain_updates = true; tracing::debug!( remote_node = %self.remote_node.address(), %chain_id, bundles = bundles.len(), "validator reported missing cross-chain updates; syncing them in one batch", ); - // Sync each origin chain up to the needed height. The - // `sent_cross_chain_updates` guard prevents re-sync loops. + // Sync each reported origin chain up to the needed height, collapsing any + // duplicate origins to the highest height. let mut origin_heights: BTreeMap = BTreeMap::new(); for (origin, height) in bundles { - if sent_cross_chain_updates - .get(&origin) - .is_some_and(|sent| *sent >= height) - { - continue; - } - sent_cross_chain_updates.insert(origin, height); let target = height.try_add_one()?; let entry = origin_heights.entry(origin).or_insert(target); *entry = (*entry).max(target); } - // Guard against an infinite loop if the validator keeps reporting bundles - // we have already synced. - ensure!( - !origin_heights.is_empty(), - NodeError::ResponseHandlingError { - error: format!( - "validator repeatedly reported already-synced cross-chain \ - updates for chain {dependencies_chain_id}" - ), - } - ); self.send_chain_info_up_to_heights( origin_heights, CrossChainMessageDelivery::Blocking, From 1ccb3351a79de21c690c0077e8b51b50180722d5 Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Fri, 26 Jun 2026 16:48:37 +0200 Subject: [PATCH 3/7] Catch up missing cross-chain senders by sending only the needed blocks instead of whole chains. --- linera-core/src/client/mod.rs | 57 ++++++++++++++++++++--------------- linera-core/src/updater.rs | 24 +++++++-------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/linera-core/src/client/mod.rs b/linera-core/src/client/mod.rs index c6f5675aefd1..b65f509829a1 100644 --- a/linera-core/src/client/mod.rs +++ b/linera-core/src/client/mod.rs @@ -2210,34 +2210,43 @@ impl Client { continue; } } - // The local node reports *every* missing sender in a single - // `MissingCrossChainUpdates`, so we download them all in one pass and retry once; - // there is no need to iterate the discovery one sender at a time. + // The local node reports every missing sender bundle in a single + // `MissingCrossChainUpdates`, so we download them all in one pass and retry once. if let LocalNodeError::WorkerError(WorkerError::ChainError(chain_err)) = &err { if let ChainError::MissingCrossChainUpdates { chain_id, bundles } = &**chain_err { let chain_id = *chain_id; - // Clone to end the borrow of `err` before we reassign it below. - let bundles = bundles.clone(); - if !bundles.is_empty() { - for (origin, height) in bundles { - self.download_sender_block_with_sending_ancestors( - chain_id, - origin, - height, - remote_node, - ) - .await?; - } - if let Err(new_err) = self - .local_node - .handle_block_proposal(proposal.clone()) - .await - { - err = new_err; - } else { - continue 'proposal_loop; - } + // `download_sender_block_with_sending_ancestors` walks each origin's + // message-bearing blocks back from the given height, so the highest missing + // height per origin subsumes the lower ones. Deduplicate to that (also + // ending the borrow of `err` so we can reassign it below), then download the + // independent origins concurrently, bounded by `max_joined_tasks`. + let mut origin_heights: BTreeMap = BTreeMap::new(); + for (origin, height) in bundles { + let entry = origin_heights.entry(*origin).or_insert(*height); + *entry = (*entry).max(*height); + } + stream::iter(origin_heights.into_iter().map(|(origin, height)| { + self.download_sender_block_with_sending_ancestors( + chain_id, + origin, + height, + remote_node, + ) + })) + .buffer_unordered(self.options.max_joined_tasks) + .collect::>() + .await + .into_iter() + .collect::>()?; + if let Err(new_err) = self + .local_node + .handle_block_proposal(proposal.clone()) + .await + { + err = new_err; + } else { + continue 'proposal_loop; } } } diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index cf8aee31c2b9..721a5bffd512 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -527,11 +527,9 @@ where .await?; } // The validator reports *every* missing cross-chain bundle in a single - // `MissingCrossChainUpdates`, so we sync all of them at once and retry. Some - // received certificates may be missing for this validator (e.g. to create the - // chain or make the balance sufficient). If it still reports missing bundles - // after we synced the whole set, retrying would not make progress, so we surface - // the error instead of looping. + // `MissingCrossChainUpdates`, so we send all of them at once and retry. If it + // still reports missing bundles after we synced the whole set, retrying would not + // make progress, so we surface the error instead of looping. Err(NodeError::MissingCrossChainUpdates { chain_id: dependencies_chain_id, bundles, @@ -552,15 +550,17 @@ where bundles = bundles.len(), "validator reported missing cross-chain updates; syncing them in one batch", ); - // Sync each reported origin chain up to the needed height, collapsing any - // duplicate origins to the highest height. - let mut origin_heights: BTreeMap = BTreeMap::new(); + // The reported bundles are exactly the sender blocks this proposal consumes + // that the validator is missing (the recipient's inbox reports each absent + // bundle). Send precisely those blocks, sparsely, rather than back-filling each + // origin's whole prefix. Grouping into a `BTreeSet` per origin delivers them in + // ascending height order, which the inbox requires. + let mut origin_heights: BTreeMap> = + BTreeMap::new(); for (origin, height) in bundles { - let target = height.try_add_one()?; - let entry = origin_heights.entry(origin).or_insert(target); - *entry = (*entry).max(target); + origin_heights.entry(origin).or_default().insert(height); } - self.send_chain_info_up_to_heights( + self.send_chain_info_at_heights( origin_heights, CrossChainMessageDelivery::Blocking, ) From b59ad54ca0b62861b8b85ce8974a5adfea5a11c5 Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:04:47 +0200 Subject: [PATCH 4/7] Revert sparse validator catch-up to the contiguous sync; sparse push breaks gap delivery (test_update_validator_sender_gaps). --- linera-core/src/updater.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index 721a5bffd512..cf8aee31c2b9 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -527,9 +527,11 @@ where .await?; } // The validator reports *every* missing cross-chain bundle in a single - // `MissingCrossChainUpdates`, so we send all of them at once and retry. If it - // still reports missing bundles after we synced the whole set, retrying would not - // make progress, so we surface the error instead of looping. + // `MissingCrossChainUpdates`, so we sync all of them at once and retry. Some + // received certificates may be missing for this validator (e.g. to create the + // chain or make the balance sufficient). If it still reports missing bundles + // after we synced the whole set, retrying would not make progress, so we surface + // the error instead of looping. Err(NodeError::MissingCrossChainUpdates { chain_id: dependencies_chain_id, bundles, @@ -550,17 +552,15 @@ where bundles = bundles.len(), "validator reported missing cross-chain updates; syncing them in one batch", ); - // The reported bundles are exactly the sender blocks this proposal consumes - // that the validator is missing (the recipient's inbox reports each absent - // bundle). Send precisely those blocks, sparsely, rather than back-filling each - // origin's whole prefix. Grouping into a `BTreeSet` per origin delivers them in - // ascending height order, which the inbox requires. - let mut origin_heights: BTreeMap> = - BTreeMap::new(); + // Sync each reported origin chain up to the needed height, collapsing any + // duplicate origins to the highest height. + let mut origin_heights: BTreeMap = BTreeMap::new(); for (origin, height) in bundles { - origin_heights.entry(origin).or_default().insert(height); + let target = height.try_add_one()?; + let entry = origin_heights.entry(origin).or_insert(target); + *entry = (*entry).max(target); } - self.send_chain_info_at_heights( + self.send_chain_info_up_to_heights( origin_heights, CrossChainMessageDelivery::Blocking, ) From 211074fbf68b013c3ff7368c4e9d04a9829265aa Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:47:14 +0200 Subject: [PATCH 5/7] Add a regression test for catching up a lagging validator across a sender-chain gap. --- linera-core/src/unit_tests/client_tests.rs | 71 ++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/linera-core/src/unit_tests/client_tests.rs b/linera-core/src/unit_tests/client_tests.rs index 485c4b92d7f0..59cc9ade4484 100644 --- a/linera-core/src/unit_tests/client_tests.rs +++ b/linera-core/src/unit_tests/client_tests.rs @@ -1216,6 +1216,77 @@ where Ok(()) } +#[test_case(MemoryStorageBuilder::default(); "memory")] +#[test_log::test(tokio::test)] +async fn test_proposal_catch_up_with_sender_gap(storage_builder: B) -> anyhow::Result<()> +where + B: StorageBuilder, +{ + // A lagging validator must be caught up on a sender that has a *gap* from the recipient's + // perspective: sender block 0 messages the recipient, block 1 does not, block 2 does. The + // recipient consumes those two bundles in two separate blocks, so the second proposal only + // references sender block 2 — whose bundle can only be scheduled once block 0 is executed. + let signer = InMemorySigner::new(None); + let mut builder = TestBuilder::new(storage_builder, 4, 0, signer).await?; + let sender = builder.add_root_chain(1, Amount::from_tokens(10)).await?; + let recipient = builder.add_root_chain(2, Amount::ZERO).await?; + let sender_id = sender.chain_id(); + let recipient_id = recipient.chain_id(); + + // Validator 3 is offline while the sender builds a gapped chain and the recipient consumes + // the first bundle. + builder.set_fault_type([3], FaultType::OfflineWithInfo); + + // Sender block 0 -> recipient. + sender + .transfer_to_account( + AccountOwner::CHAIN, + Amount::ONE, + Account::chain(recipient_id), + ) + .await + .unwrap_ok_committed(); + recipient.synchronize_from_validators().await?; + recipient.process_inbox().await?; // recipient block 0 consumes sender block 0's bundle + + // Sender block 1 -> itself (no message to the recipient: the gap), block 2 -> recipient. + sender + .transfer_to_account(AccountOwner::CHAIN, Amount::ONE, Account::chain(sender_id)) + .await + .unwrap_ok_committed(); + sender + .transfer_to_account( + AccountOwner::CHAIN, + Amount::from_tokens(3), + Account::chain(recipient_id), + ) + .await + .unwrap_ok_committed(); + recipient.synchronize_from_validators().await?; + + // Validator 3 comes back; validator 2 goes offline so the recipient's next block needs + // validator 3's vote — the updater must catch it up on the sender across the gap. + builder.set_fault_type([3], FaultType::Honest); + builder.set_fault_type([2], FaultType::Offline); + + recipient.process_inbox().await?; // recipient block 1 consumes sender block 2's bundle + + assert_eq!( + recipient.local_balance().await.unwrap(), + Amount::from_tokens(4), + ); + assert_eq!( + recipient.chain_info().await?.next_block_height, + BlockHeight::from(2), + ); + builder + .check_that_validators_have_certificate(recipient_id, BlockHeight::from(1), 3) + .await + .unwrap(); + + Ok(()) +} + #[test_case(MemoryStorageBuilder::default(); "memory")] #[cfg_attr(feature = "storage-service", test_case(ServiceStorageBuilder::new(); "storage_service"))] #[cfg_attr(feature = "rocksdb", test_case(RocksDbStorageBuilder::new().await; "rocks_db"))] From 8dada6fcfd9e3601f54a360900e3955946522b72 Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Fri, 26 Jun 2026 20:08:49 +0200 Subject: [PATCH 6/7] Document that validator catch-up sends only locally-held certificates, leaving gaps intentionally. --- linera-core/src/updater.rs | 60 +++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index cf8aee31c2b9..0b565d2c57e7 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -706,15 +706,31 @@ where /// Sends chain information to bring a validator up to date with a specific chain. /// /// This method performs a two-phase synchronization: - /// 1. **Height synchronization**: Ensures the validator has all blocks up to `target_block_height`. + /// 1. **Height synchronization**: sends the certificates we hold locally for the range + /// `[validator_next_height, target_block_height)`, in order. /// 2. **Round synchronization**: If heights match, ensures the validator has proposals/certificates /// for the current consensus round. /// + /// Only certificates that are actually in our local storage are sent; heights we don't have are + /// silently skipped (see [`Self::read_certificates_for_heights`]). This is deliberate and is what + /// makes the "leave gaps on the validator side" behavior (#4181) work: a chain we merely *receive* + /// from is stored only at its message-bearing heights, so we push exactly those. The validator + /// executes the contiguous prefix and preprocesses any block that sits above a gap — enough to + /// deliver that block's cross-chain bundles without our ever having to send the intervening + /// non-message blocks (which we don't have anyway). + /// + /// Because our local storage is guaranteed to hold every block we needed to build a proposal (a + /// bundle can only be consumed after its ordered message-bearing predecessors were downloaded), + /// this is the reliable way to catch a validator up. Deriving the set to send from a + /// `MissingCrossChainUpdates` error instead is *not* reliable: that error lists only the bundles + /// the current proposal is missing and omits already-consumed ancestors, which the validator + /// still needs executed before it can schedule a later gap block's bundle. + /// /// # Height Sync Strategy /// - For existing chains (target_block_height > 0): /// * Optimistically sends the last certificate first (often that's all that's missing). - /// * Falls back to full chain query if the validator needs more context. - /// * Sends any additional missing certificates in order. + /// * Falls back to a full chain query if the validator needs more context. + /// * Sends any additional locally-held certificates in order. /// - For new chains (target_block_height == 0): /// * Sends the chain description and dependencies first. /// * Then queries the validator's state. @@ -774,9 +790,13 @@ where self.sync_consensus_round(remote_round, &manager).await } - /// Synchronizes a validator to a specific block height by sending missing certificates. + /// Synchronizes a validator to a specific block height by sending the certificates we hold. /// - /// Uses an optimistic approach: sends the last certificate first, then fills in any gaps. + /// Uses an optimistic approach: sends the last certificate first, then, based on the + /// validator's reported height, sends the earlier certificates in the range. Only the heights + /// we actually have in local storage are sent — any we're missing are silently skipped rather + /// than treated as an error, which is what leaves genuine gaps on the validator (see + /// [`Self::send_chain_information`] for why that is both safe and intended). /// /// Returns the [`ChainInfo`] from the validator after synchronization. async fn sync_chain_height( @@ -849,7 +869,12 @@ where Ok(info) } - /// Reads certificates for the given heights from storage. + /// Reads certificates for the given heights from local storage. + /// + /// Heights we don't have are silently dropped: the returned vector contains only the + /// certificates actually present, so callers naturally skip any block we never downloaded + /// (e.g. a sender's non-message-bearing blocks). Callers must not assume the result covers + /// every requested height. async fn read_certificates_for_heights( &self, chain_id: ChainId, @@ -1055,11 +1080,19 @@ where .await } - /// Sends chain information for specific heights on multiple chains. + /// Sends the blocks at exactly the specified heights on multiple chains. /// - /// Unlike `send_chain_info_up_to_heights`, this method only sends the blocks at the - /// specified heights, not all blocks up to those heights. This is more efficient for - /// sparse chains where only specific blocks are needed. + /// Unlike [`Self::send_chain_info_up_to_heights`], this sends *only* the blocks at the given + /// heights, not the locally-held prefix leading up to them. Use it only when the required + /// blocks are fully self-describing to the validator — e.g. bringing over the specific blocks + /// that carry a set of blobs. + /// + /// Do **not** use it to catch a validator up on missing cross-chain message bundles. A block + /// that sits above a gap is only preprocessed, and its bundle is scheduled only once its + /// `previous_message_block` has been executed on the validator; sending just the reported + /// height omits that ancestor and the bundle never gets delivered. Use + /// [`Self::send_chain_info_up_to_heights`] there, which pushes the whole locally-held + /// message-bearing prefix. async fn send_chain_info_at_heights( &self, chain_heights: impl IntoIterator)>, @@ -1094,6 +1127,13 @@ where Ok(()) } + /// Brings a validator up to each given `(chain, height)` by pushing the locally-held prefix + /// of that chain (via [`Self::send_chain_information`]), for all chains concurrently. + /// + /// This is the correct primitive for catching a validator up on missing cross-chain bundles: + /// it pushes every message-bearing block we hold up to the target, so the validator executes + /// the ancestors it needs and preprocesses gap blocks. See [`Self::send_chain_info_at_heights`] + /// for why sending only the exact reported heights is not enough. async fn send_chain_info_up_to_heights( &self, chain_heights: impl IntoIterator, From 8a45eb791ac1f5ab2503144114e03a055797a04f Mon Sep 17 00:00:00 2001 From: Mathieu Baudet <1105398+ma2bd@users.noreply.github.com> Date: Fri, 26 Jun 2026 20:11:32 +0200 Subject: [PATCH 7/7] Trim validator catch-up doc comments to the essentials. --- linera-core/src/updater.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/linera-core/src/updater.rs b/linera-core/src/updater.rs index 0b565d2c57e7..3b4e4be1a22d 100644 --- a/linera-core/src/updater.rs +++ b/linera-core/src/updater.rs @@ -1086,13 +1086,6 @@ where /// heights, not the locally-held prefix leading up to them. Use it only when the required /// blocks are fully self-describing to the validator — e.g. bringing over the specific blocks /// that carry a set of blobs. - /// - /// Do **not** use it to catch a validator up on missing cross-chain message bundles. A block - /// that sits above a gap is only preprocessed, and its bundle is scheduled only once its - /// `previous_message_block` has been executed on the validator; sending just the reported - /// height omits that ancestor and the bundle never gets delivered. Use - /// [`Self::send_chain_info_up_to_heights`] there, which pushes the whole locally-held - /// message-bearing prefix. async fn send_chain_info_at_heights( &self, chain_heights: impl IntoIterator)>, @@ -1129,11 +1122,6 @@ where /// Brings a validator up to each given `(chain, height)` by pushing the locally-held prefix /// of that chain (via [`Self::send_chain_information`]), for all chains concurrently. - /// - /// This is the correct primitive for catching a validator up on missing cross-chain bundles: - /// it pushes every message-bearing block we hold up to the target, so the validator executes - /// the ancestors it needs and preprocesses gap blocks. See [`Self::send_chain_info_at_heights`] - /// for why sending only the exact reported heights is not enough. async fn send_chain_info_up_to_heights( &self, chain_heights: impl IntoIterator,