From c342b57571dc0d974abf60a2d0987bd4afec95c6 Mon Sep 17 00:00:00 2001 From: Andre da Silva <2917611+ndr-ds@users.noreply.github.com> Date: Thu, 25 Jun 2026 19:34:59 -0300 Subject: [PATCH 1/2] Add observability for message and block rejections (#6473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have almost no observability when messages or blocks are rejected. Most rejection paths are silent (no log, no metric), so rejection rates can't be measured reliably and silent rejections are invisible in production. Closes #6459. Add logging at every currently-silent rejection decision site, plus rate-correct Prometheus counters, reusing the `error_type()` labelling convention established in #5951. - **Message rejections:** logs at the policy decision (`IncomingBundle::apply_policy`, INFO), the closed-chain auto-reject (`prepare_chain_info_response`, INFO), and the validator-side execution of a rejection (`BlockExecutionTracker`, DEBUG — see Log levels). The execution-failure path already logged. No new message counter — the rejection rate stays on the existing commit-time pair `rejected_bundle_count` / `incoming_bundle_count`; the reason (which can't be recovered from a committed block) lives in the logs. - **Validator proposal rejections:** `block_proposals_received_total` (unconditional) + `block_proposals_rejected_total{error_type}` at the worker's `handle_block_proposal`, plus a DEBUG log (see Log levels). `error_type` reuses `WorkerError::error_type()`, surfacing `WorkerError::InvalidOwner`, `ChainError::WrongRound`, etc. - **Client staging failures:** `block_staging_failures_total{error_type}` + INFO log in `ChainClient::execute_block`, reusing `execute_block_latency`'s count as the denominator. Adds `error_type()` to the client `Error` and `LocalNodeError`, delegating into `WorkerError` / `ChainError` exactly like `WorkerError::error_type` from Each rate lives within one observation point + process role, so numerator and denominator emit from the same site and metric names can't be summed across points. The three low-volume decision sites are INFO; the two high-volume sites (proposal rejected, message rejected) are DEBUG. Rationale from prod (`testnet-conway`, `rate(linera_server_request_error{method_name="handle_block_proposal"}[1h])`): - Proposal rejections run at **~10/sec cluster-wide** (~1.7/sec per shard), but **~92% is routine consensus churn** — `UnexpectedBlockHeight` (4.0/s), `InvalidTimestamp` (3.7/s), `MissingCrossChainUpdate` (1.6/s). The genuinely adversarial rejections (`InvalidOwner`, `InvalidSigner`, `WrongRound`, …) sum to **<0.05/sec**. This is why kept these at DEBUG. - The message-rejection site fires **per message × per executing node** and re-runs during confirmed-block execution and catch-up sync, so a syncing node would replay the entire rejection history into Loki as a burst. At INFO these two would emit ~10/sec of almost-entirely-liveness noise, contradicting the issue's own goal of making rejection data usable. DEBUG keeps them opt-in. The per-`reason` rates are fully preserved in the metrics (`block_proposals_rejected_total{error_type}`, etc.) — the logs are just the detail layer. - `cargo clippy --all-targets --all-features -- -D warnings` — clean. - `cargo clippy --no-default-features -- -D warnings` (linera-core, linera-chain) — clean. - `cargo doc --all-features` — clean. - New unit tests covering `error_type()` delegation and prefix fallback (`LocalNodeError -> WorkerError`, `Error -> LocalNodeError`, the `ChainClientError::` / `LocalNodeError::` fallbacks) — pass. - **NOT yet verified end-to-end on a running network.** Metric registration/increments and log emission are validated by inspection and compilation only; they follow the existing metric/log patterns in these files. A local-net run confirming the counters appear on `/metrics` is still pending. - These changes follow the usual release cycle (observability only; no protocol or storage format change). Recommend backporting to the latest `testnet` branch, since that is where the rejection-visibility gap is felt operationally — though this is not a bug fix. - Closes #6459 - Builds on #5951 (`error_type` labelling for server/proxy error metrics) - [reviewer checklist](https://github.com/linera-io/linera-protocol/blob/main/CONTRIBUTING.md#reviewer-checklist) --- linera-chain/src/block_tracker.rs | 7 ++- linera-chain/src/data_types/mod.rs | 6 ++- linera-core/src/chain_worker/state.rs | 39 ++++++++++++++-- linera-core/src/client/chain_client/mod.rs | 54 +++++++++++++++++++++- linera-core/src/client/mod.rs | 14 +++++- linera-core/src/local_node.rs | 37 ++++++++++++++- 6 files changed, 148 insertions(+), 9 deletions(-) diff --git a/linera-chain/src/block_tracker.rs b/linera-chain/src/block_tracker.rs index 51f05247c2cc..a5ead8187ffd 100644 --- a/linera-chain/src/block_tracker.rs +++ b/linera-chain/src/block_tracker.rs @@ -18,7 +18,7 @@ use linera_execution::{ FLAG_FREE_REJECT, }; use linera_views::context::Context; -use tracing::instrument; +use tracing::{debug, instrument}; #[cfg(with_metrics)] use crate::chain::metrics; @@ -245,6 +245,11 @@ impl<'resources, 'blobs> BlockExecutionTracker<'resources, 'blobs> { posted_message: Box::new(posted_message.clone()), } ); + debug!( + chain_id = %self.chain_id, + origin = %incoming_bundle.origin, + "Rejecting incoming message" + ); let mut actor = ExecutionStateActor::new(chain, txn_tracker, self.resource_controller); if posted_message.is_tracked() { diff --git a/linera-chain/src/data_types/mod.rs b/linera-chain/src/data_types/mod.rs index e104cfb22747..166d44325768 100644 --- a/linera-chain/src/data_types/mod.rs +++ b/linera-chain/src/data_types/mod.rs @@ -27,7 +27,7 @@ use linera_base::{ }; use linera_execution::{committee::Committee, Message, MessageKind, Operation, OutgoingMessage}; use serde::{Deserialize, Serialize}; -use tracing::instrument; +use tracing::{info, instrument}; use crate::{ block::{Block, ValidatedBlock}, @@ -308,6 +308,10 @@ impl IncomingBundle { if self.bundle.is_skippable() { return None; } else if !self.bundle.is_protected() { + info!( + origin = %self.origin, + "Rejecting incoming message bundle due to the message policy" + ); self.action = MessageAction::Reject; } } diff --git a/linera-core/src/chain_worker/state.rs b/linera-core/src/chain_worker/state.rs index 5535f7937ec6..350ff4e71c72 100644 --- a/linera-core/src/chain_worker/state.rs +++ b/linera-core/src/chain_worker/state.rs @@ -63,9 +63,9 @@ mod metrics { use linera_base::prometheus_util::{ exponential_bucket_interval, exponential_bucket_latencies, register_histogram, - register_histogram_vec, + register_histogram_vec, register_int_counter, register_int_counter_vec, }; - use prometheus::{Histogram, HistogramVec}; + use prometheus::{Histogram, HistogramVec, IntCounter, IntCounterVec}; pub static CREATE_NETWORK_ACTIONS_LATENCY: LazyLock = LazyLock::new(|| { register_histogram( @@ -83,6 +83,21 @@ mod metrics { exponential_bucket_interval(1.0, 10_000.0), ) }); + + pub static BLOCK_PROPOSALS_RECEIVED_TOTAL: LazyLock = LazyLock::new(|| { + register_int_counter( + "block_proposals_received_total", + "Total number of block proposals received by the worker", + ) + }); + + pub static BLOCK_PROPOSALS_REJECTED_TOTAL: LazyLock = LazyLock::new(|| { + register_int_counter_vec( + "block_proposals_rejected_total", + "Total number of block proposals rejected by the worker, labelled by error type", + &["error_type"], + ) + }); } /// The state of the chain worker. @@ -2109,10 +2124,20 @@ where &mut self, proposal: BlockProposal, ) -> (Result, NetworkActions) { + #[cfg(with_metrics)] + metrics::BLOCK_PROPOSALS_RECEIVED_TOTAL.inc(); + let chain_id = proposal.content.block.chain_id; + let height = proposal.content.block.height; let old_round = self.chain.manager.current_round(); match self.try_handle_block_proposal(proposal).await { Ok((response, actions)) => (Ok(response), actions), Err(err) => { + let error_type = err.error_type(); + #[cfg(with_metrics)] + metrics::BLOCK_PROPOSALS_REJECTED_TOTAL + .with_label_values(&[error_type.as_str()]) + .inc(); + debug!(%chain_id, %height, %error_type, "Block proposal rejected"); // Even on error, the manager's `current_round` may have advanced // (the `HasIncompatibleConfirmedVote` recovery path calls // `update_signed_proposal`). Surface the resulting `NewRound` @@ -2383,7 +2408,8 @@ where metrics::NUM_INBOXES .with_label_values(&[]) .observe(origins_and_inboxes.len() as f64); - let action = if *self.chain.execution_state.system.closed.get() { + let is_closed = *self.chain.execution_state.system.closed.get(); + let action = if is_closed { MessageAction::Reject } else { MessageAction::Accept @@ -2397,6 +2423,13 @@ where }); } } + if is_closed && !bundles.is_empty() { + info!( + chain_id = %self.chain.chain_id(), + count = bundles.len(), + "Auto-rejecting all incoming message bundles because the chain is closed" + ); + } info.requested_pending_message_bundles = bundles; } let hashes = self diff --git a/linera-core/src/client/chain_client/mod.rs b/linera-core/src/client/chain_client/mod.rs index 7b1f468ed219..9beb07f7282a 100644 --- a/linera-core/src/client/chain_client/mod.rs +++ b/linera-core/src/client/chain_client/mod.rs @@ -247,7 +247,7 @@ impl Clone for ChainClient { } /// Error type for [`ChainClient`]. -#[derive(Debug, Error)] +#[derive(Debug, Error, strum::IntoStaticStr)] #[allow(missing_docs)] pub enum Error { #[error("Local node operation failed: {0}")] @@ -369,6 +369,20 @@ impl Error { pub fn signer_failure(err: impl signer::Error + 'static) -> Self { Self::Signer(Box::new(err)) } + + /// Returns the qualified error variant name for the `error_type` metric label, + /// delegating to the wrapped error's `error_type()` so the underlying worker or + /// chain error name is surfaced rather than just the outer variant. + pub fn error_type(&self) -> String { + match self { + Error::LocalNodeError(local_node_error) => local_node_error.error_type(), + Error::ChainError(chain_error) => chain_error.error_type(), + other => { + let variant: &'static str = other.into(); + format!("ChainClientError::{variant}") + } + } + } } impl ChainClient { @@ -1458,6 +1472,23 @@ impl ChainClient { #[cfg(with_metrics)] let _latency = super::metrics::EXECUTE_BLOCK_LATENCY.measure_latency(); + let result = self.try_execute_block(operations, blobs).await; + if let Err(error) = &result { + let error_type = error.error_type(); + #[cfg(with_metrics)] + super::metrics::BLOCK_STAGING_FAILURES_TOTAL + .with_label_values(&[error_type.as_str()]) + .inc(); + info!(chain_id = %self.chain_id, %error_type, "Block staging failed"); + } + result + } + + async fn try_execute_block( + &self, + operations: Vec, + blobs: Vec, + ) -> Result, Error> { let mutex = self.proposal_mutex(); let lock_start = linera_base::time::Instant::now(); let mut proposal_guard = mutex.lock_owned().await; @@ -3539,3 +3570,24 @@ impl ChainClient { .unwrap(); } } + +#[cfg(test)] +mod tests { + use super::{Error, LocalNodeError}; + + #[test] + fn error_type_delegates_to_local_node_error() { + assert_eq!( + Error::LocalNodeError(LocalNodeError::InvalidChainInfoResponse).error_type(), + "LocalNodeError::InvalidChainInfoResponse" + ); + } + + #[test] + fn error_type_falls_back_to_chain_client_variant() { + assert_eq!( + Error::WalletSynchronizationError.error_type(), + "ChainClientError::WalletSynchronizationError" + ); + } +} diff --git a/linera-core/src/client/mod.rs b/linera-core/src/client/mod.rs index 468964e11ab7..13c28c80289d 100644 --- a/linera-core/src/client/mod.rs +++ b/linera-core/src/client/mod.rs @@ -77,8 +77,10 @@ mod validator_trackers; mod metrics { use std::sync::LazyLock; - use linera_base::prometheus_util::{exponential_bucket_latencies, register_histogram_vec}; - use prometheus::HistogramVec; + use linera_base::prometheus_util::{ + exponential_bucket_latencies, register_histogram_vec, register_int_counter_vec, + }; + use prometheus::{HistogramVec, IntCounterVec}; pub static PROCESS_INBOX_WITHOUT_PREPARE_LATENCY: LazyLock = LazyLock::new(|| { @@ -125,6 +127,14 @@ mod metrics { exponential_bucket_latencies(10_000.0), ) }); + + pub static BLOCK_STAGING_FAILURES_TOTAL: LazyLock = LazyLock::new(|| { + register_int_counter_vec( + "block_staging_failures_total", + "Total number of client block staging (execute_block) failures, labelled by error type", + &["error_type"], + ) + }); } /// Default number of certificates to download in a single batch. diff --git a/linera-core/src/local_node.rs b/linera-core/src/local_node.rs index 9d870bd2d92e..d77412a48bdd 100644 --- a/linera-core/src/local_node.rs +++ b/linera-core/src/local_node.rs @@ -48,7 +48,7 @@ where } /// Error type for the operations on a local node. -#[derive(Debug, Error)] +#[derive(Debug, Error, strum::IntoStaticStr)] #[allow(missing_docs)] pub enum LocalNodeError { #[error(transparent)] @@ -73,6 +73,20 @@ pub enum LocalNodeError { EventsNotFound(Vec), } +impl LocalNodeError { + /// Returns the qualified error variant name for the `error_type` metric label, + /// delegating to [`WorkerError::error_type`] for wrapped worker errors. + pub fn error_type(&self) -> String { + match self { + LocalNodeError::WorkerError(worker_error) => worker_error.error_type(), + other => { + let variant: &'static str = other.into(); + format!("LocalNodeError::{variant}") + } + } + } +} + impl From for LocalNodeError { fn from(error: WorkerError) -> Self { match error { @@ -480,3 +494,24 @@ where Ok(self.node.state.get_manager_seed(chain_id).await?) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn error_type_delegates_to_worker_error() { + assert_eq!( + LocalNodeError::WorkerError(WorkerError::InvalidOwner).error_type(), + "WorkerError::InvalidOwner" + ); + } + + #[test] + fn error_type_falls_back_to_local_node_variant() { + assert_eq!( + LocalNodeError::InvalidChainInfoResponse.error_type(), + "LocalNodeError::InvalidChainInfoResponse" + ); + } +} From b90db49f11d080219c28eb4636e4cf95cbbc0cea Mon Sep 17 00:00:00 2001 From: Andre da Silva Date: Wed, 1 Jul 2026 18:03:35 +0000 Subject: [PATCH 2/2] Drop redundant execution-site rejection log in block_tracker --- linera-chain/src/block_tracker.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/linera-chain/src/block_tracker.rs b/linera-chain/src/block_tracker.rs index a5ead8187ffd..51f05247c2cc 100644 --- a/linera-chain/src/block_tracker.rs +++ b/linera-chain/src/block_tracker.rs @@ -18,7 +18,7 @@ use linera_execution::{ FLAG_FREE_REJECT, }; use linera_views::context::Context; -use tracing::{debug, instrument}; +use tracing::instrument; #[cfg(with_metrics)] use crate::chain::metrics; @@ -245,11 +245,6 @@ impl<'resources, 'blobs> BlockExecutionTracker<'resources, 'blobs> { posted_message: Box::new(posted_message.clone()), } ); - debug!( - chain_id = %self.chain_id, - origin = %incoming_bundle.origin, - "Rejecting incoming message" - ); let mut actor = ExecutionStateActor::new(chain, txn_tracker, self.resource_controller); if posted_message.is_tracked() {