From c47511b0d86a9d06e90aaa0c8432e802bf7f223f Mon Sep 17 00:00:00 2001 From: Jeff Date: Wed, 15 Jan 2025 22:18:00 +0800 Subject: [PATCH] feat: don't crash relayer due to flaky RPCs (#5115) ### Description - instead of crashing relayer if any RPC calls fail during `build_mailboxes` and `build_validator_announces`, we just log the error and metric - relayer will continue running without those chains ### Drive-by changes ### Related issues Fixes https://github.com/hyperlane-xyz/hyperlane-monorepo/issues/4887 ### Backward compatibility Are these changes backward compatible? Are there any infrastructure implications, e.g. changes that would prohibit deploying older commits using this infra tooling? Yes ### Testing Added unittests to test ensure functions run correctly and log errors in metrics --------- Co-authored-by: Danil Nemirovsky Co-authored-by: Trevor Porter --- rust/main/Cargo.lock | 2 + rust/main/agents/relayer/Cargo.toml | 2 + rust/main/agents/relayer/src/relayer.rs | 297 +++++++++++++++++- rust/main/agents/relayer/src/settings/mod.rs | 2 +- .../src/metrics/agent_metrics.rs | 2 +- rust/main/hyperlane-base/src/settings/base.rs | 12 +- 6 files changed, 295 insertions(+), 22 deletions(-) diff --git a/rust/main/Cargo.lock b/rust/main/Cargo.lock index 6a6f44e6c9..da153bf564 100644 --- a/rust/main/Cargo.lock +++ b/rust/main/Cargo.lock @@ -7101,6 +7101,7 @@ dependencies = [ "dhat", "ethers", "ethers-contract", + "ethers-prometheus", "eyre", "futures", "futures-util", @@ -7126,6 +7127,7 @@ dependencies = [ "tokio-test", "tracing", "tracing-futures", + "tracing-test", "typetag", ] diff --git a/rust/main/agents/relayer/Cargo.toml b/rust/main/agents/relayer/Cargo.toml index 5a891d912c..ac0e198945 100644 --- a/rust/main/agents/relayer/Cargo.toml +++ b/rust/main/agents/relayer/Cargo.toml @@ -56,9 +56,11 @@ hyperlane-ethereum = { path = "../../chains/hyperlane-ethereum" } once_cell.workspace = true mockall.workspace = true tokio-test.workspace = true +tracing-test.workspace = true hyperlane-test = { path = "../../hyperlane-test" } hyperlane-base = { path = "../../hyperlane-base", features = ["test-utils"] } hyperlane-core = { path = "../../hyperlane-core", features = ["agent", "async", "test-utils"] } +ethers-prometheus = { path = "../../ethers-prometheus", features = ["serde"] } [features] default = ["color-eyre", "oneline-errors"] diff --git a/rust/main/agents/relayer/src/relayer.rs b/rust/main/agents/relayer/src/relayer.rs index d91117c1d7..e05549fb61 100644 --- a/rust/main/agents/relayer/src/relayer.rs +++ b/rust/main/agents/relayer/src/relayer.rs @@ -18,8 +18,8 @@ use hyperlane_base::{ }; use hyperlane_core::{ rpc_clients::call_and_retry_n_times, ChainCommunicationError, ContractSyncCursor, - HyperlaneDomain, HyperlaneMessage, InterchainGasPayment, MerkleTreeInsertion, QueueOperation, - H512, U256, + HyperlaneDomain, HyperlaneMessage, InterchainGasPayment, Mailbox, MerkleTreeInsertion, + QueueOperation, ValidatorAnnounce, H512, U256, }; use tokio::{ sync::{ @@ -135,12 +135,10 @@ impl BaseAgent for Relayer { .map(|origin| (origin.clone(), HyperlaneRocksDB::new(origin, db.clone()))) .collect::>(); - let mailboxes = settings - .build_mailboxes(settings.destination_chains.iter(), &core_metrics) - .await?; - let validator_announces = settings - .build_validator_announces(settings.origin_chains.iter(), &core_metrics) - .await?; + let mailboxes = Self::build_mailboxes(&settings, &core_metrics, &chain_metrics).await; + + let validator_announces = + Self::build_validator_announces(&settings, &core_metrics, &chain_metrics).await; let contract_sync_metrics = Arc::new(ContractSyncMetrics::new(&core_metrics)); @@ -236,7 +234,9 @@ impl BaseAgent for Relayer { let mut msg_ctxs = HashMap::new(); let mut destination_chains = HashMap::new(); - for destination in &settings.destination_chains { + + // only iterate through destination chains that were successfully instantiated + for (destination, dest_mailbox) in mailboxes.iter() { let destination_chain_setup = core.settings.chain_setup(destination).unwrap().clone(); destination_chains.insert(destination.clone(), destination_chain_setup.clone()); let transaction_gas_limit: Option = @@ -246,18 +246,19 @@ impl BaseAgent for Relayer { transaction_gas_limit }; - for origin in &settings.origin_chains { + // only iterate through origin chains that were successfully instantiated + for (origin, validator_announce) in validator_announces.iter() { let db = dbs.get(origin).unwrap().clone(); let metadata_builder = BaseMetadataBuilder::new( origin.clone(), destination_chain_setup.clone(), prover_syncs[origin].clone(), - validator_announces[origin].clone(), + validator_announce.clone(), settings.allow_local_checkpoint_syncers, core.metrics.clone(), db, IsmAwareAppContextClassifier::new( - mailboxes[destination].clone(), + dest_mailbox.clone(), settings.metric_app_contexts.clone(), ), ); @@ -268,7 +269,7 @@ impl BaseAgent for Relayer { destination: destination.id(), }, Arc::new(MessageContext { - destination_mailbox: mailboxes[destination].clone(), + destination_mailbox: dest_mailbox.clone(), origin_db: dbs.get(origin).unwrap().clone(), metadata_builder: Arc::new(metadata_builder), origin_gas_payment_enforcer: gas_payment_enforcers[origin].clone(), @@ -620,7 +621,275 @@ impl Relayer { })) .instrument(span) } + + /// Helper function to build and return a hashmap of mailboxes. + /// Any chains that fail to build mailbox will not be included + /// in the hashmap. Errors will be logged and chain metrics + /// will be updated for chains that fail to build mailbox. + pub async fn build_mailboxes( + settings: &RelayerSettings, + core_metrics: &CoreMetrics, + chain_metrics: &ChainMetrics, + ) -> HashMap> { + settings + .build_mailboxes(settings.destination_chains.iter(), core_metrics) + .await + .into_iter() + .filter_map(|(origin, mailbox_res)| match mailbox_res { + Ok(mailbox) => Some((origin, mailbox)), + Err(err) => { + error!(?err, origin=?origin, "Critical error when building mailbox"); + chain_metrics.set_critical_error(origin.name(), true); + None + } + }) + .collect() + } + + /// Helper function to build and return a hashmap of validator announces. + /// Any chains that fail to build validator announce will not be included + /// in the hashmap. Errors will be logged and chain metrics + /// will be updated for chains that fail to build validator announce. + pub async fn build_validator_announces( + settings: &RelayerSettings, + core_metrics: &CoreMetrics, + chain_metrics: &ChainMetrics, + ) -> HashMap> { + settings + .build_validator_announces(settings.origin_chains.iter(), core_metrics) + .await + .into_iter() + .filter_map(|(origin, mailbox_res)| match mailbox_res { + Ok(mailbox) => Some((origin, mailbox)), + Err(err) => { + error!(?err, origin=?origin, "Critical error when building validator announce"); + chain_metrics.set_critical_error(origin.name(), true); + None + } + }) + .collect() + } } #[cfg(test)] -mod test {} +mod test { + use std::{ + collections::{HashMap, HashSet}, + path::PathBuf, + }; + + use crate::settings::{matching_list::MatchingList, RelayerSettings}; + use ethers::utils::hex; + use ethers_prometheus::middleware::PrometheusMiddlewareConf; + use hyperlane_base::{ + settings::{ + ChainConf, ChainConnectionConf, CoreContractAddresses, IndexSettings, Settings, + TracingConfig, + }, + ChainMetrics, CoreMetrics, BLOCK_HEIGHT_HELP, BLOCK_HEIGHT_LABELS, CRITICAL_ERROR_HELP, + CRITICAL_ERROR_LABELS, + }; + use hyperlane_core::{ + config::OperationBatchConfig, HyperlaneDomain, IndexMode, KnownHyperlaneDomain, + ReorgPeriod, H256, + }; + use hyperlane_ethereum as h_eth; + use prometheus::{opts, IntGaugeVec, Registry}; + use reqwest::Url; + + use super::Relayer; + + /// Builds a test RelayerSetting + fn generate_test_relayer_settings() -> RelayerSettings { + let chains = [( + "arbitrum".to_string(), + ChainConf { + domain: HyperlaneDomain::Known(KnownHyperlaneDomain::Arbitrum), + signer: None, + reorg_period: ReorgPeriod::None, + addresses: CoreContractAddresses { + mailbox: H256::from_slice( + hex::decode( + "000000000000000000000000598facE78a4302f11E3de0bee1894Da0b2Cb71F8", + ) + .unwrap() + .as_slice(), + ), + interchain_gas_paymaster: H256::from_slice( + hex::decode( + "000000000000000000000000c756cFc1b7d0d4646589EDf10eD54b201237F5e8", + ) + .unwrap() + .as_slice(), + ), + validator_announce: H256::from_slice( + hex::decode( + "0000000000000000000000001b33611fCc073aB0737011d5512EF673Bff74962", + ) + .unwrap() + .as_slice(), + ), + merkle_tree_hook: H256::from_slice( + hex::decode( + "000000000000000000000000AD34A66Bf6dB18E858F6B686557075568c6E031C", + ) + .unwrap() + .as_slice(), + ), + }, + connection: ChainConnectionConf::Ethereum(h_eth::ConnectionConf { + rpc_connection: h_eth::RpcConnectionConf::Http { + url: Url::parse("https://sepolia-rollup.arbitrum.io/rpc").unwrap(), + }, + transaction_overrides: h_eth::TransactionOverrides { + gas_price: None, + gas_limit: None, + max_fee_per_gas: None, + max_priority_fee_per_gas: None, + }, + operation_batch: OperationBatchConfig { + batch_contract_address: None, + max_batch_size: 1, + }, + }), + metrics_conf: PrometheusMiddlewareConf { + contracts: HashMap::new(), + chain: None, + }, + index: IndexSettings { + from: 0, + chunk_size: 1, + mode: IndexMode::Block, + }, + }, + )]; + + RelayerSettings { + base: Settings { + chains: chains.into_iter().collect(), + metrics_port: 5000, + tracing: TracingConfig::default(), + }, + db: PathBuf::new(), + origin_chains: [ + HyperlaneDomain::Known(KnownHyperlaneDomain::Arbitrum), + HyperlaneDomain::Known(KnownHyperlaneDomain::Ethereum), + HyperlaneDomain::Known(KnownHyperlaneDomain::Optimism), + ] + .into_iter() + .collect(), + destination_chains: [ + HyperlaneDomain::Known(KnownHyperlaneDomain::Arbitrum), + HyperlaneDomain::Known(KnownHyperlaneDomain::Ethereum), + HyperlaneDomain::Known(KnownHyperlaneDomain::Optimism), + ] + .into_iter() + .collect(), + gas_payment_enforcement: Vec::new(), + whitelist: MatchingList::default(), + blacklist: MatchingList::default(), + address_blacklist: Vec::new(), + transaction_gas_limit: None, + skip_transaction_gas_limit_for: HashSet::new(), + allow_local_checkpoint_syncers: true, + metric_app_contexts: Vec::new(), + } + } + + #[tokio::test] + #[tracing_test::traced_test] + async fn test_failed_build_mailboxes() { + let settings = generate_test_relayer_settings(); + + let registry = Registry::new(); + let core_metrics = CoreMetrics::new("relayer", 4000, registry).unwrap(); + let chain_metrics = ChainMetrics { + block_height: IntGaugeVec::new( + opts!("block_height", BLOCK_HEIGHT_HELP), + BLOCK_HEIGHT_LABELS, + ) + .unwrap(), + gas_price: None, + critical_error: IntGaugeVec::new( + opts!("critical_error", CRITICAL_ERROR_HELP), + CRITICAL_ERROR_LABELS, + ) + .unwrap(), + }; + + let mailboxes = Relayer::build_mailboxes(&settings, &core_metrics, &chain_metrics).await; + + assert_eq!(mailboxes.len(), 1); + assert!(mailboxes.contains_key(&HyperlaneDomain::Known(KnownHyperlaneDomain::Arbitrum))); + + // Arbitrum chain should not have any errors because it's ChainConf exists + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["arbitrum"]) + .unwrap(); + assert_eq!(metric.get(), 0); + + // Ethereum chain should error because it is missing ChainConf + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["ethereum"]) + .unwrap(); + assert_eq!(metric.get(), 1); + + // Optimism chain should error because it is missing ChainConf + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["optimism"]) + .unwrap(); + assert_eq!(metric.get(), 1); + } + + #[tokio::test] + #[tracing_test::traced_test] + async fn test_failed_build_validator_announces() { + let settings = generate_test_relayer_settings(); + + let registry = Registry::new(); + let core_metrics = CoreMetrics::new("relayer", 4000, registry).unwrap(); + let chain_metrics = ChainMetrics { + block_height: IntGaugeVec::new( + opts!("block_height", BLOCK_HEIGHT_HELP), + BLOCK_HEIGHT_LABELS, + ) + .unwrap(), + gas_price: None, + critical_error: IntGaugeVec::new( + opts!("critical_error", CRITICAL_ERROR_HELP), + CRITICAL_ERROR_LABELS, + ) + .unwrap(), + }; + + let mailboxes = + Relayer::build_validator_announces(&settings, &core_metrics, &chain_metrics).await; + + assert_eq!(mailboxes.len(), 1); + assert!(mailboxes.contains_key(&HyperlaneDomain::Known(KnownHyperlaneDomain::Arbitrum))); + + // Arbitrum chain should not have any errors because it's ChainConf exists + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["arbitrum"]) + .unwrap(); + assert_eq!(metric.get(), 0); + + // Ethereum chain should error because it is missing ChainConf + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["ethereum"]) + .unwrap(); + assert_eq!(metric.get(), 1); + + // Optimism chain should error because it is missing ChainConf + let metric = chain_metrics + .critical_error + .get_metric_with_label_values(&["optimism"]) + .unwrap(); + assert_eq!(metric.get(), 1); + } +} diff --git a/rust/main/agents/relayer/src/settings/mod.rs b/rust/main/agents/relayer/src/settings/mod.rs index 2d3d9375de..a8a5c1ca4c 100644 --- a/rust/main/agents/relayer/src/settings/mod.rs +++ b/rust/main/agents/relayer/src/settings/mod.rs @@ -33,7 +33,7 @@ pub struct RelayerSettings { #[as_mut] #[deref] #[deref_mut] - base: Settings, + pub base: Settings, /// Database path pub db: PathBuf, diff --git a/rust/main/hyperlane-base/src/metrics/agent_metrics.rs b/rust/main/hyperlane-base/src/metrics/agent_metrics.rs index ad1320488f..2f4862d4aa 100644 --- a/rust/main/hyperlane-base/src/metrics/agent_metrics.rs +++ b/rust/main/hyperlane-base/src/metrics/agent_metrics.rs @@ -94,7 +94,7 @@ pub struct ChainMetrics { pub gas_price: Option, /// Boolean marker for critical errors on a chain, signalling loss of liveness. - critical_error: IntGaugeVec, + pub critical_error: IntGaugeVec, } impl ChainMetrics { diff --git a/rust/main/hyperlane-base/src/settings/base.rs b/rust/main/hyperlane-base/src/settings/base.rs index 7024d67cc6..d298a3c94f 100644 --- a/rust/main/hyperlane-base/src/settings/base.rs +++ b/rust/main/hyperlane-base/src/settings/base.rs @@ -1,9 +1,9 @@ use std::{collections::HashMap, fmt::Debug, hash::Hash, sync::Arc}; use eyre::{eyre, Context, Result}; -use futures_util::future::try_join_all; +use futures_util::future::join_all; use hyperlane_core::{ - HyperlaneChain, HyperlaneDomain, HyperlaneLogStore, HyperlaneProvider, + HyperlaneDomain, HyperlaneLogStore, HyperlaneProvider, HyperlaneSequenceAwareIndexerStoreReader, HyperlaneWatermarkedLogStore, InterchainGasPaymaster, Mailbox, MerkleTreeHook, MultisigIsm, SequenceAwareIndexer, ValidatorAnnounce, H256, }; @@ -132,11 +132,11 @@ macro_rules! build_contract_fns { &self, domains: impl Iterator, metrics: &CoreMetrics, - ) -> Result>> { - try_join_all(domains.map(|d| self.$singular(d, metrics))) - .await? + ) -> HashMap>> { + join_all(domains.map(|d| async { (d.clone(), self.$singular(d, metrics).await) })) + .await .into_iter() - .map(|i| Ok((i.domain().clone(), Arc::from(i)))) + .map(|(d, future)| (d, future.map(|f| Arc::from(f)))) .collect() } };