From daca1c7466956eefd0b49d3b5ab687b9f1b4f90c Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 2 Sep 2024 14:18:47 +0200 Subject: [PATCH] introduce deadlock monitor --- libsql-server/src/lib.rs | 58 +++++++++++++++++++++++++++++++++++++++ libsql-server/src/main.rs | 5 ++++ 2 files changed, 63 insertions(+) diff --git a/libsql-server/src/lib.rs b/libsql-server/src/lib.rs index 971879bfd6..480d110d67 100644 --- a/libsql-server/src/lib.rs +++ b/libsql-server/src/lib.rs @@ -176,6 +176,7 @@ pub struct Server, pub migrate_bottomless: bool, + pub enable_deadlock_monitor: bool, } impl Default for Server { @@ -201,6 +202,7 @@ impl Default for Server { storage_server_address: Default::default(), connector: None, migrate_bottomless: false, + enable_deadlock_monitor: false, } } } @@ -410,6 +412,57 @@ fn init_version_file(db_path: &Path) -> anyhow::Result<()> { Ok(()) } +/// The deadlock watcher monitors the main tokio runtime for deadlock by sending Ping to a task +/// within it, and waiting for pongs. If the runtime fails to respond in due time, the watcher +/// exits the process. +fn install_deadlock_monitor() { + // this is a very generous deadline for the main runtime to respond + const PONG_DEADLINE: Duration = Duration::from_secs(5); + + struct Ping; + struct Pong; + + let (sender, mut receiver) = tokio::sync::mpsc::channel(1); + + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + rt.block_on(async move { + loop { + let (snd, ret) = tokio::sync::oneshot::channel(); + sender.try_send((snd, Ping)).unwrap(); + match tokio::time::timeout(PONG_DEADLINE, ret).await { + Ok(Ok(Pong)) => (), + Err(_) => { + tracing::error!( + "main runtime failed to respond within deadlines, deadlock detected" + ); + // std::process::exit(1); + } + _ => (), + } + + tokio::time::sleep(Duration::from_secs(1)).await; + } + }) + }); + + tokio::spawn(async move { + loop { + match receiver.recv().await { + Some((ret, Ping)) => { + let _ = ret.send(Pong); + } + None => break, + } + } + + tracing::warn!("deadlock monitor exited") + }); +} + impl Server where C: Connector, @@ -501,6 +554,11 @@ where static INIT: std::sync::Once = std::sync::Once::new(); let mut task_manager = TaskManager::new(); + if self.enable_deadlock_monitor { + install_deadlock_monitor(); + tracing::info!("deadlock monitor installed"); + } + if std::env::var("LIBSQL_SQLITE_MIMALLOC").is_ok() { setup_sqlite_alloc(); } diff --git a/libsql-server/src/main.rs b/libsql-server/src/main.rs index 48e42e73e0..038d34b22d 100644 --- a/libsql-server/src/main.rs +++ b/libsql-server/src/main.rs @@ -272,6 +272,10 @@ struct Cli { requires = "enable_bottomless_replication" )] migrate_bottomless: bool, + + /// Enables the main runtime deadlock monitor: if the main runtime deadlocks, logs an error + #[clap(long)] + enable_deadlock_monitor: bool, } #[derive(clap::Subcommand, Debug)] @@ -671,6 +675,7 @@ async fn build_server(config: &Cli) -> anyhow::Result { storage_server_address: config.storage_server_address.clone(), connector: Some(https), migrate_bottomless: config.migrate_bottomless, + enable_deadlock_monitor: config.enable_deadlock_monitor, }) }