Skip to content

Commit c055656

Browse files
authored
Merge pull request #1717 from tursodatabase/deadlock-watcher
introduce deadlock monitor
2 parents 73a2913 + daca1c7 commit c055656

2 files changed

Lines changed: 63 additions & 0 deletions

File tree

libsql-server/src/lib.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ pub struct Server<C = HttpConnector, A = AddrIncoming, D = HttpsConnector<HttpCo
176176
pub storage_server_address: String,
177177
pub connector: Option<D>,
178178
pub migrate_bottomless: bool,
179+
pub enable_deadlock_monitor: bool,
179180
}
180181

181182
impl<C, A, D> Default for Server<C, A, D> {
@@ -201,6 +202,7 @@ impl<C, A, D> Default for Server<C, A, D> {
201202
storage_server_address: Default::default(),
202203
connector: None,
203204
migrate_bottomless: false,
205+
enable_deadlock_monitor: false,
204206
}
205207
}
206208
}
@@ -410,6 +412,57 @@ fn init_version_file(db_path: &Path) -> anyhow::Result<()> {
410412
Ok(())
411413
}
412414

415+
/// The deadlock watcher monitors the main tokio runtime for deadlock by sending Ping to a task
416+
/// within it, and waiting for pongs. If the runtime fails to respond in due time, the watcher
417+
/// exits the process.
418+
fn install_deadlock_monitor() {
419+
// this is a very generous deadline for the main runtime to respond
420+
const PONG_DEADLINE: Duration = Duration::from_secs(5);
421+
422+
struct Ping;
423+
struct Pong;
424+
425+
let (sender, mut receiver) = tokio::sync::mpsc::channel(1);
426+
427+
std::thread::spawn(move || {
428+
let rt = tokio::runtime::Builder::new_current_thread()
429+
.enable_time()
430+
.build()
431+
.unwrap();
432+
rt.block_on(async move {
433+
loop {
434+
let (snd, ret) = tokio::sync::oneshot::channel();
435+
sender.try_send((snd, Ping)).unwrap();
436+
match tokio::time::timeout(PONG_DEADLINE, ret).await {
437+
Ok(Ok(Pong)) => (),
438+
Err(_) => {
439+
tracing::error!(
440+
"main runtime failed to respond within deadlines, deadlock detected"
441+
);
442+
// std::process::exit(1);
443+
}
444+
_ => (),
445+
}
446+
447+
tokio::time::sleep(Duration::from_secs(1)).await;
448+
}
449+
})
450+
});
451+
452+
tokio::spawn(async move {
453+
loop {
454+
match receiver.recv().await {
455+
Some((ret, Ping)) => {
456+
let _ = ret.send(Pong);
457+
}
458+
None => break,
459+
}
460+
}
461+
462+
tracing::warn!("deadlock monitor exited")
463+
});
464+
}
465+
413466
impl<C, A, D> Server<C, A, D>
414467
where
415468
C: Connector,
@@ -501,6 +554,11 @@ where
501554
static INIT: std::sync::Once = std::sync::Once::new();
502555
let mut task_manager = TaskManager::new();
503556

557+
if self.enable_deadlock_monitor {
558+
install_deadlock_monitor();
559+
tracing::info!("deadlock monitor installed");
560+
}
561+
504562
if std::env::var("LIBSQL_SQLITE_MIMALLOC").is_ok() {
505563
setup_sqlite_alloc();
506564
}

libsql-server/src/main.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,10 @@ struct Cli {
272272
requires = "enable_bottomless_replication"
273273
)]
274274
migrate_bottomless: bool,
275+
276+
/// Enables the main runtime deadlock monitor: if the main runtime deadlocks, logs an error
277+
#[clap(long)]
278+
enable_deadlock_monitor: bool,
275279
}
276280

277281
#[derive(clap::Subcommand, Debug)]
@@ -671,6 +675,7 @@ async fn build_server(config: &Cli) -> anyhow::Result<Server> {
671675
storage_server_address: config.storage_server_address.clone(),
672676
connector: Some(https),
673677
migrate_bottomless: config.migrate_bottomless,
678+
enable_deadlock_monitor: config.enable_deadlock_monitor,
674679
})
675680
}
676681

0 commit comments

Comments
 (0)