From 6c520a08fade3a1299bff10034a2f71995bc7051 Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 11 Dec 2024 02:56:19 +0000 Subject: [PATCH 1/4] [nextest-runner] make DispatcherContext implement Clone and Debug This will help test out various state changes to the context. --- Cargo.lock | 12 ++++++++++++ Cargo.toml | 1 + nextest-runner/Cargo.toml | 1 + nextest-runner/src/runner/dispatcher.rs | 11 +++++++---- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e7a53f40259..dffcd67c251 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -750,6 +750,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "derive-where" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "dialoguer" version = "0.11.0" @@ -1808,6 +1819,7 @@ dependencies = [ "console-subscriber", "crossterm", "debug-ignore", + "derive-where", "duct", "dunce", "fixture-data", diff --git a/Cargo.toml b/Cargo.toml index 6af0e74c419..064cdd33648 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ cp_r = "0.5.2" crossterm = { version = "0.28.1", features = ["event-stream"] } dialoguer = "0.11.0" debug-ignore = "1.0.5" +derive-where = "1.2.7" duct = "0.13.7" dunce = "1.0.5" enable-ansi-support = "0.2.1" diff --git a/nextest-runner/Cargo.toml b/nextest-runner/Cargo.toml index ed41a9d6489..1212275f133 100644 --- a/nextest-runner/Cargo.toml +++ b/nextest-runner/Cargo.toml @@ -27,6 +27,7 @@ cfg-if.workspace = true chrono.workspace = true crossterm.workspace = true debug-ignore.workspace = true +derive-where.workspace = true duct.workspace = true future-queue.workspace = true futures.workspace = true diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs index ef862f3e4cc..674c92a19ca 100644 --- a/nextest-runner/src/runner/dispatcher.rs +++ b/nextest-runner/src/runner/dispatcher.rs @@ -21,6 +21,7 @@ use crate::{ time::StopwatchStart, }; use chrono::Local; +use debug_ignore::DebugIgnore; use quick_junit::ReportUuid; use std::{ collections::BTreeMap, @@ -38,8 +39,10 @@ use tracing::debug; /// /// This struct is responsible for coordinating events from the outside world /// and communicating with the executor. +#[derive(Clone)] +#[derive_where::derive_where(Debug)] pub(super) struct DispatcherContext<'a, F> { - callback: F, + callback: DebugIgnore, run_id: ReportUuid, profile_name: String, cli_args: Vec, @@ -67,7 +70,7 @@ where max_fail: MaxFail, ) -> Self { Self { - callback, + callback: DebugIgnore(callback), run_id, stopwatch: crate::time::stopwatch(), profile_name: profile_name.to_owned(), @@ -754,7 +757,7 @@ where } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ContextSetupScript<'a> { id: ScriptId, // Store these details primarily for debugging. @@ -767,7 +770,7 @@ struct ContextSetupScript<'a> { req_tx: UnboundedSender>, } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ContextTestInstance<'a> { // Store the instance primarily for debugging. #[expect(dead_code)] From c6fbc0f5672e76575ed1d068a6b39123b90431c4 Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 11 Dec 2024 03:48:18 +0000 Subject: [PATCH 2/4] [integration-tests] add an ignored tests which causes a stuck signal --- Cargo.lock | 1 + integration-tests/Cargo.toml | 1 + integration-tests/tests/integration/main.rs | 1 + .../tests/integration/stuck_signal.rs | 17 +++++++++++++++++ 4 files changed, 20 insertions(+) create mode 100644 integration-tests/tests/integration/stuck_signal.rs diff --git a/Cargo.lock b/Cargo.lock index dffcd67c251..b76e12f4fda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1497,6 +1497,7 @@ dependencies = [ "serde_json", "sha2", "target-spec", + "tokio", "whoami", ] diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml index 65a16e7ac1e..518a7a4f87f 100644 --- a/integration-tests/Cargo.toml +++ b/integration-tests/Cargo.toml @@ -52,6 +52,7 @@ nextest-metadata.workspace = true pathdiff.workspace = true regex.workspace = true target-spec.workspace = true +tokio.workspace = true # These platforms are supported by num_threads. # https://docs.rs/num_threads/0.1.7/src/num_threads/lib.rs.html#5-8 diff --git a/integration-tests/tests/integration/main.rs b/integration-tests/tests/integration/main.rs index 7f14b7b7b6e..25ef476ef1d 100644 --- a/integration-tests/tests/integration/main.rs +++ b/integration-tests/tests/integration/main.rs @@ -32,6 +32,7 @@ use std::{borrow::Cow, fs::File, io::Write}; use target_spec::Platform; mod fixtures; +mod stuck_signal; mod temp_project; use crate::temp_project::{create_uds, UdsStatus}; diff --git a/integration-tests/tests/integration/stuck_signal.rs b/integration-tests/tests/integration/stuck_signal.rs new file mode 100644 index 00000000000..b9e4f978f0c --- /dev/null +++ b/integration-tests/tests/integration/stuck_signal.rs @@ -0,0 +1,17 @@ +// Copyright (c) The nextest Contributors +// SPDX-License-Identifier: MIT OR Apache-2.0 + +//! A test that gets stuck in a signal handler. +//! +//! Meant mostly for debugging. We should also likely have a fixture which does +//! this, but it's hard to do that without pulling in extra dependencies. + +#[ignore] +#[tokio::test] +async fn test_stuck_signal() { + // This test installs a signal handler that gets stuck. Feel free to tweak + // the loop as needed. + loop { + tokio::signal::ctrl_c().await.expect("received signal"); + } +} From f11b78de65048e58f97f845e3b20e2affd7d3b2d Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 11 Dec 2024 02:51:14 +0000 Subject: [PATCH 3/4] [nextest-runner] ensure that the second signal always results in SIGKILL It was always the intention to do this, but this broke at some point :( Unbreak it and add tests. --- nextest-runner/src/reporter/aggregator.rs | 2 +- nextest-runner/src/reporter/displayer.rs | 53 +++++- nextest-runner/src/reporter/events.rs | 16 ++ nextest-runner/src/runner/dispatcher.rs | 207 +++++++++++----------- nextest-runner/src/signal.rs | 13 ++ 5 files changed, 177 insertions(+), 114 deletions(-) diff --git a/nextest-runner/src/reporter/aggregator.rs b/nextest-runner/src/reporter/aggregator.rs index c1c013b41f2..3f84be26cd7 100644 --- a/nextest-runner/src/reporter/aggregator.rs +++ b/nextest-runner/src/reporter/aggregator.rs @@ -202,7 +202,7 @@ impl<'cfg> MetadataJunit<'cfg> { // // testsuite.add_testcase(testcase); } - TestEventKind::RunBeginCancel { .. } => {} + TestEventKind::RunBeginCancel { .. } | TestEventKind::RunBeginKill { .. } => {} TestEventKind::RunFinished { run_id, start_time, diff --git a/nextest-runner/src/reporter/displayer.rs b/nextest-runner/src/reporter/displayer.rs index 9db2c8a8509..1252ca5bcf8 100644 --- a/nextest-runner/src/reporter/displayer.rs +++ b/nextest-runner/src/reporter/displayer.rs @@ -460,8 +460,9 @@ impl ProgressBarState { // continue to output to it. self.hidden_run_paused = false; } - TestEventKind::RunBeginCancel { .. } => { - progress_bar.set_prefix(progress_bar_cancel_prefix(styles)); + TestEventKind::RunBeginCancel { reason, .. } + | TestEventKind::RunBeginKill { reason, .. } => { + progress_bar.set_prefix(progress_bar_cancel_prefix(*reason, styles)); } _ => {} } @@ -540,8 +541,16 @@ impl<'a> TestReporter<'a> { } } -fn progress_bar_cancel_prefix(styles: &Styles) -> String { - format!("{:>12}", "Cancelling".style(styles.fail)) +fn progress_bar_cancel_prefix(reason: CancelReason, styles: &Styles) -> String { + let status = match reason { + CancelReason::SetupScriptFailure + | CancelReason::TestFailure + | CancelReason::ReportError + | CancelReason::Signal + | CancelReason::Interrupt => "Cancelling", + CancelReason::SecondSignal => "Killing", + }; + format!("{:>12}", status.style(styles.fail)) } fn progress_bar_prefix( @@ -549,8 +558,8 @@ fn progress_bar_prefix( cancel_reason: Option, styles: &Styles, ) -> String { - if cancel_reason.is_some() { - return progress_bar_cancel_prefix(styles); + if let Some(reason) = cancel_reason { + return progress_bar_cancel_prefix(reason, styles); } let style = if run_stats.has_failures() { @@ -972,6 +981,38 @@ impl<'a> TestReporterImpl<'a> { } writeln!(writer)?; } + TestEventKind::RunBeginKill { + setup_scripts_running, + running, + reason, + } => { + self.cancel_status = self.cancel_status.max(Some(*reason)); + + write!( + writer, + "{:>12} due to {}", + "Killing".style(self.styles.fail), + reason.to_static_str().style(self.styles.fail) + )?; + + // At the moment, we can have either setup scripts or tests running, but not both. + if *setup_scripts_running > 0 { + let s = plural::setup_scripts_str(*setup_scripts_running); + write!( + writer, + ": {} {s} still running", + setup_scripts_running.style(self.styles.count), + )?; + } else if *running > 0 { + let tests_str = plural::tests_str(*running); + write!( + writer, + ": {} {tests_str} still running", + running.style(self.styles.count), + )?; + } + writeln!(writer)?; + } TestEventKind::RunPaused { setup_scripts_running, running, diff --git a/nextest-runner/src/reporter/events.rs b/nextest-runner/src/reporter/events.rs index 89695301c66..9931671cbbc 100644 --- a/nextest-runner/src/reporter/events.rs +++ b/nextest-runner/src/reporter/events.rs @@ -257,6 +257,18 @@ pub enum TestEventKind<'a> { reason: CancelReason, }, + /// A forcible kill was requested due to receiving a signal. + RunBeginKill { + /// The number of setup scripts still running. + setup_scripts_running: usize, + + /// The number of tests still running. + running: usize, + + /// The reason this run was killed. + reason: CancelReason, + }, + /// A SIGTSTP event was received and the run was paused. RunPaused { /// The number of setup scripts running. @@ -777,6 +789,9 @@ pub enum CancelReason { /// An interrupt (on Unix, Ctrl-C) was received. Interrupt, + + /// A second signal was received, and the run is being forcibly killed. + SecondSignal, } impl CancelReason { @@ -787,6 +802,7 @@ impl CancelReason { CancelReason::ReportError => "reporting error", CancelReason::Signal => "signal", CancelReason::Interrupt => "interrupt", + CancelReason::SecondSignal => "second signal", } } } diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs index 674c92a19ca..98dba26e6d5 100644 --- a/nextest-runner/src/runner/dispatcher.rs +++ b/nextest-runner/src/runner/dispatcher.rs @@ -635,14 +635,7 @@ where SignalEvent::Shutdown(event) => { let signal_count = self.increment_signal_count(); let req = signal_count.to_request(event); - - let cancel_reason = match event { - #[cfg(unix)] - ShutdownEvent::Hangup | ShutdownEvent::Term | ShutdownEvent::Quit => { - CancelReason::Signal - } - ShutdownEvent::Interrupt => CancelReason::Interrupt, - }; + let cancel_reason = event_to_cancel_reason(event); self.begin_cancel(cancel_reason, CancelEvent::Signal(req)) } @@ -728,8 +721,19 @@ where /// /// Returns the corresponding `HandleEventResponse`. fn begin_cancel(&mut self, reason: CancelReason, event: CancelEvent) -> HandleEventResponse { - // TODO: combine reason and event? - if self.cancel_state < Some(reason) { + // TODO: combine reason and event? The Twice block ignoring the event + // seems to indicate a data modeling issue. + if event == CancelEvent::Signal(ShutdownRequest::Twice) { + // Forcibly kill child processes in the case of a second shutdown + // signal. + self.basic_callback(TestEventKind::RunBeginKill { + setup_scripts_running: self.setup_scripts_running(), + running: self.running(), + // This is always a second signal. + reason: CancelReason::SecondSignal, + }); + HandleEventResponse::Cancel(event) + } else if self.cancel_state < Some(reason) { self.cancel_state = Some(reason); self.basic_callback(TestEventKind::RunBeginCancel { setup_scripts_running: self.setup_scripts_running(), @@ -757,6 +761,14 @@ where } } +fn event_to_cancel_reason(event: ShutdownEvent) -> CancelReason { + match event { + #[cfg(unix)] + ShutdownEvent::Hangup | ShutdownEvent::Term | ShutdownEvent::Quit => CancelReason::Signal, + ShutdownEvent::Interrupt => CancelReason::Interrupt, + } +} + #[derive(Clone, Debug)] struct ContextSetupScript<'a> { id: ScriptId, @@ -877,123 +889,104 @@ mod tests { "expected report" ); { - let events = events.lock().unwrap(); + let mut events = events.lock().unwrap(); assert_eq!(events.len(), 1, "expected 1 event"); + let event = events.pop().unwrap(); let TestEventKind::RunBeginCancel { setup_scripts_running, running, reason, - } = &events[0].kind + } = event.kind else { - panic!("expected RunBeginCancel event, found {:?}", events[0].kind); + panic!("expected RunBeginCancel event, found {:?}", event.kind); }; - assert_eq!( - *setup_scripts_running, 0, - "expected 0 setup scripts running" - ); - assert_eq!(*running, 0, "expected 0 tests running"); - assert_eq!(*reason, CancelReason::ReportError, "expected report error"); + assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running"); + assert_eq!(running, 0, "expected 0 tests running"); + assert_eq!(reason, CancelReason::ReportError, "expected report error"); } // Send another report error, ensuring it's ignored. let response = cx.handle_event(InternalEvent::ReportCancel); - assert_noop(response, &events, 1); - - // Cancel with a signal. - #[cfg(unix)] - let signal_event_count = { - let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown( - ShutdownEvent::Hangup, - ))); - assert_eq!( - response, - HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once( - ShutdownEvent::Hangup - ))), - "expected cancellation" - ); - { - let events = events.lock().unwrap(); - assert_eq!(events.len(), 2, "expected 2 events"); - let TestEventKind::RunBeginCancel { - setup_scripts_running, - running, - reason, - } = &events[1].kind - else { - panic!("expected RunBeginCancel event, found {:?}", events[1].kind); - }; + assert_noop(response, &events); + + // The rules: + // * Any one signal will cause that signal. + // * Any two signals received will cause a SIGKILL. + // * After a signal is received, any less-important cancel-worthy events + // are ignored. + // + // Interestingly, this state machine appears to function on Windows too + // (though of course the only variant is an Interrupt so this only runs + // one iteration.) Should it be different? No compelling reason to be + // yet. + for sig1 in ShutdownEvent::ALL_VARIANTS { + for sig2 in ShutdownEvent::ALL_VARIANTS { + eprintln!("** testing {:?} -> {:?}", sig1, sig2); + // Separate test for each signal to avoid mixing up state. + let mut cx = cx.clone(); + + // First signal. + let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(*sig1))); assert_eq!( - *setup_scripts_running, 0, - "expected 0 setup scripts running" + response, + HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once(*sig1))), + "expected Once" ); - assert_eq!(*running, 0, "expected 0 tests running"); - assert_eq!(*reason, CancelReason::Signal, "expected signal"); - } - - // Send another signal, ensuring it's ignored. - let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown( - ShutdownEvent::Hangup, - ))); - assert_noop(response, &events, 2); - - // Send a report error, ensuring it's ignored. - let response = cx.handle_event(InternalEvent::ReportCancel); - assert_noop(response, &events, 2); + { + let mut events = events.lock().unwrap(); + assert_eq!(events.len(), 1, "expected 1 event"); + let event = events.pop().unwrap(); + let TestEventKind::RunBeginCancel { + setup_scripts_running, + running, + reason, + } = event.kind + else { + panic!("expected RunBeginCancel event, found {:?}", event.kind); + }; + assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running"); + assert_eq!(running, 0, "expected 0 tests running"); + assert_eq!(reason, event_to_cancel_reason(*sig1), "expected signal"); + } - 1 - }; - #[cfg(not(unix))] - let signal_event_count = 0; + // Another report error, ensuring it's ignored. + let response = cx.handle_event(InternalEvent::ReportCancel); + assert_noop(response, &events); - // Cancel with an interrupt. - let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown( - ShutdownEvent::Interrupt, - ))); - assert_eq!( - response, - if signal_event_count == 0 { - // On Windows, this is the first signal. - HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once( - ShutdownEvent::Interrupt, - ))) - } else { - // On Unix, this is the second signal. - HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Twice)) - }, - "expected cancellation" - ); - { - let events = events.lock().unwrap(); - assert_eq!(events.len(), 2 + signal_event_count, "expected event count"); - let TestEventKind::RunBeginCancel { - setup_scripts_running, - running, - reason, - } = &events[1 + signal_event_count].kind - else { - panic!( - "expected RunBeginCancel event, found {:?}", - events[1 + signal_event_count].kind + // Second signal. + let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(*sig2))); + assert_eq!( + response, + HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Twice)), + "expected kill" ); - }; - assert_eq!( - *setup_scripts_running, 0, - "expected 0 setup scripts running" - ); - assert_eq!(*running, 0, "expected 0 tests running"); - assert_eq!(*reason, CancelReason::Interrupt, "expected interrupt"); + { + let mut events = events.lock().unwrap(); + assert_eq!(events.len(), 1, "expected 1 events"); + let event = events.pop().unwrap(); + let TestEventKind::RunBeginKill { + setup_scripts_running, + running, + reason, + } = event.kind + else { + panic!("expected RunBeginKill event, found {:?}", event.kind); + }; + assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running"); + assert_eq!(running, 0, "expected 0 tests running"); + assert_eq!(reason, CancelReason::SecondSignal, "expected second signal"); + } + + // Another report error, ensuring it's ignored. + let response = cx.handle_event(InternalEvent::ReportCancel); + assert_noop(response, &events); + } } } #[track_caller] - fn assert_noop( - response: HandleEventResponse, - events: &Mutex>>, - event_count: usize, - ) { + fn assert_noop(response: HandleEventResponse, events: &Mutex>>) { assert_eq!(response, HandleEventResponse::None, "expected no response"); - let events = events.lock().unwrap(); - assert_eq!(events.len(), event_count, "expected no new events"); + assert_eq!(events.lock().unwrap().len(), 0, "expected no new events"); } } diff --git a/nextest-runner/src/signal.rs b/nextest-runner/src/signal.rs index 003bb19bea3..7a9fadd3897 100644 --- a/nextest-runner/src/signal.rs +++ b/nextest-runner/src/signal.rs @@ -238,6 +238,19 @@ pub(crate) enum ShutdownEvent { Interrupt, } +impl ShutdownEvent { + #[cfg(test)] + pub(crate) const ALL_VARIANTS: &'static [Self] = &[ + #[cfg(unix)] + Self::Hangup, + #[cfg(unix)] + Self::Term, + #[cfg(unix)] + Self::Quit, + Self::Interrupt, + ]; +} + // A signal event to query information about tests. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub(crate) enum SignalInfoEvent { From aee7e0630ea4330c44cdc0939462528d848352db Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 11 Dec 2024 02:51:14 +0000 Subject: [PATCH 4/4] [nextest-runner] spawn a task for each test Previously we'd use the same task for all tests -- that turns out to be fine in most cases, but doesn't quite provide per-test isolation, and is slower in repos like clap that have lots of small tests. Switch to using the same task instead. It's cool that async-scoped kind of works here, but non-static tasks are such a pain to work with in general. This makes the clap tests significantly faster. On `fc55ad08`, runtime on my desktop (Ryzen 7950X/Linux) goes from 0.36 seconds to 0.24. --- nextest-runner/src/list/test_list.rs | 8 +- nextest-runner/src/runner/dispatcher.rs | 8 +- nextest-runner/src/runner/imp.rs | 117 +++++++++++++++---- nextest-runner/src/runner/internal_events.rs | 23 +++- 4 files changed, 129 insertions(+), 27 deletions(-) diff --git a/nextest-runner/src/list/test_list.rs b/nextest-runner/src/list/test_list.rs index 43636de11c5..edf3dd9918e 100644 --- a/nextest-runner/src/list/test_list.rs +++ b/nextest-runner/src/list/test_list.rs @@ -31,7 +31,7 @@ use owo_colors::OwoColorize; use std::{ collections::{BTreeMap, BTreeSet}, ffi::{OsStr, OsString}, - io, + fmt, io, path::PathBuf, sync::{Arc, OnceLock}, }; @@ -1070,6 +1070,12 @@ pub struct TestInstanceId<'a> { pub test_name: &'a str, } +impl fmt::Display for TestInstanceId<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {}", self.binary_id, self.test_name) + } +} + /// Context required for test execution. #[derive(Clone, Debug)] pub struct TestExecuteContext<'a> { diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs index 98dba26e6d5..23eb90a181d 100644 --- a/nextest-runner/src/runner/dispatcher.rs +++ b/nextest-runner/src/runner/dispatcher.rs @@ -7,7 +7,7 @@ //! receives events from the executor and from other inputs (e.g. signal and //! input handling), and sends events to the reporter. -use super::{RunUnitRequest, ShutdownRequest}; +use super::{RunUnitRequest, RunnerTaskState, ShutdownRequest}; use crate::{ config::{MaxFail, ScriptConfig, ScriptId}, input::{InputEvent, InputHandler}, @@ -104,7 +104,7 @@ where report_cancel_rx: oneshot::Receiver<()>, cancelled_ref: &AtomicBool, cancellation_sender: broadcast::Sender<()>, - ) { + ) -> RunnerTaskState { let mut report_cancel_rx = std::pin::pin!(report_cancel_rx); let mut signals_done = false; @@ -118,7 +118,7 @@ where Some(event) => InternalEvent::Executor(event), None => { // All runs have been completed. - break; + break RunnerTaskState::finished_no_children(); } } }, @@ -705,6 +705,8 @@ where if self.disable_signal_3_times_panic { SignalCount::Twice } else { + // TODO: a panic here won't currently lead to other + // tasks being cancelled. This should be fixed. panic!("Signaled 3 times, exiting immediately"); } } diff --git a/nextest-runner/src/runner/imp.rs b/nextest-runner/src/runner/imp.rs index 4dd59558dd6..93f263b9329 100644 --- a/nextest-runner/src/runner/imp.rs +++ b/nextest-runner/src/runner/imp.rs @@ -1,7 +1,7 @@ // Copyright (c) The nextest Contributors // SPDX-License-Identifier: MIT OR Apache-2.0 -use super::{DispatcherContext, ExecutorContext}; +use super::{DispatcherContext, ExecutorContext, RunnerTaskState}; use crate::{ config::{ EvaluatableProfile, MaxFail, RetryPolicy, SetupScriptExecuteData, TestGroup, TestThreads, @@ -30,7 +30,7 @@ use tokio::{ sync::{broadcast, mpsc::unbounded_channel, oneshot}, task::JoinError, }; -use tracing::debug; +use tracing::{debug, warn}; /// Test runner options. #[derive(Debug, Default)] @@ -298,11 +298,13 @@ impl<'a> TestRunnerInner<'a> { cancelled_ref, cancellation_sender.clone(), ); - scope.spawn_cancellable(dispatcher_fut, || ()); + scope.spawn_cancellable(dispatcher_fut, || RunnerTaskState::Cancelled); let (script_tx, mut script_rx) = unbounded_channel::>(); let script_resp_tx = resp_tx.clone(); let run_scripts_fut = async move { + // Since script tasks are run serially, we just reuse the one + // script task. let script_data = executor_cx_ref .run_setup_scripts(script_resp_tx, cancelled_ref) .await; @@ -310,8 +312,9 @@ impl<'a> TestRunnerInner<'a> { // The dispatcher has shut down, so we should too. debug!("script_tx.send failed, shutting down"); } + RunnerTaskState::finished_no_children() }; - scope.spawn_cancellable(run_scripts_fut, || ()); + scope.spawn_cancellable(run_scripts_fut, || RunnerTaskState::Cancelled); let Some(script_data) = script_rx.blocking_recv() else { // Most likely the harness is shutting down, so we should too. @@ -337,33 +340,107 @@ impl<'a> TestRunnerInner<'a> { TestGroup::Global => None, TestGroup::Custom(name) => Some(name.clone()), }; - - let fut = executor_cx_ref.run_test_instance( - test_instance, - settings, - resp_tx.clone(), - cancelled_ref, - cancellation_sender.subscribe(), - setup_script_data.clone(), - ); + let cancel_rx = cancellation_sender.subscribe(); + let resp_tx = resp_tx.clone(); + let setup_script_data = setup_script_data.clone(); + + // Use a separate Tokio task for each test. For repos with + // lots of small tests, this has been observed to be much + // faster than using a single task for all tests (what we + // used to do). It also provides some degree of per-test + // isolation. + let fut = async move { + // SAFETY: Within an outer scope_and_block (which we + // have here), scope_and_collect is safe as long as the + // returned future isn't forgotten. We're not forgetting + // it below -- we're running it to completion + // immediately. + // + // But recursive scoped calls really feel like pushing + // against the limits of async-scoped. For example, + // there's no way built into async-scoped to propagate a + // cancellation signal from the outer scope to the inner + // scope. (But there could be, right? That seems + // solvable via channels. And we could likely do our own + // channels here.) + let ((), mut ret) = unsafe { + TokioScope::scope_and_collect(move |scope| { + scope.spawn(executor_cx_ref.run_test_instance( + test_instance, + settings, + resp_tx.clone(), + cancelled_ref, + cancel_rx, + setup_script_data, + )) + }) + } + .await; + + // If no future was started, that's really strange. + // Worth at least logging. + let Some(result) = ret.pop() else { + warn!( + "no task was started for test instance: {}", + test_instance.id() + ); + return None; + }; + match result { + Ok(()) => None, + Err(join_error) => Some(join_error), + } + }; (threads_required, test_group, fut) }) - // future_queue_grouped means tests are spawned in order but returned in - // any order. + // future_queue_grouped means tests are spawned in the order + // defined, but returned in any order. .future_queue_grouped(self.test_threads, groups) - .collect::<()>(); - - scope.spawn_cancellable(run_tests_fut, || ()); + // Drop the None values. + .filter_map(std::future::ready) + .collect::>() + // Interestingly, using a more idiomatic `async move { + // run_tests_fut.await ... }` block causes Rust 1.83 to complain + // about a weird lifetime mismatch. FutureExt::map as used below + // does not. + .map(|child_join_errors| RunnerTaskState::Finished { child_join_errors }); + + scope.spawn_cancellable(run_tests_fut, || RunnerTaskState::Cancelled); }); dispatcher_cx.run_finished(); - // Were there any join errors? + // Were there any join errors in tasks? + // + // If one of the tasks panics, we likely end up stuck because the + // dispatcher, which is spawned in the same async-scoped block, doesn't + // get relayed the panic immediately. That should probably be fixed at + // some point. + let mut cancelled_count = 0; let join_errors = results .into_iter() - .filter_map(|r| r.err()) + .flat_map(|r| { + match r { + Ok(RunnerTaskState::Finished { child_join_errors }) => child_join_errors, + // Largely ignore cancelled tasks since it most likely means + // shutdown -- we don't cancel tasks manually. + Ok(RunnerTaskState::Cancelled) => { + cancelled_count += 1; + Vec::new() + } + Err(join_error) => vec![join_error], + } + }) .collect::>(); + + if cancelled_count > 0 { + debug!( + "{} tasks were cancelled -- this \ + generally should only happen due to panics", + cancelled_count + ); + } if !join_errors.is_empty() { return Err(join_errors); } diff --git a/nextest-runner/src/runner/internal_events.rs b/nextest-runner/src/runner/internal_events.rs index 35d211dcb12..9a994f224e7 100644 --- a/nextest-runner/src/runner/internal_events.rs +++ b/nextest-runner/src/runner/internal_events.rs @@ -24,9 +24,12 @@ use crate::{ }; use nextest_metadata::MismatchReason; use std::time::Duration; -use tokio::sync::{ - mpsc::{UnboundedReceiver, UnboundedSender}, - oneshot, +use tokio::{ + sync::{ + mpsc::{UnboundedReceiver, UnboundedSender}, + oneshot, + }, + task::JoinError, }; /// An internal event. @@ -230,3 +233,17 @@ pub(super) enum InternalTerminateReason { Timeout, Signal(ShutdownRequest), } + +pub(super) enum RunnerTaskState { + Finished { child_join_errors: Vec }, + Cancelled, +} + +impl RunnerTaskState { + /// Mark a runner task as finished and having not run any children. + pub(super) fn finished_no_children() -> Self { + Self::Finished { + child_join_errors: Vec::new(), + } + } +}