From 6c520a08fade3a1299bff10034a2f71995bc7051 Mon Sep 17 00:00:00 2001
From: Rain <rain@sunshowers.io>
Date: Wed, 11 Dec 2024 02:56:19 +0000
Subject: [PATCH 1/4] [nextest-runner] make DispatcherContext implement Clone
 and Debug

This will help test out various state changes to the context.
---
 Cargo.lock                              | 12 ++++++++++++
 Cargo.toml                              |  1 +
 nextest-runner/Cargo.toml               |  1 +
 nextest-runner/src/runner/dispatcher.rs | 11 +++++++----
 4 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e7a53f40259..dffcd67c251 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -750,6 +750,17 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "derive-where"
+version = "1.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "dialoguer"
 version = "0.11.0"
@@ -1808,6 +1819,7 @@ dependencies = [
  "console-subscriber",
  "crossterm",
  "debug-ignore",
+ "derive-where",
  "duct",
  "dunce",
  "fixture-data",
diff --git a/Cargo.toml b/Cargo.toml
index 6af0e74c419..064cdd33648 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,6 +49,7 @@ cp_r = "0.5.2"
 crossterm = { version = "0.28.1", features = ["event-stream"] }
 dialoguer = "0.11.0"
 debug-ignore = "1.0.5"
+derive-where = "1.2.7"
 duct = "0.13.7"
 dunce = "1.0.5"
 enable-ansi-support = "0.2.1"
diff --git a/nextest-runner/Cargo.toml b/nextest-runner/Cargo.toml
index ed41a9d6489..1212275f133 100644
--- a/nextest-runner/Cargo.toml
+++ b/nextest-runner/Cargo.toml
@@ -27,6 +27,7 @@ cfg-if.workspace = true
 chrono.workspace = true
 crossterm.workspace = true
 debug-ignore.workspace = true
+derive-where.workspace = true
 duct.workspace = true
 future-queue.workspace = true
 futures.workspace = true
diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs
index ef862f3e4cc..674c92a19ca 100644
--- a/nextest-runner/src/runner/dispatcher.rs
+++ b/nextest-runner/src/runner/dispatcher.rs
@@ -21,6 +21,7 @@ use crate::{
     time::StopwatchStart,
 };
 use chrono::Local;
+use debug_ignore::DebugIgnore;
 use quick_junit::ReportUuid;
 use std::{
     collections::BTreeMap,
@@ -38,8 +39,10 @@ use tracing::debug;
 ///
 /// This struct is responsible for coordinating events from the outside world
 /// and communicating with the executor.
+#[derive(Clone)]
+#[derive_where::derive_where(Debug)]
 pub(super) struct DispatcherContext<'a, F> {
-    callback: F,
+    callback: DebugIgnore<F>,
     run_id: ReportUuid,
     profile_name: String,
     cli_args: Vec<String>,
@@ -67,7 +70,7 @@ where
         max_fail: MaxFail,
     ) -> Self {
         Self {
-            callback,
+            callback: DebugIgnore(callback),
             run_id,
             stopwatch: crate::time::stopwatch(),
             profile_name: profile_name.to_owned(),
@@ -754,7 +757,7 @@ where
     }
 }
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 struct ContextSetupScript<'a> {
     id: ScriptId,
     // Store these details primarily for debugging.
@@ -767,7 +770,7 @@ struct ContextSetupScript<'a> {
     req_tx: UnboundedSender<RunUnitRequest<'a>>,
 }
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 struct ContextTestInstance<'a> {
     // Store the instance primarily for debugging.
     #[expect(dead_code)]

From c6fbc0f5672e76575ed1d068a6b39123b90431c4 Mon Sep 17 00:00:00 2001
From: Rain <rain@sunshowers.io>
Date: Wed, 11 Dec 2024 03:48:18 +0000
Subject: [PATCH 2/4] [integration-tests] add an ignored tests which causes a
 stuck signal

---
 Cargo.lock                                      |  1 +
 integration-tests/Cargo.toml                    |  1 +
 integration-tests/tests/integration/main.rs     |  1 +
 .../tests/integration/stuck_signal.rs           | 17 +++++++++++++++++
 4 files changed, 20 insertions(+)
 create mode 100644 integration-tests/tests/integration/stuck_signal.rs

diff --git a/Cargo.lock b/Cargo.lock
index dffcd67c251..b76e12f4fda 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1497,6 +1497,7 @@ dependencies = [
  "serde_json",
  "sha2",
  "target-spec",
+ "tokio",
  "whoami",
 ]
 
diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml
index 65a16e7ac1e..518a7a4f87f 100644
--- a/integration-tests/Cargo.toml
+++ b/integration-tests/Cargo.toml
@@ -52,6 +52,7 @@ nextest-metadata.workspace = true
 pathdiff.workspace = true
 regex.workspace = true
 target-spec.workspace = true
+tokio.workspace = true
 
 # These platforms are supported by num_threads.
 # https://docs.rs/num_threads/0.1.7/src/num_threads/lib.rs.html#5-8
diff --git a/integration-tests/tests/integration/main.rs b/integration-tests/tests/integration/main.rs
index 7f14b7b7b6e..25ef476ef1d 100644
--- a/integration-tests/tests/integration/main.rs
+++ b/integration-tests/tests/integration/main.rs
@@ -32,6 +32,7 @@ use std::{borrow::Cow, fs::File, io::Write};
 use target_spec::Platform;
 
 mod fixtures;
+mod stuck_signal;
 mod temp_project;
 
 use crate::temp_project::{create_uds, UdsStatus};
diff --git a/integration-tests/tests/integration/stuck_signal.rs b/integration-tests/tests/integration/stuck_signal.rs
new file mode 100644
index 00000000000..b9e4f978f0c
--- /dev/null
+++ b/integration-tests/tests/integration/stuck_signal.rs
@@ -0,0 +1,17 @@
+// Copyright (c) The nextest Contributors
+// SPDX-License-Identifier: MIT OR Apache-2.0
+
+//! A test that gets stuck in a signal handler.
+//!
+//! Meant mostly for debugging. We should also likely have a fixture which does
+//! this, but it's hard to do that without pulling in extra dependencies.
+
+#[ignore]
+#[tokio::test]
+async fn test_stuck_signal() {
+    // This test installs a signal handler that gets stuck. Feel free to tweak
+    // the loop as needed.
+    loop {
+        tokio::signal::ctrl_c().await.expect("received signal");
+    }
+}

From f11b78de65048e58f97f845e3b20e2affd7d3b2d Mon Sep 17 00:00:00 2001
From: Rain <rain@sunshowers.io>
Date: Wed, 11 Dec 2024 02:51:14 +0000
Subject: [PATCH 3/4] [nextest-runner] ensure that the second signal always
 results in SIGKILL

It was always the intention to do this, but this broke at some point :( Unbreak
it and add tests.
---
 nextest-runner/src/reporter/aggregator.rs |   2 +-
 nextest-runner/src/reporter/displayer.rs  |  53 +++++-
 nextest-runner/src/reporter/events.rs     |  16 ++
 nextest-runner/src/runner/dispatcher.rs   | 207 +++++++++++-----------
 nextest-runner/src/signal.rs              |  13 ++
 5 files changed, 177 insertions(+), 114 deletions(-)

diff --git a/nextest-runner/src/reporter/aggregator.rs b/nextest-runner/src/reporter/aggregator.rs
index c1c013b41f2..3f84be26cd7 100644
--- a/nextest-runner/src/reporter/aggregator.rs
+++ b/nextest-runner/src/reporter/aggregator.rs
@@ -202,7 +202,7 @@ impl<'cfg> MetadataJunit<'cfg> {
                 //
                 // testsuite.add_testcase(testcase);
             }
-            TestEventKind::RunBeginCancel { .. } => {}
+            TestEventKind::RunBeginCancel { .. } | TestEventKind::RunBeginKill { .. } => {}
             TestEventKind::RunFinished {
                 run_id,
                 start_time,
diff --git a/nextest-runner/src/reporter/displayer.rs b/nextest-runner/src/reporter/displayer.rs
index 9db2c8a8509..1252ca5bcf8 100644
--- a/nextest-runner/src/reporter/displayer.rs
+++ b/nextest-runner/src/reporter/displayer.rs
@@ -460,8 +460,9 @@ impl ProgressBarState {
                 // continue to output to it.
                 self.hidden_run_paused = false;
             }
-            TestEventKind::RunBeginCancel { .. } => {
-                progress_bar.set_prefix(progress_bar_cancel_prefix(styles));
+            TestEventKind::RunBeginCancel { reason, .. }
+            | TestEventKind::RunBeginKill { reason, .. } => {
+                progress_bar.set_prefix(progress_bar_cancel_prefix(*reason, styles));
             }
             _ => {}
         }
@@ -540,8 +541,16 @@ impl<'a> TestReporter<'a> {
     }
 }
 
-fn progress_bar_cancel_prefix(styles: &Styles) -> String {
-    format!("{:>12}", "Cancelling".style(styles.fail))
+fn progress_bar_cancel_prefix(reason: CancelReason, styles: &Styles) -> String {
+    let status = match reason {
+        CancelReason::SetupScriptFailure
+        | CancelReason::TestFailure
+        | CancelReason::ReportError
+        | CancelReason::Signal
+        | CancelReason::Interrupt => "Cancelling",
+        CancelReason::SecondSignal => "Killing",
+    };
+    format!("{:>12}", status.style(styles.fail))
 }
 
 fn progress_bar_prefix(
@@ -549,8 +558,8 @@ fn progress_bar_prefix(
     cancel_reason: Option<CancelReason>,
     styles: &Styles,
 ) -> String {
-    if cancel_reason.is_some() {
-        return progress_bar_cancel_prefix(styles);
+    if let Some(reason) = cancel_reason {
+        return progress_bar_cancel_prefix(reason, styles);
     }
 
     let style = if run_stats.has_failures() {
@@ -972,6 +981,38 @@ impl<'a> TestReporterImpl<'a> {
                 }
                 writeln!(writer)?;
             }
+            TestEventKind::RunBeginKill {
+                setup_scripts_running,
+                running,
+                reason,
+            } => {
+                self.cancel_status = self.cancel_status.max(Some(*reason));
+
+                write!(
+                    writer,
+                    "{:>12} due to {}",
+                    "Killing".style(self.styles.fail),
+                    reason.to_static_str().style(self.styles.fail)
+                )?;
+
+                // At the moment, we can have either setup scripts or tests running, but not both.
+                if *setup_scripts_running > 0 {
+                    let s = plural::setup_scripts_str(*setup_scripts_running);
+                    write!(
+                        writer,
+                        ": {} {s} still running",
+                        setup_scripts_running.style(self.styles.count),
+                    )?;
+                } else if *running > 0 {
+                    let tests_str = plural::tests_str(*running);
+                    write!(
+                        writer,
+                        ": {} {tests_str} still running",
+                        running.style(self.styles.count),
+                    )?;
+                }
+                writeln!(writer)?;
+            }
             TestEventKind::RunPaused {
                 setup_scripts_running,
                 running,
diff --git a/nextest-runner/src/reporter/events.rs b/nextest-runner/src/reporter/events.rs
index 89695301c66..9931671cbbc 100644
--- a/nextest-runner/src/reporter/events.rs
+++ b/nextest-runner/src/reporter/events.rs
@@ -257,6 +257,18 @@ pub enum TestEventKind<'a> {
         reason: CancelReason,
     },
 
+    /// A forcible kill was requested due to receiving a signal.
+    RunBeginKill {
+        /// The number of setup scripts still running.
+        setup_scripts_running: usize,
+
+        /// The number of tests still running.
+        running: usize,
+
+        /// The reason this run was killed.
+        reason: CancelReason,
+    },
+
     /// A SIGTSTP event was received and the run was paused.
     RunPaused {
         /// The number of setup scripts running.
@@ -777,6 +789,9 @@ pub enum CancelReason {
 
     /// An interrupt (on Unix, Ctrl-C) was received.
     Interrupt,
+
+    /// A second signal was received, and the run is being forcibly killed.
+    SecondSignal,
 }
 
 impl CancelReason {
@@ -787,6 +802,7 @@ impl CancelReason {
             CancelReason::ReportError => "reporting error",
             CancelReason::Signal => "signal",
             CancelReason::Interrupt => "interrupt",
+            CancelReason::SecondSignal => "second signal",
         }
     }
 }
diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs
index 674c92a19ca..98dba26e6d5 100644
--- a/nextest-runner/src/runner/dispatcher.rs
+++ b/nextest-runner/src/runner/dispatcher.rs
@@ -635,14 +635,7 @@ where
             SignalEvent::Shutdown(event) => {
                 let signal_count = self.increment_signal_count();
                 let req = signal_count.to_request(event);
-
-                let cancel_reason = match event {
-                    #[cfg(unix)]
-                    ShutdownEvent::Hangup | ShutdownEvent::Term | ShutdownEvent::Quit => {
-                        CancelReason::Signal
-                    }
-                    ShutdownEvent::Interrupt => CancelReason::Interrupt,
-                };
+                let cancel_reason = event_to_cancel_reason(event);
 
                 self.begin_cancel(cancel_reason, CancelEvent::Signal(req))
             }
@@ -728,8 +721,19 @@ where
     ///
     /// Returns the corresponding `HandleEventResponse`.
     fn begin_cancel(&mut self, reason: CancelReason, event: CancelEvent) -> HandleEventResponse {
-        // TODO: combine reason and event?
-        if self.cancel_state < Some(reason) {
+        // TODO: combine reason and event? The Twice block ignoring the event
+        // seems to indicate a data modeling issue.
+        if event == CancelEvent::Signal(ShutdownRequest::Twice) {
+            // Forcibly kill child processes in the case of a second shutdown
+            // signal.
+            self.basic_callback(TestEventKind::RunBeginKill {
+                setup_scripts_running: self.setup_scripts_running(),
+                running: self.running(),
+                // This is always a second signal.
+                reason: CancelReason::SecondSignal,
+            });
+            HandleEventResponse::Cancel(event)
+        } else if self.cancel_state < Some(reason) {
             self.cancel_state = Some(reason);
             self.basic_callback(TestEventKind::RunBeginCancel {
                 setup_scripts_running: self.setup_scripts_running(),
@@ -757,6 +761,14 @@ where
     }
 }
 
+fn event_to_cancel_reason(event: ShutdownEvent) -> CancelReason {
+    match event {
+        #[cfg(unix)]
+        ShutdownEvent::Hangup | ShutdownEvent::Term | ShutdownEvent::Quit => CancelReason::Signal,
+        ShutdownEvent::Interrupt => CancelReason::Interrupt,
+    }
+}
+
 #[derive(Clone, Debug)]
 struct ContextSetupScript<'a> {
     id: ScriptId,
@@ -877,123 +889,104 @@ mod tests {
             "expected report"
         );
         {
-            let events = events.lock().unwrap();
+            let mut events = events.lock().unwrap();
             assert_eq!(events.len(), 1, "expected 1 event");
+            let event = events.pop().unwrap();
             let TestEventKind::RunBeginCancel {
                 setup_scripts_running,
                 running,
                 reason,
-            } = &events[0].kind
+            } = event.kind
             else {
-                panic!("expected RunBeginCancel event, found {:?}", events[0].kind);
+                panic!("expected RunBeginCancel event, found {:?}", event.kind);
             };
-            assert_eq!(
-                *setup_scripts_running, 0,
-                "expected 0 setup scripts running"
-            );
-            assert_eq!(*running, 0, "expected 0 tests running");
-            assert_eq!(*reason, CancelReason::ReportError, "expected report error");
+            assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running");
+            assert_eq!(running, 0, "expected 0 tests running");
+            assert_eq!(reason, CancelReason::ReportError, "expected report error");
         }
 
         // Send another report error, ensuring it's ignored.
         let response = cx.handle_event(InternalEvent::ReportCancel);
-        assert_noop(response, &events, 1);
-
-        // Cancel with a signal.
-        #[cfg(unix)]
-        let signal_event_count = {
-            let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(
-                ShutdownEvent::Hangup,
-            )));
-            assert_eq!(
-                response,
-                HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once(
-                    ShutdownEvent::Hangup
-                ))),
-                "expected cancellation"
-            );
-            {
-                let events = events.lock().unwrap();
-                assert_eq!(events.len(), 2, "expected 2 events");
-                let TestEventKind::RunBeginCancel {
-                    setup_scripts_running,
-                    running,
-                    reason,
-                } = &events[1].kind
-                else {
-                    panic!("expected RunBeginCancel event, found {:?}", events[1].kind);
-                };
+        assert_noop(response, &events);
+
+        // The rules:
+        // * Any one signal will cause that signal.
+        // * Any two signals received will cause a SIGKILL.
+        // * After a signal is received, any less-important cancel-worthy events
+        //   are ignored.
+        //
+        // Interestingly, this state machine appears to function on Windows too
+        // (though of course the only variant is an Interrupt so this only runs
+        // one iteration.) Should it be different? No compelling reason to be
+        // yet.
+        for sig1 in ShutdownEvent::ALL_VARIANTS {
+            for sig2 in ShutdownEvent::ALL_VARIANTS {
+                eprintln!("** testing {:?} -> {:?}", sig1, sig2);
+                // Separate test for each signal to avoid mixing up state.
+                let mut cx = cx.clone();
+
+                // First signal.
+                let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(*sig1)));
                 assert_eq!(
-                    *setup_scripts_running, 0,
-                    "expected 0 setup scripts running"
+                    response,
+                    HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once(*sig1))),
+                    "expected Once"
                 );
-                assert_eq!(*running, 0, "expected 0 tests running");
-                assert_eq!(*reason, CancelReason::Signal, "expected signal");
-            }
-
-            // Send another signal, ensuring it's ignored.
-            let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(
-                ShutdownEvent::Hangup,
-            )));
-            assert_noop(response, &events, 2);
-
-            // Send a report error, ensuring it's ignored.
-            let response = cx.handle_event(InternalEvent::ReportCancel);
-            assert_noop(response, &events, 2);
+                {
+                    let mut events = events.lock().unwrap();
+                    assert_eq!(events.len(), 1, "expected 1 event");
+                    let event = events.pop().unwrap();
+                    let TestEventKind::RunBeginCancel {
+                        setup_scripts_running,
+                        running,
+                        reason,
+                    } = event.kind
+                    else {
+                        panic!("expected RunBeginCancel event, found {:?}", event.kind);
+                    };
+                    assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running");
+                    assert_eq!(running, 0, "expected 0 tests running");
+                    assert_eq!(reason, event_to_cancel_reason(*sig1), "expected signal");
+                }
 
-            1
-        };
-        #[cfg(not(unix))]
-        let signal_event_count = 0;
+                // Another report error, ensuring it's ignored.
+                let response = cx.handle_event(InternalEvent::ReportCancel);
+                assert_noop(response, &events);
 
-        // Cancel with an interrupt.
-        let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(
-            ShutdownEvent::Interrupt,
-        )));
-        assert_eq!(
-            response,
-            if signal_event_count == 0 {
-                // On Windows, this is the first signal.
-                HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Once(
-                    ShutdownEvent::Interrupt,
-                )))
-            } else {
-                // On Unix, this is the second signal.
-                HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Twice))
-            },
-            "expected cancellation"
-        );
-        {
-            let events = events.lock().unwrap();
-            assert_eq!(events.len(), 2 + signal_event_count, "expected event count");
-            let TestEventKind::RunBeginCancel {
-                setup_scripts_running,
-                running,
-                reason,
-            } = &events[1 + signal_event_count].kind
-            else {
-                panic!(
-                    "expected RunBeginCancel event, found {:?}",
-                    events[1 + signal_event_count].kind
+                // Second signal.
+                let response = cx.handle_event(InternalEvent::Signal(SignalEvent::Shutdown(*sig2)));
+                assert_eq!(
+                    response,
+                    HandleEventResponse::Cancel(CancelEvent::Signal(ShutdownRequest::Twice)),
+                    "expected kill"
                 );
-            };
-            assert_eq!(
-                *setup_scripts_running, 0,
-                "expected 0 setup scripts running"
-            );
-            assert_eq!(*running, 0, "expected 0 tests running");
-            assert_eq!(*reason, CancelReason::Interrupt, "expected interrupt");
+                {
+                    let mut events = events.lock().unwrap();
+                    assert_eq!(events.len(), 1, "expected 1 events");
+                    let event = events.pop().unwrap();
+                    let TestEventKind::RunBeginKill {
+                        setup_scripts_running,
+                        running,
+                        reason,
+                    } = event.kind
+                    else {
+                        panic!("expected RunBeginKill event, found {:?}", event.kind);
+                    };
+                    assert_eq!(setup_scripts_running, 0, "expected 0 setup scripts running");
+                    assert_eq!(running, 0, "expected 0 tests running");
+                    assert_eq!(reason, CancelReason::SecondSignal, "expected second signal");
+                }
+
+                // Another report error, ensuring it's ignored.
+                let response = cx.handle_event(InternalEvent::ReportCancel);
+                assert_noop(response, &events);
+            }
         }
     }
 
     #[track_caller]
-    fn assert_noop(
-        response: HandleEventResponse,
-        events: &Mutex<Vec<TestEvent<'_>>>,
-        event_count: usize,
-    ) {
+    fn assert_noop(response: HandleEventResponse, events: &Mutex<Vec<TestEvent<'_>>>) {
         assert_eq!(response, HandleEventResponse::None, "expected no response");
-        let events = events.lock().unwrap();
-        assert_eq!(events.len(), event_count, "expected no new events");
+        assert_eq!(events.lock().unwrap().len(), 0, "expected no new events");
     }
 }
diff --git a/nextest-runner/src/signal.rs b/nextest-runner/src/signal.rs
index 003bb19bea3..7a9fadd3897 100644
--- a/nextest-runner/src/signal.rs
+++ b/nextest-runner/src/signal.rs
@@ -238,6 +238,19 @@ pub(crate) enum ShutdownEvent {
     Interrupt,
 }
 
+impl ShutdownEvent {
+    #[cfg(test)]
+    pub(crate) const ALL_VARIANTS: &'static [Self] = &[
+        #[cfg(unix)]
+        Self::Hangup,
+        #[cfg(unix)]
+        Self::Term,
+        #[cfg(unix)]
+        Self::Quit,
+        Self::Interrupt,
+    ];
+}
+
 // A signal event to query information about tests.
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub(crate) enum SignalInfoEvent {

From aee7e0630ea4330c44cdc0939462528d848352db Mon Sep 17 00:00:00 2001
From: Rain <rain@sunshowers.io>
Date: Wed, 11 Dec 2024 02:51:14 +0000
Subject: [PATCH 4/4] [nextest-runner] spawn a task for each test

Previously we'd use the same task for all tests -- that turns out to be fine in
most cases, but doesn't quite provide per-test isolation, and is slower in
repos like clap that have lots of small tests. Switch to using the same task
instead.

It's cool that async-scoped kind of works here, but non-static tasks are such
a pain to work with in general.

This makes the clap tests significantly faster. On `fc55ad08`, runtime on my
desktop (Ryzen 7950X/Linux) goes from 0.36 seconds to 0.24.
---
 nextest-runner/src/list/test_list.rs         |   8 +-
 nextest-runner/src/runner/dispatcher.rs      |   8 +-
 nextest-runner/src/runner/imp.rs             | 117 +++++++++++++++----
 nextest-runner/src/runner/internal_events.rs |  23 +++-
 4 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/nextest-runner/src/list/test_list.rs b/nextest-runner/src/list/test_list.rs
index 43636de11c5..edf3dd9918e 100644
--- a/nextest-runner/src/list/test_list.rs
+++ b/nextest-runner/src/list/test_list.rs
@@ -31,7 +31,7 @@ use owo_colors::OwoColorize;
 use std::{
     collections::{BTreeMap, BTreeSet},
     ffi::{OsStr, OsString},
-    io,
+    fmt, io,
     path::PathBuf,
     sync::{Arc, OnceLock},
 };
@@ -1070,6 +1070,12 @@ pub struct TestInstanceId<'a> {
     pub test_name: &'a str,
 }
 
+impl fmt::Display for TestInstanceId<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{} {}", self.binary_id, self.test_name)
+    }
+}
+
 /// Context required for test execution.
 #[derive(Clone, Debug)]
 pub struct TestExecuteContext<'a> {
diff --git a/nextest-runner/src/runner/dispatcher.rs b/nextest-runner/src/runner/dispatcher.rs
index 98dba26e6d5..23eb90a181d 100644
--- a/nextest-runner/src/runner/dispatcher.rs
+++ b/nextest-runner/src/runner/dispatcher.rs
@@ -7,7 +7,7 @@
 //! receives events from the executor and from other inputs (e.g. signal and
 //! input handling), and sends events to the reporter.
 
-use super::{RunUnitRequest, ShutdownRequest};
+use super::{RunUnitRequest, RunnerTaskState, ShutdownRequest};
 use crate::{
     config::{MaxFail, ScriptConfig, ScriptId},
     input::{InputEvent, InputHandler},
@@ -104,7 +104,7 @@ where
         report_cancel_rx: oneshot::Receiver<()>,
         cancelled_ref: &AtomicBool,
         cancellation_sender: broadcast::Sender<()>,
-    ) {
+    ) -> RunnerTaskState {
         let mut report_cancel_rx = std::pin::pin!(report_cancel_rx);
 
         let mut signals_done = false;
@@ -118,7 +118,7 @@ where
                         Some(event) => InternalEvent::Executor(event),
                         None => {
                             // All runs have been completed.
-                            break;
+                            break RunnerTaskState::finished_no_children();
                         }
                     }
                 },
@@ -705,6 +705,8 @@ where
                     if self.disable_signal_3_times_panic {
                         SignalCount::Twice
                     } else {
+                        // TODO: a panic here won't currently lead to other
+                        // tasks being cancelled. This should be fixed.
                         panic!("Signaled 3 times, exiting immediately");
                     }
                 }
diff --git a/nextest-runner/src/runner/imp.rs b/nextest-runner/src/runner/imp.rs
index 4dd59558dd6..93f263b9329 100644
--- a/nextest-runner/src/runner/imp.rs
+++ b/nextest-runner/src/runner/imp.rs
@@ -1,7 +1,7 @@
 // Copyright (c) The nextest Contributors
 // SPDX-License-Identifier: MIT OR Apache-2.0
 
-use super::{DispatcherContext, ExecutorContext};
+use super::{DispatcherContext, ExecutorContext, RunnerTaskState};
 use crate::{
     config::{
         EvaluatableProfile, MaxFail, RetryPolicy, SetupScriptExecuteData, TestGroup, TestThreads,
@@ -30,7 +30,7 @@ use tokio::{
     sync::{broadcast, mpsc::unbounded_channel, oneshot},
     task::JoinError,
 };
-use tracing::debug;
+use tracing::{debug, warn};
 
 /// Test runner options.
 #[derive(Debug, Default)]
@@ -298,11 +298,13 @@ impl<'a> TestRunnerInner<'a> {
                 cancelled_ref,
                 cancellation_sender.clone(),
             );
-            scope.spawn_cancellable(dispatcher_fut, || ());
+            scope.spawn_cancellable(dispatcher_fut, || RunnerTaskState::Cancelled);
 
             let (script_tx, mut script_rx) = unbounded_channel::<SetupScriptExecuteData<'a>>();
             let script_resp_tx = resp_tx.clone();
             let run_scripts_fut = async move {
+                // Since script tasks are run serially, we just reuse the one
+                // script task.
                 let script_data = executor_cx_ref
                     .run_setup_scripts(script_resp_tx, cancelled_ref)
                     .await;
@@ -310,8 +312,9 @@ impl<'a> TestRunnerInner<'a> {
                     // The dispatcher has shut down, so we should too.
                     debug!("script_tx.send failed, shutting down");
                 }
+                RunnerTaskState::finished_no_children()
             };
-            scope.spawn_cancellable(run_scripts_fut, || ());
+            scope.spawn_cancellable(run_scripts_fut, || RunnerTaskState::Cancelled);
 
             let Some(script_data) = script_rx.blocking_recv() else {
                 // Most likely the harness is shutting down, so we should too.
@@ -337,33 +340,107 @@ impl<'a> TestRunnerInner<'a> {
                         TestGroup::Global => None,
                         TestGroup::Custom(name) => Some(name.clone()),
                     };
-
-                    let fut = executor_cx_ref.run_test_instance(
-                        test_instance,
-                        settings,
-                        resp_tx.clone(),
-                        cancelled_ref,
-                        cancellation_sender.subscribe(),
-                        setup_script_data.clone(),
-                    );
+                    let cancel_rx = cancellation_sender.subscribe();
+                    let resp_tx = resp_tx.clone();
+                    let setup_script_data = setup_script_data.clone();
+
+                    // Use a separate Tokio task for each test. For repos with
+                    // lots of small tests, this has been observed to be much
+                    // faster than using a single task for all tests (what we
+                    // used to do). It also provides some degree of per-test
+                    // isolation.
+                    let fut = async move {
+                        // SAFETY: Within an outer scope_and_block (which we
+                        // have here), scope_and_collect is safe as long as the
+                        // returned future isn't forgotten. We're not forgetting
+                        // it below -- we're running it to completion
+                        // immediately.
+                        //
+                        // But recursive scoped calls really feel like pushing
+                        // against the limits of async-scoped. For example,
+                        // there's no way built into async-scoped to propagate a
+                        // cancellation signal from the outer scope to the inner
+                        // scope. (But there could be, right? That seems
+                        // solvable via channels. And we could likely do our own
+                        // channels here.)
+                        let ((), mut ret) = unsafe {
+                            TokioScope::scope_and_collect(move |scope| {
+                                scope.spawn(executor_cx_ref.run_test_instance(
+                                    test_instance,
+                                    settings,
+                                    resp_tx.clone(),
+                                    cancelled_ref,
+                                    cancel_rx,
+                                    setup_script_data,
+                                ))
+                            })
+                        }
+                        .await;
+
+                        // If no future was started, that's really strange.
+                        // Worth at least logging.
+                        let Some(result) = ret.pop() else {
+                            warn!(
+                                "no task was started for test instance: {}",
+                                test_instance.id()
+                            );
+                            return None;
+                        };
+                        match result {
+                            Ok(()) => None,
+                            Err(join_error) => Some(join_error),
+                        }
+                    };
 
                     (threads_required, test_group, fut)
                 })
-                // future_queue_grouped means tests are spawned in order but returned in
-                // any order.
+                // future_queue_grouped means tests are spawned in the order
+                // defined, but returned in any order.
                 .future_queue_grouped(self.test_threads, groups)
-                .collect::<()>();
-
-            scope.spawn_cancellable(run_tests_fut, || ());
+                // Drop the None values.
+                .filter_map(std::future::ready)
+                .collect::<Vec<_>>()
+                // Interestingly, using a more idiomatic `async move {
+                // run_tests_fut.await ... }` block causes Rust 1.83 to complain
+                // about a weird lifetime mismatch. FutureExt::map as used below
+                // does not.
+                .map(|child_join_errors| RunnerTaskState::Finished { child_join_errors });
+
+            scope.spawn_cancellable(run_tests_fut, || RunnerTaskState::Cancelled);
         });
 
         dispatcher_cx.run_finished();
 
-        // Were there any join errors?
+        // Were there any join errors in tasks?
+        //
+        // If one of the tasks panics, we likely end up stuck because the
+        // dispatcher, which is spawned in the same async-scoped block, doesn't
+        // get relayed the panic immediately. That should probably be fixed at
+        // some point.
+        let mut cancelled_count = 0;
         let join_errors = results
             .into_iter()
-            .filter_map(|r| r.err())
+            .flat_map(|r| {
+                match r {
+                    Ok(RunnerTaskState::Finished { child_join_errors }) => child_join_errors,
+                    // Largely ignore cancelled tasks since it most likely means
+                    // shutdown -- we don't cancel tasks manually.
+                    Ok(RunnerTaskState::Cancelled) => {
+                        cancelled_count += 1;
+                        Vec::new()
+                    }
+                    Err(join_error) => vec![join_error],
+                }
+            })
             .collect::<Vec<_>>();
+
+        if cancelled_count > 0 {
+            debug!(
+                "{} tasks were cancelled -- this \
+                 generally should only happen due to panics",
+                cancelled_count
+            );
+        }
         if !join_errors.is_empty() {
             return Err(join_errors);
         }
diff --git a/nextest-runner/src/runner/internal_events.rs b/nextest-runner/src/runner/internal_events.rs
index 35d211dcb12..9a994f224e7 100644
--- a/nextest-runner/src/runner/internal_events.rs
+++ b/nextest-runner/src/runner/internal_events.rs
@@ -24,9 +24,12 @@ use crate::{
 };
 use nextest_metadata::MismatchReason;
 use std::time::Duration;
-use tokio::sync::{
-    mpsc::{UnboundedReceiver, UnboundedSender},
-    oneshot,
+use tokio::{
+    sync::{
+        mpsc::{UnboundedReceiver, UnboundedSender},
+        oneshot,
+    },
+    task::JoinError,
 };
 
 /// An internal event.
@@ -230,3 +233,17 @@ pub(super) enum InternalTerminateReason {
     Timeout,
     Signal(ShutdownRequest),
 }
+
+pub(super) enum RunnerTaskState {
+    Finished { child_join_errors: Vec<JoinError> },
+    Cancelled,
+}
+
+impl RunnerTaskState {
+    /// Mark a runner task as finished and having not run any children.
+    pub(super) fn finished_no_children() -> Self {
+        Self::Finished {
+            child_join_errors: Vec::new(),
+        }
+    }
+}