From 365f63040688d680680540d7fa56b9e6d548c2e2 Mon Sep 17 00:00:00 2001
From: Rain <rain@sunshowers.io>
Date: Sat, 14 Dec 2024 21:15:20 +0000
Subject: [PATCH] [nextest-runner] use grace period on receiving Windows Ctrl-C

Previously on Windows, we would not do anything at all on receiving Ctrl-C,
letting tests exit on their own.

With this change, in case of Ctrl-C, nextest will apply the same grace period
that it does on Unix. By default, nextest will now wait 10 seconds before
calling `TerminateJobObject` on the test. Like on Unix, a double Ctrl-C will
kill the test immediately.

The behavior in case of timeouts is unchanged -- call `TerminateJobObject`
immediately.
---
 nextest-runner/src/reporter/displayer.rs      |  13 +-
 nextest-runner/src/reporter/events.rs         |  10 ++
 nextest-runner/src/runner/executor.rs         |   1 -
 nextest-runner/src/runner/windows.rs          | 124 ++++++++++++++++--
 .../design/architecture/signal-handling.md    |   4 +
 5 files changed, 137 insertions(+), 15 deletions(-)

diff --git a/nextest-runner/src/reporter/displayer.rs b/nextest-runner/src/reporter/displayer.rs
index 1252ca5bcf8..43aa5380731 100644
--- a/nextest-runner/src/reporter/displayer.rs
+++ b/nextest-runner/src/reporter/displayer.rs
@@ -1661,7 +1661,6 @@ impl<'a> TestReporterImpl<'a> {
         state: &UnitTerminatingState,
         writer: &mut dyn Write,
     ) -> io::Result<()> {
-        #[cfg_attr(not(any(unix, test)), expect(unused_variables))]
         let UnitTerminatingState {
             pid,
             time_taken,
@@ -1707,6 +1706,18 @@ impl<'a> TestReporterImpl<'a> {
                     "note".style(self.styles.count),
                 )?;
             }
+            #[cfg(windows)]
+            UnitTerminateMethod::Wait => {
+                writeln!(
+                    writer,
+                    "{}:   waiting for {} to exit on its own; spent {:.3?}s, will terminate \
+                     job object after another {:.3?}s",
+                    "note".style(self.styles.count),
+                    kind,
+                    waiting_duration.as_secs_f64(),
+                    remaining.as_secs_f64(),
+                )?;
+            }
             #[cfg(test)]
             UnitTerminateMethod::Fake => {
                 // This is only used in tests.
diff --git a/nextest-runner/src/reporter/events.rs b/nextest-runner/src/reporter/events.rs
index 9ceaca23740..30b6e1bbf7c 100644
--- a/nextest-runner/src/reporter/events.rs
+++ b/nextest-runner/src/reporter/events.rs
@@ -1063,6 +1063,16 @@ pub enum UnitTerminateMethod {
     #[cfg(windows)]
     JobObject,
 
+    /// The unit is being waited on to exit. A termination signal will be sent
+    /// if it doesn't exit within the grace period.
+    ///
+    /// On Windows, this occurs when nextest receives Ctrl-C. In that case, it
+    /// is assumed that tests will also receive Ctrl-C and exit on their own. If
+    /// tests do not exit within the grace period configured for them, their
+    /// corresponding job objects will be terminated.
+    #[cfg(windows)]
+    Wait,
+
     /// A fake method used for testing.
     #[cfg(test)]
     Fake,
diff --git a/nextest-runner/src/runner/executor.rs b/nextest-runner/src/runner/executor.rs
index 01f38a4d2dc..0b5ab20413b 100644
--- a/nextest-runner/src/runner/executor.rs
+++ b/nextest-runner/src/runner/executor.rs
@@ -857,7 +857,6 @@ pub(super) struct UnitContext<'a> {
 }
 
 impl<'a> UnitContext<'a> {
-    #[cfg_attr(not(unix), expect(dead_code))]
     pub(super) fn packet(&self) -> &UnitPacket<'a> {
         &self.packet
     }
diff --git a/nextest-runner/src/runner/windows.rs b/nextest-runner/src/runner/windows.rs
index dc9087923e4..4e00140b0c3 100644
--- a/nextest-runner/src/runner/windows.rs
+++ b/nextest-runner/src/runner/windows.rs
@@ -3,7 +3,12 @@
 
 use crate::{
     errors::ConfigureHandleInheritanceError,
-    runner::{InternalTerminateReason, RunUnitRequest, UnitContext},
+    reporter::events::{UnitState, UnitTerminateMethod, UnitTerminateReason, UnitTerminatingState},
+    runner::{
+        InternalTerminateReason, RunUnitQuery, RunUnitRequest, ShutdownRequest, SignalRequest,
+        UnitContext,
+    },
+    signal::ShutdownEvent,
     test_command::ChildAccumulator,
     time::StopwatchStart,
 };
@@ -73,21 +78,78 @@ pub(super) fn assign_process_to_job(
 }
 
 #[expect(clippy::too_many_arguments)]
-pub(super) async fn terminate_child(
-    _cx: &UnitContext<'_>,
+pub(super) async fn terminate_child<'a>(
+    cx: &UnitContext<'a>,
     child: &mut Child,
-    _child_acc: &mut ChildAccumulator,
+    child_acc: &mut ChildAccumulator,
     reason: InternalTerminateReason,
-    _stopwatch: &mut StopwatchStart,
-    _req_rx: &mut UnboundedReceiver<RunUnitRequest<'_>>,
+    stopwatch: &mut StopwatchStart,
+    req_rx: &mut UnboundedReceiver<RunUnitRequest<'a>>,
     job: Option<&Job>,
-    _grace_period: Duration,
+    grace_period: Duration,
 ) {
-    // Ignore signal events since Windows propagates them to child processes (this may change if
-    // we start assigning processes to groups on Windows).
-    if !matches!(reason, InternalTerminateReason::Timeout) {
-        return;
-    }
+    let Some(pid) = child.id() else { return };
+    let (term_reason, term_method) = to_terminate_reason_and_method(&reason, grace_period);
+
+    let child_exited = match term_method {
+        UnitTerminateMethod::Wait => {
+            let waiting_stopwatch = crate::time::stopwatch();
+
+            loop {
+                tokio::select! {
+                    () = child_acc.fill_buf(), if !child_acc.fds.is_done() => {}
+                    _ = child.wait() => {
+                        // The process exited.
+                        break true;
+                    }
+                    recv = req_rx.recv() => {
+                        // The sender stays open longer than the whole loop, and
+                        // the buffer is big enough for all messages ever sent
+                        // through this channel, so a RecvError should never
+                        // happen.
+                        let req = recv.expect("a RecvError should never happen here");
+
+                        match req {
+                            RunUnitRequest::Signal(SignalRequest::Shutdown(_)) => {
+                                // Receiving a shutdown signal while waiting for
+                                // the process to exit always means kill
+                                // immediately -- go to the next step.
+                                break false;
+                            }
+                            RunUnitRequest::Query(RunUnitQuery::GetInfo(sender)) => {
+                                let waiting_snapshot = waiting_stopwatch.snapshot();
+                                _ = sender.send(
+                                    cx.info_response(
+                                        UnitState::Terminating(UnitTerminatingState {
+                                            pid,
+                                            time_taken: stopwatch.snapshot().active,
+                                            reason: term_reason,
+                                            method: term_method,
+                                            waiting_duration: waiting_snapshot.active,
+                                            remaining: grace_period
+                                                .checked_sub(waiting_snapshot.active)
+                                                .unwrap_or_default(),
+                                        }),
+                                        child_acc.snapshot_in_progress(cx.packet().kind().waiting_on_message()),
+                                    )
+                                );
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        UnitTerminateMethod::JobObject => {
+            // The process is killed by the job object.
+            false
+        }
+        #[cfg(test)]
+        UnitTerminateMethod::Fake => {
+            unreachable!("fake method is only used for reporter tests");
+        }
+    };
+
+    // In any case, always call TerminateJobObject.
     if let Some(job) = job {
         let handle = job.handle();
         unsafe {
@@ -96,6 +158,42 @@ pub(super) async fn terminate_child(
             _ = TerminateJobObject(handle as _, 1);
         }
     }
+
     // Start killing the process directly for good measure.
-    let _ = child.start_kill();
+    if !child_exited {
+        let _ = child.start_kill();
+    }
+}
+
+fn to_terminate_reason_and_method(
+    reason: &InternalTerminateReason,
+    grace_period: Duration,
+) -> (UnitTerminateReason, UnitTerminateMethod) {
+    match reason {
+        InternalTerminateReason::Timeout => (
+            UnitTerminateReason::Timeout,
+            // The grace period is currently ignored for timeouts --
+            // TerminateJobObject is immediately called.
+            UnitTerminateMethod::JobObject,
+        ),
+        InternalTerminateReason::Signal(req) => (
+            // The only signals we support on Windows are interrupts.
+            UnitTerminateReason::Interrupt,
+            shutdown_terminate_method(*req, grace_period),
+        ),
+    }
+}
+
+fn shutdown_terminate_method(req: ShutdownRequest, grace_period: Duration) -> UnitTerminateMethod {
+    if grace_period.is_zero() {
+        return UnitTerminateMethod::JobObject;
+    }
+
+    match req {
+        // In case of interrupt events, wait for the grace period to elapse
+        // before terminating the job. We're assuming that if nextest got an
+        // interrupt, child processes did as well.
+        ShutdownRequest::Once(ShutdownEvent::Interrupt) => UnitTerminateMethod::Wait,
+        ShutdownRequest::Twice => UnitTerminateMethod::JobObject,
+    }
 }
diff --git a/site/src/docs/design/architecture/signal-handling.md b/site/src/docs/design/architecture/signal-handling.md
index 010d0973682..779bea08677 100644
--- a/site/src/docs/design/architecture/signal-handling.md
+++ b/site/src/docs/design/architecture/signal-handling.md
@@ -224,6 +224,10 @@ Unlike process groups, job objects form a tree. If something else runs nextest
 within a job object and then calls `TerminateJobObject`, both nextest and all
 its child processes are terminated.
 
+When a test times out, nextest calls `TerminateJobObject` on the job object
+associated with the test immediately. In the future, it would be interesting
+to send a Ctrl-C (or maybe a `WM_CLOSE`?) to the test process first.
+
 [job objects]: https://learn.microsoft.com/en-us/windows/win32/procthread/job-objects
 [terminate-job-object]: https://docs.microsoft.com/en-us/windows/win32/api/jobapi2/nf-jobapi2-terminatejobobject