From 365f63040688d680680540d7fa56b9e6d548c2e2 Mon Sep 17 00:00:00 2001 From: Rain Date: Sat, 14 Dec 2024 21:15:20 +0000 Subject: [PATCH] [nextest-runner] use grace period on receiving Windows Ctrl-C Previously on Windows, we would not do anything at all on receiving Ctrl-C, letting tests exit on their own. With this change, in case of Ctrl-C, nextest will apply the same grace period that it does on Unix. By default, nextest will now wait 10 seconds before calling `TerminateJobObject` on the test. Like on Unix, a double Ctrl-C will kill the test immediately. The behavior in case of timeouts is unchanged -- call `TerminateJobObject` immediately. --- nextest-runner/src/reporter/displayer.rs | 13 +- nextest-runner/src/reporter/events.rs | 10 ++ nextest-runner/src/runner/executor.rs | 1 - nextest-runner/src/runner/windows.rs | 124 ++++++++++++++++-- .../design/architecture/signal-handling.md | 4 + 5 files changed, 137 insertions(+), 15 deletions(-) diff --git a/nextest-runner/src/reporter/displayer.rs b/nextest-runner/src/reporter/displayer.rs index 1252ca5bcf8..43aa5380731 100644 --- a/nextest-runner/src/reporter/displayer.rs +++ b/nextest-runner/src/reporter/displayer.rs @@ -1661,7 +1661,6 @@ impl<'a> TestReporterImpl<'a> { state: &UnitTerminatingState, writer: &mut dyn Write, ) -> io::Result<()> { - #[cfg_attr(not(any(unix, test)), expect(unused_variables))] let UnitTerminatingState { pid, time_taken, @@ -1707,6 +1706,18 @@ impl<'a> TestReporterImpl<'a> { "note".style(self.styles.count), )?; } + #[cfg(windows)] + UnitTerminateMethod::Wait => { + writeln!( + writer, + "{}: waiting for {} to exit on its own; spent {:.3?}s, will terminate \ + job object after another {:.3?}s", + "note".style(self.styles.count), + kind, + waiting_duration.as_secs_f64(), + remaining.as_secs_f64(), + )?; + } #[cfg(test)] UnitTerminateMethod::Fake => { // This is only used in tests. diff --git a/nextest-runner/src/reporter/events.rs b/nextest-runner/src/reporter/events.rs index 9ceaca23740..30b6e1bbf7c 100644 --- a/nextest-runner/src/reporter/events.rs +++ b/nextest-runner/src/reporter/events.rs @@ -1063,6 +1063,16 @@ pub enum UnitTerminateMethod { #[cfg(windows)] JobObject, + /// The unit is being waited on to exit. A termination signal will be sent + /// if it doesn't exit within the grace period. + /// + /// On Windows, this occurs when nextest receives Ctrl-C. In that case, it + /// is assumed that tests will also receive Ctrl-C and exit on their own. If + /// tests do not exit within the grace period configured for them, their + /// corresponding job objects will be terminated. + #[cfg(windows)] + Wait, + /// A fake method used for testing. #[cfg(test)] Fake, diff --git a/nextest-runner/src/runner/executor.rs b/nextest-runner/src/runner/executor.rs index 01f38a4d2dc..0b5ab20413b 100644 --- a/nextest-runner/src/runner/executor.rs +++ b/nextest-runner/src/runner/executor.rs @@ -857,7 +857,6 @@ pub(super) struct UnitContext<'a> { } impl<'a> UnitContext<'a> { - #[cfg_attr(not(unix), expect(dead_code))] pub(super) fn packet(&self) -> &UnitPacket<'a> { &self.packet } diff --git a/nextest-runner/src/runner/windows.rs b/nextest-runner/src/runner/windows.rs index dc9087923e4..4e00140b0c3 100644 --- a/nextest-runner/src/runner/windows.rs +++ b/nextest-runner/src/runner/windows.rs @@ -3,7 +3,12 @@ use crate::{ errors::ConfigureHandleInheritanceError, - runner::{InternalTerminateReason, RunUnitRequest, UnitContext}, + reporter::events::{UnitState, UnitTerminateMethod, UnitTerminateReason, UnitTerminatingState}, + runner::{ + InternalTerminateReason, RunUnitQuery, RunUnitRequest, ShutdownRequest, SignalRequest, + UnitContext, + }, + signal::ShutdownEvent, test_command::ChildAccumulator, time::StopwatchStart, }; @@ -73,21 +78,78 @@ pub(super) fn assign_process_to_job( } #[expect(clippy::too_many_arguments)] -pub(super) async fn terminate_child( - _cx: &UnitContext<'_>, +pub(super) async fn terminate_child<'a>( + cx: &UnitContext<'a>, child: &mut Child, - _child_acc: &mut ChildAccumulator, + child_acc: &mut ChildAccumulator, reason: InternalTerminateReason, - _stopwatch: &mut StopwatchStart, - _req_rx: &mut UnboundedReceiver>, + stopwatch: &mut StopwatchStart, + req_rx: &mut UnboundedReceiver>, job: Option<&Job>, - _grace_period: Duration, + grace_period: Duration, ) { - // Ignore signal events since Windows propagates them to child processes (this may change if - // we start assigning processes to groups on Windows). - if !matches!(reason, InternalTerminateReason::Timeout) { - return; - } + let Some(pid) = child.id() else { return }; + let (term_reason, term_method) = to_terminate_reason_and_method(&reason, grace_period); + + let child_exited = match term_method { + UnitTerminateMethod::Wait => { + let waiting_stopwatch = crate::time::stopwatch(); + + loop { + tokio::select! { + () = child_acc.fill_buf(), if !child_acc.fds.is_done() => {} + _ = child.wait() => { + // The process exited. + break true; + } + recv = req_rx.recv() => { + // The sender stays open longer than the whole loop, and + // the buffer is big enough for all messages ever sent + // through this channel, so a RecvError should never + // happen. + let req = recv.expect("a RecvError should never happen here"); + + match req { + RunUnitRequest::Signal(SignalRequest::Shutdown(_)) => { + // Receiving a shutdown signal while waiting for + // the process to exit always means kill + // immediately -- go to the next step. + break false; + } + RunUnitRequest::Query(RunUnitQuery::GetInfo(sender)) => { + let waiting_snapshot = waiting_stopwatch.snapshot(); + _ = sender.send( + cx.info_response( + UnitState::Terminating(UnitTerminatingState { + pid, + time_taken: stopwatch.snapshot().active, + reason: term_reason, + method: term_method, + waiting_duration: waiting_snapshot.active, + remaining: grace_period + .checked_sub(waiting_snapshot.active) + .unwrap_or_default(), + }), + child_acc.snapshot_in_progress(cx.packet().kind().waiting_on_message()), + ) + ); + } + } + } + } + } + } + UnitTerminateMethod::JobObject => { + // The process is killed by the job object. + false + } + #[cfg(test)] + UnitTerminateMethod::Fake => { + unreachable!("fake method is only used for reporter tests"); + } + }; + + // In any case, always call TerminateJobObject. if let Some(job) = job { let handle = job.handle(); unsafe { @@ -96,6 +158,42 @@ pub(super) async fn terminate_child( _ = TerminateJobObject(handle as _, 1); } } + // Start killing the process directly for good measure. - let _ = child.start_kill(); + if !child_exited { + let _ = child.start_kill(); + } +} + +fn to_terminate_reason_and_method( + reason: &InternalTerminateReason, + grace_period: Duration, +) -> (UnitTerminateReason, UnitTerminateMethod) { + match reason { + InternalTerminateReason::Timeout => ( + UnitTerminateReason::Timeout, + // The grace period is currently ignored for timeouts -- + // TerminateJobObject is immediately called. + UnitTerminateMethod::JobObject, + ), + InternalTerminateReason::Signal(req) => ( + // The only signals we support on Windows are interrupts. + UnitTerminateReason::Interrupt, + shutdown_terminate_method(*req, grace_period), + ), + } +} + +fn shutdown_terminate_method(req: ShutdownRequest, grace_period: Duration) -> UnitTerminateMethod { + if grace_period.is_zero() { + return UnitTerminateMethod::JobObject; + } + + match req { + // In case of interrupt events, wait for the grace period to elapse + // before terminating the job. We're assuming that if nextest got an + // interrupt, child processes did as well. + ShutdownRequest::Once(ShutdownEvent::Interrupt) => UnitTerminateMethod::Wait, + ShutdownRequest::Twice => UnitTerminateMethod::JobObject, + } } diff --git a/site/src/docs/design/architecture/signal-handling.md b/site/src/docs/design/architecture/signal-handling.md index 010d0973682..779bea08677 100644 --- a/site/src/docs/design/architecture/signal-handling.md +++ b/site/src/docs/design/architecture/signal-handling.md @@ -224,6 +224,10 @@ Unlike process groups, job objects form a tree. If something else runs nextest within a job object and then calls `TerminateJobObject`, both nextest and all its child processes are terminated. +When a test times out, nextest calls `TerminateJobObject` on the job object +associated with the test immediately. In the future, it would be interesting +to send a Ctrl-C (or maybe a `WM_CLOSE`?) to the test process first. + [job objects]: https://learn.microsoft.com/en-us/windows/win32/procthread/job-objects [terminate-job-object]: https://docs.microsoft.com/en-us/windows/win32/api/jobapi2/nf-jobapi2-terminatejobobject