From aeb2067e8120c1ff480625c00b9571db8d01d5a4 Mon Sep 17 00:00:00 2001 From: Jim Blandy Date: Tue, 16 Jul 2024 20:43:33 -0700 Subject: [PATCH] [core] Make `poll(Wait)` not hang after bad command submission. Add `wgpu_core::device::Device::last_successful_submission_index`, which records the fence value that `Maintain::Wait` should actually wait for. See comments for details. Fixes #5969. --- tests/tests/poll.rs | 34 ++++++++++++++++ wgpu-core/src/device/queue.rs | 7 +++- wgpu-core/src/device/resource.rs | 67 ++++++++++++++++++++++---------- 3 files changed, 87 insertions(+), 21 deletions(-) diff --git a/tests/tests/poll.rs b/tests/tests/poll.rs index 740618f23c..6b86436f7a 100644 --- a/tests/tests/poll.rs +++ b/tests/tests/poll.rs @@ -125,3 +125,37 @@ static WAIT_OUT_OF_ORDER: GpuTestConfiguration = .await .panic_on_timeout(); }); + +/// Submit a command buffer to the wrong device. A wait poll shouldn't hang. +/// +/// We can't catch panics on Wasm, since they get reported directly to the +/// console. +#[gpu_test] +static WAIT_AFTER_BAD_SUBMISSION: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters(wgpu_test::TestParameters::default().skip(wgpu_test::FailureCase::webgl2())) + .run_async(wait_after_bad_submission); + +async fn wait_after_bad_submission(ctx: TestingContext) { + let (device2, queue2) = + wgpu_test::initialize_device(&ctx.adapter, ctx.device_features, ctx.device_limits.clone()) + .await; + + let command_buffer1 = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor::default()) + .finish(); + + // This should panic, since the command buffer belongs to the wrong + // device, and queue submission errors seem to be fatal errors? + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + queue2.submit([command_buffer1]); + })); + assert!(result.is_err()); + + // This should not hang. + // + // Specifically, the failed submission should not cause a new fence value to + // be allocated that will not be signalled until further work is + // successfully submitted, causing a greater fence value to be signalled. + device2.poll(wgpu::Maintain::Wait); +} diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 291fb6456f..1c7b787428 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -1069,7 +1069,7 @@ impl Global { let fence = fence_guard.as_mut().unwrap(); let submit_index = device .active_submission_index - .fetch_add(1, Ordering::Relaxed) + .fetch_add(1, Ordering::SeqCst) + 1; let mut active_executions = Vec::new(); @@ -1392,6 +1392,11 @@ impl Global { ) .map_err(DeviceError::from)?; } + + // Advance the successful submission index. + device + .last_successful_submission_index + .fetch_max(submit_index, Ordering::SeqCst); } profiling::scope!("cleanup"); diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs index 7030f3c6fe..195c6c7e6a 100644 --- a/wgpu-core/src/device/resource.rs +++ b/wgpu-core/src/device/resource.rs @@ -88,7 +88,27 @@ pub struct Device { label: String, pub(crate) command_allocator: command::CommandAllocator, + + /// The index of the last command submission that was attempted. + /// + /// Note that `fence` may never be signalled with this value, if the command + /// submission failed. If you need to wait for everything running on a + /// `Queue` to complete, wait for [`last_successful_submission_index`]. + /// + /// [`last_successful_submission_index`]: Device::last_successful_submission_index pub(crate) active_submission_index: hal::AtomicFenceValue, + + /// The index of the last successful submission to this device's + /// [`hal::Queue`]. + /// + /// Unlike [`active_submission_index`], which is incremented each time + /// submission is attempted, this is updated only when submission succeeds, + /// so waiting for this value won't hang waiting for work that was never + /// submitted. + /// + /// [`active_submission_index`]: Device::active_submission_index + pub(crate) last_successful_submission_index: hal::AtomicFenceValue, + // NOTE: if both are needed, the `snatchable_lock` must be consistently acquired before the // `fence` lock to avoid deadlocks. pub(crate) fence: RwLock>, @@ -257,6 +277,7 @@ impl Device { label: desc.label.to_string(), command_allocator, active_submission_index: AtomicU64::new(0), + last_successful_submission_index: AtomicU64::new(0), fence: RwLock::new(rank::DEVICE_FENCE, Some(fence)), snatchable_lock: unsafe { SnatchLock::new(rank::DEVICE_SNATCHABLE_LOCK) }, valid: AtomicBool::new(true), @@ -387,37 +408,41 @@ impl Device { profiling::scope!("Device::maintain"); let fence = fence_guard.as_ref().unwrap(); - let last_done_index = if maintain.is_wait() { - let index_to_wait_for = match maintain { - wgt::Maintain::WaitForSubmissionIndex(submission_index) => { - // We don't need to check to see if the queue id matches - // as we already checked this from inside the poll call. - submission_index.index - } - _ => self.active_submission_index.load(Ordering::Relaxed), - }; - unsafe { + + // Determine which submission index `maintain` represents. + let submission_index = match maintain { + wgt::Maintain::WaitForSubmissionIndex(submission_index) => { + // We don't need to check to see if the queue id matches + // as we already checked this from inside the poll call. + submission_index.index + } + wgt::Maintain::Wait => self + .last_successful_submission_index + .load(Ordering::Acquire), + wgt::Maintain::Poll => unsafe { self.raw .as_ref() .unwrap() - .wait(fence, index_to_wait_for, CLEANUP_WAIT_MS) + .get_fence_value(fence) .map_err(DeviceError::from)? - }; - index_to_wait_for - } else { + }, + }; + + // If necessary, wait for that submission to complete. + if maintain.is_wait() { unsafe { self.raw .as_ref() .unwrap() - .get_fence_value(fence) + .wait(fence, submission_index, CLEANUP_WAIT_MS) .map_err(DeviceError::from)? - } - }; - log::info!("Device::maintain: last done index {last_done_index}"); + }; + } + log::info!("Device::maintain: waiting for submission index {submission_index}"); let mut life_tracker = self.lock_life(); let submission_closures = - life_tracker.triage_submissions(last_done_index, &self.command_allocator); + life_tracker.triage_submissions(submission_index, &self.command_allocator); life_tracker.triage_mapped(); @@ -3586,7 +3611,9 @@ impl Device { /// Wait for idle and remove resources that we can, before we die. pub(crate) fn prepare_to_die(&self) { self.pending_writes.lock().as_mut().unwrap().deactivate(); - let current_index = self.active_submission_index.load(Ordering::Relaxed); + let current_index = self + .last_successful_submission_index + .load(Ordering::Acquire); if let Err(error) = unsafe { let fence = self.fence.read(); let fence = fence.as_ref().unwrap();