Skip to content

Commit

Permalink
[core] Make poll(Wait) not hang after bad command submission.
Browse files Browse the repository at this point in the history
Add `wgpu_core::device::Device::last_successful_submission_index`,
which records the fence value that `Maintain::Wait` should actually
wait for. See comments for details.

Fixes #5969.
  • Loading branch information
jimblandy committed Jul 17, 2024
1 parent 2bc328c commit aeb2067
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 21 deletions.
34 changes: 34 additions & 0 deletions tests/tests/poll.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,37 @@ static WAIT_OUT_OF_ORDER: GpuTestConfiguration =
.await
.panic_on_timeout();
});

/// Submit a command buffer to the wrong device. A wait poll shouldn't hang.
///
/// We can't catch panics on Wasm, since they get reported directly to the
/// console.
#[gpu_test]
static WAIT_AFTER_BAD_SUBMISSION: GpuTestConfiguration = GpuTestConfiguration::new()
.parameters(wgpu_test::TestParameters::default().skip(wgpu_test::FailureCase::webgl2()))
.run_async(wait_after_bad_submission);

async fn wait_after_bad_submission(ctx: TestingContext) {
let (device2, queue2) =
wgpu_test::initialize_device(&ctx.adapter, ctx.device_features, ctx.device_limits.clone())
.await;

let command_buffer1 = ctx
.device
.create_command_encoder(&CommandEncoderDescriptor::default())
.finish();

// This should panic, since the command buffer belongs to the wrong
// device, and queue submission errors seem to be fatal errors?
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
queue2.submit([command_buffer1]);
}));
assert!(result.is_err());

// This should not hang.
//
// Specifically, the failed submission should not cause a new fence value to
// be allocated that will not be signalled until further work is
// successfully submitted, causing a greater fence value to be signalled.
device2.poll(wgpu::Maintain::Wait);
}
7 changes: 6 additions & 1 deletion wgpu-core/src/device/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ impl Global {
let fence = fence_guard.as_mut().unwrap();
let submit_index = device
.active_submission_index
.fetch_add(1, Ordering::Relaxed)
.fetch_add(1, Ordering::SeqCst)
+ 1;
let mut active_executions = Vec::new();

Expand Down Expand Up @@ -1392,6 +1392,11 @@ impl Global {
)
.map_err(DeviceError::from)?;
}

// Advance the successful submission index.
device
.last_successful_submission_index
.fetch_max(submit_index, Ordering::SeqCst);
}

profiling::scope!("cleanup");
Expand Down
67 changes: 47 additions & 20 deletions wgpu-core/src/device/resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,27 @@ pub struct Device<A: HalApi> {
label: String,

pub(crate) command_allocator: command::CommandAllocator<A>,

/// The index of the last command submission that was attempted.
///
/// Note that `fence` may never be signalled with this value, if the command
/// submission failed. If you need to wait for everything running on a
/// `Queue` to complete, wait for [`last_successful_submission_index`].
///
/// [`last_successful_submission_index`]: Device::last_successful_submission_index
pub(crate) active_submission_index: hal::AtomicFenceValue,

/// The index of the last successful submission to this device's
/// [`hal::Queue`].
///
/// Unlike [`active_submission_index`], which is incremented each time
/// submission is attempted, this is updated only when submission succeeds,
/// so waiting for this value won't hang waiting for work that was never
/// submitted.
///
/// [`active_submission_index`]: Device::active_submission_index
pub(crate) last_successful_submission_index: hal::AtomicFenceValue,

// NOTE: if both are needed, the `snatchable_lock` must be consistently acquired before the
// `fence` lock to avoid deadlocks.
pub(crate) fence: RwLock<Option<A::Fence>>,
Expand Down Expand Up @@ -257,6 +277,7 @@ impl<A: HalApi> Device<A> {
label: desc.label.to_string(),
command_allocator,
active_submission_index: AtomicU64::new(0),
last_successful_submission_index: AtomicU64::new(0),
fence: RwLock::new(rank::DEVICE_FENCE, Some(fence)),
snatchable_lock: unsafe { SnatchLock::new(rank::DEVICE_SNATCHABLE_LOCK) },
valid: AtomicBool::new(true),
Expand Down Expand Up @@ -387,37 +408,41 @@ impl<A: HalApi> Device<A> {
profiling::scope!("Device::maintain");

let fence = fence_guard.as_ref().unwrap();
let last_done_index = if maintain.is_wait() {
let index_to_wait_for = match maintain {
wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
// We don't need to check to see if the queue id matches
// as we already checked this from inside the poll call.
submission_index.index
}
_ => self.active_submission_index.load(Ordering::Relaxed),
};
unsafe {

// Determine which submission index `maintain` represents.
let submission_index = match maintain {
wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
// We don't need to check to see if the queue id matches
// as we already checked this from inside the poll call.
submission_index.index
}
wgt::Maintain::Wait => self
.last_successful_submission_index
.load(Ordering::Acquire),
wgt::Maintain::Poll => unsafe {
self.raw
.as_ref()
.unwrap()
.wait(fence, index_to_wait_for, CLEANUP_WAIT_MS)
.get_fence_value(fence)
.map_err(DeviceError::from)?
};
index_to_wait_for
} else {
},
};

// If necessary, wait for that submission to complete.
if maintain.is_wait() {
unsafe {
self.raw
.as_ref()
.unwrap()
.get_fence_value(fence)
.wait(fence, submission_index, CLEANUP_WAIT_MS)
.map_err(DeviceError::from)?
}
};
log::info!("Device::maintain: last done index {last_done_index}");
};
}
log::info!("Device::maintain: waiting for submission index {submission_index}");

let mut life_tracker = self.lock_life();
let submission_closures =
life_tracker.triage_submissions(last_done_index, &self.command_allocator);
life_tracker.triage_submissions(submission_index, &self.command_allocator);

life_tracker.triage_mapped();

Expand Down Expand Up @@ -3586,7 +3611,9 @@ impl<A: HalApi> Device<A> {
/// Wait for idle and remove resources that we can, before we die.
pub(crate) fn prepare_to_die(&self) {
self.pending_writes.lock().as_mut().unwrap().deactivate();
let current_index = self.active_submission_index.load(Ordering::Relaxed);
let current_index = self
.last_successful_submission_index
.load(Ordering::Acquire);
if let Err(error) = unsafe {
let fence = self.fence.read();
let fence = fence.as_ref().unwrap();
Expand Down

0 comments on commit aeb2067

Please sign in to comment.