From de809c8f96ba18084873355a917fbdfc5426ea90 Mon Sep 17 00:00:00 2001 From: Vladas Zakrevskis <146100@gmail.com> Date: Wed, 29 May 2024 19:00:32 +0100 Subject: [PATCH 1/9] Fix missing family check flag (#5754) Co-authored-by: Jim Blandy Co-authored-by: Xiaopeng Li Co-authored-by: Connor Fitzgerald Co-authored-by: Samson <16504129+sagudev@users.noreply.github.com> Co-authored-by: Valaphee The Meerkat <32491319+valaphee@users.noreply.github.com> Co-authored-by: Andreas Reich --- CHANGELOG.md | 4 ++++ wgpu-hal/src/metal/adapter.rs | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a13590d0b..ff3eb46b99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -96,6 +96,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410) - Ensure render pipelines have at least 1 target. By @ErichDonGubler in [#5715](https://github.com/gfx-rs/wgpu/pull/5715) +#### Metal + +- Fix unrecognized selector crash on iOS 12. By @vladasz in [#5744](https://github.com/gfx-rs/wgpu/pull/5744). + #### Vulkan - Fix enablement of subgroup ops extension on Vulkan devices that don't support Vulkan 1.3. By @cwfitzgerald in [#5624](https://github.com/gfx-rs/wgpu/pull/5624). diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index 2f84be8859..0ffe37f5e7 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -736,7 +736,9 @@ impl super::PrivateCapabilities { 4 }, // Per https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - max_color_attachment_bytes_per_sample: if device.supports_family(MTLGPUFamily::Apple4) { + max_color_attachment_bytes_per_sample: if family_check + && device.supports_family(MTLGPUFamily::Apple4) + { 64 } else { 32 From 23307e1dc355df3686547c48e9d1523105faa735 Mon Sep 17 00:00:00 2001 From: Valaphee The Meerkat <32491319+valaphee@users.noreply.github.com> Date: Wed, 29 May 2024 20:01:32 +0200 Subject: [PATCH 2/9] gles: Return the version as driver_info (#5753) --- CHANGELOG.md | 9 +++++---- wgpu-hal/src/gles/adapter.rs | 24 ++---------------------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff3eb46b99..9ed0ded2e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -92,7 +92,7 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410) ### Bug Fixes -### General +#### General - Ensure render pipelines have at least 1 target. By @ErichDonGubler in [#5715](https://github.com/gfx-rs/wgpu/pull/5715) @@ -106,9 +106,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410) #### GLES / OpenGL -- Fix regression on OpenGL (EGL) where non-sRGB still used sRGB [#5642](https://github.com/gfx-rs/wgpu/pull/5642) -- Fix `ClearColorF`, `ClearColorU` and `ClearColorI` commands being issued before `SetDrawColorBuffers` [#5666](https://github.com/gfx-rs/wgpu/pull/5666) -- Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666) +- Fix regression on OpenGL (EGL) where non-sRGB still used sRGB [#5642](https://github.com/gfx-rs/wgpu/pull/5642) +- Fix `ClearColorF`, `ClearColorU` and `ClearColorI` commands being issued before `SetDrawColorBuffers` [#5666](https://github.com/gfx-rs/wgpu/pull/5666) +- Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666) +- Return the unmodified version in driver_info. By @Valaphee in [#5753](https://github.com/gfx-rs/wgpu/pull/5753) ## v0.20.0 (2024-04-28) diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs index 03c026aa23..926b5afbcb 100644 --- a/wgpu-hal/src/gles/adapter.rs +++ b/wgpu-hal/src/gles/adapter.rs @@ -179,33 +179,13 @@ impl super::Adapter { 0 }; - let driver; - let driver_info; - if version.starts_with("WebGL ") || version.starts_with("OpenGL ") { - let es_sig = " ES"; - match version.find(es_sig) { - Some(pos) => { - driver = version[..pos + es_sig.len()].to_owned(); - driver_info = version[pos + es_sig.len() + 1..].to_owned(); - } - None => { - let pos = version.find(' ').unwrap(); - driver = version[..pos].to_owned(); - driver_info = version[pos + 1..].to_owned(); - } - } - } else { - driver = "OpenGL".to_owned(); - driver_info = version; - } - wgt::AdapterInfo { name: renderer_orig, vendor: vendor_id, device: 0, device_type: inferred_device_type, - driver, - driver_info, + driver: "".to_owned(), + driver_info: version, backend: wgt::Backend::Gl, } } From 071fb14e159749241b810ada3ee2e620f15d915e Mon Sep 17 00:00:00 2001 From: Douglas Dwyer Date: Wed, 29 May 2024 15:33:04 -0400 Subject: [PATCH 3/9] Add support for pipeline-overridable constants in web backend (#5688) * Add support for pipeline-overridable constants in WebGPU * Add utility function for setting constants map * Panic on failure to set constants map --------- Co-authored-by: Andreas Reich --- CHANGELOG.md | 4 ++++ wgpu/src/backend/webgpu.rs | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed0ded2e4..392ccc9b44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,6 +111,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410) - Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666) - Return the unmodified version in driver_info. By @Valaphee in [#5753](https://github.com/gfx-rs/wgpu/pull/5753) +#### WebGPU + +- Added support for pipeline-overridable constants to the WebGPU backend by @DouglasDwyer in [#5688](https://github.com/gfx-rs/wgpu/pull/5688) + ## v0.20.0 (2024-04-28) ### Major Changes diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs index 9d316e76fb..948c707b78 100644 --- a/wgpu/src/backend/webgpu.rs +++ b/wgpu/src/backend/webgpu.rs @@ -7,6 +7,7 @@ use js_sys::Promise; use std::{ any::Any, cell::RefCell, + collections::HashMap, fmt, future::Future, marker::PhantomData, @@ -1876,6 +1877,10 @@ impl crate::context::Context for ContextWebGpu { let module: &::ShaderModuleData = downcast_ref(desc.vertex.module.data.as_ref()); let mut mapped_vertex_state = webgpu_sys::GpuVertexState::new(&module.0.module); + insert_constants_map( + &mapped_vertex_state, + desc.vertex.compilation_options.constants, + ); mapped_vertex_state.entry_point(desc.vertex.entry_point); let buffers = desc @@ -1952,6 +1957,7 @@ impl crate::context::Context for ContextWebGpu { downcast_ref(frag.module.data.as_ref()); let mut mapped_fragment_desc = webgpu_sys::GpuFragmentState::new(&module.0.module, &targets); + insert_constants_map(&mapped_vertex_state, frag.compilation_options.constants); mapped_fragment_desc.entry_point(frag.entry_point); mapped_desc.fragment(&mapped_fragment_desc); } @@ -1978,6 +1984,7 @@ impl crate::context::Context for ContextWebGpu { downcast_ref(desc.module.data.as_ref()); let mut mapped_compute_stage = webgpu_sys::GpuProgrammableStage::new(&shader_module.0.module); + insert_constants_map(&mapped_compute_stage, desc.compilation_options.constants); mapped_compute_stage.entry_point(desc.entry_point); let auto_layout = wasm_bindgen::JsValue::from(webgpu_sys::GpuAutoLayoutMode::Auto); let mut mapped_desc = webgpu_sys::GpuComputePipelineDescriptor::new( @@ -1994,6 +2001,7 @@ impl crate::context::Context for ContextWebGpu { if let Some(label) = desc.label { mapped_desc.label(label); } + create_identified(device_data.0.create_compute_pipeline(&mapped_desc)) } @@ -3824,3 +3832,29 @@ impl Drop for BufferMappedRange { } } } + +/// Adds the constants map to the given pipeline descriptor if the map is nonempty. +/// Panics if the map cannot be set. +/// +/// This function is necessary because the constants array is not currently +/// exposed by `wasm-bindgen`. See the following issues for details: +/// - [gfx-rs/wgpu#5688](https://github.com/gfx-rs/wgpu/pull/5688) +/// - [rustwasm/wasm-bindgen#3587](https://github.com/rustwasm/wasm-bindgen/issues/3587) +fn insert_constants_map(target: &JsValue, map: &HashMap) { + if !map.is_empty() { + js_sys::Reflect::set(target, &"constants".into(), &hashmap_to_jsvalue(map)) + .expect("Setting the values in a Javascript pipeline descriptor should never fail"); + } +} + +/// Converts a hashmap to a Javascript object. +fn hashmap_to_jsvalue(map: &HashMap) -> JsValue { + let obj = js_sys::Object::new(); + + for (k, v) in map.iter() { + js_sys::Reflect::set(&obj, &k.into(), &(*v).into()) + .expect("Setting the values in a Javascript map should never fail"); + } + + JsValue::from(obj) +} From 588950110af8aca278516ec15d33ef6b7b66588c Mon Sep 17 00:00:00 2001 From: Andreas Reich Date: Thu, 30 May 2024 00:43:24 +0200 Subject: [PATCH 4/9] Remove lifetime dependency of `ComputePass` to its parent command encoder (#5620) * lift encoder->computepass lifetime constraint and add now failing test * compute passes now take an arc to their parent command encoder, thus removing compile time dependency to it * Command encoder goes now into locked state while compute pass is open * changelog entry * share most of the code between get_encoder and lock_encoder --- CHANGELOG.md | 8 +- deno_webgpu/command_encoder.rs | 5 +- ...ownership.rs => compute_pass_ownership.rs} | 49 +++- tests/tests/encoder.rs | 230 +++++++++++++++++- tests/tests/root.rs | 2 +- wgpu-core/src/command/clear.rs | 10 +- wgpu-core/src/command/compute.rs | 87 +++++-- wgpu-core/src/command/mod.rs | 106 +++++++- wgpu-core/src/command/render.rs | 2 +- wgpu-core/src/registry.rs | 12 +- wgpu/src/backend/wgpu_core.rs | 20 +- wgpu/src/lib.rs | 47 ++-- 12 files changed, 490 insertions(+), 88 deletions(-) rename tests/tests/{compute_pass_resource_ownership.rs => compute_pass_ownership.rs} (77%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 392ccc9b44..23370791cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,7 +47,13 @@ TODO(wumpf): This is still work in progress. Should write a bit more about it. A `wgpu::ComputePass` recording methods (e.g. `wgpu::ComputePass:set_render_pipeline`) no longer impose a lifetime constraint passed in resources. -By @wumpf in [#5569](https://github.com/gfx-rs/wgpu/pull/5569), [#5575](https://github.com/gfx-rs/wgpu/pull/5575). +Furthermore, `wgpu::ComputePass` no longer has a life time dependency on its parent `wgpu::CommandEncoder`. +⚠️ As long as a `wgpu::ComputePass` is pending for a given `wgpu::CommandEncoder`, creation of a compute or render pass is an error and invalidates the `wgpu::CommandEncoder`. +Previously, this was statically enforced by a lifetime constraint. +TODO(wumpf): There was some discussion on whether to make this life time constraint opt-in or opt-out (entirely on `wgpu` side, no changes to `wgpu-core`). +Lifting this lifetime dependencies is very useful for library authors, but opens up an easy way for incorrect use. + +By @wumpf in [#5569](https://github.com/gfx-rs/wgpu/pull/5569), [#5575](https://github.com/gfx-rs/wgpu/pull/5575), [#5620](https://github.com/gfx-rs/wgpu/pull/5620). #### Querying shader compilation errors diff --git a/deno_webgpu/command_encoder.rs b/deno_webgpu/command_encoder.rs index b82fba92ea..552b084171 100644 --- a/deno_webgpu/command_encoder.rs +++ b/deno_webgpu/command_encoder.rs @@ -261,15 +261,14 @@ pub fn op_webgpu_command_encoder_begin_compute_pass( timestamp_writes: timestamp_writes.as_ref(), }; - let compute_pass = gfx_select!(command_encoder => instance.command_encoder_create_compute_pass_dyn(*command_encoder, &descriptor)); - + let (compute_pass, error) = gfx_select!(command_encoder => instance.command_encoder_create_compute_pass_dyn(*command_encoder, &descriptor)); let rid = state .resource_table .add(super::compute_pass::WebGpuComputePass(RefCell::new( compute_pass, ))); - Ok(WebGpuResult::rid(rid)) + Ok(WebGpuResult::rid_err(rid, error)) } #[op2] diff --git a/tests/tests/compute_pass_resource_ownership.rs b/tests/tests/compute_pass_ownership.rs similarity index 77% rename from tests/tests/compute_pass_resource_ownership.rs rename to tests/tests/compute_pass_ownership.rs index 4d48c2ad9e..9988accd62 100644 --- a/tests/tests/compute_pass_resource_ownership.rs +++ b/tests/tests/compute_pass_ownership.rs @@ -1,9 +1,6 @@ //! Tests that compute passes take ownership of resources that are associated with. //! I.e. once a resource is passed in to a compute pass, it can be dropped. //! -//! TODO: Test doesn't check on timestamp writes & pipeline statistics queries yet. -//! (Not important as long as they are lifetime constrained to the command encoder, -//! but once we lift this constraint, we should add tests for this as well!) //! TODO: Also should test resource ownership for: //! * write_timestamp //! * begin_pipeline_statistics_query @@ -11,7 +8,7 @@ use std::num::NonZeroU64; use wgpu::util::DeviceExt as _; -use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext}; +use wgpu_test::{gpu_test, valid, GpuTestConfiguration, TestParameters, TestingContext}; const SHADER_SRC: &str = " @group(0) @binding(0) @@ -75,6 +72,50 @@ async fn compute_pass_resource_ownership(ctx: TestingContext) { assert_eq!(floats, [2.0, 4.0, 6.0, 8.0]); } +#[gpu_test] +static COMPUTE_PASS_KEEP_ENCODER_ALIVE: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters(TestParameters::default().test_features_limits()) + .run_async(compute_pass_keep_encoder_alive); + +async fn compute_pass_keep_encoder_alive(ctx: TestingContext) { + let ResourceSetup { + gpu_buffer: _, + cpu_buffer: _, + buffer_size: _, + indirect_buffer, + bind_group, + pipeline, + } = resource_setup(&ctx); + + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("encoder"), + }); + + let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("compute_pass"), + timestamp_writes: None, + }); + + // Now drop the encoder - it is kept alive by the compute pass. + drop(encoder); + ctx.async_poll(wgpu::Maintain::wait()) + .await + .panic_on_timeout(); + + // Record some draw commands. + cpass.set_pipeline(&pipeline); + cpass.set_bind_group(0, &bind_group, &[]); + cpass.dispatch_workgroups_indirect(&indirect_buffer, 0); + + // Dropping the pass will still execute the pass, even though there's no way to submit it. + // Ideally, this would log an error, but the encoder is not dropped until the compute pass is dropped, + // making this a valid operation. + // (If instead the encoder was explicitly destroyed or finished, this would be an error.) + valid(&ctx.device, || drop(cpass)); +} + // Setup ------------------------------------------------------------ struct ResourceSetup { diff --git a/tests/tests/encoder.rs b/tests/tests/encoder.rs index 83f575c4c8..efdde7a539 100644 --- a/tests/tests/encoder.rs +++ b/tests/tests/encoder.rs @@ -1,4 +1,8 @@ -use wgpu_test::{fail, gpu_test, FailureCase, GpuTestConfiguration, TestParameters}; +use wgpu::util::DeviceExt; +use wgpu::CommandEncoder; +use wgpu_test::{ + fail, gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext, +}; #[gpu_test] static DROP_ENCODER: GpuTestConfiguration = GpuTestConfiguration::new().run_sync(|ctx| { @@ -72,3 +76,227 @@ static DROP_ENCODER_AFTER_ERROR: GpuTestConfiguration = GpuTestConfiguration::ne // The encoder is still open! drop(encoder); }); + +// TODO: This should also apply to render passes once the lifetime bound is lifted. +#[gpu_test] +static ENCODER_OPERATIONS_FAIL_WHILE_COMPUTE_PASS_ALIVE: GpuTestConfiguration = + GpuTestConfiguration::new() + .parameters(TestParameters::default().features( + wgpu::Features::CLEAR_TEXTURE + | wgpu::Features::TIMESTAMP_QUERY + | wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS, + )) + .run_sync(encoder_operations_fail_while_compute_pass_alive); + +fn encoder_operations_fail_while_compute_pass_alive(ctx: TestingContext) { + let buffer_source = ctx + .device + .create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: None, + contents: &[0u8; 4], + usage: wgpu::BufferUsages::COPY_SRC, + }); + let buffer_dest = ctx + .device + .create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: None, + contents: &[0u8; 4], + usage: wgpu::BufferUsages::COPY_DST, + }); + + let texture_desc = wgpu::TextureDescriptor { + label: None, + size: wgpu::Extent3d { + width: 1, + height: 1, + depth_or_array_layers: 1, + }, + mip_level_count: 1, + sample_count: 1, + dimension: wgpu::TextureDimension::D2, + format: wgpu::TextureFormat::Rgba8Unorm, + usage: wgpu::TextureUsages::COPY_DST, + view_formats: &[], + }; + let texture_dst = ctx.device.create_texture(&texture_desc); + let texture_src = ctx.device.create_texture(&wgpu::TextureDescriptor { + usage: wgpu::TextureUsages::COPY_SRC, + ..texture_desc + }); + let query_set = ctx.device.create_query_set(&wgpu::QuerySetDescriptor { + count: 1, + ty: wgpu::QueryType::Timestamp, + label: None, + }); + + #[allow(clippy::type_complexity)] + let recording_ops: Vec<(_, Box)> = vec![ + ( + "begin_compute_pass", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default()); + }), + ), + ( + "begin_render_pass", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.begin_render_pass(&wgpu::RenderPassDescriptor::default()); + }), + ), + ( + "copy_buffer_to_buffer", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.copy_buffer_to_buffer(&buffer_source, 0, &buffer_dest, 0, 4); + }), + ), + ( + "copy_buffer_to_texture", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.copy_buffer_to_texture( + wgpu::ImageCopyBuffer { + buffer: &buffer_source, + layout: wgpu::ImageDataLayout { + offset: 0, + bytes_per_row: Some(4), + rows_per_image: None, + }, + }, + texture_dst.as_image_copy(), + texture_dst.size(), + ); + }), + ), + ( + "copy_texture_to_buffer", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.copy_texture_to_buffer( + wgpu::ImageCopyTexture { + texture: &texture_src, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::ImageCopyBuffer { + buffer: &buffer_dest, + layout: wgpu::ImageDataLayout { + offset: 0, + bytes_per_row: Some(4), + rows_per_image: None, + }, + }, + texture_dst.size(), + ); + }), + ), + ( + "copy_texture_to_texture", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.copy_texture_to_texture( + wgpu::ImageCopyTexture { + texture: &texture_src, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::ImageCopyTexture { + texture: &texture_dst, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + texture_dst.size(), + ); + }), + ), + ( + "clear_texture", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.clear_texture(&texture_dst, &wgpu::ImageSubresourceRange::default()); + }), + ), + ( + "clear_buffer", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.clear_buffer(&buffer_dest, 0, None); + }), + ), + ( + "insert_debug_marker", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.insert_debug_marker("marker"); + }), + ), + ( + "push_debug_group", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.push_debug_group("marker"); + }), + ), + ( + "pop_debug_group", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.pop_debug_group(); + }), + ), + ( + "resolve_query_set", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.resolve_query_set(&query_set, 0..1, &buffer_dest, 0); + }), + ), + ( + "write_timestamp", + Box::new(|encoder: &mut wgpu::CommandEncoder| { + encoder.write_timestamp(&query_set, 0); + }), + ), + ]; + + for (op_name, op) in recording_ops.iter() { + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()); + + let pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default()); + + ctx.device.push_error_scope(wgpu::ErrorFilter::Validation); + + log::info!("Testing operation {} on a locked command encoder", op_name); + fail( + &ctx.device, + || op(&mut encoder), + Some("Command encoder is locked"), + ); + + // Drop the pass - this also fails now since the encoder is invalid: + fail( + &ctx.device, + || drop(pass), + Some("Command encoder is invalid"), + ); + // Also, it's not possible to create a new pass on the encoder: + fail( + &ctx.device, + || encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default()), + Some("Command encoder is invalid"), + ); + } + + // Test encoder finishing separately since it consumes the encoder and doesn't fit above pattern. + { + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()); + let pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default()); + fail( + &ctx.device, + || encoder.finish(), + Some("Command encoder is locked"), + ); + fail( + &ctx.device, + || drop(pass), + Some("Command encoder is invalid"), + ); + } +} diff --git a/tests/tests/root.rs b/tests/tests/root.rs index 29f894ede9..1cb5b56c7c 100644 --- a/tests/tests/root.rs +++ b/tests/tests/root.rs @@ -11,7 +11,7 @@ mod buffer; mod buffer_copy; mod buffer_usages; mod clear_texture; -mod compute_pass_resource_ownership; +mod compute_pass_ownership; mod create_surface_error; mod device; mod encoder; diff --git a/wgpu-core/src/command/clear.rs b/wgpu-core/src/command/clear.rs index faff177928..9ef0f24d47 100644 --- a/wgpu-core/src/command/clear.rs +++ b/wgpu-core/src/command/clear.rs @@ -26,8 +26,6 @@ use wgt::{math::align_to, BufferAddress, BufferUsages, ImageSubresourceRange, Te pub enum ClearError { #[error("To use clear_texture the CLEAR_TEXTURE feature needs to be enabled")] MissingClearTextureFeature, - #[error("Command encoder {0:?} is invalid")] - InvalidCommandEncoder(CommandEncoderId), #[error("Device {0:?} is invalid")] InvalidDevice(DeviceId), #[error("Buffer {0:?} is invalid or destroyed")] @@ -74,6 +72,8 @@ whereas subesource range specified start {subresource_base_array_layer} and coun }, #[error(transparent)] Device(#[from] DeviceError), + #[error(transparent)] + CommandEncoderError(#[from] super::CommandEncoderError), } impl Global { @@ -89,8 +89,7 @@ impl Global { let hub = A::hub(self); - let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id) - .map_err(|_| ClearError::InvalidCommandEncoder(command_encoder_id))?; + let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)?; let mut cmd_buf_data = cmd_buf.data.lock(); let cmd_buf_data = cmd_buf_data.as_mut().unwrap(); @@ -183,8 +182,7 @@ impl Global { let hub = A::hub(self); - let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id) - .map_err(|_| ClearError::InvalidCommandEncoder(command_encoder_id))?; + let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)?; let mut cmd_buf_data = cmd_buf.data.lock(); let cmd_buf_data = cmd_buf_data.as_mut().unwrap(); diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs index 08609d9e51..5f463e179d 100644 --- a/wgpu-core/src/command/compute.rs +++ b/wgpu-core/src/command/compute.rs @@ -13,7 +13,7 @@ use crate::{ global::Global, hal_api::HalApi, hal_label, - id::{self, DeviceId}, + id::{self}, init_tracker::MemoryInitKind, resource::{self, Resource}, snatch::SnatchGuard, @@ -34,14 +34,20 @@ use wgt::{BufferAddress, DynamicOffset}; use std::sync::Arc; use std::{fmt, mem, str}; +use super::DynComputePass; + pub struct ComputePass { /// All pass data & records is stored here. /// - /// If this is `None`, the pass has been ended and can no longer be used. + /// If this is `None`, the pass is in the 'ended' state and can no longer be used. /// Any attempt to record more commands will result in a validation error. base: Option>>, - parent_id: id::CommandEncoderId, + /// Parent command buffer that this pass records commands into. + /// + /// If it is none, this pass is invalid and any operation on it will return an error. + parent: Option>>, + timestamp_writes: Option, // Resource binding dedupe state. @@ -50,10 +56,11 @@ pub struct ComputePass { } impl ComputePass { - fn new(parent_id: id::CommandEncoderId, desc: &ComputePassDescriptor) -> Self { + /// If the parent command buffer is invalid, the returned pass will be invalid. + fn new(parent: Option>>, desc: &ComputePassDescriptor) -> Self { Self { - base: Some(BasePass::>::new(&desc.label)), - parent_id, + base: Some(BasePass::new(&desc.label)), + parent, timestamp_writes: desc.timestamp_writes.cloned(), current_bind_groups: BindGroupStateChange::new(), @@ -62,8 +69,8 @@ impl ComputePass { } #[inline] - pub fn parent_id(&self) -> id::CommandEncoderId { - self.parent_id + pub fn parent_id(&self) -> Option { + self.parent.as_ref().map(|cmd_buf| cmd_buf.as_info().id()) } #[inline] @@ -84,7 +91,7 @@ impl ComputePass { impl fmt::Debug for ComputePass { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "ComputePass {{ encoder_id: {:?} }}", self.parent_id) + write!(f, "ComputePass {{ parent: {:?} }}", self.parent_id()) } } @@ -129,10 +136,12 @@ pub enum ComputePassErrorInner { Device(#[from] DeviceError), #[error(transparent)] Encoder(#[from] CommandEncoderError), + #[error("Parent encoder is invalid")] + InvalidParentEncoder, #[error("Bind group at index {0:?} is invalid")] InvalidBindGroup(u32), #[error("Device {0:?} is invalid")] - InvalidDevice(DeviceId), + InvalidDevice(id::DeviceId), #[error("Bind group index {index} is greater than the device's requested `max_bind_group` limit {max}")] BindGroupIndexOutOfRange { index: u32, max: u32 }, #[error("Compute pipeline {0:?} is invalid")] @@ -292,31 +301,55 @@ impl<'a, A: HalApi> State<'a, A> { // Running the compute pass. impl Global { + /// Creates a compute pass. + /// + /// If creation fails, an invalid pass is returned. + /// Any operation on an invalid pass will return an error. + /// + /// If successful, puts the encoder into the [`CommandEncoderStatus::Locked`] state. pub fn command_encoder_create_compute_pass( &self, - parent_id: id::CommandEncoderId, + encoder_id: id::CommandEncoderId, desc: &ComputePassDescriptor, - ) -> ComputePass { - ComputePass::new(parent_id, desc) + ) -> (ComputePass, Option) { + let hub = A::hub(self); + + match CommandBuffer::lock_encoder(hub, encoder_id) { + Ok(cmd_buf) => (ComputePass::new(Some(cmd_buf), desc), None), + Err(err) => (ComputePass::new(None, desc), Some(err)), + } } + /// Creates a type erased compute pass. + /// + /// If creation fails, an invalid pass is returned. + /// Any operation on an invalid pass will return an error. pub fn command_encoder_create_compute_pass_dyn( &self, - parent_id: id::CommandEncoderId, + encoder_id: id::CommandEncoderId, desc: &ComputePassDescriptor, - ) -> Box { - Box::new(ComputePass::::new(parent_id, desc)) + ) -> (Box, Option) { + let (pass, err) = self.command_encoder_create_compute_pass::(encoder_id, desc); + (Box::new(pass), err) } pub fn compute_pass_end( &self, pass: &mut ComputePass, ) -> Result<(), ComputePassError> { - let base = pass.base.take().ok_or(ComputePassError { - scope: PassErrorScope::Pass(pass.parent_id), - inner: ComputePassErrorInner::PassEnded, - })?; - self.compute_pass_end_impl(pass.parent_id, base, pass.timestamp_writes.as_ref()) + let scope = PassErrorScope::Pass(pass.parent_id()); + let Some(parent) = pass.parent.as_ref() else { + return Err(ComputePassErrorInner::InvalidParentEncoder).map_pass_err(scope); + }; + + parent.unlock_encoder().map_pass_err(scope)?; + + let base = pass + .base + .take() + .ok_or(ComputePassErrorInner::PassEnded) + .map_pass_err(scope)?; + self.compute_pass_end_impl(parent, base, pass.timestamp_writes.as_ref()) } #[doc(hidden)] @@ -326,10 +359,14 @@ impl Global { base: BasePass, timestamp_writes: Option<&ComputePassTimestampWrites>, ) -> Result<(), ComputePassError> { + let hub = A::hub(self); + + let cmd_buf = CommandBuffer::get_encoder(hub, encoder_id) + .map_pass_err(PassErrorScope::PassEncoder(encoder_id))?; let commands = ComputeCommand::resolve_compute_command_ids(A::hub(self), &base.commands)?; self.compute_pass_end_impl::( - encoder_id, + &cmd_buf, BasePass { label: base.label, commands, @@ -343,17 +380,15 @@ impl Global { fn compute_pass_end_impl( &self, - encoder_id: id::CommandEncoderId, + cmd_buf: &CommandBuffer, base: BasePass>, timestamp_writes: Option<&ComputePassTimestampWrites>, ) -> Result<(), ComputePassError> { profiling::scope!("CommandEncoder::run_compute_pass"); - let pass_scope = PassErrorScope::Pass(encoder_id); + let pass_scope = PassErrorScope::Pass(Some(cmd_buf.as_info().id())); let hub = A::hub(self); - let cmd_buf: Arc> = - CommandBuffer::get_encoder(hub, encoder_id).map_pass_err(pass_scope)?; let device = &cmd_buf.device; if !device.is_valid() { return Err(ComputePassErrorInner::InvalidDevice( diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs index bfb9276057..20a6bdfae1 100644 --- a/wgpu-core/src/command/mod.rs +++ b/wgpu-core/src/command/mod.rs @@ -25,7 +25,6 @@ use self::memory_init::CommandBufferTextureMemoryActions; use crate::device::{Device, DeviceError}; use crate::error::{ErrorFormatter, PrettyError}; use crate::hub::Hub; -use crate::id::CommandBufferId; use crate::lock::{rank, Mutex}; use crate::snatch::SnatchGuard; @@ -51,10 +50,23 @@ pub(crate) enum CommandEncoderStatus { /// [`compute_pass_end`] require the encoder to be in this /// state. /// + /// This corresponds to WebGPU's "open" state. + /// See + /// /// [`command_encoder_clear_buffer`]: Global::command_encoder_clear_buffer /// [`compute_pass_end`]: Global::compute_pass_end Recording, + /// Locked by a render or compute pass. + /// + /// This state is entered when a render/compute pass is created, + /// and exited when the pass is ended. + /// + /// As long as the command encoder is locked, any command building operation on it will fail + /// and put the encoder into the [`CommandEncoderStatus::Error`] state. + /// See + Locked, + /// Command recording is complete, and the buffer is ready for submission. /// /// [`Global::command_encoder_finish`] transitions a @@ -410,6 +422,38 @@ impl CommandBuffer { } impl CommandBuffer { + fn get_encoder_impl( + hub: &Hub, + id: id::CommandEncoderId, + lock_on_acquire: bool, + ) -> Result, CommandEncoderError> { + let storage = hub.command_buffers.read(); + match storage.get(id.into_command_buffer_id()) { + Ok(cmd_buf) => { + let mut cmd_buf_data = cmd_buf.data.lock(); + let cmd_buf_data = cmd_buf_data.as_mut().unwrap(); + match cmd_buf_data.status { + CommandEncoderStatus::Recording => { + if lock_on_acquire { + cmd_buf_data.status = CommandEncoderStatus::Locked; + } + Ok(cmd_buf.clone()) + } + CommandEncoderStatus::Locked => { + // Any operation on a locked encoder is required to put it into the invalid/error state. + // See https://www.w3.org/TR/webgpu/#encoder-state-locked + cmd_buf_data.encoder.discard(); + cmd_buf_data.status = CommandEncoderStatus::Error; + Err(CommandEncoderError::Locked) + } + CommandEncoderStatus::Finished => Err(CommandEncoderError::NotRecording), + CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid), + } + } + Err(_) => Err(CommandEncoderError::Invalid), + } + } + /// Return the [`CommandBuffer`] for `id`, for recording new commands. /// /// In `wgpu_core`, the [`CommandBuffer`] type serves both as encoder and @@ -420,14 +464,37 @@ impl CommandBuffer { hub: &Hub, id: id::CommandEncoderId, ) -> Result, CommandEncoderError> { - let storage = hub.command_buffers.read(); - match storage.get(id.into_command_buffer_id()) { - Ok(cmd_buf) => match cmd_buf.data.lock().as_ref().unwrap().status { - CommandEncoderStatus::Recording => Ok(cmd_buf.clone()), - CommandEncoderStatus::Finished => Err(CommandEncoderError::NotRecording), - CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid), - }, - Err(_) => Err(CommandEncoderError::Invalid), + let lock_on_acquire = false; + Self::get_encoder_impl(hub, id, lock_on_acquire) + } + + /// Return the [`CommandBuffer`] for `id` and if successful puts it into the [`CommandEncoderStatus::Locked`] state. + /// + /// See [`CommandBuffer::get_encoder`]. + /// Call [`CommandBuffer::unlock_encoder`] to put the [`CommandBuffer`] back into the [`CommandEncoderStatus::Recording`] state. + fn lock_encoder( + hub: &Hub, + id: id::CommandEncoderId, + ) -> Result, CommandEncoderError> { + let lock_on_acquire = true; + Self::get_encoder_impl(hub, id, lock_on_acquire) + } + + /// Unlocks the [`CommandBuffer`] for `id` and puts it back into the [`CommandEncoderStatus::Recording`] state. + /// + /// This function is the counterpart to [`CommandBuffer::lock_encoder`]. + /// It is only valid to call this function if the encoder is in the [`CommandEncoderStatus::Locked`] state. + fn unlock_encoder(&self) -> Result<(), CommandEncoderError> { + let mut data_lock = self.data.lock(); + let status = &mut data_lock.as_mut().unwrap().status; + match *status { + CommandEncoderStatus::Recording => Err(CommandEncoderError::Invalid), + CommandEncoderStatus::Locked => { + *status = CommandEncoderStatus::Recording; + Ok(()) + } + CommandEncoderStatus::Finished => Err(CommandEncoderError::Invalid), + CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid), } } @@ -564,6 +631,8 @@ pub enum CommandEncoderError { NotRecording, #[error(transparent)] Device(#[from] DeviceError), + #[error("Command encoder is locked by a previously created render/compute pass. Before recording any new commands, the pass must be ended.")] + Locked, } impl Global { @@ -571,7 +640,7 @@ impl Global { &self, encoder_id: id::CommandEncoderId, _desc: &wgt::CommandBufferDescriptor { entry_point: final_entry_point_name.as_ref(), constants: desc.stage.constants.as_ref(), zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory, + vertex_pulling_transform: false, }, cache: cache.as_ref().and_then(|it| it.raw.as_ref()), }; @@ -3165,6 +3166,7 @@ impl Device { entry_point: &vertex_entry_point_name, constants: stage_desc.constants.as_ref(), zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory, + vertex_pulling_transform: stage_desc.vertex_pulling_transform, } }; @@ -3228,6 +3230,7 @@ impl Device { zero_initialize_workgroup_memory: fragment_state .stage .zero_initialize_workgroup_memory, + vertex_pulling_transform: false, }) } None => None, diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs index ee8f8668c3..f3e7dbacb2 100644 --- a/wgpu-core/src/pipeline.rs +++ b/wgpu-core/src/pipeline.rs @@ -166,6 +166,8 @@ pub struct ProgrammableStageDescriptor<'a> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, + /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. + pub vertex_pulling_transform: bool, } /// Number of implicit bind groups derived at pipeline creation. diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs index ee59fa2590..560aa6f8c6 100644 --- a/wgpu-hal/examples/halmark/main.rs +++ b/wgpu-hal/examples/halmark/main.rs @@ -254,6 +254,7 @@ impl Example { entry_point: "vs_main", constants: &constants, zero_initialize_workgroup_memory: true, + vertex_pulling_transform: false, }, vertex_buffers: &[], fragment_stage: Some(hal::ProgrammableStage { @@ -261,6 +262,7 @@ impl Example { entry_point: "fs_main", constants: &constants, zero_initialize_workgroup_memory: true, + vertex_pulling_transform: false, }), primitive: wgt::PrimitiveState { topology: wgt::PrimitiveTopology::TriangleStrip, diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs index 8f404dc4d2..90f0e6fc50 100644 --- a/wgpu-hal/examples/ray-traced-triangle/main.rs +++ b/wgpu-hal/examples/ray-traced-triangle/main.rs @@ -373,6 +373,7 @@ impl Example { entry_point: "main", constants: &Default::default(), zero_initialize_workgroup_memory: true, + vertex_pulling_transform: false, }, cache: None, }) diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index 35b9ea0d0a..da3834bcb0 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -1714,6 +1714,8 @@ pub struct ProgrammableStage<'a, A: Api> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, + /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. + pub vertex_pulling_transform: bool, } // Rust gets confused about the impl requirements for `A` @@ -1724,6 +1726,7 @@ impl Clone for ProgrammableStage<'_, A> { entry_point: self.entry_point, constants: self.constants, zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory, + vertex_pulling_transform: self.vertex_pulling_transform, } } } diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs index 341712c323..fb9c7e9c0e 100644 --- a/wgpu-hal/src/metal/command.rs +++ b/wgpu-hal/src/metal/command.rs @@ -16,6 +16,7 @@ impl Default for super::CommandState { raw_wg_size: metal::MTLSize::new(0, 0, 0), stage_infos: Default::default(), storage_buffer_length_map: Default::default(), + vertex_buffer_size_map: Default::default(), work_group_memory_sizes: Vec::new(), push_constants: Vec::new(), pending_timer_queries: Vec::new(), @@ -137,6 +138,7 @@ impl super::CommandEncoder { impl super::CommandState { fn reset(&mut self) { self.storage_buffer_length_map.clear(); + self.vertex_buffer_size_map.clear(); self.stage_infos.vs.clear(); self.stage_infos.fs.clear(); self.stage_infos.cs.clear(); @@ -160,6 +162,15 @@ impl super::CommandState { .unwrap_or_default() })); + // Extend with the sizes of the mapped vertex buffers, in the order + // they were added to the map. + result_sizes.extend(stage_info.vertex_buffer_mappings.iter().map(|vbm| { + self.vertex_buffer_size_map + .get(&(vbm.id as u64)) + .map(|size| u32::try_from(size.get()).unwrap_or(u32::MAX)) + .unwrap_or_default() + })); + if !result_sizes.is_empty() { Some((slot as _, result_sizes)) } else { @@ -927,6 +938,27 @@ impl crate::CommandEncoder for super::CommandEncoder { let buffer_index = self.shared.private_caps.max_vertex_buffers as u64 - 1 - index as u64; let encoder = self.state.render.as_ref().unwrap(); encoder.set_vertex_buffer(buffer_index, Some(&binding.buffer.raw), binding.offset); + + let buffer_size = binding.resolve_size(); + if buffer_size > 0 { + self.state.vertex_buffer_size_map.insert( + buffer_index, + std::num::NonZeroU64::new(buffer_size).unwrap(), + ); + } else { + self.state.vertex_buffer_size_map.remove(&buffer_index); + } + + if let Some((index, sizes)) = self + .state + .make_sizes_buffer_update(naga::ShaderStage::Vertex, &mut self.temp.binding_sizes) + { + encoder.set_vertex_bytes( + index as _, + (sizes.len() * WORD_SIZE) as u64, + sizes.as_ptr() as _, + ); + } } unsafe fn set_viewport(&mut self, rect: &crate::Rect, depth_range: Range) { diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 81ab5dbdb6..77ea8a0d86 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -59,10 +59,48 @@ fn create_depth_stencil_desc(state: &wgt::DepthStencilState) -> metal::DepthSten desc } +const fn convert_vertex_format_to_naga(format: wgt::VertexFormat) -> naga::back::msl::VertexFormat { + match format { + wgt::VertexFormat::Uint8x2 => naga::back::msl::VertexFormat::Uint8x2, + wgt::VertexFormat::Uint8x4 => naga::back::msl::VertexFormat::Uint8x4, + wgt::VertexFormat::Sint8x2 => naga::back::msl::VertexFormat::Sint8x2, + wgt::VertexFormat::Sint8x4 => naga::back::msl::VertexFormat::Sint8x4, + wgt::VertexFormat::Unorm8x2 => naga::back::msl::VertexFormat::Unorm8x2, + wgt::VertexFormat::Unorm8x4 => naga::back::msl::VertexFormat::Unorm8x4, + wgt::VertexFormat::Snorm8x2 => naga::back::msl::VertexFormat::Snorm8x2, + wgt::VertexFormat::Snorm8x4 => naga::back::msl::VertexFormat::Snorm8x4, + wgt::VertexFormat::Uint16x2 => naga::back::msl::VertexFormat::Uint16x2, + wgt::VertexFormat::Uint16x4 => naga::back::msl::VertexFormat::Uint16x4, + wgt::VertexFormat::Sint16x2 => naga::back::msl::VertexFormat::Sint16x2, + wgt::VertexFormat::Sint16x4 => naga::back::msl::VertexFormat::Sint16x4, + wgt::VertexFormat::Unorm16x2 => naga::back::msl::VertexFormat::Unorm16x2, + wgt::VertexFormat::Unorm16x4 => naga::back::msl::VertexFormat::Unorm16x4, + wgt::VertexFormat::Snorm16x2 => naga::back::msl::VertexFormat::Snorm16x2, + wgt::VertexFormat::Snorm16x4 => naga::back::msl::VertexFormat::Snorm16x4, + wgt::VertexFormat::Float16x2 => naga::back::msl::VertexFormat::Float16x2, + wgt::VertexFormat::Float16x4 => naga::back::msl::VertexFormat::Float16x4, + wgt::VertexFormat::Float32 => naga::back::msl::VertexFormat::Float32, + wgt::VertexFormat::Float32x2 => naga::back::msl::VertexFormat::Float32x2, + wgt::VertexFormat::Float32x3 => naga::back::msl::VertexFormat::Float32x3, + wgt::VertexFormat::Float32x4 => naga::back::msl::VertexFormat::Float32x4, + wgt::VertexFormat::Uint32 => naga::back::msl::VertexFormat::Uint32, + wgt::VertexFormat::Uint32x2 => naga::back::msl::VertexFormat::Uint32x2, + wgt::VertexFormat::Uint32x3 => naga::back::msl::VertexFormat::Uint32x3, + wgt::VertexFormat::Uint32x4 => naga::back::msl::VertexFormat::Uint32x4, + wgt::VertexFormat::Sint32 => naga::back::msl::VertexFormat::Sint32, + wgt::VertexFormat::Sint32x2 => naga::back::msl::VertexFormat::Sint32x2, + wgt::VertexFormat::Sint32x3 => naga::back::msl::VertexFormat::Sint32x3, + wgt::VertexFormat::Sint32x4 => naga::back::msl::VertexFormat::Sint32x4, + wgt::VertexFormat::Unorm10_10_10_2 => naga::back::msl::VertexFormat::Unorm10_10_10_2, + _ => unimplemented!(), + } +} + impl super::Device { fn load_shader( &self, stage: &crate::ProgrammableStage, + vertex_buffer_mappings: &[naga::back::msl::VertexBufferMapping], layout: &super::PipelineLayout, primitive_class: metal::MTLPrimitiveTopologyClass, naga_stage: naga::ShaderStage, @@ -120,6 +158,8 @@ impl super::Device { metal::MTLPrimitiveTopologyClass::Point => true, _ => false, }, + vertex_pulling_transform: stage.vertex_pulling_transform, + vertex_buffer_mappings: vertex_buffer_mappings.to_vec(), }; let (source, info) = @@ -548,7 +588,7 @@ impl crate::Device for super::Device { pc_buffer: Option, pc_limit: u32, sizes_buffer: Option, - sizes_count: u8, + need_sizes_buffer: bool, resources: naga::back::msl::BindingMap, } @@ -558,7 +598,7 @@ impl crate::Device for super::Device { pc_buffer: None, pc_limit: 0, sizes_buffer: None, - sizes_count: 0, + need_sizes_buffer: false, resources: Default::default(), }); let mut bind_group_infos = arrayvec::ArrayVec::new(); @@ -603,7 +643,7 @@ impl crate::Device for super::Device { { for info in stage_data.iter_mut() { if entry.visibility.contains(map_naga_stage(info.stage)) { - info.sizes_count += 1; + info.need_sizes_buffer = true; } } } @@ -661,11 +701,13 @@ impl crate::Device for super::Device { // Finally, make sure we fit the limits for info in stage_data.iter_mut() { - // handle the sizes buffer assignment and shader overrides - if info.sizes_count != 0 { + if info.need_sizes_buffer || info.stage == naga::ShaderStage::Vertex { + // Set aside space for the sizes_buffer, which is required + // for variable-length buffers, or to support vertex pulling. info.sizes_buffer = Some(info.counters.buffers); info.counters.buffers += 1; } + if info.counters.buffers > self.shared.private_caps.max_buffers_per_stage || info.counters.textures > self.shared.private_caps.max_textures_per_stage || info.counters.samplers > self.shared.private_caps.max_samplers_per_stage @@ -832,8 +874,38 @@ impl crate::Device for super::Device { // Vertex shader let (vs_lib, vs_info) = { + let mut vertex_buffer_mappings = Vec::::new(); + for (i, vbl) in desc.vertex_buffers.iter().enumerate() { + let mut attributes = Vec::::new(); + for attribute in vbl.attributes.iter() { + attributes.push(naga::back::msl::AttributeMapping { + shader_location: attribute.shader_location, + offset: attribute.offset as u32, + format: convert_vertex_format_to_naga(attribute.format), + }); + } + + vertex_buffer_mappings.push(naga::back::msl::VertexBufferMapping { + id: self.shared.private_caps.max_vertex_buffers - 1 - i as u32, + stride: if vbl.array_stride > 0 { + vbl.array_stride.try_into().unwrap() + } else { + vbl.attributes + .iter() + .map(|attribute| attribute.offset + attribute.format.size()) + .max() + .unwrap_or(0) + .try_into() + .unwrap() + }, + indexed_by_vertex: (vbl.step_mode == wgt::VertexStepMode::Vertex {}), + attributes, + }); + } + let vs = self.load_shader( &desc.vertex_stage, + &vertex_buffer_mappings, desc.layout, primitive_class, naga::ShaderStage::Vertex, @@ -851,6 +923,7 @@ impl crate::Device for super::Device { push_constants: desc.layout.push_constants_infos.vs, sizes_slot: desc.layout.per_stage_map.vs.sizes_buffer, sized_bindings: vs.sized_bindings, + vertex_buffer_mappings, }; (vs.library, info) @@ -861,6 +934,7 @@ impl crate::Device for super::Device { Some(ref stage) => { let fs = self.load_shader( stage, + &[], desc.layout, primitive_class, naga::ShaderStage::Fragment, @@ -878,6 +952,7 @@ impl crate::Device for super::Device { push_constants: desc.layout.push_constants_infos.fs, sizes_slot: desc.layout.per_stage_map.fs.sizes_buffer, sized_bindings: fs.sized_bindings, + vertex_buffer_mappings: vec![], }; (Some(fs.library), Some(info)) @@ -1053,6 +1128,7 @@ impl crate::Device for super::Device { let cs = self.load_shader( &desc.stage, + &[], desc.layout, metal::MTLPrimitiveTopologyClass::Unspecified, naga::ShaderStage::Compute, @@ -1070,6 +1146,7 @@ impl crate::Device for super::Device { push_constants: desc.layout.push_constants_infos.cs, sizes_slot: desc.layout.per_stage_map.cs.sizes_buffer, sized_bindings: cs.sized_bindings, + vertex_buffer_mappings: vec![], }; if let Some(name) = desc.label { diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index a5ea63b035..ce8e015924 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -466,6 +466,15 @@ impl Buffer { } } +impl crate::BufferBinding<'_, Api> { + fn resolve_size(&self) -> wgt::BufferAddress { + match self.size { + Some(size) => size.get(), + None => self.buffer.size - self.offset, + } + } +} + #[derive(Debug)] pub struct Texture { raw: metal::Texture, @@ -690,6 +699,9 @@ struct PipelineStageInfo { /// /// See `device::CompiledShader::sized_bindings` for more details. sized_bindings: Vec, + + /// Info on all bound vertex buffers. + vertex_buffer_mappings: Vec, } impl PipelineStageInfo { @@ -697,6 +709,7 @@ impl PipelineStageInfo { self.push_constants = None; self.sizes_slot = None; self.sized_bindings.clear(); + self.vertex_buffer_mappings.clear(); } fn assign_from(&mut self, other: &Self) { @@ -704,6 +717,9 @@ impl PipelineStageInfo { self.sizes_slot = other.sizes_slot; self.sized_bindings.clear(); self.sized_bindings.extend_from_slice(&other.sized_bindings); + self.vertex_buffer_mappings.clear(); + self.vertex_buffer_mappings + .extend_from_slice(&other.vertex_buffer_mappings); } } @@ -821,6 +837,8 @@ struct CommandState { /// [`ResourceBinding`]: naga::ResourceBinding storage_buffer_length_map: rustc_hash::FxHashMap, + vertex_buffer_size_map: rustc_hash::FxHashMap, + work_group_memory_sizes: Vec, push_constants: Vec, diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs index 5ed055f2be..d5210900bb 100644 --- a/wgpu/src/backend/wgpu_core.rs +++ b/wgpu/src/backend/wgpu_core.rs @@ -1189,6 +1189,10 @@ impl crate::Context for ContextWgpuCore { .vertex .compilation_options .zero_initialize_workgroup_memory, + vertex_pulling_transform: desc + .vertex + .compilation_options + .vertex_pulling_transform, }, buffers: Borrowed(&vertex_buffers), }, @@ -1203,6 +1207,7 @@ impl crate::Context for ContextWgpuCore { zero_initialize_workgroup_memory: frag .compilation_options .zero_initialize_workgroup_memory, + vertex_pulling_transform: false, }, targets: Borrowed(frag.targets), }), @@ -1256,6 +1261,7 @@ impl crate::Context for ContextWgpuCore { zero_initialize_workgroup_memory: desc .compilation_options .zero_initialize_workgroup_memory, + vertex_pulling_transform: false, }, cache: desc.cache.map(|c| c.id.into()), }; diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs index 00130a99c2..e94ae27fe8 100644 --- a/wgpu/src/lib.rs +++ b/wgpu/src/lib.rs @@ -1987,6 +1987,8 @@ pub struct PipelineCompilationOptions<'a> { /// This is required by the WebGPU spec, but may have overhead which can be avoided /// for cross-platform applications pub zero_initialize_workgroup_memory: bool, + /// Should the pipeline attempt to transform vertex shaders to use vertex pulling. + pub vertex_pulling_transform: bool, } impl<'a> Default for PipelineCompilationOptions<'a> { @@ -2000,6 +2002,7 @@ impl<'a> Default for PipelineCompilationOptions<'a> { Self { constants, zero_initialize_workgroup_memory: true, + vertex_pulling_transform: false, } } } From c7458638d14921c7562e4197ddeefa17be413587 Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Thu, 30 May 2024 16:53:34 -0400 Subject: [PATCH 8/9] [hal/vk] Rework Submission and Surface Synchronization (#5681) Fix two major synchronization issues in `wgpu_val::vulkan`: - Properly order queue command buffer submissions. Due to Mesa bugs, two semaphores are required even though the Vulkan spec says that only one should be necessary. - Properly manage surface texture acquisition and presentation: - Acquiring a surface texture can return while the presentation engine is still displaying the texture. Applications must wait for a semaphore to be signaled before using the acquired texture. - Presenting a surface texture requires a semaphore to ensure that drawing is complete before presentation occurs. Co-authored-by: Jim Blandy --- wgpu-core/src/device/queue.rs | 2 +- wgpu-core/src/present.rs | 15 +- wgpu-hal/examples/halmark/main.rs | 73 ++-- wgpu-hal/examples/raw-gles.rs | 3 +- wgpu-hal/examples/ray-traced-triangle/main.rs | 73 ++-- wgpu-hal/src/dx12/mod.rs | 11 +- wgpu-hal/src/empty.rs | 3 +- wgpu-hal/src/gles/egl.rs | 1 + wgpu-hal/src/gles/queue.rs | 12 +- wgpu-hal/src/gles/web.rs | 1 + wgpu-hal/src/gles/wgl.rs | 1 + wgpu-hal/src/lib.rs | 113 ++++- wgpu-hal/src/metal/mod.rs | 55 ++- wgpu-hal/src/metal/surface.rs | 1 + wgpu-hal/src/vulkan/adapter.rs | 20 +- wgpu-hal/src/vulkan/device.rs | 140 +++--- wgpu-hal/src/vulkan/instance.rs | 71 ++- wgpu-hal/src/vulkan/mod.rs | 408 +++++++++++++++--- 18 files changed, 699 insertions(+), 304 deletions(-) diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 168b36843b..8eb46f0aa9 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -1499,7 +1499,7 @@ impl Global { .raw .as_ref() .unwrap() - .submit(&refs, &submit_surface_textures, Some((fence, submit_index))) + .submit(&refs, &submit_surface_textures, (fence, submit_index)) .map_err(DeviceError::from)?; } diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs index 053f7fdb24..7f5939feb0 100644 --- a/wgpu-core/src/present.rs +++ b/wgpu-core/src/present.rs @@ -154,17 +154,20 @@ impl Global { parent_id: surface_id, }); } - #[cfg(not(feature = "trace"))] - let _ = device; + + let fence_guard = device.fence.read(); + let fence = fence_guard.as_ref().unwrap(); let suf = A::surface_as_hal(surface.as_ref()); let (texture_id, status) = match unsafe { - suf.unwrap() - .acquire_texture(Some(std::time::Duration::from_millis( - FRAME_TIMEOUT_MS as u64, - ))) + suf.unwrap().acquire_texture( + Some(std::time::Duration::from_millis(FRAME_TIMEOUT_MS as u64)), + fence, + ) } { Ok(Some(ast)) => { + drop(fence_guard); + let texture_desc = wgt::TextureDescriptor { label: (), size: wgt::Extent3d { diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs index 560aa6f8c6..81474f233d 100644 --- a/wgpu-hal/examples/halmark/main.rs +++ b/wgpu-hal/examples/halmark/main.rs @@ -22,7 +22,6 @@ const MAX_BUNNIES: usize = 1 << 20; const BUNNY_SIZE: f32 = 0.15 * 256.0; const GRAVITY: f32 = -9.8 * 100.0; const MAX_VELOCITY: f32 = 750.0; -const COMMAND_BUFFER_PER_CONTEXT: usize = 100; const DESIRED_MAX_LATENCY: u32 = 2; #[repr(C)] @@ -498,7 +497,7 @@ impl Example { let mut fence = device.create_fence().unwrap(); let init_cmd = cmd_encoder.end_encoding().unwrap(); queue - .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value))) + .submit(&[&init_cmd], &[], (&mut fence, init_fence_value)) .unwrap(); device.wait(&fence, init_fence_value, !0).unwrap(); device.destroy_buffer(staging_buffer); @@ -550,7 +549,7 @@ impl Example { { let ctx = &mut self.contexts[self.context_index]; self.queue - .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value))) + .submit(&[], &[], (&mut ctx.fence, ctx.fence_value)) .unwrap(); } @@ -650,7 +649,13 @@ impl Example { let ctx = &mut self.contexts[self.context_index]; - let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture }; + let surface_tex = unsafe { + self.surface + .acquire_texture(None, &ctx.fence) + .unwrap() + .unwrap() + .texture + }; let target_barrier0 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -718,7 +723,6 @@ impl Example { } ctx.frames_recorded += 1; - let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT; let target_barrier1 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -732,45 +736,42 @@ impl Example { unsafe { let cmd_buf = ctx.encoder.end_encoding().unwrap(); - let fence_param = if do_fence { - Some((&mut ctx.fence, ctx.fence_value)) - } else { - None - }; self.queue - .submit(&[&cmd_buf], &[&surface_tex], fence_param) + .submit( + &[&cmd_buf], + &[&surface_tex], + (&mut ctx.fence, ctx.fence_value), + ) .unwrap(); self.queue.present(&self.surface, surface_tex).unwrap(); ctx.used_cmd_bufs.push(cmd_buf); ctx.used_views.push(surface_tex_view); }; - if do_fence { - log::debug!("Context switch from {}", self.context_index); - let old_fence_value = ctx.fence_value; - if self.contexts.len() == 1 { - let hal_desc = hal::CommandEncoderDescriptor { - label: None, - queue: &self.queue, - }; - self.contexts.push(unsafe { - ExecutionContext { - encoder: self.device.create_command_encoder(&hal_desc).unwrap(), - fence: self.device.create_fence().unwrap(), - fence_value: 0, - used_views: Vec::new(), - used_cmd_bufs: Vec::new(), - frames_recorded: 0, - } - }); - } - self.context_index = (self.context_index + 1) % self.contexts.len(); - let next = &mut self.contexts[self.context_index]; - unsafe { - next.wait_and_clear(&self.device); - } - next.fence_value = old_fence_value + 1; + log::debug!("Context switch from {}", self.context_index); + let old_fence_value = ctx.fence_value; + if self.contexts.len() == 1 { + let hal_desc = hal::CommandEncoderDescriptor { + label: None, + queue: &self.queue, + }; + self.contexts.push(unsafe { + ExecutionContext { + encoder: self.device.create_command_encoder(&hal_desc).unwrap(), + fence: self.device.create_fence().unwrap(), + fence_value: 0, + used_views: Vec::new(), + used_cmd_bufs: Vec::new(), + frames_recorded: 0, + } + }); + } + self.context_index = (self.context_index + 1) % self.contexts.len(); + let next = &mut self.contexts[self.context_index]; + unsafe { + next.wait_and_clear(&self.device); } + next.fence_value = old_fence_value + 1; } } diff --git a/wgpu-hal/examples/raw-gles.rs b/wgpu-hal/examples/raw-gles.rs index 342100e1cb..675a518694 100644 --- a/wgpu-hal/examples/raw-gles.rs +++ b/wgpu-hal/examples/raw-gles.rs @@ -156,6 +156,7 @@ fn fill_screen(exposed: &hal::ExposedAdapter, width: u32, height }) .unwrap() }; + let mut fence = unsafe { od.device.create_fence().unwrap() }; let rp_desc = hal::RenderPassDescriptor { label: None, extent: wgt::Extent3d { @@ -183,6 +184,6 @@ fn fill_screen(exposed: &hal::ExposedAdapter, width: u32, height encoder.begin_render_pass(&rp_desc); encoder.end_render_pass(); let cmd_buf = encoder.end_encoding().unwrap(); - od.queue.submit(&[&cmd_buf], &[], None).unwrap(); + od.queue.submit(&[&cmd_buf], &[], (&mut fence, 0)).unwrap(); } } diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs index 90f0e6fc50..cf0e146ec9 100644 --- a/wgpu-hal/examples/ray-traced-triangle/main.rs +++ b/wgpu-hal/examples/ray-traced-triangle/main.rs @@ -13,7 +13,6 @@ use std::{ }; use winit::window::WindowButtons; -const COMMAND_BUFFER_PER_CONTEXT: usize = 100; const DESIRED_MAX_LATENCY: u32 = 2; /// [D3D12_RAYTRACING_INSTANCE_DESC](https://microsoft.github.io/DirectX-Specs/d3d/Raytracing.html#d3d12_raytracing_instance_desc) @@ -759,7 +758,7 @@ impl Example { let mut fence = device.create_fence().unwrap(); let init_cmd = cmd_encoder.end_encoding().unwrap(); queue - .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value))) + .submit(&[&init_cmd], &[], (&mut fence, init_fence_value)) .unwrap(); device.wait(&fence, init_fence_value, !0).unwrap(); cmd_encoder.reset_all(iter::once(init_cmd)); @@ -808,7 +807,13 @@ impl Example { fn render(&mut self) { let ctx = &mut self.contexts[self.context_index]; - let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture }; + let surface_tex = unsafe { + self.surface + .acquire_texture(None, &ctx.fence) + .unwrap() + .unwrap() + .texture + }; let target_barrier0 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -909,7 +914,6 @@ impl Example { } ctx.frames_recorded += 1; - let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT; let target_barrier1 = hal::TextureBarrier { texture: surface_tex.borrow(), @@ -959,45 +963,42 @@ impl Example { unsafe { let cmd_buf = ctx.encoder.end_encoding().unwrap(); - let fence_param = if do_fence { - Some((&mut ctx.fence, ctx.fence_value)) - } else { - None - }; self.queue - .submit(&[&cmd_buf], &[&surface_tex], fence_param) + .submit( + &[&cmd_buf], + &[&surface_tex], + (&mut ctx.fence, ctx.fence_value), + ) .unwrap(); self.queue.present(&self.surface, surface_tex).unwrap(); ctx.used_cmd_bufs.push(cmd_buf); ctx.used_views.push(surface_tex_view); }; - if do_fence { - log::info!("Context switch from {}", self.context_index); - let old_fence_value = ctx.fence_value; - if self.contexts.len() == 1 { - let hal_desc = hal::CommandEncoderDescriptor { - label: None, - queue: &self.queue, - }; - self.contexts.push(unsafe { - ExecutionContext { - encoder: self.device.create_command_encoder(&hal_desc).unwrap(), - fence: self.device.create_fence().unwrap(), - fence_value: 0, - used_views: Vec::new(), - used_cmd_bufs: Vec::new(), - frames_recorded: 0, - } - }); - } - self.context_index = (self.context_index + 1) % self.contexts.len(); - let next = &mut self.contexts[self.context_index]; - unsafe { - next.wait_and_clear(&self.device); - } - next.fence_value = old_fence_value + 1; + log::info!("Context switch from {}", self.context_index); + let old_fence_value = ctx.fence_value; + if self.contexts.len() == 1 { + let hal_desc = hal::CommandEncoderDescriptor { + label: None, + queue: &self.queue, + }; + self.contexts.push(unsafe { + ExecutionContext { + encoder: self.device.create_command_encoder(&hal_desc).unwrap(), + fence: self.device.create_fence().unwrap(), + fence_value: 0, + used_views: Vec::new(), + used_cmd_bufs: Vec::new(), + frames_recorded: 0, + } + }); + } + self.context_index = (self.context_index + 1) % self.contexts.len(); + let next = &mut self.contexts[self.context_index]; + unsafe { + next.wait_and_clear(&self.device); } + next.fence_value = old_fence_value + 1; } fn exit(mut self) { @@ -1005,7 +1006,7 @@ impl Example { { let ctx = &mut self.contexts[self.context_index]; self.queue - .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value))) + .submit(&[], &[], (&mut ctx.fence, ctx.fence_value)) .unwrap(); } diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs index 99800e87c9..9d5f62f915 100644 --- a/wgpu-hal/src/dx12/mod.rs +++ b/wgpu-hal/src/dx12/mod.rs @@ -857,6 +857,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, timeout: Option, + _fence: &Fence, ) -> Result>, crate::SurfaceError> { let mut swapchain = self.swap_chain.write(); let sc = swapchain.as_mut().unwrap(); @@ -895,7 +896,7 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], _surface_textures: &[&Texture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let mut temp_lists = self.temp_lists.lock(); temp_lists.clear(); @@ -908,11 +909,9 @@ impl crate::Queue for Queue { self.raw.execute_command_lists(&temp_lists); } - if let Some((fence, value)) = signal_fence { - self.raw - .signal(&fence.raw, value) - .into_device_result("Signal fence")?; - } + self.raw + .signal(&signal_fence.raw, signal_value) + .into_device_result("Signal fence")?; // Note the lack of synchronization here between the main Direct queue // and the dedicated presentation queue. This is automatically handled diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs index f1986f7705..8cba9d063f 100644 --- a/wgpu-hal/src/empty.rs +++ b/wgpu-hal/src/empty.rs @@ -75,6 +75,7 @@ impl crate::Surface for Context { unsafe fn acquire_texture( &self, timeout: Option, + fence: &Resource, ) -> Result>, crate::SurfaceError> { Ok(None) } @@ -114,7 +115,7 @@ impl crate::Queue for Context { &self, command_buffers: &[&Resource], surface_textures: &[&Resource], - signal_fence: Option<(&mut Resource, crate::FenceValue)>, + signal_fence: (&mut Resource, crate::FenceValue), ) -> DeviceResult<()> { Ok(()) } diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs index 5ddf9b48b5..07cd8e835d 100644 --- a/wgpu-hal/src/gles/egl.rs +++ b/wgpu-hal/src/gles/egl.rs @@ -1432,6 +1432,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs index f6b55a449a..95eff36d57 100644 --- a/wgpu-hal/src/gles/queue.rs +++ b/wgpu-hal/src/gles/queue.rs @@ -1740,7 +1740,7 @@ impl crate::Queue for super::Queue { &self, command_buffers: &[&super::CommandBuffer], _surface_textures: &[&super::Texture], - signal_fence: Option<(&mut super::Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut super::Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let shared = Arc::clone(&self.shared); let gl = &shared.context.lock(); @@ -1774,12 +1774,10 @@ impl crate::Queue for super::Queue { } } - if let Some((fence, value)) = signal_fence { - fence.maintain(gl); - let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) } - .map_err(|_| crate::DeviceError::OutOfMemory)?; - fence.pending.push((value, sync)); - } + signal_fence.maintain(gl); + let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) } + .map_err(|_| crate::DeviceError::OutOfMemory)?; + signal_fence.pending.push((signal_value, sync)); Ok(()) } diff --git a/wgpu-hal/src/gles/web.rs b/wgpu-hal/src/gles/web.rs index ab2ccef8b6..081f7da5d1 100644 --- a/wgpu-hal/src/gles/web.rs +++ b/wgpu-hal/src/gles/web.rs @@ -427,6 +427,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/gles/wgl.rs b/wgpu-hal/src/gles/wgl.rs index aae70478b4..1111d98f83 100644 --- a/wgpu-hal/src/gles/wgl.rs +++ b/wgpu-hal/src/gles/wgl.rs @@ -798,6 +798,7 @@ impl crate::Surface for Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let swapchain = self.swapchain.read(); let sc = swapchain.as_ref().unwrap(); diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index da3834bcb0..e81fad403f 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -459,44 +459,101 @@ pub trait Instance: Sized + WasmNotSendSync { pub trait Surface: WasmNotSendSync { type A: Api; - /// Configures the surface to use the given device. + /// Configure `self` to use `device`. /// /// # Safety /// - /// - All gpu work that uses the surface must have been completed. + /// - All GPU work using `self` must have been completed. /// - All [`AcquiredSurfaceTexture`]s must have been destroyed. /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed. - /// - All surfaces created using other devices must have been unconfigured before this call. + /// - The surface `self` must not currently be configured to use any other [`Device`]. unsafe fn configure( &self, device: &::Device, config: &SurfaceConfiguration, ) -> Result<(), SurfaceError>; - /// Unconfigures the surface on the given device. + /// Unconfigure `self` on `device`. /// /// # Safety /// - /// - All gpu work that uses the surface must have been completed. + /// - All GPU work that uses `surface` must have been completed. /// - All [`AcquiredSurfaceTexture`]s must have been destroyed. /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed. - /// - The surface must have been configured on the given device. + /// - The surface `self` must have been configured on `device`. unsafe fn unconfigure(&self, device: &::Device); - /// Returns the next texture to be presented by the swapchain for drawing + /// Return the next texture to be presented by `self`, for the caller to draw on. /// - /// A `timeout` of `None` means to wait indefinitely, with no timeout. + /// On success, return an [`AcquiredSurfaceTexture`] representing the + /// texture into which the caller should draw the image to be displayed on + /// `self`. + /// + /// If `timeout` elapses before `self` has a texture ready to be acquired, + /// return `Ok(None)`. If `timeout` is `None`, wait indefinitely, with no + /// timeout. + /// + /// # Using an [`AcquiredSurfaceTexture`] + /// + /// On success, this function returns an [`AcquiredSurfaceTexture`] whose + /// [`texture`] field is a [`SurfaceTexture`] from which the caller can + /// [`borrow`] a [`Texture`] to draw on. The [`AcquiredSurfaceTexture`] also + /// carries some metadata about that [`SurfaceTexture`]. + /// + /// All calls to [`Queue::submit`] that draw on that [`Texture`] must also + /// include the [`SurfaceTexture`] in the `surface_textures` argument. + /// + /// When you are done drawing on the texture, you can display it on `self` + /// by passing the [`SurfaceTexture`] and `self` to [`Queue::present`]. + /// + /// If you do not wish to display the texture, you must pass the + /// [`SurfaceTexture`] to [`self.discard_texture`], so that it can be reused + /// by future acquisitions. /// /// # Portability /// - /// Some backends can't support a timeout when acquiring a texture and - /// the timeout will be ignored. + /// Some backends can't support a timeout when acquiring a texture. On these + /// backends, `timeout` is ignored. /// - /// Returns `None` on timing out. + /// # Safety + /// + /// - The surface `self` must currently be configured on some [`Device`]. + /// + /// - The `fence` argument must be the same [`Fence`] passed to all calls to + /// [`Queue::submit`] that used [`Texture`]s acquired from this surface. + /// + /// - You may only have one texture acquired from `self` at a time. When + /// `acquire_texture` returns `Ok(Some(ast))`, you must pass the returned + /// [`SurfaceTexture`] `ast.texture` to either [`Queue::present`] or + /// [`Surface::discard_texture`] before calling `acquire_texture` again. + /// + /// [`texture`]: AcquiredSurfaceTexture::texture + /// [`SurfaceTexture`]: Api::SurfaceTexture + /// [`borrow`]: std::borrow::Borrow::borrow + /// [`Texture`]: Api::Texture + /// [`Fence`]: Api::Fence + /// [`self.discard_texture`]: Surface::discard_texture unsafe fn acquire_texture( &self, timeout: Option, + fence: &::Fence, ) -> Result>, SurfaceError>; + + /// Relinquish an acquired texture without presenting it. + /// + /// After this call, the texture underlying [`SurfaceTexture`] may be + /// returned by subsequent calls to [`self.acquire_texture`]. + /// + /// # Safety + /// + /// - The surface `self` must currently be configured on some [`Device`]. + /// + /// - `texture` must be a [`SurfaceTexture`] returned by a call to + /// [`self.acquire_texture`] that has not yet been passed to + /// [`Queue::present`]. + /// + /// [`SurfaceTexture`]: Api::SurfaceTexture + /// [`self.acquire_texture`]: Surface::acquire_texture unsafe fn discard_texture(&self, texture: ::SurfaceTexture); } @@ -762,19 +819,23 @@ pub trait Queue: WasmNotSendSync { /// Submit `command_buffers` for execution on GPU. /// - /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value` - /// when the operation is complete. See [`Fence`] for details. + /// Update `fence` to `value` when the operation is complete. See + /// [`Fence`] for details. + /// + /// A `wgpu_hal` queue is "single threaded": all command buffers are + /// executed in the order they're submitted, with each buffer able to see + /// previous buffers' results. Specifically: /// - /// If two calls to `submit` on a single `Queue` occur in a particular order - /// (that is, they happen on the same thread, or on two threads that have - /// synchronized to establish an ordering), then the first submission's - /// commands all complete execution before any of the second submission's - /// commands begin. All results produced by one submission are visible to - /// the next. + /// - If two calls to `submit` on a single `Queue` occur in a particular + /// order (that is, they happen on the same thread, or on two threads that + /// have synchronized to establish an ordering), then the first + /// submission's commands all complete execution before any of the second + /// submission's commands begin. All results produced by one submission + /// are visible to the next. /// - /// Within a submission, command buffers execute in the order in which they - /// appear in `command_buffers`. All results produced by one buffer are - /// visible to the next. + /// - Within a submission, command buffers execute in the order in which they + /// appear in `command_buffers`. All results produced by one buffer are + /// visible to the next. /// /// If two calls to `submit` on a single `Queue` from different threads are /// not synchronized to occur in a particular order, they must pass distinct @@ -803,10 +864,16 @@ pub trait Queue: WasmNotSendSync { /// - Every [`SurfaceTexture`][st] that any command in `command_buffers` /// writes to must appear in the `surface_textures` argument. /// + /// - No [`SurfaceTexture`][st] may appear in the `surface_textures` + /// argument more than once. + /// /// - Each [`SurfaceTexture`][st] in `surface_textures` must be configured /// for use with the [`Device`][d] associated with this [`Queue`], /// typically by calling [`Surface::configure`]. /// + /// - All calls to this function that include a given [`SurfaceTexture`][st] + /// in `surface_textures` must use the same [`Fence`]. + /// /// [`Fence`]: Api::Fence /// [cb]: Api::CommandBuffer /// [ce]: Api::CommandEncoder @@ -819,7 +886,7 @@ pub trait Queue: WasmNotSendSync { &self, command_buffers: &[&::CommandBuffer], surface_textures: &[&::SurfaceTexture], - signal_fence: Option<(&mut ::Fence, FenceValue)>, + signal_fence: (&mut ::Fence, FenceValue), ) -> Result<(), DeviceError>; unsafe fn present( &self, diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index ce8e015924..1867d7de44 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -377,38 +377,37 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], _surface_textures: &[&SurfaceTexture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { objc::rc::autoreleasepool(|| { - let extra_command_buffer = match signal_fence { - Some((fence, value)) => { - let completed_value = Arc::clone(&fence.completed_value); - let block = block::ConcreteBlock::new(move |_cmd_buf| { - completed_value.store(value, atomic::Ordering::Release); - }) - .copy(); - - let raw = match command_buffers.last() { - Some(&cmd_buf) => cmd_buf.raw.to_owned(), - None => { - let queue = self.raw.lock(); - queue - .new_command_buffer_with_unretained_references() - .to_owned() - } - }; - raw.set_label("(wgpu internal) Signal"); - raw.add_completed_handler(&block); - - fence.maintain(); - fence.pending_command_buffers.push((value, raw.to_owned())); - // only return an extra one if it's extra - match command_buffers.last() { - Some(_) => None, - None => Some(raw), + let extra_command_buffer = { + let completed_value = Arc::clone(&signal_fence.completed_value); + let block = block::ConcreteBlock::new(move |_cmd_buf| { + completed_value.store(signal_value, atomic::Ordering::Release); + }) + .copy(); + + let raw = match command_buffers.last() { + Some(&cmd_buf) => cmd_buf.raw.to_owned(), + None => { + let queue = self.raw.lock(); + queue + .new_command_buffer_with_unretained_references() + .to_owned() } + }; + raw.set_label("(wgpu internal) Signal"); + raw.add_completed_handler(&block); + + signal_fence.maintain(); + signal_fence + .pending_command_buffers + .push((signal_value, raw.to_owned())); + // only return an extra one if it's extra + match command_buffers.last() { + Some(_) => None, + None => Some(raw), } - None => None, }; for cmd_buffer in command_buffers { diff --git a/wgpu-hal/src/metal/surface.rs b/wgpu-hal/src/metal/surface.rs index e1eb6d5b23..1a11056609 100644 --- a/wgpu-hal/src/metal/surface.rs +++ b/wgpu-hal/src/metal/surface.rs @@ -242,6 +242,7 @@ impl crate::Surface for super::Surface { unsafe fn acquire_texture( &self, _timeout_ms: Option, //TODO + _fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let render_layer = self.render_layer.lock(); let (drawable, texture) = match autoreleasepool(|| { diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 6df999084f..fe2a6f9707 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -3,11 +3,7 @@ use super::conv; use ash::{amd, ext, khr, vk}; use parking_lot::Mutex; -use std::{ - collections::BTreeMap, - ffi::CStr, - sync::{atomic::AtomicIsize, Arc}, -}; +use std::{collections::BTreeMap, ffi::CStr, sync::Arc}; fn depth_stencil_required_flags() -> vk::FormatFeatureFlags { vk::FormatFeatureFlags::SAMPLED_IMAGE | vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT @@ -1783,21 +1779,15 @@ impl super::Adapter { render_passes: Mutex::new(Default::default()), framebuffers: Mutex::new(Default::default()), }); - let mut relay_semaphores = [vk::Semaphore::null(); 2]; - for sem in relay_semaphores.iter_mut() { - unsafe { - *sem = shared - .raw - .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)? - }; - } + + let relay_semaphores = super::RelaySemaphores::new(&shared)?; + let queue = super::Queue { raw: raw_queue, swapchain_fn, device: Arc::clone(&shared), family_index, - relay_semaphores, - relay_index: AtomicIsize::new(-1), + relay_semaphores: Mutex::new(relay_semaphores), }; let mem_allocator = { diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs index 1ea627897f..867b7efb23 100644 --- a/wgpu-hal/src/vulkan/device.rs +++ b/wgpu-hal/src/vulkan/device.rs @@ -612,17 +612,16 @@ impl super::Device { let images = unsafe { functor.get_swapchain_images(raw) }.map_err(crate::DeviceError::from)?; - // NOTE: It's important that we define at least images.len() + 1 wait + // NOTE: It's important that we define at least images.len() wait // semaphores, since we prospectively need to provide the call to // acquire the next image with an unsignaled semaphore. - let surface_semaphores = (0..images.len() + 1) - .map(|_| unsafe { - self.shared - .raw - .create_semaphore(&vk::SemaphoreCreateInfo::default(), None) + let surface_semaphores = (0..=images.len()) + .map(|_| { + super::SwapchainImageSemaphores::new(&self.shared) + .map(Mutex::new) + .map(Arc::new) }) - .collect::, _>>() - .map_err(crate::DeviceError::from)?; + .collect::, _>>()?; Ok(super::Swapchain { raw, @@ -633,7 +632,7 @@ impl super::Device { config: config.clone(), view_formats: wgt_view_formats, surface_semaphores, - next_surface_index: 0, + next_semaphore_index: 0, }) } @@ -836,9 +835,12 @@ impl crate::Device for super::Device { unsafe fn exit(self, queue: super::Queue) { unsafe { self.mem_allocator.into_inner().cleanup(&*self.shared) }; unsafe { self.desc_allocator.into_inner().cleanup(&*self.shared) }; - for &sem in queue.relay_semaphores.iter() { - unsafe { self.shared.raw.destroy_semaphore(sem, None) }; - } + unsafe { + queue + .relay_semaphores + .into_inner() + .destroy(&self.shared.raw) + }; unsafe { self.shared.free_resources() }; } @@ -2055,54 +2057,7 @@ impl crate::Device for super::Device { timeout_ms: u32, ) -> Result { let timeout_ns = timeout_ms as u64 * super::MILLIS_TO_NANOS; - match *fence { - super::Fence::TimelineSemaphore(raw) => { - let semaphores = [raw]; - let values = [wait_value]; - let vk_info = vk::SemaphoreWaitInfo::default() - .semaphores(&semaphores) - .values(&values); - let result = match self.shared.extension_fns.timeline_semaphore { - Some(super::ExtensionFn::Extension(ref ext)) => unsafe { - ext.wait_semaphores(&vk_info, timeout_ns) - }, - Some(super::ExtensionFn::Promoted) => unsafe { - self.shared.raw.wait_semaphores(&vk_info, timeout_ns) - }, - None => unreachable!(), - }; - match result { - Ok(()) => Ok(true), - Err(vk::Result::TIMEOUT) => Ok(false), - Err(other) => Err(other.into()), - } - } - super::Fence::FencePool { - last_completed, - ref active, - free: _, - } => { - if wait_value <= last_completed { - Ok(true) - } else { - match active.iter().find(|&&(value, _)| value >= wait_value) { - Some(&(_, raw)) => { - match unsafe { - self.shared.raw.wait_for_fences(&[raw], true, timeout_ns) - } { - Ok(()) => Ok(true), - Err(vk::Result::TIMEOUT) => Ok(false), - Err(other) => Err(other.into()), - } - } - None => { - log::error!("No signals reached value {}", wait_value); - Err(crate::DeviceError::Lost) - } - } - } - } - } + self.shared.wait_for_fence(fence, wait_value, timeout_ns) } unsafe fn start_capture(&self) -> bool { @@ -2364,6 +2319,71 @@ impl crate::Device for super::Device { } } +impl super::DeviceShared { + pub(super) fn new_binary_semaphore(&self) -> Result { + unsafe { + self.raw + .create_semaphore(&vk::SemaphoreCreateInfo::default(), None) + .map_err(crate::DeviceError::from) + } + } + + pub(super) fn wait_for_fence( + &self, + fence: &super::Fence, + wait_value: crate::FenceValue, + timeout_ns: u64, + ) -> Result { + profiling::scope!("Device::wait"); + match *fence { + super::Fence::TimelineSemaphore(raw) => { + let semaphores = [raw]; + let values = [wait_value]; + let vk_info = vk::SemaphoreWaitInfo::default() + .semaphores(&semaphores) + .values(&values); + let result = match self.extension_fns.timeline_semaphore { + Some(super::ExtensionFn::Extension(ref ext)) => unsafe { + ext.wait_semaphores(&vk_info, timeout_ns) + }, + Some(super::ExtensionFn::Promoted) => unsafe { + self.raw.wait_semaphores(&vk_info, timeout_ns) + }, + None => unreachable!(), + }; + match result { + Ok(()) => Ok(true), + Err(vk::Result::TIMEOUT) => Ok(false), + Err(other) => Err(other.into()), + } + } + super::Fence::FencePool { + last_completed, + ref active, + free: _, + } => { + if wait_value <= last_completed { + Ok(true) + } else { + match active.iter().find(|&&(value, _)| value >= wait_value) { + Some(&(_, raw)) => { + match unsafe { self.raw.wait_for_fences(&[raw], true, timeout_ns) } { + Ok(()) => Ok(true), + Err(vk::Result::TIMEOUT) => Ok(false), + Err(other) => Err(other.into()), + } + } + None => { + log::error!("No signals reached value {}", wait_value); + Err(crate::DeviceError::Lost) + } + } + } + } + } + } +} + impl From for crate::DeviceError { fn from(error: gpu_alloc::AllocationError) -> Self { use gpu_alloc::AllocationError as Ae; diff --git a/wgpu-hal/src/vulkan/instance.rs b/wgpu-hal/src/vulkan/instance.rs index 6f471f8905..18acaeabb9 100644 --- a/wgpu-hal/src/vulkan/instance.rs +++ b/wgpu-hal/src/vulkan/instance.rs @@ -164,10 +164,14 @@ impl super::Swapchain { let _ = unsafe { device.device_wait_idle() }; }; + // We cannot take this by value, as the function returns `self`. for semaphore in self.surface_semaphores.drain(..) { - unsafe { - device.destroy_semaphore(semaphore, None); - } + let arc_removed = Arc::into_inner(semaphore).expect( + "Trying to destroy a SurfaceSemaphores that is still in use by a SurfaceTexture", + ); + let mutex_removed = arc_removed.into_inner(); + + unsafe { mutex_removed.destroy(device) }; } self @@ -966,9 +970,10 @@ impl crate::Surface for super::Surface { unsafe fn acquire_texture( &self, timeout: Option, + fence: &super::Fence, ) -> Result>, crate::SurfaceError> { let mut swapchain = self.swapchain.write(); - let sc = swapchain.as_mut().unwrap(); + let swapchain = swapchain.as_mut().unwrap(); let mut timeout_ns = match timeout { Some(duration) => duration.as_nanos() as u64, @@ -988,12 +993,40 @@ impl crate::Surface for super::Surface { timeout_ns = u64::MAX; } - let wait_semaphore = sc.surface_semaphores[sc.next_surface_index]; + let swapchain_semaphores_arc = swapchain.get_surface_semaphores(); + // Nothing should be using this, so we don't block, but panic if we fail to lock. + let locked_swapchain_semaphores = swapchain_semaphores_arc + .try_lock() + .expect("Failed to lock a SwapchainSemaphores."); + + // Wait for all commands writing to the previously acquired image to + // complete. + // + // Almost all the steps in the usual acquire-draw-present flow are + // asynchronous: they get something started on the presentation engine + // or the GPU, but on the CPU, control returns immediately. Without some + // sort of intervention, the CPU could crank out frames much faster than + // the presentation engine can display them. + // + // This is the intervention: if any submissions drew on this image, and + // thus waited for `locked_swapchain_semaphores.acquire`, wait for all + // of them to finish, thus ensuring that it's okay to pass `acquire` to + // `vkAcquireNextImageKHR` again. + swapchain.device.wait_for_fence( + fence, + locked_swapchain_semaphores.previously_used_submission_index, + timeout_ns, + )?; // will block if no image is available let (index, suboptimal) = match unsafe { - sc.functor - .acquire_next_image(sc.raw, timeout_ns, wait_semaphore, vk::Fence::null()) + profiling::scope!("vkAcquireNextImageKHR"); + swapchain.functor.acquire_next_image( + swapchain.raw, + timeout_ns, + locked_swapchain_semaphores.acquire, + vk::Fence::null(), + ) } { // We treat `VK_SUBOPTIMAL_KHR` as `VK_SUCCESS` on Android. // See the comment in `Queue::present`. @@ -1013,16 +1046,18 @@ impl crate::Surface for super::Surface { } }; - sc.next_surface_index += 1; - sc.next_surface_index %= sc.surface_semaphores.len(); + drop(locked_swapchain_semaphores); + // We only advance the surface semaphores if we successfully acquired an image, otherwise + // we should try to re-acquire using the same semaphores. + swapchain.advance_surface_semaphores(); // special case for Intel Vulkan returning bizarre values (ugh) - if sc.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 { + if swapchain.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 { return Err(crate::SurfaceError::Outdated); } // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-framebuffer-03209 - let raw_flags = if sc + let raw_flags = if swapchain .raw_flags .contains(vk::SwapchainCreateFlagsKHR::MUTABLE_FORMAT) { @@ -1034,20 +1069,20 @@ impl crate::Surface for super::Surface { let texture = super::SurfaceTexture { index, texture: super::Texture { - raw: sc.images[index as usize], + raw: swapchain.images[index as usize], drop_guard: None, block: None, - usage: sc.config.usage, - format: sc.config.format, + usage: swapchain.config.usage, + format: swapchain.config.format, raw_flags, copy_size: crate::CopyExtent { - width: sc.config.extent.width, - height: sc.config.extent.height, + width: swapchain.config.extent.width, + height: swapchain.config.extent.height, depth: 1, }, - view_formats: sc.view_formats.clone(), + view_formats: swapchain.view_formats.clone(), }, - wait_semaphore, + surface_semaphores: swapchain_semaphores_arc, }; Ok(Some(crate::AcquiredSurfaceTexture { texture, diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index 1716ee9206..40e7a2cb42 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -33,13 +33,11 @@ mod instance; use std::{ borrow::Borrow, + collections::HashSet, ffi::{CStr, CString}, - fmt, + fmt, mem, num::NonZeroU32, - sync::{ - atomic::{AtomicIsize, Ordering}, - Arc, - }, + sync::Arc, }; use arrayvec::ArrayVec; @@ -147,6 +145,173 @@ pub struct Instance { shared: Arc, } +/// The semaphores needed to use one image in a swapchain. +#[derive(Debug)] +struct SwapchainImageSemaphores { + /// A semaphore that is signaled when this image is safe for us to modify. + /// + /// When [`vkAcquireNextImageKHR`] returns the index of the next swapchain + /// image that we should use, that image may actually still be in use by the + /// presentation engine, and is not yet safe to modify. However, that + /// function does accept a semaphore that it will signal when the image is + /// indeed safe to begin messing with. + /// + /// This semaphore is: + /// + /// - waited for by the first queue submission to operate on this image + /// since it was acquired, and + /// + /// - signaled by [`vkAcquireNextImageKHR`] when the acquired image is ready + /// for us to use. + /// + /// [`vkAcquireNextImageKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkAcquireNextImageKHR + acquire: vk::Semaphore, + + /// True if the next command submission operating on this image should wait + /// for [`acquire`]. + /// + /// We must wait for `acquire` before drawing to this swapchain image, but + /// because `wgpu-hal` queue submissions are always strongly ordered, only + /// the first submission that works with a swapchain image actually needs to + /// wait. We set this flag when this image is acquired, and clear it the + /// first time it's passed to [`Queue::submit`] as a surface texture. + /// + /// [`acquire`]: SwapchainImageSemaphores::acquire + /// [`Queue::submit`]: crate::Queue::submit + should_wait_for_acquire: bool, + + /// A pool of semaphores for ordering presentation after drawing. + /// + /// The first [`present_index`] semaphores in this vector are: + /// + /// - all waited on by the call to [`vkQueuePresentKHR`] that presents this + /// image, and + /// + /// - each signaled by some [`vkQueueSubmit`] queue submission that draws to + /// this image, when the submission finishes execution. + /// + /// This vector accumulates one semaphore per submission that writes to this + /// image. This is awkward, but hard to avoid: [`vkQueuePresentKHR`] + /// requires a semaphore to order it with respect to drawing commands, and + /// we can't attach new completion semaphores to a command submission after + /// it's been submitted. This means that, at submission time, we must create + /// the semaphore we might need if the caller's next action is to enqueue a + /// presentation of this image. + /// + /// An alternative strategy would be for presentation to enqueue an empty + /// submit, ordered relative to other submits in the usual way, and + /// signaling a single presentation semaphore. But we suspect that submits + /// are usually expensive enough, and semaphores usually cheap enough, that + /// performance-sensitive users will avoid making many submits, so that the + /// cost of accumulated semaphores will usually be less than the cost of an + /// additional submit. + /// + /// Only the first [`present_index`] semaphores in the vector are actually + /// going to be signalled by submitted commands, and need to be waited for + /// by the next present call. Any semaphores beyond that index were created + /// for prior presents and are simply being retained for recycling. + /// + /// [`present_index`]: SwapchainImageSemaphores::present_index + /// [`vkQueuePresentKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueuePresentKHR + /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit + present: Vec, + + /// The number of semaphores in [`present`] to be signalled for this submission. + /// + /// [`present`]: SwapchainImageSemaphores::present + present_index: usize, + + /// The fence value of the last command submission that wrote to this image. + /// + /// The next time we try to acquire this image, we'll block until + /// this submission finishes, proving that [`acquire`] is ready to + /// pass to `vkAcquireNextImageKHR` again. + /// + /// [`acquire`]: SwapchainImageSemaphores::acquire + previously_used_submission_index: crate::FenceValue, +} + +impl SwapchainImageSemaphores { + fn new(device: &DeviceShared) -> Result { + Ok(Self { + acquire: device.new_binary_semaphore()?, + should_wait_for_acquire: true, + present: Vec::new(), + present_index: 0, + previously_used_submission_index: 0, + }) + } + + fn set_used_fence_value(&mut self, value: crate::FenceValue) { + self.previously_used_submission_index = value; + } + + /// Return the semaphore that commands drawing to this image should wait for, if any. + /// + /// This only returns `Some` once per acquisition; see + /// [`SwapchainImageSemaphores::should_wait_for_acquire`] for details. + fn get_acquire_wait_semaphore(&mut self) -> Option { + if self.should_wait_for_acquire { + self.should_wait_for_acquire = false; + Some(self.acquire) + } else { + None + } + } + + /// Return a semaphore that a submission that writes to this image should + /// signal when it's done. + /// + /// See [`SwapchainImageSemaphores::present`] for details. + fn get_submit_signal_semaphore( + &mut self, + device: &DeviceShared, + ) -> Result { + // Try to recycle a semaphore we created for a previous presentation. + let sem = match self.present.get(self.present_index) { + Some(sem) => *sem, + None => { + let sem = device.new_binary_semaphore()?; + self.present.push(sem); + sem + } + }; + + self.present_index += 1; + + Ok(sem) + } + + /// Return the semaphores that a presentation of this image should wait on. + /// + /// Return a slice of semaphores that the call to [`vkQueueSubmit`] that + /// ends this image's acquisition should wait for. See + /// [`SwapchainImageSemaphores::present`] for details. + /// + /// Reset `self` to be ready for the next acquisition cycle. + /// + /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit + fn get_present_wait_semaphores(&mut self) -> &[vk::Semaphore] { + let old_index = self.present_index; + + // Since this marks the end of this acquire/draw/present cycle, take the + // opportunity to reset `self` in preparation for the next acquisition. + self.present_index = 0; + self.should_wait_for_acquire = true; + + &self.present[0..old_index] + } + + unsafe fn destroy(&self, device: &ash::Device) { + unsafe { + device.destroy_semaphore(self.acquire, None); + for sem in &self.present { + device.destroy_semaphore(*sem, None); + } + } + } +} + struct Swapchain { raw: vk::SwapchainKHR, raw_flags: vk::SwapchainCreateFlagsKHR, @@ -157,9 +322,25 @@ struct Swapchain { view_formats: Vec, /// One wait semaphore per swapchain image. This will be associated with the /// surface texture, and later collected during submission. - surface_semaphores: Vec, - /// Current semaphore index to use when acquiring a surface. - next_surface_index: usize, + /// + /// We need this to be `Arc>` because we need to be able to pass this + /// data into the surface texture, so submit/present can use it. + surface_semaphores: Vec>>, + /// The index of the next semaphore to use. Ideally we would use the same + /// index as the image index, but we need to specify the semaphore as an argument + /// to the acquire_next_image function which is what tells us which image to use. + next_semaphore_index: usize, +} + +impl Swapchain { + fn advance_surface_semaphores(&mut self) { + let semaphore_count = self.surface_semaphores.len(); + self.next_semaphore_index = (self.next_semaphore_index + 1) % semaphore_count; + } + + fn get_surface_semaphores(&self) -> Arc> { + self.surface_semaphores[self.next_semaphore_index].clone() + } } pub struct Surface { @@ -173,7 +354,7 @@ pub struct Surface { pub struct SurfaceTexture { index: u32, texture: Texture, - wait_semaphore: vk::Semaphore, + surface_semaphores: Arc>, } impl Borrow for SurfaceTexture { @@ -359,18 +540,87 @@ pub struct Device { render_doc: crate::auxil::renderdoc::RenderDoc, } +/// Semaphores for forcing queue submissions to run in order. +/// +/// The [`wgpu_hal::Queue`] trait promises that if two calls to [`submit`] are +/// ordered, then the first submission will finish on the GPU before the second +/// submission begins. To get this behavior on Vulkan we need to pass semaphores +/// to [`vkQueueSubmit`] for the commands to wait on before beginning execution, +/// and to signal when their execution is done. +/// +/// Normally this can be done with a single semaphore, waited on and then +/// signalled for each submission. At any given time there's exactly one +/// submission that would signal the semaphore, and exactly one waiting on it, +/// as Vulkan requires. +/// +/// However, as of Oct 2021, bug [#5508] in the Mesa ANV drivers caused them to +/// hang if we use a single semaphore. The workaround is to alternate between +/// two semaphores. The bug has been fixed in Mesa, but we should probably keep +/// the workaround until, say, Oct 2026. +/// +/// [`wgpu_hal::Queue`]: crate::Queue +/// [`submit`]: crate::Queue::submit +/// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit +/// [#5508]: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508 +#[derive(Clone)] +struct RelaySemaphores { + /// The semaphore the next submission should wait on before beginning + /// execution on the GPU. This is `None` for the first submission, which + /// should not wait on anything at all. + wait: Option, + + /// The semaphore the next submission should signal when it has finished + /// execution on the GPU. + signal: vk::Semaphore, +} + +impl RelaySemaphores { + fn new(device: &DeviceShared) -> Result { + Ok(Self { + wait: None, + signal: device.new_binary_semaphore()?, + }) + } + + /// Advances the semaphores, returning the semaphores that should be used for a submission. + fn advance(&mut self, device: &DeviceShared) -> Result { + let old = self.clone(); + + // Build the state for the next submission. + match self.wait { + None => { + // The `old` values describe the first submission to this queue. + // The second submission should wait on `old.signal`, and then + // signal a new semaphore which we'll create now. + self.wait = Some(old.signal); + self.signal = device.new_binary_semaphore()?; + } + Some(ref mut wait) => { + // What this submission signals, the next should wait. + mem::swap(wait, &mut self.signal); + } + }; + + Ok(old) + } + + /// Destroys the semaphores. + unsafe fn destroy(&self, device: &ash::Device) { + unsafe { + if let Some(wait) = self.wait { + device.destroy_semaphore(wait, None); + } + device.destroy_semaphore(self.signal, None); + } + } +} + pub struct Queue { raw: vk::Queue, swapchain_fn: khr::swapchain::Device, device: Arc, family_index: u32, - /// We use a redundant chain of semaphores to pass on the signal - /// from submissions to the last present, since it's required by the - /// specification. - /// It would be correct to use a single semaphore there, but - /// [Intel hangs in `anv_queue_finish`](https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508). - relay_semaphores: [vk::Semaphore; 2], - relay_index: AtomicIsize, + relay_semaphores: Mutex, } #[derive(Debug)] @@ -702,58 +952,89 @@ impl crate::Queue for Queue { &self, command_buffers: &[&CommandBuffer], surface_textures: &[&SurfaceTexture], - signal_fence: Option<(&mut Fence, crate::FenceValue)>, + (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { let mut fence_raw = vk::Fence::null(); let mut wait_stage_masks = Vec::new(); let mut wait_semaphores = Vec::new(); - let mut signal_semaphores = ArrayVec::<_, 2>::new(); - let mut signal_values = ArrayVec::<_, 2>::new(); + let mut signal_semaphores = Vec::new(); + let mut signal_values = Vec::new(); + + // Double check that the same swapchain image isn't being given to us multiple times, + // as that will deadlock when we try to lock them all. + debug_assert!( + { + let mut check = HashSet::with_capacity(surface_textures.len()); + // We compare the Arcs by pointer, as Eq isn't well defined for SurfaceSemaphores. + for st in surface_textures { + check.insert(Arc::as_ptr(&st.surface_semaphores)); + } + check.len() == surface_textures.len() + }, + "More than one surface texture is being used from the same swapchain. This will cause a deadlock in release." + ); - for &surface_texture in surface_textures { - wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); - wait_semaphores.push(surface_texture.wait_semaphore); + let locked_swapchain_semaphores = surface_textures + .iter() + .map(|st| { + st.surface_semaphores + .try_lock() + .expect("Failed to lock surface semaphore.") + }) + .collect::>(); + + for mut swapchain_semaphore in locked_swapchain_semaphores { + swapchain_semaphore.set_used_fence_value(signal_value); + + // If we're the first submission to operate on this image, wait on + // its acquire semaphore, to make sure the presentation engine is + // done with it. + if let Some(sem) = swapchain_semaphore.get_acquire_wait_semaphore() { + wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); + wait_semaphores.push(sem); + } + + // Get a semaphore to signal when we're done writing to this surface + // image. Presentation of this image will wait for this. + let signal_semaphore = swapchain_semaphore.get_submit_signal_semaphore(&self.device)?; + signal_semaphores.push(signal_semaphore); + signal_values.push(!0); } - let old_index = self.relay_index.load(Ordering::Relaxed); + // In order for submissions to be strictly ordered, we encode a dependency between each submission + // using a pair of semaphores. This adds a wait if it is needed, and signals the next semaphore. + let semaphore_state = self.relay_semaphores.lock().advance(&self.device)?; - let sem_index = if old_index >= 0 { + if let Some(sem) = semaphore_state.wait { wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE); - wait_semaphores.push(self.relay_semaphores[old_index as usize]); - (old_index as usize + 1) % self.relay_semaphores.len() - } else { - 0 - }; - - signal_semaphores.push(self.relay_semaphores[sem_index]); + wait_semaphores.push(sem); + } - self.relay_index - .store(sem_index as isize, Ordering::Relaxed); + signal_semaphores.push(semaphore_state.signal); + signal_values.push(!0); - if let Some((fence, value)) = signal_fence { - fence.maintain(&self.device.raw)?; - match *fence { - Fence::TimelineSemaphore(raw) => { - signal_semaphores.push(raw); - signal_values.push(!0); - signal_values.push(value); - } - Fence::FencePool { - ref mut active, - ref mut free, - .. - } => { - fence_raw = match free.pop() { - Some(raw) => raw, - None => unsafe { - self.device - .raw - .create_fence(&vk::FenceCreateInfo::default(), None)? - }, - }; - active.push((value, fence_raw)); - } + // We need to signal our wgpu::Fence if we have one, this adds it to the signal list. + signal_fence.maintain(&self.device.raw)?; + match *signal_fence { + Fence::TimelineSemaphore(raw) => { + signal_semaphores.push(raw); + signal_values.push(signal_value); + } + Fence::FencePool { + ref mut active, + ref mut free, + .. + } => { + fence_raw = match free.pop() { + Some(raw) => raw, + None => unsafe { + self.device + .raw + .create_fence(&vk::FenceCreateInfo::default(), None)? + }, + }; + active.push((signal_value, fence_raw)); } } @@ -771,7 +1052,7 @@ impl crate::Queue for Queue { let mut vk_timeline_info; - if !signal_values.is_empty() { + if self.device.private_caps.timeline_semaphores { vk_timeline_info = vk::TimelineSemaphoreSubmitInfo::default().signal_semaphore_values(&signal_values); vk_info = vk_info.push_next(&mut vk_timeline_info); @@ -793,19 +1074,14 @@ impl crate::Queue for Queue { ) -> Result<(), crate::SurfaceError> { let mut swapchain = surface.swapchain.write(); let ssc = swapchain.as_mut().unwrap(); + let mut swapchain_semaphores = texture.surface_semaphores.lock(); let swapchains = [ssc.raw]; let image_indices = [texture.index]; - let mut vk_info = vk::PresentInfoKHR::default() + let vk_info = vk::PresentInfoKHR::default() .swapchains(&swapchains) - .image_indices(&image_indices); - - let old_index = self.relay_index.swap(-1, Ordering::Relaxed); - if old_index >= 0 { - vk_info = vk_info.wait_semaphores( - &self.relay_semaphores[old_index as usize..old_index as usize + 1], - ); - } + .image_indices(&image_indices) + .wait_semaphores(swapchain_semaphores.get_present_wait_semaphores()); let suboptimal = { profiling::scope!("vkQueuePresentKHR"); From e7a528b62b20f0036721237715a8f7f74c11f401 Mon Sep 17 00:00:00 2001 From: Kevin Reid Date: Fri, 31 May 2024 18:25:42 -0700 Subject: [PATCH 9/9] Document WebGPU spec rule that an `Adapter` should be used only once. (#5764) --- wgpu/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs index e94ae27fe8..618946b1a1 100644 --- a/wgpu/src/lib.rs +++ b/wgpu/src/lib.rs @@ -2558,6 +2558,11 @@ impl Adapter { /// /// Returns the [`Device`] together with a [`Queue`] that executes command buffers. /// + /// [Per the WebGPU specification], an [`Adapter`] may only be used once to create a device. + /// If another device is wanted, call [`Instance::request_adapter()`] again to get a fresh + /// [`Adapter`]. + /// However, `wgpu` does not currently enforce this restriction. + /// /// # Arguments /// /// - `desc` - Description of the features and limits requested from the given device. @@ -2566,10 +2571,13 @@ impl Adapter { /// /// # Panics /// + /// - `request_device()` was already called on this `Adapter`. /// - Features specified by `desc` are not supported by this adapter. /// - Unsafe features were requested but not enabled when requesting the adapter. /// - Limits requested exceed the values provided by the adapter. /// - Adapter does not support all features wgpu requires to safely operate. + /// + /// [Per the WebGPU specification]: https://www.w3.org/TR/webgpu/#dom-gpuadapter-requestdevice pub fn request_device( &self, desc: &DeviceDescriptor<'_>,