From de809c8f96ba18084873355a917fbdfc5426ea90 Mon Sep 17 00:00:00 2001
From: Vladas Zakrevskis <146100@gmail.com>
Date: Wed, 29 May 2024 19:00:32 +0100
Subject: [PATCH 1/9] Fix missing family check flag (#5754)

Co-authored-by: Jim Blandy <jimb@red-bean.com>
Co-authored-by: Xiaopeng Li <x.friday@outlook.com>
Co-authored-by: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Co-authored-by: Samson <16504129+sagudev@users.noreply.github.com>
Co-authored-by: Valaphee The Meerkat <32491319+valaphee@users.noreply.github.com>
Co-authored-by: Andreas Reich <r_andreas2@web.de>
---
 CHANGELOG.md                  | 4 ++++
 wgpu-hal/src/metal/adapter.rs | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a13590d0b..ff3eb46b99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -96,6 +96,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 
 - Ensure render pipelines have at least 1 target. By @ErichDonGubler in [#5715](https://github.com/gfx-rs/wgpu/pull/5715)
 
+#### Metal
+
+- Fix unrecognized selector crash on iOS 12. By @vladasz in [#5744](https://github.com/gfx-rs/wgpu/pull/5744).
+
 #### Vulkan
 
 - Fix enablement of subgroup ops extension on Vulkan devices that don't support Vulkan 1.3. By @cwfitzgerald in [#5624](https://github.com/gfx-rs/wgpu/pull/5624).
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index 2f84be8859..0ffe37f5e7 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -736,7 +736,9 @@ impl super::PrivateCapabilities {
                 4
             },
             // Per https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-            max_color_attachment_bytes_per_sample: if device.supports_family(MTLGPUFamily::Apple4) {
+            max_color_attachment_bytes_per_sample: if family_check
+                && device.supports_family(MTLGPUFamily::Apple4)
+            {
                 64
             } else {
                 32

From 23307e1dc355df3686547c48e9d1523105faa735 Mon Sep 17 00:00:00 2001
From: Valaphee The Meerkat <32491319+valaphee@users.noreply.github.com>
Date: Wed, 29 May 2024 20:01:32 +0200
Subject: [PATCH 2/9] gles: Return the version as driver_info (#5753)

---
 CHANGELOG.md                 |  9 +++++----
 wgpu-hal/src/gles/adapter.rs | 24 ++----------------------
 2 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff3eb46b99..9ed0ded2e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -92,7 +92,7 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 
 ### Bug Fixes
 
-### General
+#### General
 
 - Ensure render pipelines have at least 1 target. By @ErichDonGubler in [#5715](https://github.com/gfx-rs/wgpu/pull/5715)
 
@@ -106,9 +106,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 
 #### GLES / OpenGL
 
--  Fix regression on OpenGL (EGL) where non-sRGB still used sRGB [#5642](https://github.com/gfx-rs/wgpu/pull/5642)
--  Fix `ClearColorF`, `ClearColorU` and `ClearColorI` commands being issued before `SetDrawColorBuffers` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
--  Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
+- Fix regression on OpenGL (EGL) where non-sRGB still used sRGB [#5642](https://github.com/gfx-rs/wgpu/pull/5642)
+- Fix `ClearColorF`, `ClearColorU` and `ClearColorI` commands being issued before `SetDrawColorBuffers` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
+- Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
+- Return the unmodified version in driver_info. By @Valaphee in [#5753](https://github.com/gfx-rs/wgpu/pull/5753)
 
 ## v0.20.0 (2024-04-28)
 
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 03c026aa23..926b5afbcb 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -179,33 +179,13 @@ impl super::Adapter {
             0
         };
 
-        let driver;
-        let driver_info;
-        if version.starts_with("WebGL ") || version.starts_with("OpenGL ") {
-            let es_sig = " ES";
-            match version.find(es_sig) {
-                Some(pos) => {
-                    driver = version[..pos + es_sig.len()].to_owned();
-                    driver_info = version[pos + es_sig.len() + 1..].to_owned();
-                }
-                None => {
-                    let pos = version.find(' ').unwrap();
-                    driver = version[..pos].to_owned();
-                    driver_info = version[pos + 1..].to_owned();
-                }
-            }
-        } else {
-            driver = "OpenGL".to_owned();
-            driver_info = version;
-        }
-
         wgt::AdapterInfo {
             name: renderer_orig,
             vendor: vendor_id,
             device: 0,
             device_type: inferred_device_type,
-            driver,
-            driver_info,
+            driver: "".to_owned(),
+            driver_info: version,
             backend: wgt::Backend::Gl,
         }
     }

From 071fb14e159749241b810ada3ee2e620f15d915e Mon Sep 17 00:00:00 2001
From: Douglas Dwyer <dougmdwyer@hotmail.com>
Date: Wed, 29 May 2024 15:33:04 -0400
Subject: [PATCH 3/9] Add support for pipeline-overridable constants in web
 backend (#5688)

* Add support for pipeline-overridable constants in WebGPU

* Add utility function for setting constants map

* Panic on failure to set constants map

---------

Co-authored-by: Andreas Reich <r_andreas2@web.de>
---
 CHANGELOG.md               |  4 ++++
 wgpu/src/backend/webgpu.rs | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ed0ded2e4..392ccc9b44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -111,6 +111,10 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 - Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
 - Return the unmodified version in driver_info. By @Valaphee in [#5753](https://github.com/gfx-rs/wgpu/pull/5753)
 
+#### WebGPU
+
+- Added support for pipeline-overridable constants to the WebGPU backend by @DouglasDwyer in [#5688](https://github.com/gfx-rs/wgpu/pull/5688)
+
 ## v0.20.0 (2024-04-28)
 
 ### Major Changes
diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs
index 9d316e76fb..948c707b78 100644
--- a/wgpu/src/backend/webgpu.rs
+++ b/wgpu/src/backend/webgpu.rs
@@ -7,6 +7,7 @@ use js_sys::Promise;
 use std::{
     any::Any,
     cell::RefCell,
+    collections::HashMap,
     fmt,
     future::Future,
     marker::PhantomData,
@@ -1876,6 +1877,10 @@ impl crate::context::Context for ContextWebGpu {
         let module: &<ContextWebGpu as crate::Context>::ShaderModuleData =
             downcast_ref(desc.vertex.module.data.as_ref());
         let mut mapped_vertex_state = webgpu_sys::GpuVertexState::new(&module.0.module);
+        insert_constants_map(
+            &mapped_vertex_state,
+            desc.vertex.compilation_options.constants,
+        );
         mapped_vertex_state.entry_point(desc.vertex.entry_point);
 
         let buffers = desc
@@ -1952,6 +1957,7 @@ impl crate::context::Context for ContextWebGpu {
                 downcast_ref(frag.module.data.as_ref());
             let mut mapped_fragment_desc =
                 webgpu_sys::GpuFragmentState::new(&module.0.module, &targets);
+            insert_constants_map(&mapped_vertex_state, frag.compilation_options.constants);
             mapped_fragment_desc.entry_point(frag.entry_point);
             mapped_desc.fragment(&mapped_fragment_desc);
         }
@@ -1978,6 +1984,7 @@ impl crate::context::Context for ContextWebGpu {
             downcast_ref(desc.module.data.as_ref());
         let mut mapped_compute_stage =
             webgpu_sys::GpuProgrammableStage::new(&shader_module.0.module);
+        insert_constants_map(&mapped_compute_stage, desc.compilation_options.constants);
         mapped_compute_stage.entry_point(desc.entry_point);
         let auto_layout = wasm_bindgen::JsValue::from(webgpu_sys::GpuAutoLayoutMode::Auto);
         let mut mapped_desc = webgpu_sys::GpuComputePipelineDescriptor::new(
@@ -1994,6 +2001,7 @@ impl crate::context::Context for ContextWebGpu {
         if let Some(label) = desc.label {
             mapped_desc.label(label);
         }
+
         create_identified(device_data.0.create_compute_pipeline(&mapped_desc))
     }
 
@@ -3824,3 +3832,29 @@ impl Drop for BufferMappedRange {
         }
     }
 }
+
+/// Adds the constants map to the given pipeline descriptor if the map is nonempty.
+/// Panics if the map cannot be set.
+///
+/// This function is necessary because the constants array is not currently
+/// exposed by `wasm-bindgen`. See the following issues for details:
+/// - [gfx-rs/wgpu#5688](https://github.com/gfx-rs/wgpu/pull/5688)
+/// - [rustwasm/wasm-bindgen#3587](https://github.com/rustwasm/wasm-bindgen/issues/3587)
+fn insert_constants_map(target: &JsValue, map: &HashMap<String, f64>) {
+    if !map.is_empty() {
+        js_sys::Reflect::set(target, &"constants".into(), &hashmap_to_jsvalue(map))
+            .expect("Setting the values in a Javascript pipeline descriptor should never fail");
+    }
+}
+
+/// Converts a hashmap to a Javascript object.
+fn hashmap_to_jsvalue(map: &HashMap<String, f64>) -> JsValue {
+    let obj = js_sys::Object::new();
+
+    for (k, v) in map.iter() {
+        js_sys::Reflect::set(&obj, &k.into(), &(*v).into())
+            .expect("Setting the values in a Javascript map should never fail");
+    }
+
+    JsValue::from(obj)
+}

From 588950110af8aca278516ec15d33ef6b7b66588c Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Thu, 30 May 2024 00:43:24 +0200
Subject: [PATCH 4/9] Remove lifetime dependency of `ComputePass` to its parent
 command encoder (#5620)

* lift encoder->computepass lifetime constraint and add now failing test
* compute passes now take an arc to their parent command encoder, thus removing compile time dependency to it
* Command encoder goes now into locked state while compute pass is open
* changelog entry
* share most of the code between get_encoder and lock_encoder
---
 CHANGELOG.md                                  |   8 +-
 deno_webgpu/command_encoder.rs                |   5 +-
 ...ownership.rs => compute_pass_ownership.rs} |  49 +++-
 tests/tests/encoder.rs                        | 230 +++++++++++++++++-
 tests/tests/root.rs                           |   2 +-
 wgpu-core/src/command/clear.rs                |  10 +-
 wgpu-core/src/command/compute.rs              |  87 +++++--
 wgpu-core/src/command/mod.rs                  | 106 +++++++-
 wgpu-core/src/command/render.rs               |   2 +-
 wgpu-core/src/registry.rs                     |  12 +-
 wgpu/src/backend/wgpu_core.rs                 |  20 +-
 wgpu/src/lib.rs                               |  47 ++--
 12 files changed, 490 insertions(+), 88 deletions(-)
 rename tests/tests/{compute_pass_resource_ownership.rs => compute_pass_ownership.rs} (77%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 392ccc9b44..23370791cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,7 +47,13 @@ TODO(wumpf): This is still work in progress. Should write a bit more about it. A
 
 `wgpu::ComputePass` recording methods (e.g. `wgpu::ComputePass:set_render_pipeline`) no longer impose a lifetime constraint passed in resources.
 
-By @wumpf in [#5569](https://github.com/gfx-rs/wgpu/pull/5569), [#5575](https://github.com/gfx-rs/wgpu/pull/5575).
+Furthermore, `wgpu::ComputePass` no longer has a life time dependency on its parent `wgpu::CommandEncoder`.
+⚠️ As long as a `wgpu::ComputePass` is pending for a given `wgpu::CommandEncoder`, creation of a compute or render pass is an error and invalidates the `wgpu::CommandEncoder`.
+Previously, this was statically enforced by a lifetime constraint.
+TODO(wumpf): There was some discussion on whether to make this life time constraint opt-in or opt-out (entirely on `wgpu` side, no changes to `wgpu-core`).
+Lifting this lifetime dependencies is very useful for library authors, but opens up an easy way for incorrect use.
+
+By @wumpf in [#5569](https://github.com/gfx-rs/wgpu/pull/5569), [#5575](https://github.com/gfx-rs/wgpu/pull/5575), [#5620](https://github.com/gfx-rs/wgpu/pull/5620).
 
 #### Querying shader compilation errors
 
diff --git a/deno_webgpu/command_encoder.rs b/deno_webgpu/command_encoder.rs
index b82fba92ea..552b084171 100644
--- a/deno_webgpu/command_encoder.rs
+++ b/deno_webgpu/command_encoder.rs
@@ -261,15 +261,14 @@ pub fn op_webgpu_command_encoder_begin_compute_pass(
         timestamp_writes: timestamp_writes.as_ref(),
     };
 
-    let compute_pass = gfx_select!(command_encoder => instance.command_encoder_create_compute_pass_dyn(*command_encoder, &descriptor));
-
+    let (compute_pass, error) = gfx_select!(command_encoder => instance.command_encoder_create_compute_pass_dyn(*command_encoder, &descriptor));
     let rid = state
         .resource_table
         .add(super::compute_pass::WebGpuComputePass(RefCell::new(
             compute_pass,
         )));
 
-    Ok(WebGpuResult::rid(rid))
+    Ok(WebGpuResult::rid_err(rid, error))
 }
 
 #[op2]
diff --git a/tests/tests/compute_pass_resource_ownership.rs b/tests/tests/compute_pass_ownership.rs
similarity index 77%
rename from tests/tests/compute_pass_resource_ownership.rs
rename to tests/tests/compute_pass_ownership.rs
index 4d48c2ad9e..9988accd62 100644
--- a/tests/tests/compute_pass_resource_ownership.rs
+++ b/tests/tests/compute_pass_ownership.rs
@@ -1,9 +1,6 @@
 //! Tests that compute passes take ownership of resources that are associated with.
 //! I.e. once a resource is passed in to a compute pass, it can be dropped.
 //!
-//! TODO: Test doesn't check on timestamp writes & pipeline statistics queries yet.
-//!       (Not important as long as they are lifetime constrained to the command encoder,
-//!       but once we lift this constraint, we should add tests for this as well!)
 //! TODO: Also should test resource ownership for:
 //!       * write_timestamp
 //!       * begin_pipeline_statistics_query
@@ -11,7 +8,7 @@
 use std::num::NonZeroU64;
 
 use wgpu::util::DeviceExt as _;
-use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext};
+use wgpu_test::{gpu_test, valid, GpuTestConfiguration, TestParameters, TestingContext};
 
 const SHADER_SRC: &str = "
 @group(0) @binding(0)
@@ -75,6 +72,50 @@ async fn compute_pass_resource_ownership(ctx: TestingContext) {
     assert_eq!(floats, [2.0, 4.0, 6.0, 8.0]);
 }
 
+#[gpu_test]
+static COMPUTE_PASS_KEEP_ENCODER_ALIVE: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(TestParameters::default().test_features_limits())
+    .run_async(compute_pass_keep_encoder_alive);
+
+async fn compute_pass_keep_encoder_alive(ctx: TestingContext) {
+    let ResourceSetup {
+        gpu_buffer: _,
+        cpu_buffer: _,
+        buffer_size: _,
+        indirect_buffer,
+        bind_group,
+        pipeline,
+    } = resource_setup(&ctx);
+
+    let mut encoder = ctx
+        .device
+        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("encoder"),
+        });
+
+    let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+        label: Some("compute_pass"),
+        timestamp_writes: None,
+    });
+
+    // Now drop the encoder - it is kept alive by the compute pass.
+    drop(encoder);
+    ctx.async_poll(wgpu::Maintain::wait())
+        .await
+        .panic_on_timeout();
+
+    // Record some draw commands.
+    cpass.set_pipeline(&pipeline);
+    cpass.set_bind_group(0, &bind_group, &[]);
+    cpass.dispatch_workgroups_indirect(&indirect_buffer, 0);
+
+    // Dropping the pass will still execute the pass, even though there's no way to submit it.
+    // Ideally, this would log an error, but the encoder is not dropped until the compute pass is dropped,
+    // making this a valid operation.
+    // (If instead the encoder was explicitly destroyed or finished, this would be an error.)
+    valid(&ctx.device, || drop(cpass));
+}
+
 // Setup ------------------------------------------------------------
 
 struct ResourceSetup {
diff --git a/tests/tests/encoder.rs b/tests/tests/encoder.rs
index 83f575c4c8..efdde7a539 100644
--- a/tests/tests/encoder.rs
+++ b/tests/tests/encoder.rs
@@ -1,4 +1,8 @@
-use wgpu_test::{fail, gpu_test, FailureCase, GpuTestConfiguration, TestParameters};
+use wgpu::util::DeviceExt;
+use wgpu::CommandEncoder;
+use wgpu_test::{
+    fail, gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext,
+};
 
 #[gpu_test]
 static DROP_ENCODER: GpuTestConfiguration = GpuTestConfiguration::new().run_sync(|ctx| {
@@ -72,3 +76,227 @@ static DROP_ENCODER_AFTER_ERROR: GpuTestConfiguration = GpuTestConfiguration::ne
         // The encoder is still open!
         drop(encoder);
     });
+
+// TODO: This should also apply to render passes once the lifetime bound is lifted.
+#[gpu_test]
+static ENCODER_OPERATIONS_FAIL_WHILE_COMPUTE_PASS_ALIVE: GpuTestConfiguration =
+    GpuTestConfiguration::new()
+        .parameters(TestParameters::default().features(
+            wgpu::Features::CLEAR_TEXTURE
+                | wgpu::Features::TIMESTAMP_QUERY
+                | wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS,
+        ))
+        .run_sync(encoder_operations_fail_while_compute_pass_alive);
+
+fn encoder_operations_fail_while_compute_pass_alive(ctx: TestingContext) {
+    let buffer_source = ctx
+        .device
+        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: None,
+            contents: &[0u8; 4],
+            usage: wgpu::BufferUsages::COPY_SRC,
+        });
+    let buffer_dest = ctx
+        .device
+        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: None,
+            contents: &[0u8; 4],
+            usage: wgpu::BufferUsages::COPY_DST,
+        });
+
+    let texture_desc = wgpu::TextureDescriptor {
+        label: None,
+        size: wgpu::Extent3d {
+            width: 1,
+            height: 1,
+            depth_or_array_layers: 1,
+        },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        format: wgpu::TextureFormat::Rgba8Unorm,
+        usage: wgpu::TextureUsages::COPY_DST,
+        view_formats: &[],
+    };
+    let texture_dst = ctx.device.create_texture(&texture_desc);
+    let texture_src = ctx.device.create_texture(&wgpu::TextureDescriptor {
+        usage: wgpu::TextureUsages::COPY_SRC,
+        ..texture_desc
+    });
+    let query_set = ctx.device.create_query_set(&wgpu::QuerySetDescriptor {
+        count: 1,
+        ty: wgpu::QueryType::Timestamp,
+        label: None,
+    });
+
+    #[allow(clippy::type_complexity)]
+    let recording_ops: Vec<(_, Box<dyn Fn(&mut CommandEncoder)>)> = vec![
+        (
+            "begin_compute_pass",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default());
+            }),
+        ),
+        (
+            "begin_render_pass",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.begin_render_pass(&wgpu::RenderPassDescriptor::default());
+            }),
+        ),
+        (
+            "copy_buffer_to_buffer",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.copy_buffer_to_buffer(&buffer_source, 0, &buffer_dest, 0, 4);
+            }),
+        ),
+        (
+            "copy_buffer_to_texture",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.copy_buffer_to_texture(
+                    wgpu::ImageCopyBuffer {
+                        buffer: &buffer_source,
+                        layout: wgpu::ImageDataLayout {
+                            offset: 0,
+                            bytes_per_row: Some(4),
+                            rows_per_image: None,
+                        },
+                    },
+                    texture_dst.as_image_copy(),
+                    texture_dst.size(),
+                );
+            }),
+        ),
+        (
+            "copy_texture_to_buffer",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.copy_texture_to_buffer(
+                    wgpu::ImageCopyTexture {
+                        texture: &texture_src,
+                        mip_level: 0,
+                        origin: wgpu::Origin3d::ZERO,
+                        aspect: wgpu::TextureAspect::All,
+                    },
+                    wgpu::ImageCopyBuffer {
+                        buffer: &buffer_dest,
+                        layout: wgpu::ImageDataLayout {
+                            offset: 0,
+                            bytes_per_row: Some(4),
+                            rows_per_image: None,
+                        },
+                    },
+                    texture_dst.size(),
+                );
+            }),
+        ),
+        (
+            "copy_texture_to_texture",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.copy_texture_to_texture(
+                    wgpu::ImageCopyTexture {
+                        texture: &texture_src,
+                        mip_level: 0,
+                        origin: wgpu::Origin3d::ZERO,
+                        aspect: wgpu::TextureAspect::All,
+                    },
+                    wgpu::ImageCopyTexture {
+                        texture: &texture_dst,
+                        mip_level: 0,
+                        origin: wgpu::Origin3d::ZERO,
+                        aspect: wgpu::TextureAspect::All,
+                    },
+                    texture_dst.size(),
+                );
+            }),
+        ),
+        (
+            "clear_texture",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.clear_texture(&texture_dst, &wgpu::ImageSubresourceRange::default());
+            }),
+        ),
+        (
+            "clear_buffer",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.clear_buffer(&buffer_dest, 0, None);
+            }),
+        ),
+        (
+            "insert_debug_marker",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.insert_debug_marker("marker");
+            }),
+        ),
+        (
+            "push_debug_group",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.push_debug_group("marker");
+            }),
+        ),
+        (
+            "pop_debug_group",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.pop_debug_group();
+            }),
+        ),
+        (
+            "resolve_query_set",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.resolve_query_set(&query_set, 0..1, &buffer_dest, 0);
+            }),
+        ),
+        (
+            "write_timestamp",
+            Box::new(|encoder: &mut wgpu::CommandEncoder| {
+                encoder.write_timestamp(&query_set, 0);
+            }),
+        ),
+    ];
+
+    for (op_name, op) in recording_ops.iter() {
+        let mut encoder = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        let pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default());
+
+        ctx.device.push_error_scope(wgpu::ErrorFilter::Validation);
+
+        log::info!("Testing operation {} on a locked command encoder", op_name);
+        fail(
+            &ctx.device,
+            || op(&mut encoder),
+            Some("Command encoder is locked"),
+        );
+
+        // Drop the pass - this also fails now since the encoder is invalid:
+        fail(
+            &ctx.device,
+            || drop(pass),
+            Some("Command encoder is invalid"),
+        );
+        // Also, it's not possible to create a new pass on the encoder:
+        fail(
+            &ctx.device,
+            || encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default()),
+            Some("Command encoder is invalid"),
+        );
+    }
+
+    // Test encoder finishing separately since it consumes the encoder and doesn't fit above pattern.
+    {
+        let mut encoder = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+        let pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor::default());
+        fail(
+            &ctx.device,
+            || encoder.finish(),
+            Some("Command encoder is locked"),
+        );
+        fail(
+            &ctx.device,
+            || drop(pass),
+            Some("Command encoder is invalid"),
+        );
+    }
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index 29f894ede9..1cb5b56c7c 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -11,7 +11,7 @@ mod buffer;
 mod buffer_copy;
 mod buffer_usages;
 mod clear_texture;
-mod compute_pass_resource_ownership;
+mod compute_pass_ownership;
 mod create_surface_error;
 mod device;
 mod encoder;
diff --git a/wgpu-core/src/command/clear.rs b/wgpu-core/src/command/clear.rs
index faff177928..9ef0f24d47 100644
--- a/wgpu-core/src/command/clear.rs
+++ b/wgpu-core/src/command/clear.rs
@@ -26,8 +26,6 @@ use wgt::{math::align_to, BufferAddress, BufferUsages, ImageSubresourceRange, Te
 pub enum ClearError {
     #[error("To use clear_texture the CLEAR_TEXTURE feature needs to be enabled")]
     MissingClearTextureFeature,
-    #[error("Command encoder {0:?} is invalid")]
-    InvalidCommandEncoder(CommandEncoderId),
     #[error("Device {0:?} is invalid")]
     InvalidDevice(DeviceId),
     #[error("Buffer {0:?} is invalid or destroyed")]
@@ -74,6 +72,8 @@ whereas subesource range specified start {subresource_base_array_layer} and coun
     },
     #[error(transparent)]
     Device(#[from] DeviceError),
+    #[error(transparent)]
+    CommandEncoderError(#[from] super::CommandEncoderError),
 }
 
 impl Global {
@@ -89,8 +89,7 @@ impl Global {
 
         let hub = A::hub(self);
 
-        let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)
-            .map_err(|_| ClearError::InvalidCommandEncoder(command_encoder_id))?;
+        let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)?;
         let mut cmd_buf_data = cmd_buf.data.lock();
         let cmd_buf_data = cmd_buf_data.as_mut().unwrap();
 
@@ -183,8 +182,7 @@ impl Global {
 
         let hub = A::hub(self);
 
-        let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)
-            .map_err(|_| ClearError::InvalidCommandEncoder(command_encoder_id))?;
+        let cmd_buf = CommandBuffer::get_encoder(hub, command_encoder_id)?;
         let mut cmd_buf_data = cmd_buf.data.lock();
         let cmd_buf_data = cmd_buf_data.as_mut().unwrap();
 
diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs
index 08609d9e51..5f463e179d 100644
--- a/wgpu-core/src/command/compute.rs
+++ b/wgpu-core/src/command/compute.rs
@@ -13,7 +13,7 @@ use crate::{
     global::Global,
     hal_api::HalApi,
     hal_label,
-    id::{self, DeviceId},
+    id::{self},
     init_tracker::MemoryInitKind,
     resource::{self, Resource},
     snatch::SnatchGuard,
@@ -34,14 +34,20 @@ use wgt::{BufferAddress, DynamicOffset};
 use std::sync::Arc;
 use std::{fmt, mem, str};
 
+use super::DynComputePass;
+
 pub struct ComputePass<A: HalApi> {
     /// All pass data & records is stored here.
     ///
-    /// If this is `None`, the pass has been ended and can no longer be used.
+    /// If this is `None`, the pass is in the 'ended' state and can no longer be used.
     /// Any attempt to record more commands will result in a validation error.
     base: Option<BasePass<ArcComputeCommand<A>>>,
 
-    parent_id: id::CommandEncoderId,
+    /// Parent command buffer that this pass records commands into.
+    ///
+    /// If it is none, this pass is invalid and any operation on it will return an error.
+    parent: Option<Arc<CommandBuffer<A>>>,
+
     timestamp_writes: Option<ComputePassTimestampWrites>,
 
     // Resource binding dedupe state.
@@ -50,10 +56,11 @@ pub struct ComputePass<A: HalApi> {
 }
 
 impl<A: HalApi> ComputePass<A> {
-    fn new(parent_id: id::CommandEncoderId, desc: &ComputePassDescriptor) -> Self {
+    /// If the parent command buffer is invalid, the returned pass will be invalid.
+    fn new(parent: Option<Arc<CommandBuffer<A>>>, desc: &ComputePassDescriptor) -> Self {
         Self {
-            base: Some(BasePass::<ArcComputeCommand<A>>::new(&desc.label)),
-            parent_id,
+            base: Some(BasePass::new(&desc.label)),
+            parent,
             timestamp_writes: desc.timestamp_writes.cloned(),
 
             current_bind_groups: BindGroupStateChange::new(),
@@ -62,8 +69,8 @@ impl<A: HalApi> ComputePass<A> {
     }
 
     #[inline]
-    pub fn parent_id(&self) -> id::CommandEncoderId {
-        self.parent_id
+    pub fn parent_id(&self) -> Option<id::CommandBufferId> {
+        self.parent.as_ref().map(|cmd_buf| cmd_buf.as_info().id())
     }
 
     #[inline]
@@ -84,7 +91,7 @@ impl<A: HalApi> ComputePass<A> {
 
 impl<A: HalApi> fmt::Debug for ComputePass<A> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "ComputePass {{ encoder_id: {:?} }}", self.parent_id)
+        write!(f, "ComputePass {{ parent: {:?} }}", self.parent_id())
     }
 }
 
@@ -129,10 +136,12 @@ pub enum ComputePassErrorInner {
     Device(#[from] DeviceError),
     #[error(transparent)]
     Encoder(#[from] CommandEncoderError),
+    #[error("Parent encoder is invalid")]
+    InvalidParentEncoder,
     #[error("Bind group at index {0:?} is invalid")]
     InvalidBindGroup(u32),
     #[error("Device {0:?} is invalid")]
-    InvalidDevice(DeviceId),
+    InvalidDevice(id::DeviceId),
     #[error("Bind group index {index} is greater than the device's requested `max_bind_group` limit {max}")]
     BindGroupIndexOutOfRange { index: u32, max: u32 },
     #[error("Compute pipeline {0:?} is invalid")]
@@ -292,31 +301,55 @@ impl<'a, A: HalApi> State<'a, A> {
 // Running the compute pass.
 
 impl Global {
+    /// Creates a compute pass.
+    ///
+    /// If creation fails, an invalid pass is returned.
+    /// Any operation on an invalid pass will return an error.
+    ///
+    /// If successful, puts the encoder into the [`CommandEncoderStatus::Locked`] state.
     pub fn command_encoder_create_compute_pass<A: HalApi>(
         &self,
-        parent_id: id::CommandEncoderId,
+        encoder_id: id::CommandEncoderId,
         desc: &ComputePassDescriptor,
-    ) -> ComputePass<A> {
-        ComputePass::new(parent_id, desc)
+    ) -> (ComputePass<A>, Option<CommandEncoderError>) {
+        let hub = A::hub(self);
+
+        match CommandBuffer::lock_encoder(hub, encoder_id) {
+            Ok(cmd_buf) => (ComputePass::new(Some(cmd_buf), desc), None),
+            Err(err) => (ComputePass::new(None, desc), Some(err)),
+        }
     }
 
+    /// Creates a type erased compute pass.
+    ///
+    /// If creation fails, an invalid pass is returned.
+    /// Any operation on an invalid pass will return an error.
     pub fn command_encoder_create_compute_pass_dyn<A: HalApi>(
         &self,
-        parent_id: id::CommandEncoderId,
+        encoder_id: id::CommandEncoderId,
         desc: &ComputePassDescriptor,
-    ) -> Box<dyn super::DynComputePass> {
-        Box::new(ComputePass::<A>::new(parent_id, desc))
+    ) -> (Box<dyn DynComputePass>, Option<CommandEncoderError>) {
+        let (pass, err) = self.command_encoder_create_compute_pass::<A>(encoder_id, desc);
+        (Box::new(pass), err)
     }
 
     pub fn compute_pass_end<A: HalApi>(
         &self,
         pass: &mut ComputePass<A>,
     ) -> Result<(), ComputePassError> {
-        let base = pass.base.take().ok_or(ComputePassError {
-            scope: PassErrorScope::Pass(pass.parent_id),
-            inner: ComputePassErrorInner::PassEnded,
-        })?;
-        self.compute_pass_end_impl(pass.parent_id, base, pass.timestamp_writes.as_ref())
+        let scope = PassErrorScope::Pass(pass.parent_id());
+        let Some(parent) = pass.parent.as_ref() else {
+            return Err(ComputePassErrorInner::InvalidParentEncoder).map_pass_err(scope);
+        };
+
+        parent.unlock_encoder().map_pass_err(scope)?;
+
+        let base = pass
+            .base
+            .take()
+            .ok_or(ComputePassErrorInner::PassEnded)
+            .map_pass_err(scope)?;
+        self.compute_pass_end_impl(parent, base, pass.timestamp_writes.as_ref())
     }
 
     #[doc(hidden)]
@@ -326,10 +359,14 @@ impl Global {
         base: BasePass<ComputeCommand>,
         timestamp_writes: Option<&ComputePassTimestampWrites>,
     ) -> Result<(), ComputePassError> {
+        let hub = A::hub(self);
+
+        let cmd_buf = CommandBuffer::get_encoder(hub, encoder_id)
+            .map_pass_err(PassErrorScope::PassEncoder(encoder_id))?;
         let commands = ComputeCommand::resolve_compute_command_ids(A::hub(self), &base.commands)?;
 
         self.compute_pass_end_impl::<A>(
-            encoder_id,
+            &cmd_buf,
             BasePass {
                 label: base.label,
                 commands,
@@ -343,17 +380,15 @@ impl Global {
 
     fn compute_pass_end_impl<A: HalApi>(
         &self,
-        encoder_id: id::CommandEncoderId,
+        cmd_buf: &CommandBuffer<A>,
         base: BasePass<ArcComputeCommand<A>>,
         timestamp_writes: Option<&ComputePassTimestampWrites>,
     ) -> Result<(), ComputePassError> {
         profiling::scope!("CommandEncoder::run_compute_pass");
-        let pass_scope = PassErrorScope::Pass(encoder_id);
+        let pass_scope = PassErrorScope::Pass(Some(cmd_buf.as_info().id()));
 
         let hub = A::hub(self);
 
-        let cmd_buf: Arc<CommandBuffer<A>> =
-            CommandBuffer::get_encoder(hub, encoder_id).map_pass_err(pass_scope)?;
         let device = &cmd_buf.device;
         if !device.is_valid() {
             return Err(ComputePassErrorInner::InvalidDevice(
diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs
index bfb9276057..20a6bdfae1 100644
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -25,7 +25,6 @@ use self::memory_init::CommandBufferTextureMemoryActions;
 use crate::device::{Device, DeviceError};
 use crate::error::{ErrorFormatter, PrettyError};
 use crate::hub::Hub;
-use crate::id::CommandBufferId;
 use crate::lock::{rank, Mutex};
 use crate::snatch::SnatchGuard;
 
@@ -51,10 +50,23 @@ pub(crate) enum CommandEncoderStatus {
     /// [`compute_pass_end`] require the encoder to be in this
     /// state.
     ///
+    /// This corresponds to WebGPU's "open" state.
+    /// See <https://www.w3.org/TR/webgpu/#encoder-state-open>
+    ///
     /// [`command_encoder_clear_buffer`]: Global::command_encoder_clear_buffer
     /// [`compute_pass_end`]: Global::compute_pass_end
     Recording,
 
+    /// Locked by a render or compute pass.
+    ///
+    /// This state is entered when a render/compute pass is created,
+    /// and exited when the pass is ended.
+    ///
+    /// As long as the command encoder is locked, any command building operation on it will fail
+    /// and put the encoder into the [`CommandEncoderStatus::Error`] state.
+    /// See <https://www.w3.org/TR/webgpu/#encoder-state-locked>
+    Locked,
+
     /// Command recording is complete, and the buffer is ready for submission.
     ///
     /// [`Global::command_encoder_finish`] transitions a
@@ -410,6 +422,38 @@ impl<A: HalApi> CommandBuffer<A> {
 }
 
 impl<A: HalApi> CommandBuffer<A> {
+    fn get_encoder_impl(
+        hub: &Hub<A>,
+        id: id::CommandEncoderId,
+        lock_on_acquire: bool,
+    ) -> Result<Arc<Self>, CommandEncoderError> {
+        let storage = hub.command_buffers.read();
+        match storage.get(id.into_command_buffer_id()) {
+            Ok(cmd_buf) => {
+                let mut cmd_buf_data = cmd_buf.data.lock();
+                let cmd_buf_data = cmd_buf_data.as_mut().unwrap();
+                match cmd_buf_data.status {
+                    CommandEncoderStatus::Recording => {
+                        if lock_on_acquire {
+                            cmd_buf_data.status = CommandEncoderStatus::Locked;
+                        }
+                        Ok(cmd_buf.clone())
+                    }
+                    CommandEncoderStatus::Locked => {
+                        // Any operation on a locked encoder is required to put it into the invalid/error state.
+                        // See https://www.w3.org/TR/webgpu/#encoder-state-locked
+                        cmd_buf_data.encoder.discard();
+                        cmd_buf_data.status = CommandEncoderStatus::Error;
+                        Err(CommandEncoderError::Locked)
+                    }
+                    CommandEncoderStatus::Finished => Err(CommandEncoderError::NotRecording),
+                    CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid),
+                }
+            }
+            Err(_) => Err(CommandEncoderError::Invalid),
+        }
+    }
+
     /// Return the [`CommandBuffer`] for `id`, for recording new commands.
     ///
     /// In `wgpu_core`, the [`CommandBuffer`] type serves both as encoder and
@@ -420,14 +464,37 @@ impl<A: HalApi> CommandBuffer<A> {
         hub: &Hub<A>,
         id: id::CommandEncoderId,
     ) -> Result<Arc<Self>, CommandEncoderError> {
-        let storage = hub.command_buffers.read();
-        match storage.get(id.into_command_buffer_id()) {
-            Ok(cmd_buf) => match cmd_buf.data.lock().as_ref().unwrap().status {
-                CommandEncoderStatus::Recording => Ok(cmd_buf.clone()),
-                CommandEncoderStatus::Finished => Err(CommandEncoderError::NotRecording),
-                CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid),
-            },
-            Err(_) => Err(CommandEncoderError::Invalid),
+        let lock_on_acquire = false;
+        Self::get_encoder_impl(hub, id, lock_on_acquire)
+    }
+
+    /// Return the [`CommandBuffer`] for `id` and if successful puts it into the [`CommandEncoderStatus::Locked`] state.
+    ///
+    /// See [`CommandBuffer::get_encoder`].
+    /// Call [`CommandBuffer::unlock_encoder`] to put the [`CommandBuffer`] back into the [`CommandEncoderStatus::Recording`] state.
+    fn lock_encoder(
+        hub: &Hub<A>,
+        id: id::CommandEncoderId,
+    ) -> Result<Arc<Self>, CommandEncoderError> {
+        let lock_on_acquire = true;
+        Self::get_encoder_impl(hub, id, lock_on_acquire)
+    }
+
+    /// Unlocks the [`CommandBuffer`] for `id` and puts it back into the [`CommandEncoderStatus::Recording`] state.
+    ///
+    /// This function is the counterpart to [`CommandBuffer::lock_encoder`].
+    /// It is only valid to call this function if the encoder is in the [`CommandEncoderStatus::Locked`] state.
+    fn unlock_encoder(&self) -> Result<(), CommandEncoderError> {
+        let mut data_lock = self.data.lock();
+        let status = &mut data_lock.as_mut().unwrap().status;
+        match *status {
+            CommandEncoderStatus::Recording => Err(CommandEncoderError::Invalid),
+            CommandEncoderStatus::Locked => {
+                *status = CommandEncoderStatus::Recording;
+                Ok(())
+            }
+            CommandEncoderStatus::Finished => Err(CommandEncoderError::Invalid),
+            CommandEncoderStatus::Error => Err(CommandEncoderError::Invalid),
         }
     }
 
@@ -564,6 +631,8 @@ pub enum CommandEncoderError {
     NotRecording,
     #[error(transparent)]
     Device(#[from] DeviceError),
+    #[error("Command encoder is locked by a previously created render/compute pass. Before recording any new commands, the pass must be ended.")]
+    Locked,
 }
 
 impl Global {
@@ -571,7 +640,7 @@ impl Global {
         &self,
         encoder_id: id::CommandEncoderId,
         _desc: &wgt::CommandBufferDescriptor<Label>,
-    ) -> (CommandBufferId, Option<CommandEncoderError>) {
+    ) -> (id::CommandBufferId, Option<CommandEncoderError>) {
         profiling::scope!("CommandEncoder::finish");
 
         let hub = A::hub(self);
@@ -592,6 +661,11 @@ impl Global {
                             None
                         }
                     }
+                    CommandEncoderStatus::Locked => {
+                        cmd_buf_data.encoder.discard();
+                        cmd_buf_data.status = CommandEncoderStatus::Error;
+                        Some(CommandEncoderError::Locked)
+                    }
                     CommandEncoderStatus::Finished => Some(CommandEncoderError::NotRecording),
                     CommandEncoderStatus::Error => {
                         cmd_buf_data.encoder.discard();
@@ -805,7 +879,12 @@ pub enum PassErrorScope {
     #[error("In a bundle parameter")]
     Bundle,
     #[error("In a pass parameter")]
-    Pass(id::CommandEncoderId),
+    // TODO: To be removed in favor of `Pass`.
+    // ComputePass is already operating on command buffer instead,
+    // same should apply to RenderPass in the future.
+    PassEncoder(id::CommandEncoderId),
+    #[error("In a pass parameter")]
+    Pass(Option<id::CommandBufferId>),
     #[error("In a set_bind_group command")]
     SetBindGroup(id::BindGroupId),
     #[error("In a set_pipeline command")]
@@ -859,9 +938,12 @@ impl PrettyError for PassErrorScope {
     fn fmt_pretty(&self, fmt: &mut ErrorFormatter) {
         // This error is not in the error chain, only notes are needed
         match *self {
-            Self::Pass(id) => {
+            Self::PassEncoder(id) => {
                 fmt.command_buffer_label(&id.into_command_buffer_id());
             }
+            Self::Pass(Some(id)) => {
+                fmt.command_buffer_label(&id);
+            }
             Self::SetBindGroup(id) => {
                 fmt.bind_group_label(&id);
             }
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index 541dfd6b43..47b8e45a28 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -1341,7 +1341,7 @@ impl Global {
             .contains(wgt::InstanceFlags::DISCARD_HAL_LABELS);
         let label = hal_label(base.label, self.instance.flags);
 
-        let pass_scope = PassErrorScope::Pass(encoder_id);
+        let pass_scope = PassErrorScope::PassEncoder(encoder_id);
 
         let hub = A::hub(self);
 
diff --git a/wgpu-core/src/registry.rs b/wgpu-core/src/registry.rs
index f5abb76dfe..d14d882067 100644
--- a/wgpu-core/src/registry.rs
+++ b/wgpu-core/src/registry.rs
@@ -176,8 +176,14 @@ impl<T: Resource> Registry<T> {
         let guard = self.storage.read();
 
         let type_name = guard.kind();
-        match guard.get(id) {
-            Ok(res) => {
+
+        // Using `get` over `try_get` is fine for the most part.
+        // However, there's corner cases where it can happen that a resource still holds an Arc
+        // to another resource that was already dropped explicitly from the registry.
+        // That resource is now in an invalid state, likely causing an error that lead
+        // us here, trying to print its label but failing because the id is now vacant.
+        match guard.try_get(id) {
+            Ok(Some(res)) => {
                 let label = res.label();
                 if label.is_empty() {
                     format!("<{}-{:?}>", type_name, id.unzip())
@@ -185,7 +191,7 @@ impl<T: Resource> Registry<T> {
                     label.to_owned()
                 }
             }
-            Err(_) => format!(
+            _ => format!(
                 "<Invalid-{} label={}>",
                 type_name,
                 guard.label_for_invalid_id(id)
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 4065a590b1..5ed055f2be 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -1918,13 +1918,25 @@ impl crate::Context for ContextWgpuCore {
                     end_of_pass_write_index: tw.end_of_pass_write_index,
                 });
 
+        let (pass, err) = gfx_select!(encoder => self.0.command_encoder_create_compute_pass_dyn(*encoder, &wgc::command::ComputePassDescriptor {
+            label: desc.label.map(Borrowed),
+            timestamp_writes: timestamp_writes.as_ref(),
+        }));
+
+        if let Some(cause) = err {
+            self.handle_error(
+                &encoder_data.error_sink,
+                cause,
+                LABEL,
+                desc.label,
+                "CommandEncoder::begin_compute_pass",
+            );
+        }
+
         (
             Unused,
             Self::ComputePassData {
-                pass: gfx_select!(encoder => self.0.command_encoder_create_compute_pass_dyn(*encoder, &wgc::command::ComputePassDescriptor {
-                    label: desc.label.map(Borrowed),
-                    timestamp_writes: timestamp_writes.as_ref(),
-                })),
+                pass,
                 error_sink: encoder_data.error_sink.clone(),
             },
         )
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index be80b20cb7..00130a99c2 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -1286,10 +1286,10 @@ pub struct RenderPass<'a> {
 /// Corresponds to [WebGPU `GPUComputePassEncoder`](
 /// https://gpuweb.github.io/gpuweb/#compute-pass-encoder).
 #[derive(Debug)]
-pub struct ComputePass<'a> {
+pub struct ComputePass {
     id: ObjectId,
     data: Box<Data>,
-    parent: &'a mut CommandEncoder,
+    context: Arc<C>,
 }
 
 /// Encodes a series of GPU operations into a reusable "render bundle".
@@ -3876,7 +3876,7 @@ impl CommandEncoder {
     /// Begins recording of a compute pass.
     ///
     /// This function returns a [`ComputePass`] object which records a single compute pass.
-    pub fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor<'_>) -> ComputePass<'_> {
+    pub fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor<'_>) -> ComputePass {
         let id = self.id.as_ref().unwrap();
         let (id, data) = DynContext::command_encoder_begin_compute_pass(
             &*self.context,
@@ -3887,7 +3887,7 @@ impl CommandEncoder {
         ComputePass {
             id,
             data,
-            parent: self,
+            context: self.context.clone(),
         }
     }
 
@@ -4728,7 +4728,7 @@ impl<'a> Drop for RenderPass<'a> {
     }
 }
 
-impl<'a> ComputePass<'a> {
+impl ComputePass {
     /// Sets the active bind group for a given bind group index. The bind group layout
     /// in the active pipeline when the `dispatch()` function is called must match the layout of this bind group.
     ///
@@ -4742,7 +4742,7 @@ impl<'a> ComputePass<'a> {
         offsets: &[DynamicOffset],
     ) {
         DynContext::compute_pass_set_bind_group(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             index,
@@ -4755,7 +4755,7 @@ impl<'a> ComputePass<'a> {
     /// Sets the active compute pipeline.
     pub fn set_pipeline(&mut self, pipeline: &ComputePipeline) {
         DynContext::compute_pass_set_pipeline(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             &pipeline.id,
@@ -4766,7 +4766,7 @@ impl<'a> ComputePass<'a> {
     /// Inserts debug marker.
     pub fn insert_debug_marker(&mut self, label: &str) {
         DynContext::compute_pass_insert_debug_marker(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             label,
@@ -4776,7 +4776,7 @@ impl<'a> ComputePass<'a> {
     /// Start record commands and group it into debug marker group.
     pub fn push_debug_group(&mut self, label: &str) {
         DynContext::compute_pass_push_debug_group(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             label,
@@ -4785,11 +4785,7 @@ impl<'a> ComputePass<'a> {
 
     /// Stops command recording and creates debug group.
     pub fn pop_debug_group(&mut self) {
-        DynContext::compute_pass_pop_debug_group(
-            &*self.parent.context,
-            &mut self.id,
-            self.data.as_mut(),
-        );
+        DynContext::compute_pass_pop_debug_group(&*self.context, &mut self.id, self.data.as_mut());
     }
 
     /// Dispatches compute work operations.
@@ -4797,7 +4793,7 @@ impl<'a> ComputePass<'a> {
     /// `x`, `y` and `z` denote the number of work groups to dispatch in each dimension.
     pub fn dispatch_workgroups(&mut self, x: u32, y: u32, z: u32) {
         DynContext::compute_pass_dispatch_workgroups(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             x,
@@ -4815,7 +4811,7 @@ impl<'a> ComputePass<'a> {
         indirect_offset: BufferAddress,
     ) {
         DynContext::compute_pass_dispatch_workgroups_indirect(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             &indirect_buffer.id,
@@ -4826,7 +4822,7 @@ impl<'a> ComputePass<'a> {
 }
 
 /// [`Features::PUSH_CONSTANTS`] must be enabled on the device in order to call these functions.
-impl<'a> ComputePass<'a> {
+impl ComputePass {
     /// Set push constant data for subsequent dispatch calls.
     ///
     /// Write the bytes in `data` at offset `offset` within push constant
@@ -4837,7 +4833,7 @@ impl<'a> ComputePass<'a> {
     /// call will write `data` to bytes `4..12` of push constant storage.
     pub fn set_push_constants(&mut self, offset: u32, data: &[u8]) {
         DynContext::compute_pass_set_push_constants(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             offset,
@@ -4847,7 +4843,7 @@ impl<'a> ComputePass<'a> {
 }
 
 /// [`Features::TIMESTAMP_QUERY_INSIDE_PASSES`] must be enabled on the device in order to call these functions.
-impl<'a> ComputePass<'a> {
+impl ComputePass {
     /// Issue a timestamp command at this point in the queue. The timestamp will be written to the specified query set, at the specified index.
     ///
     /// Must be multiplied by [`Queue::get_timestamp_period`] to get
@@ -4856,7 +4852,7 @@ impl<'a> ComputePass<'a> {
     /// for a string of operations to complete.
     pub fn write_timestamp(&mut self, query_set: &QuerySet, query_index: u32) {
         DynContext::compute_pass_write_timestamp(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             &query_set.id,
@@ -4867,12 +4863,12 @@ impl<'a> ComputePass<'a> {
 }
 
 /// [`Features::PIPELINE_STATISTICS_QUERY`] must be enabled on the device in order to call these functions.
-impl<'a> ComputePass<'a> {
+impl ComputePass {
     /// Start a pipeline statistics query on this compute pass. It can be ended with
     /// `end_pipeline_statistics_query`. Pipeline statistics queries may not be nested.
     pub fn begin_pipeline_statistics_query(&mut self, query_set: &QuerySet, query_index: u32) {
         DynContext::compute_pass_begin_pipeline_statistics_query(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
             &query_set.id,
@@ -4885,18 +4881,17 @@ impl<'a> ComputePass<'a> {
     /// `begin_pipeline_statistics_query`. Pipeline statistics queries may not be nested.
     pub fn end_pipeline_statistics_query(&mut self) {
         DynContext::compute_pass_end_pipeline_statistics_query(
-            &*self.parent.context,
+            &*self.context,
             &mut self.id,
             self.data.as_mut(),
         );
     }
 }
 
-impl<'a> Drop for ComputePass<'a> {
+impl Drop for ComputePass {
     fn drop(&mut self) {
         if !thread::panicking() {
-            self.parent
-                .context
+            self.context
                 .compute_pass_end(&mut self.id, self.data.as_mut());
         }
     }

From 60a14c67fb43e68b7e64bee45ab3fac3f4253e72 Mon Sep 17 00:00:00 2001
From: Mads Marquart <mads@marquart.dk>
Date: Thu, 30 May 2024 00:55:33 +0200
Subject: [PATCH 5/9] Remove the `link` Cargo feature (#5752)

---
 CHANGELOG.md                  | 9 +++++++++
 wgpu-core/Cargo.toml          | 5 -----
 wgpu-hal/Cargo.toml           | 2 --
 wgpu-hal/src/metal/surface.rs | 2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 23370791cd..a73b79330b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -96,6 +96,15 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 
 - Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
 
+#### Metal
+- Removed the `link` Cargo feature.
+
+  This was used to allow weakly linking frameworks. This can be achieved with putting something like the following in your `.cargo/config.toml` instead:
+  ```toml
+  [target.'cfg(target_vendor = "apple")']
+  rustflags = ["-C", "link-args=-weak_framework Metal -weak_framework QuartzCore -weak_framework CoreGraphics"]
+  ```
+
 ### Bug Fixes
 
 #### General
diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml
index c18a5066ab..711145fed9 100644
--- a/wgpu-core/Cargo.toml
+++ b/wgpu-core/Cargo.toml
@@ -32,17 +32,12 @@ ignored = ["cfg_aliases"]
 [lib]
 
 [features]
-default = ["link"]
-
 ## Log all API entry points at info instead of trace level.
 api_log_info = []
 
 ## Log resource lifecycle management at info instead of trace level.
 resource_log_info = []
 
-## Use static linking for libraries. Disable to manually link. Enabled by default.
-link = ["hal/link"]
-
 ## Support the Renderdoc graphics debugger:
 ## <https://renderdoc.org/>
 renderdoc = ["hal/renderdoc"]
diff --git a/wgpu-hal/Cargo.toml b/wgpu-hal/Cargo.toml
index 96ee7ff95f..9852b61081 100644
--- a/wgpu-hal/Cargo.toml
+++ b/wgpu-hal/Cargo.toml
@@ -37,7 +37,6 @@ ignored = ["cfg_aliases"]
 [lib]
 
 [features]
-default = ["link"]
 metal = ["naga/msl-out", "dep:block"]
 vulkan = [
     "naga/spv-out",
@@ -76,7 +75,6 @@ windows_rs = ["dep:gpu-allocator"]
 dxc_shader_compiler = ["dep:hassle-rs"]
 renderdoc = ["dep:libloading", "dep:renderdoc-sys"]
 fragile-send-sync-non-atomic-wasm = ["wgt/fragile-send-sync-non-atomic-wasm"]
-link = ["metal/link"]
 # Panic when running into an out-of-memory error (for debugging purposes).
 #
 # Only affects the d3d12 and vulkan backends.
diff --git a/wgpu-hal/src/metal/surface.rs b/wgpu-hal/src/metal/surface.rs
index 889e319493..e1eb6d5b23 100644
--- a/wgpu-hal/src/metal/surface.rs
+++ b/wgpu-hal/src/metal/surface.rs
@@ -17,7 +17,7 @@ use objc::{
 use parking_lot::{Mutex, RwLock};
 
 #[cfg(target_os = "macos")]
-#[cfg_attr(feature = "link", link(name = "QuartzCore", kind = "framework"))]
+#[link(name = "QuartzCore", kind = "framework")]
 extern "C" {
     #[allow(non_upper_case_globals)]
     static kCAGravityTopLeft: *mut Object;

From 480d4dbd738c0f46e7104ca9986ef2bf2b5f455a Mon Sep 17 00:00:00 2001
From: Schell Carl Scivally <efsubenovex@gmail.com>
Date: Thu, 30 May 2024 16:39:32 +1200
Subject: [PATCH 6/9] spv-in parsing Op::AtomicIIncrement (#5702)

Parse spirv::Op::AtomicIIncrement, add atomic_i_increment test.
---
 CHANGELOG.md                             |   1 +
 naga/src/front/spv/mod.rs                | 127 ++++++++++++++++++++++-
 naga/tests/in/spv/atomic_i_increment.spv | Bin 0 -> 392 bytes
 naga/tests/snapshots.rs                  |  33 ++++--
 4 files changed, 150 insertions(+), 11 deletions(-)
 create mode 100644 naga/tests/in/spv/atomic_i_increment.spv

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a73b79330b..fb8b0d6d1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -89,6 +89,7 @@ By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 #### Naga
 
 - Implement `WGSL`'s `unpack4xI8`,`unpack4xU8`,`pack4xI8` and `pack4xU8`. By @VlaDexa in [#5424](https://github.com/gfx-rs/wgpu/pull/5424)
+- Began work adding support for atomics to the SPIR-V frontend. Tracking issue is [here](https://github.com/gfx-rs/wgpu/issues/4489). By @schell in [#5702](https://github.com/gfx-rs/wgpu/pull/5702).
 
 ### Changes
 
diff --git a/naga/src/front/spv/mod.rs b/naga/src/front/spv/mod.rs
index 12610083a4..480f771340 100644
--- a/naga/src/front/spv/mod.rs
+++ b/naga/src/front/spv/mod.rs
@@ -564,6 +564,20 @@ enum SignAnchor {
     Operand,
 }
 
+enum AtomicOpInst {
+    AtomicIIncrement,
+}
+
+#[allow(dead_code)]
+struct AtomicOp {
+    instruction: AtomicOpInst,
+    result_type_id: spirv::Word,
+    result_id: spirv::Word,
+    pointer_id: spirv::Word,
+    scope_id: spirv::Word,
+    memory_semantics_id: spirv::Word,
+}
+
 pub struct Frontend<I> {
     data: I,
     data_offset: usize,
@@ -575,6 +589,8 @@ pub struct Frontend<I> {
     future_member_decor: FastHashMap<(spirv::Word, MemberIndex), Decoration>,
     lookup_member: FastHashMap<(Handle<crate::Type>, MemberIndex), LookupMember>,
     handle_sampling: FastHashMap<Handle<crate::GlobalVariable>, image::SamplingFlags>,
+    // Used to upgrade types used in atomic ops to atomic types, keyed by pointer id
+    lookup_atomic: FastHashMap<spirv::Word, AtomicOp>,
     lookup_type: FastHashMap<spirv::Word, LookupType>,
     lookup_void_type: Option<spirv::Word>,
     lookup_storage_buffer_types: FastHashMap<Handle<crate::Type>, crate::StorageAccess>,
@@ -630,6 +646,7 @@ impl<I: Iterator<Item = u32>> Frontend<I> {
             future_member_decor: FastHashMap::default(),
             handle_sampling: FastHashMap::default(),
             lookup_member: FastHashMap::default(),
+            lookup_atomic: FastHashMap::default(),
             lookup_type: FastHashMap::default(),
             lookup_void_type: None,
             lookup_storage_buffer_types: FastHashMap::default(),
@@ -3943,7 +3960,81 @@ impl<I: Iterator<Item = u32>> Frontend<I> {
                     );
                     emitter.start(ctx.expressions);
                 }
-                _ => return Err(Error::UnsupportedInstruction(self.state, inst.op)),
+                Op::AtomicIIncrement => {
+                    inst.expect(6)?;
+                    let start = self.data_offset;
+                    let span = self.span_from_with_op(start);
+                    let result_type_id = self.next()?;
+                    let result_id = self.next()?;
+                    let pointer_id = self.next()?;
+                    let scope_id = self.next()?;
+                    let memory_semantics_id = self.next()?;
+                    // Store the op for a later pass where we "upgrade" the pointer type
+                    let atomic = AtomicOp {
+                        instruction: AtomicOpInst::AtomicIIncrement,
+                        result_type_id,
+                        result_id,
+                        pointer_id,
+                        scope_id,
+                        memory_semantics_id,
+                    };
+                    self.lookup_atomic.insert(pointer_id, atomic);
+
+                    log::trace!("\t\t\tlooking up expr {:?}", pointer_id);
+
+                    let (p_lexp_handle, p_lexp_ty_id) = {
+                        let lexp = self.lookup_expression.lookup(pointer_id)?;
+                        let handle = get_expr_handle!(pointer_id, &lexp);
+                        (handle, lexp.type_id)
+                    };
+                    log::trace!("\t\t\tlooking up type {pointer_id:?}");
+                    let p_ty = self.lookup_type.lookup(p_lexp_ty_id)?;
+                    let p_ty_base_id =
+                        p_ty.base_id.ok_or(Error::InvalidAccessType(p_lexp_ty_id))?;
+                    log::trace!("\t\t\tlooking up base type {p_ty_base_id:?} of {p_ty:?}");
+                    let p_base_ty = self.lookup_type.lookup(p_ty_base_id)?;
+
+                    // Create an expression for our result
+                    let r_lexp_handle = {
+                        let expr = crate::Expression::AtomicResult {
+                            ty: p_base_ty.handle,
+                            comparison: false,
+                        };
+                        let handle = ctx.expressions.append(expr, span);
+                        self.lookup_expression.insert(
+                            result_id,
+                            LookupExpression {
+                                handle,
+                                type_id: result_type_id,
+                                block_id,
+                            },
+                        );
+                        handle
+                    };
+
+                    // Create a literal "1" since WGSL lacks an increment operation
+                    let one_lexp_handle = make_index_literal(
+                        ctx,
+                        1,
+                        &mut block,
+                        &mut emitter,
+                        p_base_ty.handle,
+                        p_lexp_ty_id,
+                        span,
+                    )?;
+
+                    // Create a statement for the op itself
+                    let stmt = crate::Statement::Atomic {
+                        pointer: p_lexp_handle,
+                        fun: crate::AtomicFunction::Add,
+                        value: one_lexp_handle,
+                        result: r_lexp_handle,
+                    };
+                    block.push(stmt, span);
+                }
+                _ => {
+                    return Err(Error::UnsupportedInstruction(self.state, inst.op));
+                }
             }
         };
 
@@ -5593,4 +5684,38 @@ mod test {
         ];
         let _ = super::parse_u8_slice(&bin, &Default::default()).unwrap();
     }
+
+    #[test]
+    fn atomic_i_inc() {
+        let _ = env_logger::builder()
+            .is_test(true)
+            .filter_level(log::LevelFilter::Trace)
+            .try_init();
+        let bytes = include_bytes!("../../../tests/in/spv/atomic_i_increment.spv");
+        let m = super::parse_u8_slice(bytes, &Default::default()).unwrap();
+        let mut validator = crate::valid::Validator::new(
+            crate::valid::ValidationFlags::empty(),
+            Default::default(),
+        );
+        let info = validator.validate(&m).unwrap();
+        let wgsl =
+            crate::back::wgsl::write_string(&m, &info, crate::back::wgsl::WriterFlags::empty())
+                .unwrap();
+        log::info!("atomic_i_increment:\n{wgsl}");
+
+        let m = match crate::front::wgsl::parse_str(&wgsl) {
+            Ok(m) => m,
+            Err(e) => {
+                log::error!("{}", e.emit_to_string(&wgsl));
+                // at this point we know atomics create invalid modules
+                // so simply bail
+                return;
+            }
+        };
+        let mut validator =
+            crate::valid::Validator::new(crate::valid::ValidationFlags::all(), Default::default());
+        if let Err(e) = validator.validate(&m) {
+            log::error!("{}", e.emit_to_string(&wgsl));
+        }
+    }
 }
diff --git a/naga/tests/in/spv/atomic_i_increment.spv b/naga/tests/in/spv/atomic_i_increment.spv
new file mode 100644
index 0000000000000000000000000000000000000000..1b17c53065a7acbf9e846b656f26ab6b7eb83cb6
GIT binary patch
literal 392
zcmZ9Hy$*sv5QIko@drhtwg#lMH%2QfeG5VeF%)1T_ri8Qf{lqYa5i4DH@iE#-+|Ll
ze0V(oA?Cbw7U)AA>2n3l;Iw_+Ap&?*`HjhL#qns0#-y3KS7n}-X<6rwqAF?wZiCE$
zWdV`h)9!^#aWr#frd%iKgudH)@n5~Q^_u>Gd1UL2<#jz5KKlVf+$BjvYRs7tRe8ky
zF;z3#IkDD`zDK>C<xiOF{Azic*^?}e{zXUJH=QZ<Oi%0?L$hMM_=?^M8S0Cp{)dDE
F;|um`71IC!

literal 0
HcmV?d00001

diff --git a/naga/tests/snapshots.rs b/naga/tests/snapshots.rs
index ee775a3e63..5e2441e0d5 100644
--- a/naga/tests/snapshots.rs
+++ b/naga/tests/snapshots.rs
@@ -14,14 +14,15 @@ const BASE_DIR_OUT: &str = "tests/out";
 bitflags::bitflags! {
     #[derive(Clone, Copy)]
     struct Targets: u32 {
-        const IR = 0x1;
-        const ANALYSIS = 0x2;
-        const SPIRV = 0x4;
-        const METAL = 0x8;
-        const GLSL = 0x10;
-        const DOT = 0x20;
-        const HLSL = 0x40;
-        const WGSL = 0x80;
+        const IR = 1;
+        const ANALYSIS = 1 << 1;
+        const SPIRV = 1 << 2;
+        const METAL = 1 << 3;
+        const GLSL = 1 << 4;
+        const DOT = 1 << 5;
+        const HLSL = 1 << 6;
+        const WGSL = 1 << 7;
+        const NO_VALIDATION = 1 << 8;
     }
 }
 
@@ -292,7 +293,13 @@ fn check_targets(
         }
     }
 
-    let info = naga::valid::Validator::new(naga::valid::ValidationFlags::all(), capabilities)
+    let validation_flags = if targets.contains(Targets::NO_VALIDATION) {
+        naga::valid::ValidationFlags::empty()
+    } else {
+        naga::valid::ValidationFlags::all()
+    };
+
+    let info = naga::valid::Validator::new(validation_flags, capabilities)
         .subgroup_stages(subgroup_stages)
         .subgroup_operations(subgroup_operations)
         .validate(module)
@@ -317,7 +324,7 @@ fn check_targets(
             }
         }
 
-        naga::valid::Validator::new(naga::valid::ValidationFlags::all(), capabilities)
+        naga::valid::Validator::new(validation_flags, capabilities)
             .subgroup_stages(subgroup_stages)
             .subgroup_operations(subgroup_operations)
             .validate(module)
@@ -979,6 +986,12 @@ fn convert_spv_all() {
         false,
         Targets::METAL | Targets::GLSL | Targets::HLSL | Targets::WGSL,
     );
+    convert_spv(
+        "atomic_i_increment",
+        false,
+        // TODO(@schell): remove Targets::NO_VALIDATION when OpAtomicIIncrement lands
+        Targets::IR | Targets::NO_VALIDATION,
+    );
 }
 
 #[cfg(feature = "glsl-in")]

From 9b7a9656670d74d364c359b918b062f814cb5f01 Mon Sep 17 00:00:00 2001
From: Brad Werth <werth@efn.org>
Date: Tue, 6 Feb 2024 16:35:17 -0800
Subject: [PATCH 7/9] Add an experimental vertex pulling flag to Metal
 pipelines.

This proves a flag in msl::PipelineOptions that attempts to write all
Metal vertex entry points to use a vertex pulling technique. It does
this by:

1) Forcing the _buffer_sizes structure to be generated for all vertex
entry points. The structure has additional buffer_size members that
contain the byte sizes of the vertex buffers.
2) Adding new args to vertex entry points for the vertex id and/or
the instance id and for the bound buffers. If there is an existing
@builtin(vertex_index) or @builtin(instance_index) param, then no
duplicate arg is created.
3) Adding code at the beginning of the function for vertex entry points
to compare the vertex id or instance id against the lengths of all the
bound buffers, and force an early-exit if the bounds are violated.
4) Extracting the raw bytes from the vertex buffer(s) and unpacking
those bytes into the bound attributes with the expected types.
5) Replacing the varyings input and instead using the unpacked
attributes to fill any structs-as-args that are rebuilt in the entry
point.

A new naga test is added which exercises this flag and demonstrates the
effect of the transform. The msl generated by this test passes
validation.

Eventually this transformation will be the default, always-on behavior
for Metal pipelines, though the flag may remain so that naga
translation tests can be run with and without the tranformation.
---
 deno_webgpu/pipeline.rs                       |    3 +
 naga/CHANGELOG.md                             |    1 +
 naga/src/back/msl/mod.rs                      |  118 ++
 naga/src/back/msl/writer.rs                   | 1131 ++++++++++++++++-
 naga/tests/in/interface.param.ron             |    2 +
 .../in/vertex-pulling-transform.param.ron     |   31 +
 naga/tests/in/vertex-pulling-transform.wgsl   |   32 +
 .../out/msl/vertex-pulling-transform.msl      |   76 ++
 naga/tests/snapshots.rs                       |    1 +
 player/tests/data/bind-group.ron              |    1 +
 .../tests/data/pipeline-statistics-query.ron  |    1 +
 player/tests/data/quad.ron                    |    2 +
 player/tests/data/zero-init-buffer.ron        |    1 +
 .../tests/data/zero-init-texture-binding.ron  |    1 +
 tests/tests/vertex_indices/mod.rs             |   53 +-
 wgpu-core/src/device/resource.rs              |    3 +
 wgpu-core/src/pipeline.rs                     |    2 +
 wgpu-hal/examples/halmark/main.rs             |    2 +
 wgpu-hal/examples/ray-traced-triangle/main.rs |    1 +
 wgpu-hal/src/lib.rs                           |    3 +
 wgpu-hal/src/metal/command.rs                 |   32 +
 wgpu-hal/src/metal/device.rs                  |   87 +-
 wgpu-hal/src/metal/mod.rs                     |   18 +
 wgpu/src/backend/wgpu_core.rs                 |    6 +
 wgpu/src/lib.rs                               |    3 +
 25 files changed, 1540 insertions(+), 71 deletions(-)
 create mode 100644 naga/tests/in/vertex-pulling-transform.param.ron
 create mode 100644 naga/tests/in/vertex-pulling-transform.wgsl
 create mode 100644 naga/tests/out/msl/vertex-pulling-transform.msl

diff --git a/deno_webgpu/pipeline.rs b/deno_webgpu/pipeline.rs
index c82b6a97c8..9923652451 100644
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@@ -114,6 +114,7 @@ pub fn op_webgpu_create_compute_pipeline(
             entry_point: compute.entry_point.map(Cow::from),
             constants: Cow::Owned(compute.constants.unwrap_or_default()),
             zero_initialize_workgroup_memory: true,
+            vertex_pulling_transform: false,
         },
         cache: None,
     };
@@ -363,6 +364,7 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(fragment.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
+                vertex_pulling_transform: false,
             },
             targets: Cow::Owned(fragment.targets),
         })
@@ -388,6 +390,7 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(args.vertex.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
+                vertex_pulling_transform: false,
             },
             buffers: Cow::Owned(vertex_buffers),
         },
diff --git a/naga/CHANGELOG.md b/naga/CHANGELOG.md
index a92d0c4f97..d2e0515ebd 100644
--- a/naga/CHANGELOG.md
+++ b/naga/CHANGELOG.md
@@ -79,6 +79,7 @@ For changelogs after v0.14, see [the wgpu changelog](../CHANGELOG.md).
 
 - Add and fix minimum Metal version checks for optional functionality. ([#2486](https://github.com/gfx-rs/naga/pull/2486)) **@teoxoy**
 - Make varyings' struct members unique. ([#2521](https://github.com/gfx-rs/naga/pull/2521)) **@evahop**
+- Add experimental vertex pulling transform flag. ([#5254](https://github.com/gfx-rs/wgpu/pull/5254)) **@bradwerth**
 
 #### GLSL-OUT
 
diff --git a/naga/src/back/msl/mod.rs b/naga/src/back/msl/mod.rs
index d7a06d7749..d80d012ad4 100644
--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@@ -222,6 +222,113 @@ impl Default for Options {
     }
 }
 
+/// Corresponds to [WebGPU `GPUVertexFormat`](
+/// https://gpuweb.github.io/gpuweb/#enumdef-gpuvertexformat).
+#[repr(u32)]
+#[derive(Copy, Clone, Debug, Hash, Eq, PartialEq)]
+#[cfg_attr(feature = "serialize", derive(serde::Serialize))]
+#[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
+pub enum VertexFormat {
+    /// Two unsigned bytes (u8). `vec2<u32>` in shaders.
+    Uint8x2 = 0,
+    /// Four unsigned bytes (u8). `vec4<u32>` in shaders.
+    Uint8x4 = 1,
+    /// Two signed bytes (i8). `vec2<i32>` in shaders.
+    Sint8x2 = 2,
+    /// Four signed bytes (i8). `vec4<i32>` in shaders.
+    Sint8x4 = 3,
+    /// Two unsigned bytes (u8). [0, 255] converted to float [0, 1] `vec2<f32>` in shaders.
+    Unorm8x2 = 4,
+    /// Four unsigned bytes (u8). [0, 255] converted to float [0, 1] `vec4<f32>` in shaders.
+    Unorm8x4 = 5,
+    /// Two signed bytes (i8). [-127, 127] converted to float [-1, 1] `vec2<f32>` in shaders.
+    Snorm8x2 = 6,
+    /// Four signed bytes (i8). [-127, 127] converted to float [-1, 1] `vec4<f32>` in shaders.
+    Snorm8x4 = 7,
+    /// Two unsigned shorts (u16). `vec2<u32>` in shaders.
+    Uint16x2 = 8,
+    /// Four unsigned shorts (u16). `vec4<u32>` in shaders.
+    Uint16x4 = 9,
+    /// Two signed shorts (i16). `vec2<i32>` in shaders.
+    Sint16x2 = 10,
+    /// Four signed shorts (i16). `vec4<i32>` in shaders.
+    Sint16x4 = 11,
+    /// Two unsigned shorts (u16). [0, 65535] converted to float [0, 1] `vec2<f32>` in shaders.
+    Unorm16x2 = 12,
+    /// Four unsigned shorts (u16). [0, 65535] converted to float [0, 1] `vec4<f32>` in shaders.
+    Unorm16x4 = 13,
+    /// Two signed shorts (i16). [-32767, 32767] converted to float [-1, 1] `vec2<f32>` in shaders.
+    Snorm16x2 = 14,
+    /// Four signed shorts (i16). [-32767, 32767] converted to float [-1, 1] `vec4<f32>` in shaders.
+    Snorm16x4 = 15,
+    /// Two half-precision floats (no Rust equiv). `vec2<f32>` in shaders.
+    Float16x2 = 16,
+    /// Four half-precision floats (no Rust equiv). `vec4<f32>` in shaders.
+    Float16x4 = 17,
+    /// One single-precision float (f32). `f32` in shaders.
+    Float32 = 18,
+    /// Two single-precision floats (f32). `vec2<f32>` in shaders.
+    Float32x2 = 19,
+    /// Three single-precision floats (f32). `vec3<f32>` in shaders.
+    Float32x3 = 20,
+    /// Four single-precision floats (f32). `vec4<f32>` in shaders.
+    Float32x4 = 21,
+    /// One unsigned int (u32). `u32` in shaders.
+    Uint32 = 22,
+    /// Two unsigned ints (u32). `vec2<u32>` in shaders.
+    Uint32x2 = 23,
+    /// Three unsigned ints (u32). `vec3<u32>` in shaders.
+    Uint32x3 = 24,
+    /// Four unsigned ints (u32). `vec4<u32>` in shaders.
+    Uint32x4 = 25,
+    /// One signed int (i32). `i32` in shaders.
+    Sint32 = 26,
+    /// Two signed ints (i32). `vec2<i32>` in shaders.
+    Sint32x2 = 27,
+    /// Three signed ints (i32). `vec3<i32>` in shaders.
+    Sint32x3 = 28,
+    /// Four signed ints (i32). `vec4<i32>` in shaders.
+    Sint32x4 = 29,
+    /// Three unsigned 10-bit integers and one 2-bit integer, packed into a 32-bit integer (u32). [0, 1024] converted to float [0, 1] `vec4<f32>` in shaders.
+    #[cfg_attr(feature = "serde", serde(rename = "unorm10-10-10-2"))]
+    Unorm10_10_10_2 = 34,
+}
+
+/// A mapping of vertex buffers and their attributes to shader
+/// locations.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serialize", derive(serde::Serialize))]
+#[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
+pub struct AttributeMapping {
+    /// Shader location associated with this attribute
+    pub shader_location: u32,
+    /// Offset in bytes from start of vertex buffer structure
+    pub offset: u32,
+    /// Format code to help us unpack the attribute into the type
+    /// used by the shader. Codes correspond to a 0-based index of
+    /// <https://gpuweb.github.io/gpuweb/#enumdef-gpuvertexformat>.
+    /// The conversion process is described by
+    /// <https://gpuweb.github.io/gpuweb/#vertex-processing>.
+    pub format: VertexFormat,
+}
+
+/// A description of a vertex buffer with all the information we
+/// need to address the attributes within it.
+#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serialize", derive(serde::Serialize))]
+#[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
+pub struct VertexBufferMapping {
+    /// Shader location associated with this buffer
+    pub id: u32,
+    /// Size of the structure in bytes
+    pub stride: u32,
+    /// True if the buffer is indexed by vertex, false if indexed
+    /// by instance.
+    pub indexed_by_vertex: bool,
+    /// Vec of the attributes within the structure
+    pub attributes: Vec<AttributeMapping>,
+}
+
 /// A subset of options that are meant to be changed per pipeline.
 #[derive(Debug, Default, Clone)]
 #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
@@ -234,6 +341,17 @@ pub struct PipelineOptions {
     ///
     /// Enable this for vertex shaders with point primitive topologies.
     pub allow_and_force_point_size: bool,
+
+    /// If set, when generating the Metal vertex shader, transform it
+    /// to receive the vertex buffers, lengths, and vertex id as args,
+    /// and bounds-check the vertex id and use the index into the
+    /// vertex buffers to access attributes, rather than using Metal's
+    /// [[stage-in]] assembled attribute data.
+    pub vertex_pulling_transform: bool,
+
+    /// vertex_buffer_mappings are used during shader translation to
+    /// support vertex pulling.
+    pub vertex_buffer_mappings: Vec<VertexBufferMapping>,
 }
 
 impl Options {
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 3897859924..ba2876713f 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -75,6 +75,14 @@ fn put_numeric_type(
     }
 }
 
+const fn scalar_is_int(scalar: crate::Scalar) -> bool {
+    use crate::ScalarKind::*;
+    match scalar.kind {
+        Sint | Uint | AbstractInt | Bool => true,
+        Float | AbstractFloat => false,
+    }
+}
+
 /// Prefix for cached clamped level-of-detail values for `ImageLoad` expressions.
 const CLAMPED_LOD_LOAD_PREFIX: &str = "clamped_lod_e";
 
@@ -87,6 +95,22 @@ struct TypeContext<'a> {
     first_time: bool,
 }
 
+impl<'a> TypeContext<'a> {
+    fn scalar(&self) -> Option<crate::Scalar> {
+        let ty = &self.gctx.types[self.handle];
+        ty.inner.scalar()
+    }
+
+    fn vertex_input_dimension(&self) -> u32 {
+        let ty = &self.gctx.types[self.handle];
+        match ty.inner {
+            crate::TypeInner::Scalar(_) => 1,
+            crate::TypeInner::Vector { size, .. } => size as u32,
+            _ => unreachable!(),
+        }
+    }
+}
+
 impl<'a> Display for TypeContext<'a> {
     fn fmt(&self, out: &mut Formatter<'_>) -> Result<(), FmtError> {
         let ty = &self.gctx.types[self.handle];
@@ -3001,7 +3025,7 @@ impl<W: Write> Writer<W> {
                     // follow-up with any global resources used
                     let mut separate = !arguments.is_empty();
                     let fun_info = &context.expression.mod_info[function];
-                    let mut supports_array_length = false;
+                    let mut needs_buffer_sizes = false;
                     for (handle, var) in context.expression.module.global_variables.iter() {
                         if fun_info[handle].is_empty() {
                             continue;
@@ -3015,10 +3039,10 @@ impl<W: Write> Writer<W> {
                             }
                             write!(self.out, "{name}")?;
                         }
-                        supports_array_length |=
+                        needs_buffer_sizes |=
                             needs_array_length(var.ty, &context.expression.module.types);
                     }
-                    if supports_array_length {
+                    if needs_buffer_sizes {
                         if separate {
                             write!(self.out, ", ")?;
                         }
@@ -3417,13 +3441,22 @@ impl<W: Write> Writer<W> {
                 }
             }
 
-            if !indices.is_empty() {
+            let mut buffer_indices = vec![];
+            for vbm in &pipeline_options.vertex_buffer_mappings {
+                buffer_indices.push(vbm.id);
+            }
+
+            if !indices.is_empty() || !buffer_indices.is_empty() {
                 writeln!(self.out, "struct _mslBufferSizes {{")?;
 
                 for idx in indices {
                     writeln!(self.out, "{}uint size{};", back::INDENT, idx)?;
                 }
 
+                for idx in buffer_indices {
+                    writeln!(self.out, "{}uint buffer_size{};", back::INDENT, idx)?;
+                }
+
                 writeln!(self.out, "}};")?;
                 writeln!(self.out)?;
             }
@@ -3764,6 +3797,672 @@ impl<W: Write> Writer<W> {
         Ok(())
     }
 
+    fn write_unpacking_function(
+        &mut self,
+        format: back::msl::VertexFormat,
+    ) -> Result<(String, u32, u32), Error> {
+        use back::msl::VertexFormat::*;
+        match format {
+            Uint8x2 => {
+                let name = self.namer.call("unpackUint8x2");
+                writeln!(
+                    self.out,
+                    "metal::uint2 {name}(metal::uchar b0, \
+                                         metal::uchar b1) {{"
+                )?;
+                writeln!(self.out, "{}return metal::uint2(b0, b1);", back::INDENT)?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 2, 2))
+            }
+            Uint8x4 => {
+                let name = self.namer.call("unpackUint8x4");
+                writeln!(
+                    self.out,
+                    "metal::uint4 {name}(metal::uchar b0, \
+                                         metal::uchar b1, \
+                                         metal::uchar b2, \
+                                         metal::uchar b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::uint4(b0, b1, b2, b3);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 4))
+            }
+            Sint8x2 => {
+                let name = self.namer.call("unpackSint8x2");
+                writeln!(
+                    self.out,
+                    "metal::int2 {name}(metal::uchar b0, \
+                                        metal::uchar b1) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int2(as_type<char>(b0), \
+                                          as_type<char>(b1));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 2, 2))
+            }
+            Sint8x4 => {
+                let name = self.namer.call("unpackSint8x4");
+                writeln!(
+                    self.out,
+                    "metal::int4 {name}(metal::uchar b0, \
+                                        metal::uchar b1, \
+                                        metal::uchar b2, \
+                                        metal::uchar b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int4(as_type<char>(b0), \
+                                          as_type<char>(b1), \
+                                          as_type<char>(b2), \
+                                          as_type<char>(b3));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 4))
+            }
+            Unorm8x2 => {
+                let name = self.namer.call("unpackUnorm8x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(metal::uchar b0, \
+                                          metal::uchar b1) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2(float(b0) / 255.0f, \
+                                            float(b1) / 255.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 2, 2))
+            }
+            Unorm8x4 => {
+                let name = self.namer.call("unpackUnorm8x4");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(metal::uchar b0, \
+                                          metal::uchar b1, \
+                                          metal::uchar b2, \
+                                          metal::uchar b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float4(float(b0) / 255.0f, \
+                                            float(b1) / 255.0f, \
+                                            float(b2) / 255.0f, \
+                                            float(b3) / 255.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 4))
+            }
+            Snorm8x2 => {
+                let name = self.namer.call("unpackSnorm8x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(metal::uchar b0, \
+                                          metal::uchar b1) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2((float(b0) - 128.0f) / 255.0f, \
+                                            (float(b1) - 128.0f) / 255.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 2, 2))
+            }
+            Snorm8x4 => {
+                let name = self.namer.call("unpackSnorm8x4");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(metal::uchar b0, \
+                                          metal::uchar b1, \
+                                          metal::uchar b2, \
+                                          metal::uchar b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float4((float(b0) - 128.0f) / 255.0f, \
+                                            (float(b1) - 128.0f) / 255.0f, \
+                                            (float(b2) - 128.0f) / 255.0f, \
+                                            (float(b3) - 128.0f) / 255.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 4))
+            }
+            Uint16x2 => {
+                let name = self.namer.call("unpackUint16x2");
+                writeln!(
+                    self.out,
+                    "metal::uint2 {name}(metal::uint b0, \
+                                         metal::uint b1, \
+                                         metal::uint b2, \
+                                         metal::uint b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::uint2(b1 << 8 | b0, \
+                                           b3 << 8 | b2);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 2))
+            }
+            Uint16x4 => {
+                let name = self.namer.call("unpackUint16x4");
+                writeln!(
+                    self.out,
+                    "metal::uint4 {name}(metal::uint b0, \
+                                         metal::uint b1, \
+                                         metal::uint b2, \
+                                         metal::uint b3, \
+                                         metal::uint b4, \
+                                         metal::uint b5, \
+                                         metal::uint b6, \
+                                         metal::uint b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::uint4(b1 << 8 | b0, \
+                                           b3 << 8 | b2, \
+                                           b5 << 8 | b4, \
+                                           b7 << 8 | b6);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 4))
+            }
+            Sint16x2 => {
+                let name = self.namer.call("unpackSint16x2");
+                writeln!(
+                    self.out,
+                    "metal::int2 {name}(metal::ushort b0, \
+                                        metal::ushort b1, \
+                                        metal::ushort b2, \
+                                        metal::ushort b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int2(as_type<metal::short>(b1 << 8 | b0), \
+                                          as_type<metal::short>(b3 << 8 | b2));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 2))
+            }
+            Sint16x4 => {
+                let name = self.namer.call("unpackSint16x4");
+                writeln!(
+                    self.out,
+                    "metal::int4 {name}(metal::ushort b0, \
+                                        metal::ushort b1, \
+                                        metal::ushort b2, \
+                                        metal::ushort b3, \
+                                        metal::ushort b4, \
+                                        metal::ushort b5, \
+                                        metal::ushort b6, \
+                                        metal::ushort b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int4(as_type<metal::short>(b1 << 8 | b0), \
+                                          as_type<metal::short>(b3 << 8 | b2), \
+                                          as_type<metal::short>(b5 << 8 | b4), \
+                                          as_type<metal::short>(b7 << 8 | b6));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 4))
+            }
+            Unorm16x2 => {
+                let name = self.namer.call("unpackUnorm16x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(metal::ushort b0, \
+                                          metal::ushort b1, \
+                                          metal::ushort b2, \
+                                          metal::ushort b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2(float(b1 << 8 | b0) / 65535.0f, \
+                                            float(b3 << 8 | b2) / 65535.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 2))
+            }
+            Unorm16x4 => {
+                let name = self.namer.call("unpackUnorm16x4");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(metal::ushort b0, \
+                                          metal::ushort b1, \
+                                          metal::ushort b2, \
+                                          metal::ushort b3, \
+                                          metal::ushort b4, \
+                                          metal::ushort b5, \
+                                          metal::ushort b6, \
+                                          metal::ushort b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float4(float(b1 << 8 | b0) / 65535.0f, \
+                                            float(b3 << 8 | b2) / 65535.0f, \
+                                            float(b5 << 8 | b4) / 65535.0f, \
+                                            float(b7 << 8 | b6) / 65535.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 4))
+            }
+            Snorm16x2 => {
+                let name = self.namer.call("unpackSnorm16x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(metal::ushort b0, \
+                                          metal::ushort b1, \
+                                          metal::ushort b2, \
+                                          metal::ushort b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
+                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 2))
+            }
+            Snorm16x4 => {
+                let name = self.namer.call("unpackSnorm16x4");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(metal::ushort b0, \
+                                          metal::ushort b1, \
+                                          metal::ushort b2, \
+                                          metal::ushort b3, \
+                                          metal::ushort b4, \
+                                          metal::ushort b5, \
+                                          metal::ushort b6, \
+                                          metal::ushort b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float4((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
+                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f, \
+                                            (float(b5 << 8 | b4) - 32767.0f) / 65535.0f, \
+                                            (float(b7 << 8 | b6) - 32767.0f) / 65535.0f);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 4))
+            }
+            Float16x2 => {
+                let name = self.namer.call("unpackFloat16x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(metal::ushort b0, \
+                                          metal::ushort b1, \
+                                          metal::ushort b2, \
+                                          metal::ushort b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2(as_type<metal::half>(b1 << 8 | b0), \
+                                            as_type<metal::half>(b3 << 8 | b2));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 2))
+            }
+            Float16x4 => {
+                let name = self.namer.call("unpackFloat16x4");
+                writeln!(
+                    self.out,
+                    "metal::int4 {name}(metal::ushort b0, \
+                                        metal::ushort b1, \
+                                        metal::ushort b2, \
+                                        metal::ushort b3, \
+                                        metal::ushort b4, \
+                                        metal::ushort b5, \
+                                        metal::ushort b6, \
+                                        metal::ushort b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int4(as_type<metal::half>(b1 << 8 | b0), \
+                                          as_type<metal::half>(b3 << 8 | b2), \
+                                          as_type<metal::half>(b5 << 8 | b4), \
+                                          as_type<metal::half>(b7 << 8 | b6));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 4))
+            }
+            Float32 => {
+                let name = self.namer.call("unpackFloat32");
+                writeln!(
+                    self.out,
+                    "float {name}(uint b0, \
+                                  uint b1, \
+                                  uint b2, \
+                                  uint b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 1))
+            }
+            Float32x2 => {
+                let name = self.namer.call("unpackFloat32x2");
+                writeln!(
+                    self.out,
+                    "metal::float2 {name}(uint b0, \
+                                          uint b1, \
+                                          uint b2, \
+                                          uint b3, \
+                                          uint b4, \
+                                          uint b5, \
+                                          uint b6, \
+                                          uint b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float2(as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                            as_type<float>(b7 << 24 | b6 << 16 | b5 << 8 | b4));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 2))
+            }
+            Float32x3 => {
+                let name = self.namer.call("unpackFloat32x3");
+                writeln!(
+                    self.out,
+                    "metal::float3 {name}(uint b0, \
+                                          uint b1, \
+                                          uint b2, \
+                                          uint b3, \
+                                          uint b4, \
+                                          uint b5, \
+                                          uint b6, \
+                                          uint b7, \
+                                          uint b8, \
+                                          uint b9, \
+                                          uint b10, \
+                                          uint b11) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float3(as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                            as_type<float>(b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                            as_type<float>(b11 << 24 | b10 << 16 | b9 << 8 | b8));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 12, 3))
+            }
+            Float32x4 => {
+                let name = self.namer.call("unpackFloat32x4");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(uint b0, \
+                                          uint b1, \
+                                          uint b2, \
+                                          uint b3, \
+                                          uint b4, \
+                                          uint b5, \
+                                          uint b6, \
+                                          uint b7, \
+                                          uint b8, \
+                                          uint b9, \
+                                          uint b10, \
+                                          uint b11, \
+                                          uint b12, \
+                                          uint b13, \
+                                          uint b14, \
+                                          uint b15) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::float4(as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                            as_type<float>(b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                            as_type<float>(b11 << 24 | b10 << 16 | b9 << 8 | b8), \
+                                            as_type<float>(b15 << 24 | b14 << 16 | b13 << 8 | b12));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 16, 4))
+            }
+            Uint32 => {
+                let name = self.namer.call("unpackUint32");
+                writeln!(
+                    self.out,
+                    "uint {name}(uint b0, \
+                                 uint b1, \
+                                 uint b2, \
+                                 uint b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return (b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 1))
+            }
+            Uint32x2 => {
+                let name = self.namer.call("unpackUint32x2");
+                writeln!(
+                    self.out,
+                    "uint2 {name}(uint b0, \
+                                  uint b1, \
+                                  uint b2, \
+                                  uint b3, \
+                                  uint b4, \
+                                  uint b5, \
+                                  uint b6, \
+                                  uint b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return uint2((b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                    (b7 << 24 | b6 << 16 | b5 << 8 | b4));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 2))
+            }
+            Uint32x3 => {
+                let name = self.namer.call("unpackUint32x3");
+                writeln!(
+                    self.out,
+                    "uint3 {name}(uint b0, \
+                                  uint b1, \
+                                  uint b2, \
+                                  uint b3, \
+                                  uint b4, \
+                                  uint b5, \
+                                  uint b6, \
+                                  uint b7, \
+                                  uint b8, \
+                                  uint b9, \
+                                  uint b10, \
+                                  uint b11) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return uint3((b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                    (b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                    (b11 << 24 | b10 << 16 | b9 << 8 | b8));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 12, 3))
+            }
+            Uint32x4 => {
+                let name = self.namer.call("unpackUint32x4");
+                writeln!(
+                    self.out,
+                    "uint4 {name}(uint b0, \
+                                  uint b1, \
+                                  uint b2, \
+                                  uint b3, \
+                                  uint b4, \
+                                  uint b5, \
+                                  uint b6, \
+                                  uint b7, \
+                                  uint b8, \
+                                  uint b9, \
+                                  uint b10, \
+                                  uint b11, \
+                                  uint b12, \
+                                  uint b13, \
+                                  uint b14, \
+                                  uint b15) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return uint4((b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                    (b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                    (b11 << 24 | b10 << 16 | b9 << 8 | b8), \
+                                    (b15 << 24 | b14 << 16 | b13 << 8 | b12));",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 16, 4))
+            }
+            Sint32 => {
+                let name = self.namer.call("unpackSint32");
+                writeln!(
+                    self.out,
+                    "metal::int {name}(uint b0, \
+                                       uint b1, \
+                                       uint b2, \
+                                       uint b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return as_type<int>(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 1))
+            }
+            Sint32x2 => {
+                let name = self.namer.call("unpackSint32x2");
+                writeln!(
+                    self.out,
+                    "metal::int2 {name}(uint b0, \
+                                        uint b1, \
+                                        uint b2, \
+                                        uint b3, \
+                                        uint b4, \
+                                        uint b5, \
+                                        uint b6, \
+                                        uint b7) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int2(as_type<int>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                          as_type<int>(b7 << 24 | b6 << 16 | b5 << 8 | b4);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 8, 2))
+            }
+            Sint32x3 => {
+                let name = self.namer.call("unpackSint32x3");
+                writeln!(
+                    self.out,
+                    "metal::int3 {name}(uint b0, \
+                                        uint b1, \
+                                        uint b2, \
+                                        uint b3, \
+                                        uint b4, \
+                                        uint b5, \
+                                        uint b6, \
+                                        uint b7, \
+                                        uint b8, \
+                                        uint b9, \
+                                        uint b10, \
+                                        uint b11) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int3(as_type<int>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                          as_type<int>(b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                          as_type<int>(b11 << 24 | b10 << 16 | b9 << 8 | b8);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 12, 3))
+            }
+            Sint32x4 => {
+                let name = self.namer.call("unpackSint32x4");
+                writeln!(
+                    self.out,
+                    "metal::int4 {name}(uint b0, \
+                                        uint b1, \
+                                        uint b2, \
+                                        uint b3, \
+                                        uint b4, \
+                                        uint b5, \
+                                        uint b6, \
+                                        uint b7, \
+                                        uint b8, \
+                                        uint b9, \
+                                        uint b10, \
+                                        uint b11, \
+                                        uint b12, \
+                                        uint b13, \
+                                        uint b14, \
+                                        uint b15) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return metal::int4(as_type<int>(b3 << 24 | b2 << 16 | b1 << 8 | b0), \
+                                          as_type<int>(b7 << 24 | b6 << 16 | b5 << 8 | b4), \
+                                          as_type<int>(b11 << 24 | b10 << 16 | b9 << 8 | b8), \
+                                          as_type<int>(b15 << 24 | b14 << 16 | b13 << 8 | b12);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 16, 4))
+            }
+            Unorm10_10_10_2 => {
+                let name = self.namer.call("unpackUnorm10_10_10_2");
+                writeln!(
+                    self.out,
+                    "metal::float4 {name}(uint b0, \
+                                          uint b1, \
+                                          uint b2, \
+                                          uint b3) {{"
+                )?;
+                writeln!(
+                    self.out,
+                    "{}return unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    back::INDENT
+                )?;
+                writeln!(self.out, "}}")?;
+                Ok((name, 4, 4))
+            }
+        }
+    }
+
     // Returns the array of mapped entry point names.
     fn write_functions(
         &mut self,
@@ -3772,6 +4471,101 @@ impl<W: Write> Writer<W> {
         options: &Options,
         pipeline_options: &PipelineOptions,
     ) -> Result<TranslationInfo, Error> {
+        use back::msl::VertexFormat;
+
+        // Define structs to hold resolved/generated data for vertex buffers and
+        // their attributes.
+        struct AttributeMappingResolved {
+            ty_name: String,
+            dimension: u32,
+            ty_is_int: bool,
+            name: String,
+        }
+        let mut am_resolved = FastHashMap::<u32, AttributeMappingResolved>::default();
+
+        struct VertexBufferMappingResolved<'a> {
+            id: u32,
+            stride: u32,
+            indexed_by_vertex: bool,
+            ty_name: String,
+            param_name: String,
+            elem_name: String,
+            attributes: &'a Vec<back::msl::AttributeMapping>,
+        }
+        let mut vbm_resolved = Vec::<VertexBufferMappingResolved>::new();
+
+        // Define a struct to hold a named reference to a byte-unpacking function.
+        struct UnpackingFunction {
+            name: String,
+            byte_count: u32,
+            dimension: u32,
+        }
+        let mut unpacking_functions = FastHashMap::<VertexFormat, UnpackingFunction>::default();
+
+        // Check if we are attempting vertex pulling. If we are, generate some
+        // names we'll need, and iterate the vertex buffer mappings to output
+        // all the conversion functions we'll need to unpack the attribute data.
+        // We can re-use these names for all entry points that need them, since
+        // those entry points also use self.namer.
+        let mut needs_vertex_id = false;
+        let v_id = self.namer.call("v_id");
+
+        let mut needs_instance_id = false;
+        let i_id = self.namer.call("i_id");
+        if pipeline_options.vertex_pulling_transform {
+            for vbm in &pipeline_options.vertex_buffer_mappings {
+                let buffer_id = vbm.id;
+                let buffer_stride = vbm.stride;
+
+                assert!(
+                    buffer_stride > 0,
+                    "Vertex pulling requires a non-zero buffer stride."
+                );
+
+                if vbm.indexed_by_vertex {
+                    needs_vertex_id = true;
+                } else {
+                    needs_instance_id = true;
+                }
+
+                let buffer_ty = self.namer.call(format!("vb_{buffer_id}_type").as_str());
+                let buffer_param = self.namer.call(format!("vb_{buffer_id}_in").as_str());
+                let buffer_elem = self.namer.call(format!("vb_{buffer_id}_elem").as_str());
+
+                vbm_resolved.push(VertexBufferMappingResolved {
+                    id: buffer_id,
+                    stride: buffer_stride,
+                    indexed_by_vertex: vbm.indexed_by_vertex,
+                    ty_name: buffer_ty,
+                    param_name: buffer_param,
+                    elem_name: buffer_elem,
+                    attributes: &vbm.attributes,
+                });
+
+                // Iterate the attributes and generate needed unpacking functions.
+                for attribute in &vbm.attributes {
+                    if unpacking_functions.contains_key(&attribute.format) {
+                        continue;
+                    }
+                    let (name, byte_count, dimension) =
+                        match self.write_unpacking_function(attribute.format) {
+                            Ok((name, byte_count, dimension)) => (name, byte_count, dimension),
+                            _ => {
+                                continue;
+                            }
+                        };
+                    unpacking_functions.insert(
+                        attribute.format,
+                        UnpackingFunction {
+                            name,
+                            byte_count,
+                            dimension,
+                        },
+                    );
+                }
+            }
+        }
+
         let mut pass_through_globals = Vec::new();
         for (fun_handle, fun) in module.functions.iter() {
             log::trace!(
@@ -3782,13 +4576,13 @@ impl<W: Write> Writer<W> {
 
             let fun_info = &mod_info[fun_handle];
             pass_through_globals.clear();
-            let mut supports_array_length = false;
+            let mut needs_buffer_sizes = false;
             for (handle, var) in module.global_variables.iter() {
                 if !fun_info[handle].is_empty() {
                     if var.space.needs_pass_through() {
                         pass_through_globals.push(handle);
                     }
-                    supports_array_length |= needs_array_length(var.ty, &module.types);
+                    needs_buffer_sizes |= needs_array_length(var.ty, &module.types);
                 }
             }
 
@@ -3825,7 +4619,7 @@ impl<W: Write> Writer<W> {
                 let separator = separate(
                     !pass_through_globals.is_empty()
                         || index + 1 != fun.arguments.len()
-                        || supports_array_length,
+                        || needs_buffer_sizes,
                 );
                 writeln!(
                     self.out,
@@ -3846,13 +4640,13 @@ impl<W: Write> Writer<W> {
                     reference: true,
                 };
                 let separator =
-                    separate(index + 1 != pass_through_globals.len() || supports_array_length);
+                    separate(index + 1 != pass_through_globals.len() || needs_buffer_sizes);
                 write!(self.out, "{}", back::INDENT)?;
                 tyvar.try_fmt(&mut self.out)?;
                 writeln!(self.out, "{separator}")?;
             }
 
-            if supports_array_length {
+            if needs_buffer_sizes {
                 writeln!(
                     self.out,
                     "{}constant _mslBufferSizes& _buffer_sizes",
@@ -3917,18 +4711,51 @@ impl<W: Write> Writer<W> {
             let fun_info = mod_info.get_entry_point(ep_index);
             let mut ep_error = None;
 
+            // For vertex_id and instance_id arguments, presume that we'll
+            // use our generated names, but switch to the name of an
+            // existing @builtin param, if we find one.
+            let mut v_existing_id = None;
+            let mut i_existing_id = None;
+
             log::trace!(
                 "entry point {:?}, index {:?}",
                 fun.name.as_deref().unwrap_or("(anonymous)"),
                 ep_index
             );
 
+            let (em_str, in_mode, out_mode, can_vertex_pull) = match ep.stage {
+                crate::ShaderStage::Vertex => (
+                    "vertex",
+                    LocationMode::VertexInput,
+                    LocationMode::VertexOutput,
+                    true,
+                ),
+                crate::ShaderStage::Fragment { .. } => (
+                    "fragment",
+                    LocationMode::FragmentInput,
+                    LocationMode::FragmentOutput,
+                    false,
+                ),
+                crate::ShaderStage::Compute { .. } => (
+                    "kernel",
+                    LocationMode::Uniform,
+                    LocationMode::Uniform,
+                    false,
+                ),
+            };
+
+            // Should this entry point be modified to do vertex pulling?
+            let do_vertex_pulling = can_vertex_pull
+                && pipeline_options.vertex_pulling_transform
+                && !pipeline_options.vertex_buffer_mappings.is_empty();
+
             // Is any global variable used by this entry point dynamically sized?
-            let supports_array_length = module
-                .global_variables
-                .iter()
-                .filter(|&(handle, _)| !fun_info[handle].is_empty())
-                .any(|(_, var)| needs_array_length(var.ty, &module.types));
+            let needs_buffer_sizes = do_vertex_pulling
+                || module
+                    .global_variables
+                    .iter()
+                    .filter(|&(handle, _)| !fun_info[handle].is_empty())
+                    .any(|(_, var)| needs_array_length(var.ty, &module.types));
 
             // skip this entry point if any global bindings are missing,
             // or their types are incompatible.
@@ -3986,7 +4813,7 @@ impl<W: Write> Writer<W> {
                         | crate::AddressSpace::WorkGroup => {}
                     }
                 }
-                if supports_array_length {
+                if needs_buffer_sizes {
                     if let Err(err) = options.resolve_sizes_buffer(ep) {
                         ep_error = Some(err);
                     }
@@ -4002,22 +4829,6 @@ impl<W: Write> Writer<W> {
 
             writeln!(self.out)?;
 
-            let (em_str, in_mode, out_mode) = match ep.stage {
-                crate::ShaderStage::Vertex => (
-                    "vertex",
-                    LocationMode::VertexInput,
-                    LocationMode::VertexOutput,
-                ),
-                crate::ShaderStage::Fragment { .. } => (
-                    "fragment",
-                    LocationMode::FragmentInput,
-                    LocationMode::FragmentOutput,
-                ),
-                crate::ShaderStage::Compute { .. } => {
-                    ("kernel", LocationMode::Uniform, LocationMode::Uniform)
-                }
-            };
-
             // Since `Namer.reset` wasn't expecting struct members to be
             // suddenly injected into another namespace like this,
             // `self.names` doesn't keep them distinct from other variables.
@@ -4045,7 +4856,11 @@ impl<W: Write> Writer<W> {
                             let name_key = NameKey::StructMember(arg.ty, member_index);
                             let name = match member.binding {
                                 Some(crate::Binding::Location { .. }) => {
-                                    varyings_namer.call(&self.names[&name_key])
+                                    if do_vertex_pulling {
+                                        self.namer.call(&self.names[&name_key])
+                                    } else {
+                                        varyings_namer.call(&self.names[&name_key])
+                                    }
                                 }
                                 _ => self.namer.call(&self.names[&name_key]),
                             };
@@ -4060,19 +4875,24 @@ impl<W: Write> Writer<W> {
                 }
             }
 
-            // Identify the varyings among the argument values, and emit a
-            // struct type named `<fun>Input` to hold them.
+            // Identify the varyings among the argument values, and maybe emit
+            // a struct type named `<fun>Input` to hold them. If we are doing
+            // vertex pulling, we instead update our attribute mapping to
+            // note the types, names, and zero values of the attributes.
             let stage_in_name = format!("{fun_name}Input");
             let varyings_member_name = self.namer.call("varyings");
             let mut has_varyings = false;
             if !flattened_arguments.is_empty() {
-                writeln!(self.out, "struct {stage_in_name} {{")?;
+                if !do_vertex_pulling {
+                    writeln!(self.out, "struct {stage_in_name} {{")?;
+                }
                 for &(ref name_key, ty, binding) in flattened_arguments.iter() {
-                    let binding = match binding {
-                        Some(ref binding @ &crate::Binding::Location { .. }) => binding,
+                    let (binding, location) = match binding {
+                        Some(ref binding @ &crate::Binding::Location { location, .. }) => {
+                            (binding, location)
+                        }
                         _ => continue,
                     };
-                    has_varyings = true;
                     let name = match *name_key {
                         NameKey::StructMember(..) => &flattened_member_names[name_key],
                         _ => &self.names[name_key],
@@ -4086,11 +4906,27 @@ impl<W: Write> Writer<W> {
                         first_time: false,
                     };
                     let resolved = options.resolve_local_binding(binding, in_mode)?;
-                    write!(self.out, "{}{} {}", back::INDENT, ty_name, name)?;
-                    resolved.try_fmt(&mut self.out)?;
-                    writeln!(self.out, ";")?;
+                    if do_vertex_pulling {
+                        // Update our attribute mapping.
+                        am_resolved.insert(
+                            location,
+                            AttributeMappingResolved {
+                                ty_name: ty_name.to_string(),
+                                dimension: ty_name.vertex_input_dimension(),
+                                ty_is_int: ty_name.scalar().map_or(false, scalar_is_int),
+                                name: name.to_string(),
+                            },
+                        );
+                    } else {
+                        has_varyings = true;
+                        write!(self.out, "{}{} {}", back::INDENT, ty_name, name)?;
+                        resolved.try_fmt(&mut self.out)?;
+                        writeln!(self.out, ";")?;
+                    }
+                }
+                if !do_vertex_pulling {
+                    writeln!(self.out, "}};")?;
                 }
-                writeln!(self.out, "}};")?;
             }
 
             // Define a struct type named for the return value, if any, named
@@ -4173,6 +5009,23 @@ impl<W: Write> Writer<W> {
                 None => "void",
             };
 
+            // If we're doing a vertex pulling transform, define the buffer
+            // structure types.
+            if do_vertex_pulling {
+                for vbm in &vbm_resolved {
+                    let buffer_stride = vbm.stride;
+                    let buffer_ty = &vbm.ty_name;
+
+                    // Define a structure of bytes of the appropriate size.
+                    // When we access the attributes, we'll be unpacking these
+                    // bytes at some offset.
+                    writeln!(
+                        self.out,
+                        "struct {buffer_ty} {{ metal::uchar data[{buffer_stride}]; }};"
+                    )?;
+                }
+            }
+
             // Write the entry point function's name, and begin its argument list.
             writeln!(self.out, "{em_str} {result_type_name} {fun_name}(")?;
             let mut is_first_argument = true;
@@ -4213,6 +5066,17 @@ impl<W: Write> Writer<W> {
                     binding: None,
                     first_time: false,
                 };
+
+                match *binding {
+                    crate::Binding::BuiltIn(crate::BuiltIn::VertexIndex) => {
+                        v_existing_id = Some(name.clone());
+                    }
+                    crate::Binding::BuiltIn(crate::BuiltIn::InstanceIndex) => {
+                        i_existing_id = Some(name.clone());
+                    }
+                    _ => {}
+                };
+
                 let resolved = options.resolve_local_binding(binding, in_mode)?;
                 let separator = if is_first_argument {
                     is_first_argument = false;
@@ -4405,16 +5269,45 @@ impl<W: Write> Writer<W> {
                 writeln!(self.out)?;
             }
 
-            // If this entry uses any variable-length arrays, their sizes are
-            // passed as a final struct-typed argument.
-            if supports_array_length {
-                // this is checked earlier
-                let resolved = options.resolve_sizes_buffer(ep).unwrap();
-                let separator = if module.global_variables.is_empty() {
+            if do_vertex_pulling {
+                assert!(needs_vertex_id || needs_instance_id);
+
+                let mut separator = if is_first_argument {
+                    is_first_argument = false;
                     ' '
                 } else {
                     ','
                 };
+
+                if needs_vertex_id && v_existing_id.is_none() {
+                    // Write the [[vertex_id]] argument.
+                    writeln!(self.out, "{separator} uint {v_id} [[vertex_id]]")?;
+                    separator = ',';
+                }
+
+                if needs_instance_id && i_existing_id.is_none() {
+                    writeln!(self.out, "{separator} uint {i_id} [[instance_id]]")?;
+                }
+
+                // Iterate vbm_resolved, output one argument for every vertex buffer,
+                // using the names we generated earlier.
+                for vbm in &vbm_resolved {
+                    let id = &vbm.id;
+                    let ty_name = &vbm.ty_name;
+                    let param_name = &vbm.param_name;
+                    writeln!(
+                        self.out,
+                        ", const device {ty_name}* {param_name} [[buffer({id})]]"
+                    )?;
+                }
+            }
+
+            // If this entry uses any variable-length arrays, their sizes are
+            // passed as a final struct-typed argument.
+            if needs_buffer_sizes {
+                // this is checked earlier
+                let resolved = options.resolve_sizes_buffer(ep).unwrap();
+                let separator = if is_first_argument { ' ' } else { ',' };
                 write!(
                     self.out,
                     "{separator} constant _mslBufferSizes& _buffer_sizes",
@@ -4426,6 +5319,126 @@ impl<W: Write> Writer<W> {
             // end of the entry point argument list
             writeln!(self.out, ") {{")?;
 
+            // Starting the function body.
+            if do_vertex_pulling {
+                // Provide zero values for all the attributes, which we will overwrite with
+                // real data from the vertex attribute buffers, if the indices are in-bounds.
+                for vbm in &vbm_resolved {
+                    for attribute in vbm.attributes {
+                        let location = attribute.shader_location;
+                        let am_option = am_resolved.get(&location);
+                        if am_option.is_none() {
+                            // This bound attribute isn't used in this entry point, so
+                            // don't bother zero-initializing it.
+                            continue;
+                        }
+                        let am = am_option.unwrap();
+                        let attribute_ty_name = &am.ty_name;
+                        let attribute_name = &am.name;
+
+                        writeln!(
+                            self.out,
+                            "{}{attribute_ty_name} {attribute_name} = {{}};",
+                            back::Level(1)
+                        )?;
+                    }
+
+                    // Output a bounds check block that will set real values for the
+                    // attributes, if the bounds are satisfied.
+                    write!(self.out, "{}if (", back::Level(1))?;
+
+                    let idx = &vbm.id;
+                    let stride = &vbm.stride;
+                    let index_name = if vbm.indexed_by_vertex {
+                        if let Some(ref name) = v_existing_id {
+                            name
+                        } else {
+                            &v_id
+                        }
+                    } else if let Some(ref name) = i_existing_id {
+                        name
+                    } else {
+                        &i_id
+                    };
+                    write!(
+                        self.out,
+                        "{index_name} < (_buffer_sizes.buffer_size{idx} / {stride})"
+                    )?;
+
+                    writeln!(self.out, ") {{")?;
+
+                    // Pull the bytes out of the vertex buffer.
+                    let ty_name = &vbm.ty_name;
+                    let elem_name = &vbm.elem_name;
+                    let param_name = &vbm.param_name;
+
+                    writeln!(
+                        self.out,
+                        "{}const {ty_name} {elem_name} = {param_name}[{index_name}];",
+                        back::Level(2),
+                    )?;
+
+                    // Now set real values for each of the attributes, by unpacking the data
+                    // from the buffer elements.
+                    for attribute in vbm.attributes {
+                        let location = attribute.shader_location;
+                        let am_option = am_resolved.get(&location);
+                        if am_option.is_none() {
+                            // This bound attribute isn't used in this entry point, so
+                            // don't bother extracting the data. Too bad we emitted the
+                            // unpacking function earlier -- it might not get used.
+                            continue;
+                        }
+                        let am = am_option.unwrap();
+                        let attribute_name = &am.name;
+                        let attribute_ty_name = &am.ty_name;
+
+                        let offset = attribute.offset;
+                        let func = unpacking_functions
+                            .get(&attribute.format)
+                            .expect("Should have generated this unpacking function earlier.");
+                        let func_name = &func.name;
+
+                        write!(self.out, "{}{attribute_name} = ", back::Level(2),)?;
+
+                        // Check dimensionality of the attribute compared to the unpacking
+                        // function. If attribute dimension is < unpack dimension, then
+                        // we need to explicitly cast down the result. Otherwise, if attribute
+                        // dimension > unpack dimension, we have to pad out the unpack value
+                        // from a vec4(0, 0, 0, 1) of matching scalar type.
+
+                        let needs_truncate_or_padding = am.dimension != func.dimension;
+                        if needs_truncate_or_padding {
+                            write!(self.out, "{attribute_ty_name}(")?;
+                        }
+
+                        write!(self.out, "{func_name}({elem_name}.data[{offset}]",)?;
+                        for i in (offset + 1)..(offset + func.byte_count) {
+                            write!(self.out, ", {elem_name}.data[{i}]")?;
+                        }
+                        write!(self.out, ")")?;
+
+                        if needs_truncate_or_padding {
+                            let zero_value = if am.ty_is_int { "0" } else { "0.0" };
+                            let one_value = if am.ty_is_int { "1" } else { "1.0" };
+                            for i in func.dimension..am.dimension {
+                                write!(
+                                    self.out,
+                                    ", {}",
+                                    if i == 3 { one_value } else { zero_value }
+                                )?;
+                            }
+                            write!(self.out, ")")?;
+                        }
+
+                        writeln!(self.out, ";")?;
+                    }
+
+                    // End the bounds check / attribute setting block.
+                    writeln!(self.out, "{}}}", back::Level(1))?;
+                }
+            }
+
             if need_workgroup_variables_initialization {
                 self.write_workgroup_variables_initialization(
                     module,
@@ -4518,7 +5531,9 @@ impl<W: Write> Writer<W> {
                                 write!(self.out, "{{}}, ")?;
                             }
                             if let Some(crate::Binding::Location { .. }) = member.binding {
-                                write!(self.out, "{varyings_member_name}.")?;
+                                if has_varyings {
+                                    write!(self.out, "{varyings_member_name}.")?;
+                                }
                             }
                             write!(self.out, "{name}")?;
                         }
@@ -4526,14 +5541,16 @@ impl<W: Write> Writer<W> {
                     }
                     _ => {
                         if let Some(crate::Binding::Location { .. }) = arg.binding {
-                            writeln!(
-                                self.out,
-                                "{}const auto {} = {}.{};",
-                                back::INDENT,
-                                arg_name,
-                                varyings_member_name,
-                                arg_name
-                            )?;
+                            if has_varyings {
+                                writeln!(
+                                    self.out,
+                                    "{}const auto {} = {}.{};",
+                                    back::INDENT,
+                                    arg_name,
+                                    varyings_member_name,
+                                    arg_name
+                                )?;
+                            }
                         }
                     }
                 }
diff --git a/naga/tests/in/interface.param.ron b/naga/tests/in/interface.param.ron
index 4d85661767..b5dce6b8aa 100644
--- a/naga/tests/in/interface.param.ron
+++ b/naga/tests/in/interface.param.ron
@@ -27,5 +27,7 @@
 	),
 	msl_pipeline: (
 		allow_and_force_point_size: true,
+    vertex_pulling_transform: false,
+    vertex_buffer_mappings: [],
 	),
 )
diff --git a/naga/tests/in/vertex-pulling-transform.param.ron b/naga/tests/in/vertex-pulling-transform.param.ron
new file mode 100644
index 0000000000..d05e212562
--- /dev/null
+++ b/naga/tests/in/vertex-pulling-transform.param.ron
@@ -0,0 +1,31 @@
+(
+  msl_pipeline: (
+    allow_and_force_point_size: false,
+    vertex_pulling_transform: true,
+    vertex_buffer_mappings: [(
+      id: 1,
+      stride: 20,
+      indexed_by_vertex: true,
+      attributes: [(
+        shader_location: 0, // position
+        offset: 0,
+        format: Float32,    // too small, inflated to a vec4
+      ),
+      (
+        shader_location: 1, // normal
+        offset: 4,
+        format: Float32x4,  // too big, truncated to a vec3
+      )],
+    ),
+    (
+      id: 2,
+      stride: 16,
+      indexed_by_vertex: false,
+      attributes: [(
+        shader_location: 2, // texcoord
+        offset: 0,
+        format: Float32x2,
+      )],
+    )],
+  ),
+)
diff --git a/naga/tests/in/vertex-pulling-transform.wgsl b/naga/tests/in/vertex-pulling-transform.wgsl
new file mode 100644
index 0000000000..87b318ec1e
--- /dev/null
+++ b/naga/tests/in/vertex-pulling-transform.wgsl
@@ -0,0 +1,32 @@
+struct VertexOutput {
+  @builtin(position) position: vec4<f32>,
+  @location(0) color: vec4<f32>,
+  @location(1) texcoord: vec2<f32>,
+}
+
+struct VertexInput {
+  @location(0) position: vec4<f32>,
+  @location(1) normal: vec3<f32>,
+  @location(2) texcoord: vec2<f32>,
+}
+
+@group(0) @binding(0) var<uniform> mvp_matrix: mat4x4<f32>;
+
+@vertex
+fn render_vertex(
+  v_in: VertexInput,
+  @builtin(vertex_index) v_existing_id: u32,
+) -> VertexOutput
+{
+  var v_out: VertexOutput;
+  v_out.position = v_in.position * mvp_matrix;
+  v_out.color = do_lighting(v_in.position,
+                            v_in.normal);
+  v_out.texcoord = v_in.texcoord;
+  return v_out;
+}
+
+fn do_lighting(position: vec4<f32>, normal: vec3<f32>) -> vec4<f32> {
+  // blah blah blah
+  return vec4<f32>(0);
+}
diff --git a/naga/tests/out/msl/vertex-pulling-transform.msl b/naga/tests/out/msl/vertex-pulling-transform.msl
new file mode 100644
index 0000000000..6481e24c22
--- /dev/null
+++ b/naga/tests/out/msl/vertex-pulling-transform.msl
@@ -0,0 +1,76 @@
+// language: metal1.0
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using metal::uint;
+
+struct _mslBufferSizes {
+    uint buffer_size1;
+    uint buffer_size2;
+};
+
+struct VertexOutput {
+    metal::float4 position;
+    metal::float4 color;
+    metal::float2 texcoord;
+};
+struct VertexInput {
+    metal::float4 position;
+    metal::float3 normal;
+    metal::float2 texcoord;
+};
+float unpackFloat32_(uint b0, uint b1, uint b2, uint b3) {
+    return as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0);
+}
+metal::float4 unpackFloat32x4_(uint b0, uint b1, uint b2, uint b3, uint b4, uint b5, uint b6, uint b7, uint b8, uint b9, uint b10, uint b11, uint b12, uint b13, uint b14, uint b15) {
+    return metal::float4(as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0), as_type<float>(b7 << 24 | b6 << 16 | b5 << 8 | b4), as_type<float>(b11 << 24 | b10 << 16 | b9 << 8 | b8), as_type<float>(b15 << 24 | b14 << 16 | b13 << 8 | b12));
+}
+metal::float2 unpackFloat32x2_(uint b0, uint b1, uint b2, uint b3, uint b4, uint b5, uint b6, uint b7) {
+    return metal::float2(as_type<float>(b3 << 24 | b2 << 16 | b1 << 8 | b0), as_type<float>(b7 << 24 | b6 << 16 | b5 << 8 | b4));
+}
+
+metal::float4 do_lighting(
+    metal::float4 position,
+    metal::float3 normal
+) {
+    return metal::float4(0.0);
+}
+
+struct render_vertexOutput {
+    metal::float4 position [[position]];
+    metal::float4 color [[user(loc0), center_perspective]];
+    metal::float2 texcoord [[user(loc1), center_perspective]];
+};
+struct vb_1_type { metal::uchar data[20]; };
+struct vb_2_type { metal::uchar data[16]; };
+vertex render_vertexOutput render_vertex(
+  uint v_existing_id [[vertex_id]]
+, constant metal::float4x4& mvp_matrix [[user(fake0)]]
+, uint i_id [[instance_id]]
+, const device vb_1_type* vb_1_in [[buffer(1)]]
+, const device vb_2_type* vb_2_in [[buffer(2)]]
+, constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]
+) {
+    metal::float4 position_1 = {};
+    metal::float3 normal_1 = {};
+    if (v_existing_id < (_buffer_sizes.buffer_size1 / 20)) {
+        const vb_1_type vb_1_elem = vb_1_in[v_existing_id];
+        position_1 = metal::float4(unpackFloat32_(vb_1_elem.data[0], vb_1_elem.data[1], vb_1_elem.data[2], vb_1_elem.data[3]), 0.0, 0.0, 1.0);
+        normal_1 = metal::float3(unpackFloat32x4_(vb_1_elem.data[4], vb_1_elem.data[5], vb_1_elem.data[6], vb_1_elem.data[7], vb_1_elem.data[8], vb_1_elem.data[9], vb_1_elem.data[10], vb_1_elem.data[11], vb_1_elem.data[12], vb_1_elem.data[13], vb_1_elem.data[14], vb_1_elem.data[15], vb_1_elem.data[16], vb_1_elem.data[17], vb_1_elem.data[18], vb_1_elem.data[19]));
+    }
+    metal::float2 texcoord = {};
+    if (i_id < (_buffer_sizes.buffer_size2 / 16)) {
+        const vb_2_type vb_2_elem = vb_2_in[i_id];
+        texcoord = unpackFloat32x2_(vb_2_elem.data[0], vb_2_elem.data[1], vb_2_elem.data[2], vb_2_elem.data[3], vb_2_elem.data[4], vb_2_elem.data[5], vb_2_elem.data[6], vb_2_elem.data[7]);
+    }
+    const VertexInput v_in = { position_1, normal_1, texcoord };
+    VertexOutput v_out = {};
+    metal::float4x4 _e6 = mvp_matrix;
+    v_out.position = v_in.position * _e6;
+    metal::float4 _e11 = do_lighting(v_in.position, v_in.normal);
+    v_out.color = _e11;
+    v_out.texcoord = v_in.texcoord;
+    VertexOutput _e14 = v_out;
+    const auto _tmp = _e14;
+    return render_vertexOutput { _tmp.position, _tmp.color, _tmp.texcoord };
+}
diff --git a/naga/tests/snapshots.rs b/naga/tests/snapshots.rs
index 5e2441e0d5..8263375154 100644
--- a/naga/tests/snapshots.rs
+++ b/naga/tests/snapshots.rs
@@ -890,6 +890,7 @@ fn convert_wgsl() {
             "overrides-ray-query",
             Targets::IR | Targets::SPIRV | Targets::METAL,
         ),
+        ("vertex-pulling-transform", Targets::METAL),
     ];
 
     for &(name, targets) in inputs.iter() {
diff --git a/player/tests/data/bind-group.ron b/player/tests/data/bind-group.ron
index 9da7abe097..a53a77b16f 100644
--- a/player/tests/data/bind-group.ron
+++ b/player/tests/data/bind-group.ron
@@ -59,6 +59,7 @@
                     entry_point: None,
                     constants: {},
                     zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: false,
                 ),
             ),
         ),
diff --git a/player/tests/data/pipeline-statistics-query.ron b/player/tests/data/pipeline-statistics-query.ron
index f0f96d42cb..8a6e4239b9 100644
--- a/player/tests/data/pipeline-statistics-query.ron
+++ b/player/tests/data/pipeline-statistics-query.ron
@@ -32,6 +32,7 @@
                     entry_point: None,
                     constants: {},
                     zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: false,
                 ),
             ),
         ),
diff --git a/player/tests/data/quad.ron b/player/tests/data/quad.ron
index 1a8b4028bb..aad576c42b 100644
--- a/player/tests/data/quad.ron
+++ b/player/tests/data/quad.ron
@@ -60,6 +60,7 @@
                         entry_point: None,
                         constants: {},
                         zero_initialize_workgroup_memory: true,
+                        vertex_pulling_transform: false,
                     ),
                     buffers: [],
                 ),
@@ -69,6 +70,7 @@
                         entry_point: None,
                         constants: {},
                         zero_initialize_workgroup_memory: true,
+                        vertex_pulling_transform: false,
                     ),
                     targets: [
                         Some((
diff --git a/player/tests/data/zero-init-buffer.ron b/player/tests/data/zero-init-buffer.ron
index 1ce7924ddd..b13786e262 100644
--- a/player/tests/data/zero-init-buffer.ron
+++ b/player/tests/data/zero-init-buffer.ron
@@ -136,6 +136,7 @@
                     entry_point: None,
                     constants: {},
                     zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: false,
                 ),
             ),
         ),
diff --git a/player/tests/data/zero-init-texture-binding.ron b/player/tests/data/zero-init-texture-binding.ron
index 2aeaf22c7d..ba4951c198 100644
--- a/player/tests/data/zero-init-texture-binding.ron
+++ b/player/tests/data/zero-init-texture-binding.ron
@@ -137,6 +137,7 @@
                     entry_point: None,
                     constants: {},
                     zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: false,
                 ),
             ),
         ),
diff --git a/tests/tests/vertex_indices/mod.rs b/tests/tests/vertex_indices/mod.rs
index 7bd172d850..b85f3274ed 100644
--- a/tests/tests/vertex_indices/mod.rs
+++ b/tests/tests/vertex_indices/mod.rs
@@ -185,6 +185,7 @@ struct Test {
     id_source: IdSource,
     draw_call_kind: DrawCallKind,
     encoder_kind: EncoderKind,
+    vertex_pulling_transform: bool,
 }
 
 impl Test {
@@ -298,6 +299,16 @@ async fn vertex_index_common(ctx: TestingContext) {
         cache: None,
     };
     let builtin_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
+    pipeline_desc
+        .vertex
+        .compilation_options
+        .vertex_pulling_transform = true;
+    let builtin_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
+    pipeline_desc
+        .vertex
+        .compilation_options
+        .vertex_pulling_transform = false;
+
     pipeline_desc.vertex.entry_point = "vs_main_buffers";
     pipeline_desc.vertex.buffers = &[
         wgpu::VertexBufferLayout {
@@ -312,6 +323,15 @@ async fn vertex_index_common(ctx: TestingContext) {
         },
     ];
     let buffer_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
+    pipeline_desc
+        .vertex
+        .compilation_options
+        .vertex_pulling_transform = true;
+    let buffer_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
+    pipeline_desc
+        .vertex
+        .compilation_options
+        .vertex_pulling_transform = false;
 
     let dummy = ctx
         .device
@@ -336,17 +356,20 @@ async fn vertex_index_common(ctx: TestingContext) {
         )
         .create_view(&wgpu::TextureViewDescriptor::default());
 
-    let mut tests = Vec::with_capacity(5 * 2 * 2);
+    let mut tests = Vec::with_capacity(5 * 2 * 2 * 2);
     for case in TestCase::ARRAY {
         for id_source in IdSource::ARRAY {
             for draw_call_kind in DrawCallKind::ARRAY {
                 for encoder_kind in EncoderKind::ARRAY {
-                    tests.push(Test {
-                        case,
-                        id_source,
-                        draw_call_kind,
-                        encoder_kind,
-                    })
+                    for vertex_pulling_transform in [false, true] {
+                        tests.push(Test {
+                            case,
+                            id_source,
+                            draw_call_kind,
+                            encoder_kind,
+                            vertex_pulling_transform,
+                        })
+                    }
                 }
             }
         }
@@ -357,8 +380,20 @@ async fn vertex_index_common(ctx: TestingContext) {
     let mut failed = false;
     for test in tests {
         let pipeline = match test.id_source {
-            IdSource::Buffers => &buffer_pipeline,
-            IdSource::Builtins => &builtin_pipeline,
+            IdSource::Buffers => {
+                if test.vertex_pulling_transform {
+                    &buffer_pipeline_vpt
+                } else {
+                    &buffer_pipeline
+                }
+            }
+            IdSource::Builtins => {
+                if test.vertex_pulling_transform {
+                    &builtin_pipeline_vpt
+                } else {
+                    &builtin_pipeline
+                }
+            }
         };
 
         let expected = test.expectation(&ctx);
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index ba51507d1f..f9242848c8 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -2737,6 +2737,7 @@ impl<A: HalApi> Device<A> {
                 entry_point: final_entry_point_name.as_ref(),
                 constants: desc.stage.constants.as_ref(),
                 zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
+                vertex_pulling_transform: false,
             },
             cache: cache.as_ref().and_then(|it| it.raw.as_ref()),
         };
@@ -3165,6 +3166,7 @@ impl<A: HalApi> Device<A> {
                 entry_point: &vertex_entry_point_name,
                 constants: stage_desc.constants.as_ref(),
                 zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory,
+                vertex_pulling_transform: stage_desc.vertex_pulling_transform,
             }
         };
 
@@ -3228,6 +3230,7 @@ impl<A: HalApi> Device<A> {
                     zero_initialize_workgroup_memory: fragment_state
                         .stage
                         .zero_initialize_workgroup_memory,
+                    vertex_pulling_transform: false,
                 })
             }
             None => None,
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index ee8f8668c3..f3e7dbacb2 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -166,6 +166,8 @@ pub struct ProgrammableStageDescriptor<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
+    pub vertex_pulling_transform: bool,
 }
 
 /// Number of implicit bind groups derived at pipeline creation.
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index ee59fa2590..560aa6f8c6 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -254,6 +254,7 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "vs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
+                vertex_pulling_transform: false,
             },
             vertex_buffers: &[],
             fragment_stage: Some(hal::ProgrammableStage {
@@ -261,6 +262,7 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "fs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
+                vertex_pulling_transform: false,
             }),
             primitive: wgt::PrimitiveState {
                 topology: wgt::PrimitiveTopology::TriangleStrip,
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 8f404dc4d2..90f0e6fc50 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -373,6 +373,7 @@ impl<A: hal::Api> Example<A> {
                     entry_point: "main",
                     constants: &Default::default(),
                     zero_initialize_workgroup_memory: true,
+                    vertex_pulling_transform: false,
                 },
                 cache: None,
             })
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 35b9ea0d0a..da3834bcb0 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -1714,6 +1714,8 @@ pub struct ProgrammableStage<'a, A: Api> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
+    pub vertex_pulling_transform: bool,
 }
 
 // Rust gets confused about the impl requirements for `A`
@@ -1724,6 +1726,7 @@ impl<A: Api> Clone for ProgrammableStage<'_, A> {
             entry_point: self.entry_point,
             constants: self.constants,
             zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
+            vertex_pulling_transform: self.vertex_pulling_transform,
         }
     }
 }
diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs
index 341712c323..fb9c7e9c0e 100644
--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@@ -16,6 +16,7 @@ impl Default for super::CommandState {
             raw_wg_size: metal::MTLSize::new(0, 0, 0),
             stage_infos: Default::default(),
             storage_buffer_length_map: Default::default(),
+            vertex_buffer_size_map: Default::default(),
             work_group_memory_sizes: Vec::new(),
             push_constants: Vec::new(),
             pending_timer_queries: Vec::new(),
@@ -137,6 +138,7 @@ impl super::CommandEncoder {
 impl super::CommandState {
     fn reset(&mut self) {
         self.storage_buffer_length_map.clear();
+        self.vertex_buffer_size_map.clear();
         self.stage_infos.vs.clear();
         self.stage_infos.fs.clear();
         self.stage_infos.cs.clear();
@@ -160,6 +162,15 @@ impl super::CommandState {
                 .unwrap_or_default()
         }));
 
+        // Extend with the sizes of the mapped vertex buffers, in the order
+        // they were added to the map.
+        result_sizes.extend(stage_info.vertex_buffer_mappings.iter().map(|vbm| {
+            self.vertex_buffer_size_map
+                .get(&(vbm.id as u64))
+                .map(|size| u32::try_from(size.get()).unwrap_or(u32::MAX))
+                .unwrap_or_default()
+        }));
+
         if !result_sizes.is_empty() {
             Some((slot as _, result_sizes))
         } else {
@@ -927,6 +938,27 @@ impl crate::CommandEncoder for super::CommandEncoder {
         let buffer_index = self.shared.private_caps.max_vertex_buffers as u64 - 1 - index as u64;
         let encoder = self.state.render.as_ref().unwrap();
         encoder.set_vertex_buffer(buffer_index, Some(&binding.buffer.raw), binding.offset);
+
+        let buffer_size = binding.resolve_size();
+        if buffer_size > 0 {
+            self.state.vertex_buffer_size_map.insert(
+                buffer_index,
+                std::num::NonZeroU64::new(buffer_size).unwrap(),
+            );
+        } else {
+            self.state.vertex_buffer_size_map.remove(&buffer_index);
+        }
+
+        if let Some((index, sizes)) = self
+            .state
+            .make_sizes_buffer_update(naga::ShaderStage::Vertex, &mut self.temp.binding_sizes)
+        {
+            encoder.set_vertex_bytes(
+                index as _,
+                (sizes.len() * WORD_SIZE) as u64,
+                sizes.as_ptr() as _,
+            );
+        }
     }
 
     unsafe fn set_viewport(&mut self, rect: &crate::Rect<f32>, depth_range: Range<f32>) {
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
index 81ab5dbdb6..77ea8a0d86 100644
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@@ -59,10 +59,48 @@ fn create_depth_stencil_desc(state: &wgt::DepthStencilState) -> metal::DepthSten
     desc
 }
 
+const fn convert_vertex_format_to_naga(format: wgt::VertexFormat) -> naga::back::msl::VertexFormat {
+    match format {
+        wgt::VertexFormat::Uint8x2 => naga::back::msl::VertexFormat::Uint8x2,
+        wgt::VertexFormat::Uint8x4 => naga::back::msl::VertexFormat::Uint8x4,
+        wgt::VertexFormat::Sint8x2 => naga::back::msl::VertexFormat::Sint8x2,
+        wgt::VertexFormat::Sint8x4 => naga::back::msl::VertexFormat::Sint8x4,
+        wgt::VertexFormat::Unorm8x2 => naga::back::msl::VertexFormat::Unorm8x2,
+        wgt::VertexFormat::Unorm8x4 => naga::back::msl::VertexFormat::Unorm8x4,
+        wgt::VertexFormat::Snorm8x2 => naga::back::msl::VertexFormat::Snorm8x2,
+        wgt::VertexFormat::Snorm8x4 => naga::back::msl::VertexFormat::Snorm8x4,
+        wgt::VertexFormat::Uint16x2 => naga::back::msl::VertexFormat::Uint16x2,
+        wgt::VertexFormat::Uint16x4 => naga::back::msl::VertexFormat::Uint16x4,
+        wgt::VertexFormat::Sint16x2 => naga::back::msl::VertexFormat::Sint16x2,
+        wgt::VertexFormat::Sint16x4 => naga::back::msl::VertexFormat::Sint16x4,
+        wgt::VertexFormat::Unorm16x2 => naga::back::msl::VertexFormat::Unorm16x2,
+        wgt::VertexFormat::Unorm16x4 => naga::back::msl::VertexFormat::Unorm16x4,
+        wgt::VertexFormat::Snorm16x2 => naga::back::msl::VertexFormat::Snorm16x2,
+        wgt::VertexFormat::Snorm16x4 => naga::back::msl::VertexFormat::Snorm16x4,
+        wgt::VertexFormat::Float16x2 => naga::back::msl::VertexFormat::Float16x2,
+        wgt::VertexFormat::Float16x4 => naga::back::msl::VertexFormat::Float16x4,
+        wgt::VertexFormat::Float32 => naga::back::msl::VertexFormat::Float32,
+        wgt::VertexFormat::Float32x2 => naga::back::msl::VertexFormat::Float32x2,
+        wgt::VertexFormat::Float32x3 => naga::back::msl::VertexFormat::Float32x3,
+        wgt::VertexFormat::Float32x4 => naga::back::msl::VertexFormat::Float32x4,
+        wgt::VertexFormat::Uint32 => naga::back::msl::VertexFormat::Uint32,
+        wgt::VertexFormat::Uint32x2 => naga::back::msl::VertexFormat::Uint32x2,
+        wgt::VertexFormat::Uint32x3 => naga::back::msl::VertexFormat::Uint32x3,
+        wgt::VertexFormat::Uint32x4 => naga::back::msl::VertexFormat::Uint32x4,
+        wgt::VertexFormat::Sint32 => naga::back::msl::VertexFormat::Sint32,
+        wgt::VertexFormat::Sint32x2 => naga::back::msl::VertexFormat::Sint32x2,
+        wgt::VertexFormat::Sint32x3 => naga::back::msl::VertexFormat::Sint32x3,
+        wgt::VertexFormat::Sint32x4 => naga::back::msl::VertexFormat::Sint32x4,
+        wgt::VertexFormat::Unorm10_10_10_2 => naga::back::msl::VertexFormat::Unorm10_10_10_2,
+        _ => unimplemented!(),
+    }
+}
+
 impl super::Device {
     fn load_shader(
         &self,
         stage: &crate::ProgrammableStage<super::Api>,
+        vertex_buffer_mappings: &[naga::back::msl::VertexBufferMapping],
         layout: &super::PipelineLayout,
         primitive_class: metal::MTLPrimitiveTopologyClass,
         naga_stage: naga::ShaderStage,
@@ -120,6 +158,8 @@ impl super::Device {
                 metal::MTLPrimitiveTopologyClass::Point => true,
                 _ => false,
             },
+            vertex_pulling_transform: stage.vertex_pulling_transform,
+            vertex_buffer_mappings: vertex_buffer_mappings.to_vec(),
         };
 
         let (source, info) =
@@ -548,7 +588,7 @@ impl crate::Device for super::Device {
             pc_buffer: Option<super::ResourceIndex>,
             pc_limit: u32,
             sizes_buffer: Option<super::ResourceIndex>,
-            sizes_count: u8,
+            need_sizes_buffer: bool,
             resources: naga::back::msl::BindingMap,
         }
 
@@ -558,7 +598,7 @@ impl crate::Device for super::Device {
             pc_buffer: None,
             pc_limit: 0,
             sizes_buffer: None,
-            sizes_count: 0,
+            need_sizes_buffer: false,
             resources: Default::default(),
         });
         let mut bind_group_infos = arrayvec::ArrayVec::new();
@@ -603,7 +643,7 @@ impl crate::Device for super::Device {
                 {
                     for info in stage_data.iter_mut() {
                         if entry.visibility.contains(map_naga_stage(info.stage)) {
-                            info.sizes_count += 1;
+                            info.need_sizes_buffer = true;
                         }
                     }
                 }
@@ -661,11 +701,13 @@ impl crate::Device for super::Device {
 
         // Finally, make sure we fit the limits
         for info in stage_data.iter_mut() {
-            // handle the sizes buffer assignment and shader overrides
-            if info.sizes_count != 0 {
+            if info.need_sizes_buffer || info.stage == naga::ShaderStage::Vertex {
+                // Set aside space for the sizes_buffer, which is required
+                // for variable-length buffers, or to support vertex pulling.
                 info.sizes_buffer = Some(info.counters.buffers);
                 info.counters.buffers += 1;
             }
+
             if info.counters.buffers > self.shared.private_caps.max_buffers_per_stage
                 || info.counters.textures > self.shared.private_caps.max_textures_per_stage
                 || info.counters.samplers > self.shared.private_caps.max_samplers_per_stage
@@ -832,8 +874,38 @@ impl crate::Device for super::Device {
 
             // Vertex shader
             let (vs_lib, vs_info) = {
+                let mut vertex_buffer_mappings = Vec::<naga::back::msl::VertexBufferMapping>::new();
+                for (i, vbl) in desc.vertex_buffers.iter().enumerate() {
+                    let mut attributes = Vec::<naga::back::msl::AttributeMapping>::new();
+                    for attribute in vbl.attributes.iter() {
+                        attributes.push(naga::back::msl::AttributeMapping {
+                            shader_location: attribute.shader_location,
+                            offset: attribute.offset as u32,
+                            format: convert_vertex_format_to_naga(attribute.format),
+                        });
+                    }
+
+                    vertex_buffer_mappings.push(naga::back::msl::VertexBufferMapping {
+                        id: self.shared.private_caps.max_vertex_buffers - 1 - i as u32,
+                        stride: if vbl.array_stride > 0 {
+                            vbl.array_stride.try_into().unwrap()
+                        } else {
+                            vbl.attributes
+                                .iter()
+                                .map(|attribute| attribute.offset + attribute.format.size())
+                                .max()
+                                .unwrap_or(0)
+                                .try_into()
+                                .unwrap()
+                        },
+                        indexed_by_vertex: (vbl.step_mode == wgt::VertexStepMode::Vertex {}),
+                        attributes,
+                    });
+                }
+
                 let vs = self.load_shader(
                     &desc.vertex_stage,
+                    &vertex_buffer_mappings,
                     desc.layout,
                     primitive_class,
                     naga::ShaderStage::Vertex,
@@ -851,6 +923,7 @@ impl crate::Device for super::Device {
                     push_constants: desc.layout.push_constants_infos.vs,
                     sizes_slot: desc.layout.per_stage_map.vs.sizes_buffer,
                     sized_bindings: vs.sized_bindings,
+                    vertex_buffer_mappings,
                 };
 
                 (vs.library, info)
@@ -861,6 +934,7 @@ impl crate::Device for super::Device {
                 Some(ref stage) => {
                     let fs = self.load_shader(
                         stage,
+                        &[],
                         desc.layout,
                         primitive_class,
                         naga::ShaderStage::Fragment,
@@ -878,6 +952,7 @@ impl crate::Device for super::Device {
                         push_constants: desc.layout.push_constants_infos.fs,
                         sizes_slot: desc.layout.per_stage_map.fs.sizes_buffer,
                         sized_bindings: fs.sized_bindings,
+                        vertex_buffer_mappings: vec![],
                     };
 
                     (Some(fs.library), Some(info))
@@ -1053,6 +1128,7 @@ impl crate::Device for super::Device {
 
             let cs = self.load_shader(
                 &desc.stage,
+                &[],
                 desc.layout,
                 metal::MTLPrimitiveTopologyClass::Unspecified,
                 naga::ShaderStage::Compute,
@@ -1070,6 +1146,7 @@ impl crate::Device for super::Device {
                 push_constants: desc.layout.push_constants_infos.cs,
                 sizes_slot: desc.layout.per_stage_map.cs.sizes_buffer,
                 sized_bindings: cs.sized_bindings,
+                vertex_buffer_mappings: vec![],
             };
 
             if let Some(name) = desc.label {
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index a5ea63b035..ce8e015924 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -466,6 +466,15 @@ impl Buffer {
     }
 }
 
+impl crate::BufferBinding<'_, Api> {
+    fn resolve_size(&self) -> wgt::BufferAddress {
+        match self.size {
+            Some(size) => size.get(),
+            None => self.buffer.size - self.offset,
+        }
+    }
+}
+
 #[derive(Debug)]
 pub struct Texture {
     raw: metal::Texture,
@@ -690,6 +699,9 @@ struct PipelineStageInfo {
     ///
     /// See `device::CompiledShader::sized_bindings` for more details.
     sized_bindings: Vec<naga::ResourceBinding>,
+
+    /// Info on all bound vertex buffers.
+    vertex_buffer_mappings: Vec<naga::back::msl::VertexBufferMapping>,
 }
 
 impl PipelineStageInfo {
@@ -697,6 +709,7 @@ impl PipelineStageInfo {
         self.push_constants = None;
         self.sizes_slot = None;
         self.sized_bindings.clear();
+        self.vertex_buffer_mappings.clear();
     }
 
     fn assign_from(&mut self, other: &Self) {
@@ -704,6 +717,9 @@ impl PipelineStageInfo {
         self.sizes_slot = other.sizes_slot;
         self.sized_bindings.clear();
         self.sized_bindings.extend_from_slice(&other.sized_bindings);
+        self.vertex_buffer_mappings.clear();
+        self.vertex_buffer_mappings
+            .extend_from_slice(&other.vertex_buffer_mappings);
     }
 }
 
@@ -821,6 +837,8 @@ struct CommandState {
     /// [`ResourceBinding`]: naga::ResourceBinding
     storage_buffer_length_map: rustc_hash::FxHashMap<naga::ResourceBinding, wgt::BufferSize>,
 
+    vertex_buffer_size_map: rustc_hash::FxHashMap<u64, wgt::BufferSize>,
+
     work_group_memory_sizes: Vec<u32>,
     push_constants: Vec<u32>,
 
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 5ed055f2be..d5210900bb 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -1189,6 +1189,10 @@ impl crate::Context for ContextWgpuCore {
                         .vertex
                         .compilation_options
                         .zero_initialize_workgroup_memory,
+                    vertex_pulling_transform: desc
+                        .vertex
+                        .compilation_options
+                        .vertex_pulling_transform,
                 },
                 buffers: Borrowed(&vertex_buffers),
             },
@@ -1203,6 +1207,7 @@ impl crate::Context for ContextWgpuCore {
                     zero_initialize_workgroup_memory: frag
                         .compilation_options
                         .zero_initialize_workgroup_memory,
+                    vertex_pulling_transform: false,
                 },
                 targets: Borrowed(frag.targets),
             }),
@@ -1256,6 +1261,7 @@ impl crate::Context for ContextWgpuCore {
                 zero_initialize_workgroup_memory: desc
                     .compilation_options
                     .zero_initialize_workgroup_memory,
+                vertex_pulling_transform: false,
             },
             cache: desc.cache.map(|c| c.id.into()),
         };
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 00130a99c2..e94ae27fe8 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -1987,6 +1987,8 @@ pub struct PipelineCompilationOptions<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
+    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
+    pub vertex_pulling_transform: bool,
 }
 
 impl<'a> Default for PipelineCompilationOptions<'a> {
@@ -2000,6 +2002,7 @@ impl<'a> Default for PipelineCompilationOptions<'a> {
         Self {
             constants,
             zero_initialize_workgroup_memory: true,
+            vertex_pulling_transform: false,
         }
     }
 }

From c7458638d14921c7562e4197ddeefa17be413587 Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Thu, 30 May 2024 16:53:34 -0400
Subject: [PATCH 8/9] [hal/vk] Rework Submission and Surface Synchronization
 (#5681)

Fix two major synchronization issues in `wgpu_val::vulkan`:

- Properly order queue command buffer submissions. Due to Mesa bugs, two semaphores are required even though the Vulkan spec says that only one should be necessary.

- Properly manage surface texture acquisition and presentation:

    - Acquiring a surface texture can return while the presentation engine is still displaying the texture. Applications must wait for a semaphore to be signaled before using the acquired texture.

    - Presenting a surface texture requires a semaphore to ensure that drawing is complete before presentation occurs.

Co-authored-by: Jim Blandy <jimb@red-bean.com>
---
 wgpu-core/src/device/queue.rs                 |   2 +-
 wgpu-core/src/present.rs                      |  15 +-
 wgpu-hal/examples/halmark/main.rs             |  73 ++--
 wgpu-hal/examples/raw-gles.rs                 |   3 +-
 wgpu-hal/examples/ray-traced-triangle/main.rs |  73 ++--
 wgpu-hal/src/dx12/mod.rs                      |  11 +-
 wgpu-hal/src/empty.rs                         |   3 +-
 wgpu-hal/src/gles/egl.rs                      |   1 +
 wgpu-hal/src/gles/queue.rs                    |  12 +-
 wgpu-hal/src/gles/web.rs                      |   1 +
 wgpu-hal/src/gles/wgl.rs                      |   1 +
 wgpu-hal/src/lib.rs                           | 113 ++++-
 wgpu-hal/src/metal/mod.rs                     |  55 ++-
 wgpu-hal/src/metal/surface.rs                 |   1 +
 wgpu-hal/src/vulkan/adapter.rs                |  20 +-
 wgpu-hal/src/vulkan/device.rs                 | 140 +++---
 wgpu-hal/src/vulkan/instance.rs               |  71 ++-
 wgpu-hal/src/vulkan/mod.rs                    | 408 +++++++++++++++---
 18 files changed, 699 insertions(+), 304 deletions(-)

diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 168b36843b..8eb46f0aa9 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1499,7 +1499,7 @@ impl Global {
                     .raw
                     .as_ref()
                     .unwrap()
-                    .submit(&refs, &submit_surface_textures, Some((fence, submit_index)))
+                    .submit(&refs, &submit_surface_textures, (fence, submit_index))
                     .map_err(DeviceError::from)?;
             }
 
diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs
index 053f7fdb24..7f5939feb0 100644
--- a/wgpu-core/src/present.rs
+++ b/wgpu-core/src/present.rs
@@ -154,17 +154,20 @@ impl Global {
                 parent_id: surface_id,
             });
         }
-        #[cfg(not(feature = "trace"))]
-        let _ = device;
+
+        let fence_guard = device.fence.read();
+        let fence = fence_guard.as_ref().unwrap();
 
         let suf = A::surface_as_hal(surface.as_ref());
         let (texture_id, status) = match unsafe {
-            suf.unwrap()
-                .acquire_texture(Some(std::time::Duration::from_millis(
-                    FRAME_TIMEOUT_MS as u64,
-                )))
+            suf.unwrap().acquire_texture(
+                Some(std::time::Duration::from_millis(FRAME_TIMEOUT_MS as u64)),
+                fence,
+            )
         } {
             Ok(Some(ast)) => {
+                drop(fence_guard);
+
                 let texture_desc = wgt::TextureDescriptor {
                     label: (),
                     size: wgt::Extent3d {
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index 560aa6f8c6..81474f233d 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -22,7 +22,6 @@ const MAX_BUNNIES: usize = 1 << 20;
 const BUNNY_SIZE: f32 = 0.15 * 256.0;
 const GRAVITY: f32 = -9.8 * 100.0;
 const MAX_VELOCITY: f32 = 750.0;
-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;
 
 #[repr(C)]
@@ -498,7 +497,7 @@ impl<A: hal::Api> Example<A> {
             let mut fence = device.create_fence().unwrap();
             let init_cmd = cmd_encoder.end_encoding().unwrap();
             queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                 .unwrap();
             device.wait(&fence, init_fence_value, !0).unwrap();
             device.destroy_buffer(staging_buffer);
@@ -550,7 +549,7 @@ impl<A: hal::Api> Example<A> {
             {
                 let ctx = &mut self.contexts[self.context_index];
                 self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                     .unwrap();
             }
 
@@ -650,7 +649,13 @@ impl<A: hal::Api> Example<A> {
 
         let ctx = &mut self.contexts[self.context_index];
 
-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };
 
         let target_barrier0 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -718,7 +723,6 @@ impl<A: hal::Api> Example<A> {
         }
 
         ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;
 
         let target_barrier1 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -732,45 +736,42 @@ impl<A: hal::Api> Example<A> {
 
         unsafe {
             let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
             self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                 .unwrap();
             self.queue.present(&self.surface, surface_tex).unwrap();
             ctx.used_cmd_bufs.push(cmd_buf);
             ctx.used_views.push(surface_tex_view);
         };
 
-        if do_fence {
-            log::debug!("Context switch from {}", self.context_index);
-            let old_fence_value = ctx.fence_value;
-            if self.contexts.len() == 1 {
-                let hal_desc = hal::CommandEncoderDescriptor {
-                    label: None,
-                    queue: &self.queue,
-                };
-                self.contexts.push(unsafe {
-                    ExecutionContext {
-                        encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
-                        fence: self.device.create_fence().unwrap(),
-                        fence_value: 0,
-                        used_views: Vec::new(),
-                        used_cmd_bufs: Vec::new(),
-                        frames_recorded: 0,
-                    }
-                });
-            }
-            self.context_index = (self.context_index + 1) % self.contexts.len();
-            let next = &mut self.contexts[self.context_index];
-            unsafe {
-                next.wait_and_clear(&self.device);
-            }
-            next.fence_value = old_fence_value + 1;
+        log::debug!("Context switch from {}", self.context_index);
+        let old_fence_value = ctx.fence_value;
+        if self.contexts.len() == 1 {
+            let hal_desc = hal::CommandEncoderDescriptor {
+                label: None,
+                queue: &self.queue,
+            };
+            self.contexts.push(unsafe {
+                ExecutionContext {
+                    encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
+                    fence: self.device.create_fence().unwrap(),
+                    fence_value: 0,
+                    used_views: Vec::new(),
+                    used_cmd_bufs: Vec::new(),
+                    frames_recorded: 0,
+                }
+            });
+        }
+        self.context_index = (self.context_index + 1) % self.contexts.len();
+        let next = &mut self.contexts[self.context_index];
+        unsafe {
+            next.wait_and_clear(&self.device);
         }
+        next.fence_value = old_fence_value + 1;
     }
 }
 
diff --git a/wgpu-hal/examples/raw-gles.rs b/wgpu-hal/examples/raw-gles.rs
index 342100e1cb..675a518694 100644
--- a/wgpu-hal/examples/raw-gles.rs
+++ b/wgpu-hal/examples/raw-gles.rs
@@ -156,6 +156,7 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
             })
             .unwrap()
     };
+    let mut fence = unsafe { od.device.create_fence().unwrap() };
     let rp_desc = hal::RenderPassDescriptor {
         label: None,
         extent: wgt::Extent3d {
@@ -183,6 +184,6 @@ fn fill_screen(exposed: &hal::ExposedAdapter<hal::api::Gles>, width: u32, height
         encoder.begin_render_pass(&rp_desc);
         encoder.end_render_pass();
         let cmd_buf = encoder.end_encoding().unwrap();
-        od.queue.submit(&[&cmd_buf], &[], None).unwrap();
+        od.queue.submit(&[&cmd_buf], &[], (&mut fence, 0)).unwrap();
     }
 }
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 90f0e6fc50..cf0e146ec9 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -13,7 +13,6 @@ use std::{
 };
 use winit::window::WindowButtons;
 
-const COMMAND_BUFFER_PER_CONTEXT: usize = 100;
 const DESIRED_MAX_LATENCY: u32 = 2;
 
 /// [D3D12_RAYTRACING_INSTANCE_DESC](https://microsoft.github.io/DirectX-Specs/d3d/Raytracing.html#d3d12_raytracing_instance_desc)
@@ -759,7 +758,7 @@ impl<A: hal::Api> Example<A> {
             let mut fence = device.create_fence().unwrap();
             let init_cmd = cmd_encoder.end_encoding().unwrap();
             queue
-                .submit(&[&init_cmd], &[], Some((&mut fence, init_fence_value)))
+                .submit(&[&init_cmd], &[], (&mut fence, init_fence_value))
                 .unwrap();
             device.wait(&fence, init_fence_value, !0).unwrap();
             cmd_encoder.reset_all(iter::once(init_cmd));
@@ -808,7 +807,13 @@ impl<A: hal::Api> Example<A> {
     fn render(&mut self) {
         let ctx = &mut self.contexts[self.context_index];
 
-        let surface_tex = unsafe { self.surface.acquire_texture(None).unwrap().unwrap().texture };
+        let surface_tex = unsafe {
+            self.surface
+                .acquire_texture(None, &ctx.fence)
+                .unwrap()
+                .unwrap()
+                .texture
+        };
 
         let target_barrier0 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -909,7 +914,6 @@ impl<A: hal::Api> Example<A> {
         }
 
         ctx.frames_recorded += 1;
-        let do_fence = ctx.frames_recorded > COMMAND_BUFFER_PER_CONTEXT;
 
         let target_barrier1 = hal::TextureBarrier {
             texture: surface_tex.borrow(),
@@ -959,45 +963,42 @@ impl<A: hal::Api> Example<A> {
 
         unsafe {
             let cmd_buf = ctx.encoder.end_encoding().unwrap();
-            let fence_param = if do_fence {
-                Some((&mut ctx.fence, ctx.fence_value))
-            } else {
-                None
-            };
             self.queue
-                .submit(&[&cmd_buf], &[&surface_tex], fence_param)
+                .submit(
+                    &[&cmd_buf],
+                    &[&surface_tex],
+                    (&mut ctx.fence, ctx.fence_value),
+                )
                 .unwrap();
             self.queue.present(&self.surface, surface_tex).unwrap();
             ctx.used_cmd_bufs.push(cmd_buf);
             ctx.used_views.push(surface_tex_view);
         };
 
-        if do_fence {
-            log::info!("Context switch from {}", self.context_index);
-            let old_fence_value = ctx.fence_value;
-            if self.contexts.len() == 1 {
-                let hal_desc = hal::CommandEncoderDescriptor {
-                    label: None,
-                    queue: &self.queue,
-                };
-                self.contexts.push(unsafe {
-                    ExecutionContext {
-                        encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
-                        fence: self.device.create_fence().unwrap(),
-                        fence_value: 0,
-                        used_views: Vec::new(),
-                        used_cmd_bufs: Vec::new(),
-                        frames_recorded: 0,
-                    }
-                });
-            }
-            self.context_index = (self.context_index + 1) % self.contexts.len();
-            let next = &mut self.contexts[self.context_index];
-            unsafe {
-                next.wait_and_clear(&self.device);
-            }
-            next.fence_value = old_fence_value + 1;
+        log::info!("Context switch from {}", self.context_index);
+        let old_fence_value = ctx.fence_value;
+        if self.contexts.len() == 1 {
+            let hal_desc = hal::CommandEncoderDescriptor {
+                label: None,
+                queue: &self.queue,
+            };
+            self.contexts.push(unsafe {
+                ExecutionContext {
+                    encoder: self.device.create_command_encoder(&hal_desc).unwrap(),
+                    fence: self.device.create_fence().unwrap(),
+                    fence_value: 0,
+                    used_views: Vec::new(),
+                    used_cmd_bufs: Vec::new(),
+                    frames_recorded: 0,
+                }
+            });
+        }
+        self.context_index = (self.context_index + 1) % self.contexts.len();
+        let next = &mut self.contexts[self.context_index];
+        unsafe {
+            next.wait_and_clear(&self.device);
         }
+        next.fence_value = old_fence_value + 1;
     }
 
     fn exit(mut self) {
@@ -1005,7 +1006,7 @@ impl<A: hal::Api> Example<A> {
             {
                 let ctx = &mut self.contexts[self.context_index];
                 self.queue
-                    .submit(&[], &[], Some((&mut ctx.fence, ctx.fence_value)))
+                    .submit(&[], &[], (&mut ctx.fence, ctx.fence_value))
                     .unwrap();
             }
 
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 99800e87c9..9d5f62f915 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -857,6 +857,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        _fence: &Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
         let mut swapchain = self.swap_chain.write();
         let sc = swapchain.as_mut().unwrap();
@@ -895,7 +896,7 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         _surface_textures: &[&Texture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let mut temp_lists = self.temp_lists.lock();
         temp_lists.clear();
@@ -908,11 +909,9 @@ impl crate::Queue for Queue {
             self.raw.execute_command_lists(&temp_lists);
         }
 
-        if let Some((fence, value)) = signal_fence {
-            self.raw
-                .signal(&fence.raw, value)
-                .into_device_result("Signal fence")?;
-        }
+        self.raw
+            .signal(&signal_fence.raw, signal_value)
+            .into_device_result("Signal fence")?;
 
         // Note the lack of synchronization here between the main Direct queue
         // and the dedicated presentation queue. This is automatically handled
diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs
index f1986f7705..8cba9d063f 100644
--- a/wgpu-hal/src/empty.rs
+++ b/wgpu-hal/src/empty.rs
@@ -75,6 +75,7 @@ impl crate::Surface for Context {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &Resource,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<Api>>, crate::SurfaceError> {
         Ok(None)
     }
@@ -114,7 +115,7 @@ impl crate::Queue for Context {
         &self,
         command_buffers: &[&Resource],
         surface_textures: &[&Resource],
-        signal_fence: Option<(&mut Resource, crate::FenceValue)>,
+        signal_fence: (&mut Resource, crate::FenceValue),
     ) -> DeviceResult<()> {
         Ok(())
     }
diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs
index 5ddf9b48b5..07cd8e835d 100644
--- a/wgpu-hal/src/gles/egl.rs
+++ b/wgpu-hal/src/gles/egl.rs
@@ -1432,6 +1432,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index f6b55a449a..95eff36d57 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -1740,7 +1740,7 @@ impl crate::Queue for super::Queue {
         &self,
         command_buffers: &[&super::CommandBuffer],
         _surface_textures: &[&super::Texture],
-        signal_fence: Option<(&mut super::Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut super::Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let shared = Arc::clone(&self.shared);
         let gl = &shared.context.lock();
@@ -1774,12 +1774,10 @@ impl crate::Queue for super::Queue {
             }
         }
 
-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(gl);
-            let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) }
-                .map_err(|_| crate::DeviceError::OutOfMemory)?;
-            fence.pending.push((value, sync));
-        }
+        signal_fence.maintain(gl);
+        let sync = unsafe { gl.fence_sync(glow::SYNC_GPU_COMMANDS_COMPLETE, 0) }
+            .map_err(|_| crate::DeviceError::OutOfMemory)?;
+        signal_fence.pending.push((signal_value, sync));
 
         Ok(())
     }
diff --git a/wgpu-hal/src/gles/web.rs b/wgpu-hal/src/gles/web.rs
index ab2ccef8b6..081f7da5d1 100644
--- a/wgpu-hal/src/gles/web.rs
+++ b/wgpu-hal/src/gles/web.rs
@@ -427,6 +427,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/gles/wgl.rs b/wgpu-hal/src/gles/wgl.rs
index aae70478b4..1111d98f83 100644
--- a/wgpu-hal/src/gles/wgl.rs
+++ b/wgpu-hal/src/gles/wgl.rs
@@ -798,6 +798,7 @@ impl crate::Surface for Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<Duration>,
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let swapchain = self.swapchain.read();
         let sc = swapchain.as_ref().unwrap();
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index da3834bcb0..e81fad403f 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -459,44 +459,101 @@ pub trait Instance: Sized + WasmNotSendSync {
 pub trait Surface: WasmNotSendSync {
     type A: Api;
 
-    /// Configures the surface to use the given device.
+    /// Configure `self` to use `device`.
     ///
     /// # Safety
     ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work using `self` must have been completed.
     /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
     /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - All surfaces created using other devices must have been unconfigured before this call.
+    /// - The surface `self` must not currently be configured to use any other [`Device`].
     unsafe fn configure(
         &self,
         device: &<Self::A as Api>::Device,
         config: &SurfaceConfiguration,
     ) -> Result<(), SurfaceError>;
 
-    /// Unconfigures the surface on the given device.
+    /// Unconfigure `self` on `device`.
     ///
     /// # Safety
     ///
-    /// - All gpu work that uses the surface must have been completed.
+    /// - All GPU work that uses `surface` must have been completed.
     /// - All [`AcquiredSurfaceTexture`]s must have been destroyed.
     /// - All [`Api::TextureView`]s derived from the [`AcquiredSurfaceTexture`]s must have been destroyed.
-    /// - The surface must have been configured on the given device.
+    /// - The surface `self` must have been configured on `device`.
     unsafe fn unconfigure(&self, device: &<Self::A as Api>::Device);
 
-    /// Returns the next texture to be presented by the swapchain for drawing
+    /// Return the next texture to be presented by `self`, for the caller to draw on.
     ///
-    /// A `timeout` of `None` means to wait indefinitely, with no timeout.
+    /// On success, return an [`AcquiredSurfaceTexture`] representing the
+    /// texture into which the caller should draw the image to be displayed on
+    /// `self`.
+    ///
+    /// If `timeout` elapses before `self` has a texture ready to be acquired,
+    /// return `Ok(None)`. If `timeout` is `None`, wait indefinitely, with no
+    /// timeout.
+    ///
+    /// # Using an [`AcquiredSurfaceTexture`]
+    ///
+    /// On success, this function returns an [`AcquiredSurfaceTexture`] whose
+    /// [`texture`] field is a [`SurfaceTexture`] from which the caller can
+    /// [`borrow`] a [`Texture`] to draw on. The [`AcquiredSurfaceTexture`] also
+    /// carries some metadata about that [`SurfaceTexture`].
+    ///
+    /// All calls to [`Queue::submit`] that draw on that [`Texture`] must also
+    /// include the [`SurfaceTexture`] in the `surface_textures` argument.
+    ///
+    /// When you are done drawing on the texture, you can display it on `self`
+    /// by passing the [`SurfaceTexture`] and `self` to [`Queue::present`].
+    ///
+    /// If you do not wish to display the texture, you must pass the
+    /// [`SurfaceTexture`] to [`self.discard_texture`], so that it can be reused
+    /// by future acquisitions.
     ///
     /// # Portability
     ///
-    /// Some backends can't support a timeout when acquiring a texture and
-    /// the timeout will be ignored.
+    /// Some backends can't support a timeout when acquiring a texture. On these
+    /// backends, `timeout` is ignored.
     ///
-    /// Returns `None` on timing out.
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - The `fence` argument must be the same [`Fence`] passed to all calls to
+    ///   [`Queue::submit`] that used [`Texture`]s acquired from this surface.
+    ///
+    /// - You may only have one texture acquired from `self` at a time. When
+    ///   `acquire_texture` returns `Ok(Some(ast))`, you must pass the returned
+    ///   [`SurfaceTexture`] `ast.texture` to either [`Queue::present`] or
+    ///   [`Surface::discard_texture`] before calling `acquire_texture` again.
+    ///
+    /// [`texture`]: AcquiredSurfaceTexture::texture
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`borrow`]: std::borrow::Borrow::borrow
+    /// [`Texture`]: Api::Texture
+    /// [`Fence`]: Api::Fence
+    /// [`self.discard_texture`]: Surface::discard_texture
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &<Self::A as Api>::Fence,
     ) -> Result<Option<AcquiredSurfaceTexture<Self::A>>, SurfaceError>;
+
+    /// Relinquish an acquired texture without presenting it.
+    ///
+    /// After this call, the texture underlying [`SurfaceTexture`] may be
+    /// returned by subsequent calls to [`self.acquire_texture`].
+    ///
+    /// # Safety
+    ///
+    /// - The surface `self` must currently be configured on some [`Device`].
+    ///
+    /// - `texture` must be a [`SurfaceTexture`] returned by a call to
+    ///   [`self.acquire_texture`] that has not yet been passed to
+    ///   [`Queue::present`].
+    ///
+    /// [`SurfaceTexture`]: Api::SurfaceTexture
+    /// [`self.acquire_texture`]: Surface::acquire_texture
     unsafe fn discard_texture(&self, texture: <Self::A as Api>::SurfaceTexture);
 }
 
@@ -762,19 +819,23 @@ pub trait Queue: WasmNotSendSync {
 
     /// Submit `command_buffers` for execution on GPU.
     ///
-    /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value`
-    /// when the operation is complete. See [`Fence`] for details.
+    /// Update `fence` to `value` when the operation is complete. See
+    /// [`Fence`] for details.
+    ///
+    /// A `wgpu_hal` queue is "single threaded": all command buffers are
+    /// executed in the order they're submitted, with each buffer able to see
+    /// previous buffers' results. Specifically:
     ///
-    /// If two calls to `submit` on a single `Queue` occur in a particular order
-    /// (that is, they happen on the same thread, or on two threads that have
-    /// synchronized to establish an ordering), then the first submission's
-    /// commands all complete execution before any of the second submission's
-    /// commands begin. All results produced by one submission are visible to
-    /// the next.
+    /// - If two calls to `submit` on a single `Queue` occur in a particular
+    ///   order (that is, they happen on the same thread, or on two threads that
+    ///   have synchronized to establish an ordering), then the first
+    ///   submission's commands all complete execution before any of the second
+    ///   submission's commands begin. All results produced by one submission
+    ///   are visible to the next.
     ///
-    /// Within a submission, command buffers execute in the order in which they
-    /// appear in `command_buffers`. All results produced by one buffer are
-    /// visible to the next.
+    /// - Within a submission, command buffers execute in the order in which they
+    ///   appear in `command_buffers`. All results produced by one buffer are
+    ///   visible to the next.
     ///
     /// If two calls to `submit` on a single `Queue` from different threads are
     /// not synchronized to occur in a particular order, they must pass distinct
@@ -803,10 +864,16 @@ pub trait Queue: WasmNotSendSync {
     /// - Every [`SurfaceTexture`][st] that any command in `command_buffers`
     ///   writes to must appear in the `surface_textures` argument.
     ///
+    /// - No [`SurfaceTexture`][st] may appear in the `surface_textures`
+    ///   argument more than once.
+    ///
     /// - Each [`SurfaceTexture`][st] in `surface_textures` must be configured
     ///   for use with the [`Device`][d] associated with this [`Queue`],
     ///   typically by calling [`Surface::configure`].
     ///
+    /// - All calls to this function that include a given [`SurfaceTexture`][st]
+    ///   in `surface_textures` must use the same [`Fence`].
+    ///
     /// [`Fence`]: Api::Fence
     /// [cb]: Api::CommandBuffer
     /// [ce]: Api::CommandEncoder
@@ -819,7 +886,7 @@ pub trait Queue: WasmNotSendSync {
         &self,
         command_buffers: &[&<Self::A as Api>::CommandBuffer],
         surface_textures: &[&<Self::A as Api>::SurfaceTexture],
-        signal_fence: Option<(&mut <Self::A as Api>::Fence, FenceValue)>,
+        signal_fence: (&mut <Self::A as Api>::Fence, FenceValue),
     ) -> Result<(), DeviceError>;
     unsafe fn present(
         &self,
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index ce8e015924..1867d7de44 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -377,38 +377,37 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         _surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         objc::rc::autoreleasepool(|| {
-            let extra_command_buffer = match signal_fence {
-                Some((fence, value)) => {
-                    let completed_value = Arc::clone(&fence.completed_value);
-                    let block = block::ConcreteBlock::new(move |_cmd_buf| {
-                        completed_value.store(value, atomic::Ordering::Release);
-                    })
-                    .copy();
-
-                    let raw = match command_buffers.last() {
-                        Some(&cmd_buf) => cmd_buf.raw.to_owned(),
-                        None => {
-                            let queue = self.raw.lock();
-                            queue
-                                .new_command_buffer_with_unretained_references()
-                                .to_owned()
-                        }
-                    };
-                    raw.set_label("(wgpu internal) Signal");
-                    raw.add_completed_handler(&block);
-
-                    fence.maintain();
-                    fence.pending_command_buffers.push((value, raw.to_owned()));
-                    // only return an extra one if it's extra
-                    match command_buffers.last() {
-                        Some(_) => None,
-                        None => Some(raw),
+            let extra_command_buffer = {
+                let completed_value = Arc::clone(&signal_fence.completed_value);
+                let block = block::ConcreteBlock::new(move |_cmd_buf| {
+                    completed_value.store(signal_value, atomic::Ordering::Release);
+                })
+                .copy();
+
+                let raw = match command_buffers.last() {
+                    Some(&cmd_buf) => cmd_buf.raw.to_owned(),
+                    None => {
+                        let queue = self.raw.lock();
+                        queue
+                            .new_command_buffer_with_unretained_references()
+                            .to_owned()
                     }
+                };
+                raw.set_label("(wgpu internal) Signal");
+                raw.add_completed_handler(&block);
+
+                signal_fence.maintain();
+                signal_fence
+                    .pending_command_buffers
+                    .push((signal_value, raw.to_owned()));
+                // only return an extra one if it's extra
+                match command_buffers.last() {
+                    Some(_) => None,
+                    None => Some(raw),
                 }
-                None => None,
             };
 
             for cmd_buffer in command_buffers {
diff --git a/wgpu-hal/src/metal/surface.rs b/wgpu-hal/src/metal/surface.rs
index e1eb6d5b23..1a11056609 100644
--- a/wgpu-hal/src/metal/surface.rs
+++ b/wgpu-hal/src/metal/surface.rs
@@ -242,6 +242,7 @@ impl crate::Surface for super::Surface {
     unsafe fn acquire_texture(
         &self,
         _timeout_ms: Option<std::time::Duration>, //TODO
+        _fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let render_layer = self.render_layer.lock();
         let (drawable, texture) = match autoreleasepool(|| {
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 6df999084f..fe2a6f9707 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -3,11 +3,7 @@ use super::conv;
 use ash::{amd, ext, khr, vk};
 use parking_lot::Mutex;
 
-use std::{
-    collections::BTreeMap,
-    ffi::CStr,
-    sync::{atomic::AtomicIsize, Arc},
-};
+use std::{collections::BTreeMap, ffi::CStr, sync::Arc};
 
 fn depth_stencil_required_flags() -> vk::FormatFeatureFlags {
     vk::FormatFeatureFlags::SAMPLED_IMAGE | vk::FormatFeatureFlags::DEPTH_STENCIL_ATTACHMENT
@@ -1783,21 +1779,15 @@ impl super::Adapter {
             render_passes: Mutex::new(Default::default()),
             framebuffers: Mutex::new(Default::default()),
         });
-        let mut relay_semaphores = [vk::Semaphore::null(); 2];
-        for sem in relay_semaphores.iter_mut() {
-            unsafe {
-                *sem = shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)?
-            };
-        }
+
+        let relay_semaphores = super::RelaySemaphores::new(&shared)?;
+
         let queue = super::Queue {
             raw: raw_queue,
             swapchain_fn,
             device: Arc::clone(&shared),
             family_index,
-            relay_semaphores,
-            relay_index: AtomicIsize::new(-1),
+            relay_semaphores: Mutex::new(relay_semaphores),
         };
 
         let mem_allocator = {
diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs
index 1ea627897f..867b7efb23 100644
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@@ -612,17 +612,16 @@ impl super::Device {
         let images =
             unsafe { functor.get_swapchain_images(raw) }.map_err(crate::DeviceError::from)?;
 
-        // NOTE: It's important that we define at least images.len() + 1 wait
+        // NOTE: It's important that we define at least images.len() wait
         // semaphores, since we prospectively need to provide the call to
         // acquire the next image with an unsignaled semaphore.
-        let surface_semaphores = (0..images.len() + 1)
-            .map(|_| unsafe {
-                self.shared
-                    .raw
-                    .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+        let surface_semaphores = (0..=images.len())
+            .map(|_| {
+                super::SwapchainImageSemaphores::new(&self.shared)
+                    .map(Mutex::new)
+                    .map(Arc::new)
             })
-            .collect::<Result<Vec<_>, _>>()
-            .map_err(crate::DeviceError::from)?;
+            .collect::<Result<Vec<_>, _>>()?;
 
         Ok(super::Swapchain {
             raw,
@@ -633,7 +632,7 @@ impl super::Device {
             config: config.clone(),
             view_formats: wgt_view_formats,
             surface_semaphores,
-            next_surface_index: 0,
+            next_semaphore_index: 0,
         })
     }
 
@@ -836,9 +835,12 @@ impl crate::Device for super::Device {
     unsafe fn exit(self, queue: super::Queue) {
         unsafe { self.mem_allocator.into_inner().cleanup(&*self.shared) };
         unsafe { self.desc_allocator.into_inner().cleanup(&*self.shared) };
-        for &sem in queue.relay_semaphores.iter() {
-            unsafe { self.shared.raw.destroy_semaphore(sem, None) };
-        }
+        unsafe {
+            queue
+                .relay_semaphores
+                .into_inner()
+                .destroy(&self.shared.raw)
+        };
         unsafe { self.shared.free_resources() };
     }
 
@@ -2055,54 +2057,7 @@ impl crate::Device for super::Device {
         timeout_ms: u32,
     ) -> Result<bool, crate::DeviceError> {
         let timeout_ns = timeout_ms as u64 * super::MILLIS_TO_NANOS;
-        match *fence {
-            super::Fence::TimelineSemaphore(raw) => {
-                let semaphores = [raw];
-                let values = [wait_value];
-                let vk_info = vk::SemaphoreWaitInfo::default()
-                    .semaphores(&semaphores)
-                    .values(&values);
-                let result = match self.shared.extension_fns.timeline_semaphore {
-                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
-                        ext.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    Some(super::ExtensionFn::Promoted) => unsafe {
-                        self.shared.raw.wait_semaphores(&vk_info, timeout_ns)
-                    },
-                    None => unreachable!(),
-                };
-                match result {
-                    Ok(()) => Ok(true),
-                    Err(vk::Result::TIMEOUT) => Ok(false),
-                    Err(other) => Err(other.into()),
-                }
-            }
-            super::Fence::FencePool {
-                last_completed,
-                ref active,
-                free: _,
-            } => {
-                if wait_value <= last_completed {
-                    Ok(true)
-                } else {
-                    match active.iter().find(|&&(value, _)| value >= wait_value) {
-                        Some(&(_, raw)) => {
-                            match unsafe {
-                                self.shared.raw.wait_for_fences(&[raw], true, timeout_ns)
-                            } {
-                                Ok(()) => Ok(true),
-                                Err(vk::Result::TIMEOUT) => Ok(false),
-                                Err(other) => Err(other.into()),
-                            }
-                        }
-                        None => {
-                            log::error!("No signals reached value {}", wait_value);
-                            Err(crate::DeviceError::Lost)
-                        }
-                    }
-                }
-            }
-        }
+        self.shared.wait_for_fence(fence, wait_value, timeout_ns)
     }
 
     unsafe fn start_capture(&self) -> bool {
@@ -2364,6 +2319,71 @@ impl crate::Device for super::Device {
     }
 }
 
+impl super::DeviceShared {
+    pub(super) fn new_binary_semaphore(&self) -> Result<vk::Semaphore, crate::DeviceError> {
+        unsafe {
+            self.raw
+                .create_semaphore(&vk::SemaphoreCreateInfo::default(), None)
+                .map_err(crate::DeviceError::from)
+        }
+    }
+
+    pub(super) fn wait_for_fence(
+        &self,
+        fence: &super::Fence,
+        wait_value: crate::FenceValue,
+        timeout_ns: u64,
+    ) -> Result<bool, crate::DeviceError> {
+        profiling::scope!("Device::wait");
+        match *fence {
+            super::Fence::TimelineSemaphore(raw) => {
+                let semaphores = [raw];
+                let values = [wait_value];
+                let vk_info = vk::SemaphoreWaitInfo::default()
+                    .semaphores(&semaphores)
+                    .values(&values);
+                let result = match self.extension_fns.timeline_semaphore {
+                    Some(super::ExtensionFn::Extension(ref ext)) => unsafe {
+                        ext.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    Some(super::ExtensionFn::Promoted) => unsafe {
+                        self.raw.wait_semaphores(&vk_info, timeout_ns)
+                    },
+                    None => unreachable!(),
+                };
+                match result {
+                    Ok(()) => Ok(true),
+                    Err(vk::Result::TIMEOUT) => Ok(false),
+                    Err(other) => Err(other.into()),
+                }
+            }
+            super::Fence::FencePool {
+                last_completed,
+                ref active,
+                free: _,
+            } => {
+                if wait_value <= last_completed {
+                    Ok(true)
+                } else {
+                    match active.iter().find(|&&(value, _)| value >= wait_value) {
+                        Some(&(_, raw)) => {
+                            match unsafe { self.raw.wait_for_fences(&[raw], true, timeout_ns) } {
+                                Ok(()) => Ok(true),
+                                Err(vk::Result::TIMEOUT) => Ok(false),
+                                Err(other) => Err(other.into()),
+                            }
+                        }
+                        None => {
+                            log::error!("No signals reached value {}", wait_value);
+                            Err(crate::DeviceError::Lost)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 impl From<gpu_alloc::AllocationError> for crate::DeviceError {
     fn from(error: gpu_alloc::AllocationError) -> Self {
         use gpu_alloc::AllocationError as Ae;
diff --git a/wgpu-hal/src/vulkan/instance.rs b/wgpu-hal/src/vulkan/instance.rs
index 6f471f8905..18acaeabb9 100644
--- a/wgpu-hal/src/vulkan/instance.rs
+++ b/wgpu-hal/src/vulkan/instance.rs
@@ -164,10 +164,14 @@ impl super::Swapchain {
             let _ = unsafe { device.device_wait_idle() };
         };
 
+        // We cannot take this by value, as the function returns `self`.
         for semaphore in self.surface_semaphores.drain(..) {
-            unsafe {
-                device.destroy_semaphore(semaphore, None);
-            }
+            let arc_removed = Arc::into_inner(semaphore).expect(
+                "Trying to destroy a SurfaceSemaphores that is still in use by a SurfaceTexture",
+            );
+            let mutex_removed = arc_removed.into_inner();
+
+            unsafe { mutex_removed.destroy(device) };
         }
 
         self
@@ -966,9 +970,10 @@ impl crate::Surface for super::Surface {
     unsafe fn acquire_texture(
         &self,
         timeout: Option<std::time::Duration>,
+        fence: &super::Fence,
     ) -> Result<Option<crate::AcquiredSurfaceTexture<super::Api>>, crate::SurfaceError> {
         let mut swapchain = self.swapchain.write();
-        let sc = swapchain.as_mut().unwrap();
+        let swapchain = swapchain.as_mut().unwrap();
 
         let mut timeout_ns = match timeout {
             Some(duration) => duration.as_nanos() as u64,
@@ -988,12 +993,40 @@ impl crate::Surface for super::Surface {
             timeout_ns = u64::MAX;
         }
 
-        let wait_semaphore = sc.surface_semaphores[sc.next_surface_index];
+        let swapchain_semaphores_arc = swapchain.get_surface_semaphores();
+        // Nothing should be using this, so we don't block, but panic if we fail to lock.
+        let locked_swapchain_semaphores = swapchain_semaphores_arc
+            .try_lock()
+            .expect("Failed to lock a SwapchainSemaphores.");
+
+        // Wait for all commands writing to the previously acquired image to
+        // complete.
+        //
+        // Almost all the steps in the usual acquire-draw-present flow are
+        // asynchronous: they get something started on the presentation engine
+        // or the GPU, but on the CPU, control returns immediately. Without some
+        // sort of intervention, the CPU could crank out frames much faster than
+        // the presentation engine can display them.
+        //
+        // This is the intervention: if any submissions drew on this image, and
+        // thus waited for `locked_swapchain_semaphores.acquire`, wait for all
+        // of them to finish, thus ensuring that it's okay to pass `acquire` to
+        // `vkAcquireNextImageKHR` again.
+        swapchain.device.wait_for_fence(
+            fence,
+            locked_swapchain_semaphores.previously_used_submission_index,
+            timeout_ns,
+        )?;
 
         // will block if no image is available
         let (index, suboptimal) = match unsafe {
-            sc.functor
-                .acquire_next_image(sc.raw, timeout_ns, wait_semaphore, vk::Fence::null())
+            profiling::scope!("vkAcquireNextImageKHR");
+            swapchain.functor.acquire_next_image(
+                swapchain.raw,
+                timeout_ns,
+                locked_swapchain_semaphores.acquire,
+                vk::Fence::null(),
+            )
         } {
             // We treat `VK_SUBOPTIMAL_KHR` as `VK_SUCCESS` on Android.
             // See the comment in `Queue::present`.
@@ -1013,16 +1046,18 @@ impl crate::Surface for super::Surface {
             }
         };
 
-        sc.next_surface_index += 1;
-        sc.next_surface_index %= sc.surface_semaphores.len();
+        drop(locked_swapchain_semaphores);
+        // We only advance the surface semaphores if we successfully acquired an image, otherwise
+        // we should try to re-acquire using the same semaphores.
+        swapchain.advance_surface_semaphores();
 
         // special case for Intel Vulkan returning bizarre values (ugh)
-        if sc.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
+        if swapchain.device.vendor_id == crate::auxil::db::intel::VENDOR && index > 0x100 {
             return Err(crate::SurfaceError::Outdated);
         }
 
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-framebuffer-03209
-        let raw_flags = if sc
+        let raw_flags = if swapchain
             .raw_flags
             .contains(vk::SwapchainCreateFlagsKHR::MUTABLE_FORMAT)
         {
@@ -1034,20 +1069,20 @@ impl crate::Surface for super::Surface {
         let texture = super::SurfaceTexture {
             index,
             texture: super::Texture {
-                raw: sc.images[index as usize],
+                raw: swapchain.images[index as usize],
                 drop_guard: None,
                 block: None,
-                usage: sc.config.usage,
-                format: sc.config.format,
+                usage: swapchain.config.usage,
+                format: swapchain.config.format,
                 raw_flags,
                 copy_size: crate::CopyExtent {
-                    width: sc.config.extent.width,
-                    height: sc.config.extent.height,
+                    width: swapchain.config.extent.width,
+                    height: swapchain.config.extent.height,
                     depth: 1,
                 },
-                view_formats: sc.view_formats.clone(),
+                view_formats: swapchain.view_formats.clone(),
             },
-            wait_semaphore,
+            surface_semaphores: swapchain_semaphores_arc,
         };
         Ok(Some(crate::AcquiredSurfaceTexture {
             texture,
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index 1716ee9206..40e7a2cb42 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -33,13 +33,11 @@ mod instance;
 
 use std::{
     borrow::Borrow,
+    collections::HashSet,
     ffi::{CStr, CString},
-    fmt,
+    fmt, mem,
     num::NonZeroU32,
-    sync::{
-        atomic::{AtomicIsize, Ordering},
-        Arc,
-    },
+    sync::Arc,
 };
 
 use arrayvec::ArrayVec;
@@ -147,6 +145,173 @@ pub struct Instance {
     shared: Arc<InstanceShared>,
 }
 
+/// The semaphores needed to use one image in a swapchain.
+#[derive(Debug)]
+struct SwapchainImageSemaphores {
+    /// A semaphore that is signaled when this image is safe for us to modify.
+    ///
+    /// When [`vkAcquireNextImageKHR`] returns the index of the next swapchain
+    /// image that we should use, that image may actually still be in use by the
+    /// presentation engine, and is not yet safe to modify. However, that
+    /// function does accept a semaphore that it will signal when the image is
+    /// indeed safe to begin messing with.
+    ///
+    /// This semaphore is:
+    ///
+    /// - waited for by the first queue submission to operate on this image
+    ///   since it was acquired, and
+    ///
+    /// - signaled by [`vkAcquireNextImageKHR`] when the acquired image is ready
+    ///   for us to use.
+    ///
+    /// [`vkAcquireNextImageKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkAcquireNextImageKHR
+    acquire: vk::Semaphore,
+
+    /// True if the next command submission operating on this image should wait
+    /// for [`acquire`].
+    ///
+    /// We must wait for `acquire` before drawing to this swapchain image, but
+    /// because `wgpu-hal` queue submissions are always strongly ordered, only
+    /// the first submission that works with a swapchain image actually needs to
+    /// wait. We set this flag when this image is acquired, and clear it the
+    /// first time it's passed to [`Queue::submit`] as a surface texture.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    /// [`Queue::submit`]: crate::Queue::submit
+    should_wait_for_acquire: bool,
+
+    /// A pool of semaphores for ordering presentation after drawing.
+    ///
+    /// The first [`present_index`] semaphores in this vector are:
+    ///
+    /// - all waited on by the call to [`vkQueuePresentKHR`] that presents this
+    ///   image, and
+    ///
+    /// - each signaled by some [`vkQueueSubmit`] queue submission that draws to
+    ///   this image, when the submission finishes execution.
+    ///
+    /// This vector accumulates one semaphore per submission that writes to this
+    /// image. This is awkward, but hard to avoid: [`vkQueuePresentKHR`]
+    /// requires a semaphore to order it with respect to drawing commands, and
+    /// we can't attach new completion semaphores to a command submission after
+    /// it's been submitted. This means that, at submission time, we must create
+    /// the semaphore we might need if the caller's next action is to enqueue a
+    /// presentation of this image.
+    ///
+    /// An alternative strategy would be for presentation to enqueue an empty
+    /// submit, ordered relative to other submits in the usual way, and
+    /// signaling a single presentation semaphore. But we suspect that submits
+    /// are usually expensive enough, and semaphores usually cheap enough, that
+    /// performance-sensitive users will avoid making many submits, so that the
+    /// cost of accumulated semaphores will usually be less than the cost of an
+    /// additional submit.
+    ///
+    /// Only the first [`present_index`] semaphores in the vector are actually
+    /// going to be signalled by submitted commands, and need to be waited for
+    /// by the next present call. Any semaphores beyond that index were created
+    /// for prior presents and are simply being retained for recycling.
+    ///
+    /// [`present_index`]: SwapchainImageSemaphores::present_index
+    /// [`vkQueuePresentKHR`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueuePresentKHR
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    present: Vec<vk::Semaphore>,
+
+    /// The number of semaphores in [`present`] to be signalled for this submission.
+    ///
+    /// [`present`]: SwapchainImageSemaphores::present
+    present_index: usize,
+
+    /// The fence value of the last command submission that wrote to this image.
+    ///
+    /// The next time we try to acquire this image, we'll block until
+    /// this submission finishes, proving that [`acquire`] is ready to
+    /// pass to `vkAcquireNextImageKHR` again.
+    ///
+    /// [`acquire`]: SwapchainImageSemaphores::acquire
+    previously_used_submission_index: crate::FenceValue,
+}
+
+impl SwapchainImageSemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            acquire: device.new_binary_semaphore()?,
+            should_wait_for_acquire: true,
+            present: Vec::new(),
+            present_index: 0,
+            previously_used_submission_index: 0,
+        })
+    }
+
+    fn set_used_fence_value(&mut self, value: crate::FenceValue) {
+        self.previously_used_submission_index = value;
+    }
+
+    /// Return the semaphore that commands drawing to this image should wait for, if any.
+    ///
+    /// This only returns `Some` once per acquisition; see
+    /// [`SwapchainImageSemaphores::should_wait_for_acquire`] for details.
+    fn get_acquire_wait_semaphore(&mut self) -> Option<vk::Semaphore> {
+        if self.should_wait_for_acquire {
+            self.should_wait_for_acquire = false;
+            Some(self.acquire)
+        } else {
+            None
+        }
+    }
+
+    /// Return a semaphore that a submission that writes to this image should
+    /// signal when it's done.
+    ///
+    /// See [`SwapchainImageSemaphores::present`] for details.
+    fn get_submit_signal_semaphore(
+        &mut self,
+        device: &DeviceShared,
+    ) -> Result<vk::Semaphore, crate::DeviceError> {
+        // Try to recycle a semaphore we created for a previous presentation.
+        let sem = match self.present.get(self.present_index) {
+            Some(sem) => *sem,
+            None => {
+                let sem = device.new_binary_semaphore()?;
+                self.present.push(sem);
+                sem
+            }
+        };
+
+        self.present_index += 1;
+
+        Ok(sem)
+    }
+
+    /// Return the semaphores that a presentation of this image should wait on.
+    ///
+    /// Return a slice of semaphores that the call to [`vkQueueSubmit`] that
+    /// ends this image's acquisition should wait for. See
+    /// [`SwapchainImageSemaphores::present`] for details.
+    ///
+    /// Reset `self` to be ready for the next acquisition cycle.
+    ///
+    /// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+    fn get_present_wait_semaphores(&mut self) -> &[vk::Semaphore] {
+        let old_index = self.present_index;
+
+        // Since this marks the end of this acquire/draw/present cycle, take the
+        // opportunity to reset `self` in preparation for the next acquisition.
+        self.present_index = 0;
+        self.should_wait_for_acquire = true;
+
+        &self.present[0..old_index]
+    }
+
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            device.destroy_semaphore(self.acquire, None);
+            for sem in &self.present {
+                device.destroy_semaphore(*sem, None);
+            }
+        }
+    }
+}
+
 struct Swapchain {
     raw: vk::SwapchainKHR,
     raw_flags: vk::SwapchainCreateFlagsKHR,
@@ -157,9 +322,25 @@ struct Swapchain {
     view_formats: Vec<wgt::TextureFormat>,
     /// One wait semaphore per swapchain image. This will be associated with the
     /// surface texture, and later collected during submission.
-    surface_semaphores: Vec<vk::Semaphore>,
-    /// Current semaphore index to use when acquiring a surface.
-    next_surface_index: usize,
+    ///
+    /// We need this to be `Arc<Mutex<>>` because we need to be able to pass this
+    /// data into the surface texture, so submit/present can use it.
+    surface_semaphores: Vec<Arc<Mutex<SwapchainImageSemaphores>>>,
+    /// The index of the next semaphore to use. Ideally we would use the same
+    /// index as the image index, but we need to specify the semaphore as an argument
+    /// to the acquire_next_image function which is what tells us which image to use.
+    next_semaphore_index: usize,
+}
+
+impl Swapchain {
+    fn advance_surface_semaphores(&mut self) {
+        let semaphore_count = self.surface_semaphores.len();
+        self.next_semaphore_index = (self.next_semaphore_index + 1) % semaphore_count;
+    }
+
+    fn get_surface_semaphores(&self) -> Arc<Mutex<SwapchainImageSemaphores>> {
+        self.surface_semaphores[self.next_semaphore_index].clone()
+    }
 }
 
 pub struct Surface {
@@ -173,7 +354,7 @@ pub struct Surface {
 pub struct SurfaceTexture {
     index: u32,
     texture: Texture,
-    wait_semaphore: vk::Semaphore,
+    surface_semaphores: Arc<Mutex<SwapchainImageSemaphores>>,
 }
 
 impl Borrow<Texture> for SurfaceTexture {
@@ -359,18 +540,87 @@ pub struct Device {
     render_doc: crate::auxil::renderdoc::RenderDoc,
 }
 
+/// Semaphores for forcing queue submissions to run in order.
+///
+/// The [`wgpu_hal::Queue`] trait promises that if two calls to [`submit`] are
+/// ordered, then the first submission will finish on the GPU before the second
+/// submission begins. To get this behavior on Vulkan we need to pass semaphores
+/// to [`vkQueueSubmit`] for the commands to wait on before beginning execution,
+/// and to signal when their execution is done.
+///
+/// Normally this can be done with a single semaphore, waited on and then
+/// signalled for each submission. At any given time there's exactly one
+/// submission that would signal the semaphore, and exactly one waiting on it,
+/// as Vulkan requires.
+///
+/// However, as of Oct 2021, bug [#5508] in the Mesa ANV drivers caused them to
+/// hang if we use a single semaphore. The workaround is to alternate between
+/// two semaphores. The bug has been fixed in Mesa, but we should probably keep
+/// the workaround until, say, Oct 2026.
+///
+/// [`wgpu_hal::Queue`]: crate::Queue
+/// [`submit`]: crate::Queue::submit
+/// [`vkQueueSubmit`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#vkQueueSubmit
+/// [#5508]: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508
+#[derive(Clone)]
+struct RelaySemaphores {
+    /// The semaphore the next submission should wait on before beginning
+    /// execution on the GPU. This is `None` for the first submission, which
+    /// should not wait on anything at all.
+    wait: Option<vk::Semaphore>,
+
+    /// The semaphore the next submission should signal when it has finished
+    /// execution on the GPU.
+    signal: vk::Semaphore,
+}
+
+impl RelaySemaphores {
+    fn new(device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        Ok(Self {
+            wait: None,
+            signal: device.new_binary_semaphore()?,
+        })
+    }
+
+    /// Advances the semaphores, returning the semaphores that should be used for a submission.
+    fn advance(&mut self, device: &DeviceShared) -> Result<Self, crate::DeviceError> {
+        let old = self.clone();
+
+        // Build the state for the next submission.
+        match self.wait {
+            None => {
+                // The `old` values describe the first submission to this queue.
+                // The second submission should wait on `old.signal`, and then
+                // signal a new semaphore which we'll create now.
+                self.wait = Some(old.signal);
+                self.signal = device.new_binary_semaphore()?;
+            }
+            Some(ref mut wait) => {
+                // What this submission signals, the next should wait.
+                mem::swap(wait, &mut self.signal);
+            }
+        };
+
+        Ok(old)
+    }
+
+    /// Destroys the semaphores.
+    unsafe fn destroy(&self, device: &ash::Device) {
+        unsafe {
+            if let Some(wait) = self.wait {
+                device.destroy_semaphore(wait, None);
+            }
+            device.destroy_semaphore(self.signal, None);
+        }
+    }
+}
+
 pub struct Queue {
     raw: vk::Queue,
     swapchain_fn: khr::swapchain::Device,
     device: Arc<DeviceShared>,
     family_index: u32,
-    /// We use a redundant chain of semaphores to pass on the signal
-    /// from submissions to the last present, since it's required by the
-    /// specification.
-    /// It would be correct to use a single semaphore there, but
-    /// [Intel hangs in `anv_queue_finish`](https://gitlab.freedesktop.org/mesa/mesa/-/issues/5508).
-    relay_semaphores: [vk::Semaphore; 2],
-    relay_index: AtomicIsize,
+    relay_semaphores: Mutex<RelaySemaphores>,
 }
 
 #[derive(Debug)]
@@ -702,58 +952,89 @@ impl crate::Queue for Queue {
         &self,
         command_buffers: &[&CommandBuffer],
         surface_textures: &[&SurfaceTexture],
-        signal_fence: Option<(&mut Fence, crate::FenceValue)>,
+        (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         let mut fence_raw = vk::Fence::null();
 
         let mut wait_stage_masks = Vec::new();
         let mut wait_semaphores = Vec::new();
-        let mut signal_semaphores = ArrayVec::<_, 2>::new();
-        let mut signal_values = ArrayVec::<_, 2>::new();
+        let mut signal_semaphores = Vec::new();
+        let mut signal_values = Vec::new();
+
+        // Double check that the same swapchain image isn't being given to us multiple times,
+        // as that will deadlock when we try to lock them all.
+        debug_assert!(
+            {
+                let mut check = HashSet::with_capacity(surface_textures.len());
+                // We compare the Arcs by pointer, as Eq isn't well defined for SurfaceSemaphores.
+                for st in surface_textures {
+                    check.insert(Arc::as_ptr(&st.surface_semaphores));
+                }
+                check.len() == surface_textures.len()
+            },
+            "More than one surface texture is being used from the same swapchain. This will cause a deadlock in release."
+        );
 
-        for &surface_texture in surface_textures {
-            wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(surface_texture.wait_semaphore);
+        let locked_swapchain_semaphores = surface_textures
+            .iter()
+            .map(|st| {
+                st.surface_semaphores
+                    .try_lock()
+                    .expect("Failed to lock surface semaphore.")
+            })
+            .collect::<Vec<_>>();
+
+        for mut swapchain_semaphore in locked_swapchain_semaphores {
+            swapchain_semaphore.set_used_fence_value(signal_value);
+
+            // If we're the first submission to operate on this image, wait on
+            // its acquire semaphore, to make sure the presentation engine is
+            // done with it.
+            if let Some(sem) = swapchain_semaphore.get_acquire_wait_semaphore() {
+                wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
+                wait_semaphores.push(sem);
+            }
+
+            // Get a semaphore to signal when we're done writing to this surface
+            // image. Presentation of this image will wait for this.
+            let signal_semaphore = swapchain_semaphore.get_submit_signal_semaphore(&self.device)?;
+            signal_semaphores.push(signal_semaphore);
+            signal_values.push(!0);
         }
 
-        let old_index = self.relay_index.load(Ordering::Relaxed);
+        // In order for submissions to be strictly ordered, we encode a dependency between each submission
+        // using a pair of semaphores. This adds a wait if it is needed, and signals the next semaphore.
+        let semaphore_state = self.relay_semaphores.lock().advance(&self.device)?;
 
-        let sem_index = if old_index >= 0 {
+        if let Some(sem) = semaphore_state.wait {
             wait_stage_masks.push(vk::PipelineStageFlags::TOP_OF_PIPE);
-            wait_semaphores.push(self.relay_semaphores[old_index as usize]);
-            (old_index as usize + 1) % self.relay_semaphores.len()
-        } else {
-            0
-        };
-
-        signal_semaphores.push(self.relay_semaphores[sem_index]);
+            wait_semaphores.push(sem);
+        }
 
-        self.relay_index
-            .store(sem_index as isize, Ordering::Relaxed);
+        signal_semaphores.push(semaphore_state.signal);
+        signal_values.push(!0);
 
-        if let Some((fence, value)) = signal_fence {
-            fence.maintain(&self.device.raw)?;
-            match *fence {
-                Fence::TimelineSemaphore(raw) => {
-                    signal_semaphores.push(raw);
-                    signal_values.push(!0);
-                    signal_values.push(value);
-                }
-                Fence::FencePool {
-                    ref mut active,
-                    ref mut free,
-                    ..
-                } => {
-                    fence_raw = match free.pop() {
-                        Some(raw) => raw,
-                        None => unsafe {
-                            self.device
-                                .raw
-                                .create_fence(&vk::FenceCreateInfo::default(), None)?
-                        },
-                    };
-                    active.push((value, fence_raw));
-                }
+        // We need to signal our wgpu::Fence if we have one, this adds it to the signal list.
+        signal_fence.maintain(&self.device.raw)?;
+        match *signal_fence {
+            Fence::TimelineSemaphore(raw) => {
+                signal_semaphores.push(raw);
+                signal_values.push(signal_value);
+            }
+            Fence::FencePool {
+                ref mut active,
+                ref mut free,
+                ..
+            } => {
+                fence_raw = match free.pop() {
+                    Some(raw) => raw,
+                    None => unsafe {
+                        self.device
+                            .raw
+                            .create_fence(&vk::FenceCreateInfo::default(), None)?
+                    },
+                };
+                active.push((signal_value, fence_raw));
             }
         }
 
@@ -771,7 +1052,7 @@ impl crate::Queue for Queue {
 
         let mut vk_timeline_info;
 
-        if !signal_values.is_empty() {
+        if self.device.private_caps.timeline_semaphores {
             vk_timeline_info =
                 vk::TimelineSemaphoreSubmitInfo::default().signal_semaphore_values(&signal_values);
             vk_info = vk_info.push_next(&mut vk_timeline_info);
@@ -793,19 +1074,14 @@ impl crate::Queue for Queue {
     ) -> Result<(), crate::SurfaceError> {
         let mut swapchain = surface.swapchain.write();
         let ssc = swapchain.as_mut().unwrap();
+        let mut swapchain_semaphores = texture.surface_semaphores.lock();
 
         let swapchains = [ssc.raw];
         let image_indices = [texture.index];
-        let mut vk_info = vk::PresentInfoKHR::default()
+        let vk_info = vk::PresentInfoKHR::default()
             .swapchains(&swapchains)
-            .image_indices(&image_indices);
-
-        let old_index = self.relay_index.swap(-1, Ordering::Relaxed);
-        if old_index >= 0 {
-            vk_info = vk_info.wait_semaphores(
-                &self.relay_semaphores[old_index as usize..old_index as usize + 1],
-            );
-        }
+            .image_indices(&image_indices)
+            .wait_semaphores(swapchain_semaphores.get_present_wait_semaphores());
 
         let suboptimal = {
             profiling::scope!("vkQueuePresentKHR");

From e7a528b62b20f0036721237715a8f7f74c11f401 Mon Sep 17 00:00:00 2001
From: Kevin Reid <kpreid@switchb.org>
Date: Fri, 31 May 2024 18:25:42 -0700
Subject: [PATCH 9/9] Document WebGPU spec rule that an `Adapter` should be
 used only once. (#5764)

---
 wgpu/src/lib.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index e94ae27fe8..618946b1a1 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -2558,6 +2558,11 @@ impl Adapter {
     ///
     /// Returns the [`Device`] together with a [`Queue`] that executes command buffers.
     ///
+    /// [Per the WebGPU specification], an [`Adapter`] may only be used once to create a device.
+    /// If another device is wanted, call [`Instance::request_adapter()`] again to get a fresh
+    /// [`Adapter`].
+    /// However, `wgpu` does not currently enforce this restriction.
+    ///
     /// # Arguments
     ///
     /// - `desc` - Description of the features and limits requested from the given device.
@@ -2566,10 +2571,13 @@ impl Adapter {
     ///
     /// # Panics
     ///
+    /// - `request_device()` was already called on this `Adapter`.
     /// - Features specified by `desc` are not supported by this adapter.
     /// - Unsafe features were requested but not enabled when requesting the adapter.
     /// - Limits requested exceed the values provided by the adapter.
     /// - Adapter does not support all features wgpu requires to safely operate.
+    ///
+    /// [Per the WebGPU specification]: https://www.w3.org/TR/webgpu/#dom-gpuadapter-requestdevice
     pub fn request_device(
         &self,
         desc: &DeviceDescriptor<'_>,