From 286a14898af94ebb02dfac1145357de0b87499d5 Mon Sep 17 00:00:00 2001
From: Brad Werth <werth@efn.org>
Date: Tue, 4 Jun 2024 10:11:03 -0700
Subject: [PATCH] Remove vertex_pulling_transfrom from
 PipelineCompilationOptions.

This option was only evaluated for Metal backends, and now it's required
there so the option is going away. It is still configurable for tests
via the PipelineOptions struct, deserialized from .ron files.

This also fixes some type problems with the unpack functions in
writer.rs. Metal << operator extends operand to int-sized, which then
has to be cast back down to the real size before as_type bit conversion.
The math for the snorm values is corrected, in some cases using the
metal unpack_snorm2x16_to_float function because we can't directly
cast a bit-shifted ushort value to half.
---
 deno_webgpu/pipeline.rs                       |   3 -
 naga/CHANGELOG.md                             |   1 +
 naga/src/back/msl/mod.rs                      |   4 +-
 naga/src/back/msl/writer.rs                   |  68 +--
 tests/tests/root.rs                           |   1 +
 tests/tests/vertex_formats/draw.vert.wgsl     | 316 ++++++++++++++
 tests/tests/vertex_formats/mod.rs             | 386 ++++++++++++++++++
 tests/tests/vertex_indices/mod.rs             |  53 +--
 wgpu-core/src/device/global.rs                |   3 -
 wgpu-core/src/device/resource.rs              |   3 -
 wgpu-core/src/pipeline.rs                     |   4 -
 wgpu-hal/examples/halmark/main.rs             |   2 -
 wgpu-hal/examples/ray-traced-triangle/main.rs |   1 -
 wgpu-hal/src/lib.rs                           |   3 -
 wgpu-hal/src/metal/device.rs                  |   2 +-
 wgpu/src/backend/wgpu_core.rs                 |   6 -
 wgpu/src/lib.rs                               |   3 -
 17 files changed, 754 insertions(+), 105 deletions(-)
 create mode 100644 tests/tests/vertex_formats/draw.vert.wgsl
 create mode 100644 tests/tests/vertex_formats/mod.rs

diff --git a/deno_webgpu/pipeline.rs b/deno_webgpu/pipeline.rs
index f9257051191..86d530332f7 100644
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@@ -112,7 +112,6 @@ pub fn op_webgpu_create_compute_pipeline(
             entry_point: compute.entry_point.map(Cow::from),
             constants: Cow::Owned(compute.constants.unwrap_or_default()),
             zero_initialize_workgroup_memory: true,
-            vertex_pulling_transform: false,
         },
         cache: None,
     };
@@ -348,7 +347,6 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(fragment.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
             },
             targets: Cow::Owned(fragment.targets),
         })
@@ -374,7 +372,6 @@ pub fn op_webgpu_create_render_pipeline(
                 constants: Cow::Owned(args.vertex.constants.unwrap_or_default()),
                 // Required to be true for WebGPU
                 zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
             },
             buffers: Cow::Owned(vertex_buffers),
         },
diff --git a/naga/CHANGELOG.md b/naga/CHANGELOG.md
index 2a00f01f86d..49cde4e2123 100644
--- a/naga/CHANGELOG.md
+++ b/naga/CHANGELOG.md
@@ -81,6 +81,7 @@ For changelogs after v0.14, see [the wgpu changelog](../CHANGELOG.md).
 - Make varyings' struct members unique. ([#2521](https://github.com/gfx-rs/naga/pull/2521)) **@evahop**
 - Add experimental vertex pulling transform flag. ([#5254](https://github.com/gfx-rs/wgpu/pull/5254)) **@bradwerth**
 - Fixup some generated MSL for vertex buffer unpack functions. ([#5829](https://github.com/gfx-rs/wgpu/pull/5829)) **@bradwerth**
+- Make vertex pulling transform on by default. ([#5773](https://github.com/gfx-rs/wgpu/pull/5773)) **@bradwerth**
 
 #### GLSL-OUT
 
diff --git a/naga/src/back/msl/mod.rs b/naga/src/back/msl/mod.rs
index 3b33ee7a716..626475debcd 100644
--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@@ -354,7 +354,9 @@ pub struct PipelineOptions {
     /// to receive the vertex buffers, lengths, and vertex id as args,
     /// and bounds-check the vertex id and use the index into the
     /// vertex buffers to access attributes, rather than using Metal's
-    /// [[stage-in]] assembled attribute data.
+    /// [[stage-in]] assembled attribute data. This is true by default,
+    /// but remains configurable for use by tests via deserialization
+    /// of this struct. There is no user-facing way to set this value.
     pub vertex_pulling_transform: bool,
 
     /// vertex_buffer_mappings are used during shader translation to
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 8b868970078..7ec22009bd4 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -3953,8 +3953,8 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::float2((float(b0) - 128.0f) / 255.0f, \
-                                            (float(b1) - 128.0f) / 255.0f);",
+                    "{}return metal::float2(metal::max(-1.0f, as_type<char>(b0) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b1) / 127.0f));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -3971,10 +3971,10 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::float4((float(b0) - 128.0f) / 255.0f, \
-                                            (float(b1) - 128.0f) / 255.0f, \
-                                            (float(b2) - 128.0f) / 255.0f, \
-                                            (float(b3) - 128.0f) / 255.0f);",
+                    "{}return metal::float4(metal::max(-1.0f, as_type<char>(b0) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b1) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b2) / 127.0f), \
+                                            metal::max(-1.0f, as_type<char>(b3) / 127.0f));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4033,8 +4033,8 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::int2(as_type<metal::short>(b1 << 8 | b0), \
-                                          as_type<metal::short>(b3 << 8 | b2));",
+                    "{}return metal::int2(as_type<short>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<short>(metal::ushort(b3 << 8 | b2)));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4055,10 +4055,10 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::int4(as_type<metal::short>(b1 << 8 | b0), \
-                                          as_type<metal::short>(b3 << 8 | b2), \
-                                          as_type<metal::short>(b5 << 8 | b4), \
-                                          as_type<metal::short>(b7 << 8 | b6));",
+                    "{}return metal::int4(as_type<short>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<short>(metal::ushort(b3 << 8 | b2)), \
+                                          as_type<short>(metal::ushort(b5 << 8 | b4)), \
+                                          as_type<short>(metal::ushort(b7 << 8 | b6)));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4117,8 +4117,7 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::float2((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
-                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f);",
+                    "{}return metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2);",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4139,10 +4138,8 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::float4((float(b1 << 8 | b0) - 32767.0f) / 65535.0f, \
-                                            (float(b3 << 8 | b2) - 32767.0f) / 65535.0f, \
-                                            (float(b5 << 8 | b4) - 32767.0f) / 65535.0f, \
-                                            (float(b7 << 8 | b6) - 32767.0f) / 65535.0f);",
+                    "{}return metal::float4(metal::unpack_snorm2x16_to_float(b1 << 24 | b0 << 16 | b3 << 8 | b2), \
+                                            metal::unpack_snorm2x16_to_float(b5 << 24 | b4 << 16 | b7 << 8 | b6));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4159,8 +4156,8 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::float2(as_type<metal::half>(b1 << 8 | b0), \
-                                            as_type<metal::half>(b3 << 8 | b2));",
+                    "{}return metal::float2(as_type<half>(metal::ushort(b1 << 8 | b0)), \
+                                            as_type<half>(metal::ushort(b3 << 8 | b2)));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4170,7 +4167,7 @@ impl<W: Write> Writer<W> {
                 let name = self.namer.call("unpackFloat16x4");
                 writeln!(
                     self.out,
-                    "metal::int4 {name}(metal::ushort b0, \
+                    "metal::float4 {name}(metal::ushort b0, \
                                         metal::ushort b1, \
                                         metal::ushort b2, \
                                         metal::ushort b3, \
@@ -4181,10 +4178,10 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return metal::int4(as_type<metal::half>(b1 << 8 | b0), \
-                                          as_type<metal::half>(b3 << 8 | b2), \
-                                          as_type<metal::half>(b5 << 8 | b4), \
-                                          as_type<metal::half>(b7 << 8 | b6));",
+                    "{}return metal::float4(as_type<half>(metal::ushort(b1 << 8 | b0)), \
+                                          as_type<half>(metal::ushort(b3 << 8 | b2)), \
+                                          as_type<half>(metal::ushort(b5 << 8 | b4)), \
+                                          as_type<half>(metal::ushort(b7 << 8 | b6)));",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
@@ -4390,10 +4387,10 @@ impl<W: Write> Writer<W> {
                 let name = self.namer.call("unpackSint32");
                 writeln!(
                     self.out,
-                    "metal::int {name}(uint b0, \
-                                       uint b1, \
-                                       uint b2, \
-                                       uint b3) {{"
+                    "int {name}(uint b0, \
+                                uint b1, \
+                                uint b2, \
+                                uint b3) {{"
                 )?;
                 writeln!(
                     self.out,
@@ -4495,7 +4492,18 @@ impl<W: Write> Writer<W> {
                 )?;
                 writeln!(
                     self.out,
-                    "{}return unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
+                    // The following is correct for RGBA packing, but our format seems to
+                    // match ABGR, which can be fed into the Metal builtin function
+                    // unpack_unorm10a2_to_float.
+                    /*
+                    "{}uint v = (b3 << 24 | b2 << 16 | b1 << 8 | b0); \
+                       uint r = (v & 0xFFC00000) >> 22; \
+                       uint g = (v & 0x003FF000) >> 12; \
+                       uint b = (v & 0x00000FFC) >> 2; \
+                       uint a = (v & 0x00000003); \
+                       return metal::float4(float(r) / 1023.0f, float(g) / 1023.0f, float(b) / 1023.0f, float(a) / 3.0f);",
+                    */
+                    "{}return metal::unpack_unorm10a2_to_float(b3 << 24 | b2 << 16 | b1 << 8 | b0);",
                     back::INDENT
                 )?;
                 writeln!(self.out, "}}")?;
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index 6ceb3818df8..384cfcf78fc 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -42,6 +42,7 @@ mod subgroup_operations;
 mod texture_bounds;
 mod texture_view_creation;
 mod transfer;
+mod vertex_formats;
 mod vertex_indices;
 mod write_texture;
 mod zero_init_texture_after_discard;
diff --git a/tests/tests/vertex_formats/draw.vert.wgsl b/tests/tests/vertex_formats/draw.vert.wgsl
new file mode 100644
index 00000000000..bf6a08aac6c
--- /dev/null
+++ b/tests/tests/vertex_formats/draw.vert.wgsl
@@ -0,0 +1,316 @@
+@group(0) @binding(0)
+var<storage, read_write> checksums: array<f32>;
+
+const index_uint = 0u;
+const index_sint = 1u;
+const index_unorm = 2u;
+const index_snorm = 3u;
+const index_float16 = 4u;
+const index_float32 = 5u;
+
+fn init_checksums() {
+  checksums[index_uint] = 0.0;
+  checksums[index_sint] = 0.0;
+  checksums[index_unorm] = 0.0;
+  checksums[index_snorm] = 0.0;
+  checksums[index_float16] = 0.0;
+  checksums[index_float32] = 0.0;
+}
+
+// Break down the 31 vertex formats specified at
+// https://gpuweb.github.io/gpuweb/#vertex-formats into blocks
+// of 8, to keep under the limits of max locations. Each
+// AttributeBlockX structure will get a corresponding
+// vertex_block_X function to process its attributes into
+// values written to the checksums buffer.
+
+struct AttributeBlock0 {
+  // 4-byte-aligned unorm formats
+  @location(0) unorm8x4: vec4<f32>,
+  @location(1) unorm16x2: vec2<f32>,
+  @location(2) unorm16x4: vec4<f32>,
+
+  // 4-byte-aligned snorm formats
+  @location(3) snorm8x4: vec4<f32>,
+  @location(4) snorm16x2: vec2<f32>,
+  @location(5) snorm16x4: vec4<f32>,
+
+  // 2-byte-aligned formats
+  @location(6) unorm8x2: vec2<f32>,
+  @location(7) snorm8x2: vec2<f32>,
+}
+
+@vertex
+fn vertex_block_0(v_in: AttributeBlock0) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all unorm into one checksum value.
+  var all_unorm: f32 = 0.0;
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x2.y);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm8x4.w);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x2.y);
+
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm16x4.w);
+
+  checksums[index_unorm] = f32(all_unorm);
+
+  // Accumulate all snorm into one checksum value.
+  var all_snorm: f32 = 0.0;
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x2.y);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.y);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.z);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm8x4.w);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x2.y);
+
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.x);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.y);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.z);
+  all_snorm = accumulate_snorm(all_snorm, v_in.snorm16x4.w);
+
+  checksums[index_snorm] = f32(all_snorm);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock1 {
+  // 4-byte-aligned uint formats
+  @location(0) uint8x4: vec4<u32>,
+  @location(1) uint16x2: vec2<u32>,
+  @location(2) uint16x4: vec4<u32>,
+
+  // 4-byte-aligned sint formats
+  @location(3) sint8x4: vec4<i32>,
+  @location(4) sint16x2: vec2<i32>,
+  @location(5) sint16x4: vec4<i32>,
+
+  // 2-byte-aligned formats
+  @location(6) uint8x2: vec2<u32>,
+  @location(7) sint8x2: vec2<i32>,
+}
+
+@vertex
+fn vertex_block_1(v_in: AttributeBlock1) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all uint into one checksum value.
+  var all_uint: u32 = 0;
+  all_uint = accumulate_uint(all_uint, v_in.uint8x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint8x4.w);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint16x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint16x4.w);
+
+  checksums[index_uint] = f32(all_uint);
+
+  // Accumulate all sint into one checksum value.
+  var all_sint: i32 = 0;
+  all_sint = accumulate_sint(all_sint, v_in.sint8x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint8x4.w);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint16x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint16x4.w);
+
+  checksums[index_sint] = f32(all_sint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock2 {
+  @location(0) uint32: u32,
+  @location(1) uint32x2: vec2<u32>,
+  @location(2) uint32x3: vec3<u32>,
+  @location(3) uint32x4: vec4<u32>,
+}
+
+@vertex
+fn vertex_block_2(v_in: AttributeBlock2) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all uint into one checksum value.
+  var all_uint: u32 = 0;
+  all_uint = accumulate_uint(all_uint, v_in.uint32);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x2.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x2.y);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x3.z);
+
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.x);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.y);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.z);
+  all_uint = accumulate_uint(all_uint, v_in.uint32x4.w);
+
+  checksums[index_uint] = f32(all_uint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock3 {
+  @location(0) sint32: i32,
+  @location(1) sint32x2: vec2<i32>,
+  @location(2) sint32x3: vec3<i32>,
+  @location(3) sint32x4: vec4<i32>,
+}
+
+@vertex
+fn vertex_block_3(v_in: AttributeBlock3) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all sint into one checksum value.
+  var all_sint: i32 = 0;
+  all_sint = accumulate_sint(all_sint, v_in.sint32);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x2.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x2.y);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x3.z);
+
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.x);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.y);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.z);
+  all_sint = accumulate_sint(all_sint, v_in.sint32x4.w);
+
+  checksums[index_sint] = f32(all_sint);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock4{
+  @location(0) float32: f32,
+  @location(1) float32x2: vec2<f32>,
+  @location(2) float32x3: vec3<f32>,
+  @location(3) float32x4: vec4<f32>,
+  @location(4) float16x2: vec2<f32>,
+  @location(5) float16x4: vec4<f32>,
+}
+
+@vertex
+fn vertex_block_4(v_in: AttributeBlock4) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all float32 into one checksum value.
+  var all_float32: f32 = 0.0;
+  all_float32 = accumulate_float32(all_float32, v_in.float32);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x2.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x2.y);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.y);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x3.z);
+
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.x);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.y);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.z);
+  all_float32 = accumulate_float32(all_float32, v_in.float32x4.w);
+
+  checksums[index_float32] = f32(all_float32);
+
+  // Accumulate all float16 into one checksum value.
+  var all_float16: f32 = 0.0;
+  all_float16 = accumulate_float16(all_float16, v_in.float16x2.x);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x2.y);
+
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.x);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.y);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.z);
+  all_float16 = accumulate_float16(all_float16, v_in.float16x4.w);
+
+  checksums[index_float16] = f32(all_float16);
+
+  return vec4(0.0);
+}
+
+struct AttributeBlock5{
+  @location(0) unorm10_10_10_2: vec4<f32>,
+}
+
+@vertex
+fn vertex_block_5(v_in: AttributeBlock5) -> @builtin(position) vec4<f32>
+{
+  init_checksums();
+
+  // Accumulate all unorm into one checksum value.
+  var all_unorm: f32 = 0.0;
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.x);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.y);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.z);
+  all_unorm = accumulate_unorm(all_unorm, v_in.unorm10_10_10_2.w);
+
+  checksums[index_unorm] = f32(all_unorm);
+
+  return vec4(0.0);
+}
+
+fn accumulate_uint(accum: u32, val: u32) -> u32 {
+  return accum + val;
+}
+
+fn accumulate_sint(accum: i32, val: i32) -> i32 {
+  return accum + val;
+}
+
+fn accumulate_unorm(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_snorm(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_float16(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+fn accumulate_float32(accum: f32, val: f32) -> f32 {
+  return accum + val;
+}
+
+@fragment
+fn fragment_main() -> @location(0) vec4<f32> {
+    return vec4<f32>(0.0);
+}
diff --git a/tests/tests/vertex_formats/mod.rs b/tests/tests/vertex_formats/mod.rs
new file mode 100644
index 00000000000..8bd78d7c7b3
--- /dev/null
+++ b/tests/tests/vertex_formats/mod.rs
@@ -0,0 +1,386 @@
+//! Tests that vertex formats pass through to vertex shaders accurately.
+
+use std::num::NonZeroU64;
+
+use wgpu::util::{BufferInitDescriptor, DeviceExt};
+
+use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters, TestingContext};
+
+#[derive(Debug, Copy, Clone)]
+enum TestCase {
+    UnormsAndSnorms,
+    UintsAndSintsSmall,
+    UintsBig,
+    SintsBig,
+    Floats,
+    Unorm1010102,
+}
+
+struct Test<'a> {
+    case: TestCase,
+    entry_point: &'a str,
+    attributes: &'a [wgt::VertexAttribute],
+    input: &'a [u8],
+    checksums: &'a [f32],
+}
+
+async fn vertex_formats_all(ctx: TestingContext) {
+    let attributes_block_0 = &wgpu::vertex_attr_array![
+        0 => Unorm8x4,
+        1 => Unorm16x2,
+        2 => Unorm16x4,
+        3 => Snorm8x4,
+        4 => Snorm16x2,
+        5 => Snorm16x4,
+        6 => Unorm8x2,
+        7 => Snorm8x2,
+    ];
+
+    let attributes_block_1 = &wgpu::vertex_attr_array![
+        0 => Uint8x4,
+        1 => Uint16x2,
+        2 => Uint16x4,
+        3 => Sint8x4,
+        4 => Sint16x2,
+        5 => Sint16x4,
+        6 => Uint8x2,
+        7 => Sint8x2,
+    ];
+
+    let attributes_block_2 = &wgpu::vertex_attr_array![
+        0 => Uint32,
+        1 => Uint32x2,
+        2 => Uint32x3,
+        3 => Uint32x4,
+    ];
+
+    let attributes_block_3 = &wgpu::vertex_attr_array![
+        0 => Sint32,
+        1 => Sint32x2,
+        2 => Sint32x3,
+        3 => Sint32x4,
+    ];
+
+    let attributes_block_4 = &wgpu::vertex_attr_array![
+        0 => Float32,
+        1 => Float32x2,
+        2 => Float32x3,
+        3 => Float32x4,
+        4 => Float16x2,
+        5 => Float16x4,
+    ];
+
+    let tests = vec![
+        Test {
+            case: TestCase::UnormsAndSnorms,
+            entry_point: "vertex_block_0",
+            attributes: attributes_block_0,
+            input: &[
+                128u8, 128u8, 128u8, 128u8, // Unorm8x4 (0.5, 0.5, 0.5, 0.5)
+                0u8, 128u8, 0u8, 128u8, // Unorm16x2 (0.5, 0.5)
+                0u8, 64u8, 0u8, 64u8, 0u8, 64u8, 0u8,
+                64u8, // Unorm16x4 (0.25, 0.25, 0.25, 0.25)
+                127u8, 127u8, 127u8, 127u8, // Snorm8x4 (1, 1, 1, 1)
+                0u8, 128u8, 0u8, 128u8, // Snorm16x2 (-1, -1)
+                0u8, 127u8, 0u8, 127u8, 0u8, 127u8, 0u8, 127u8, // Snorm16x4 (1, 1, 1, 1)
+                255u8, 255u8, // Unorm8x2 (1, 1)
+                128u8, 128u8, // Snorm8x2 (-1, -1)
+            ],
+            checksums: &[0.0, 0.0, 6.0, 4.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::UintsAndSintsSmall,
+            entry_point: "vertex_block_1",
+            attributes: attributes_block_1,
+            input: &[
+                4u8, 8u8, 16u8, 32u8, // Uint8x4 (4, 8, 16, 32)
+                64u8, 0u8, 128u8, 0u8, // Uint16x2 (64, 128)
+                0u8, 1u8, 0u8, 2u8, 0u8, 4u8, 0u8, 8u8, // Uint16x4 (256, 512, 1024, 2048)
+                127u8, 127u8, 2u8, 0u8, // Sint8x4 (127, 127, 2, 0)
+                255u8, 255u8, 1u8, 0u8, // Sint16x2 (-1, 1)
+                128u8, 255u8, 128u8, 255u8, 0u8, 1u8, 240u8,
+                255u8, // Sint16x4 (-128, -128, 256, -16)
+                1u8, 2u8, // Uint8x2 (1, 2)
+                128u8, 128u8, // Sint8x2 (-128, -128)
+            ],
+            checksums: &[4095.0, -16.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::UintsBig,
+            entry_point: "vertex_block_2",
+            attributes: attributes_block_2,
+            input: &[
+                1u8, 0u8, 0u8, 0u8, // Uint32x2 (1)
+                2u8, 0u8, 0u8, 0u8, 4u8, 0u8, 0u8, 0u8, // Uint32x2 (2, 4)
+                8u8, 0u8, 0u8, 0u8, 16u8, 0u8, 0u8, 0u8, 32u8, 0u8, 0u8,
+                0u8, // Uint32x3 (8, 16, 32)
+                64u8, 0u8, 0u8, 0u8, 128u8, 0u8, 0u8, 0u8, 0u8, 1u8, 0u8, 0u8, 0u8, 2u8, 0u8,
+                0u8, // Uint32x4 (64, 128, 256, 512)
+            ],
+            checksums: &[1023.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::SintsBig,
+            entry_point: "vertex_block_3",
+            attributes: attributes_block_3,
+            input: &[
+                128u8, 255u8, 255u8, 255u8, // Sint32 (-128)
+                120u8, 0u8, 0u8, 0u8, 8u8, 0u8, 0u8, 0u8, // Sint32x2 (120, 8)
+                252u8, 255u8, 255u8, 255u8, 2u8, 0u8, 0u8, 0u8, 2u8, 0u8, 0u8,
+                0u8, // Sint32x3 (-4, 2, 2)
+                24u8, 252u8, 255u8, 255u8, 88u8, 2u8, 0u8, 0u8, 44u8, 1u8, 0u8, 0u8, 99u8, 0u8,
+                0u8, 0u8, // Sint32x4 (-1000, 600, 300, 99)
+            ],
+            checksums: &[0.0, -1.0, 0.0, 0.0, 0.0, 0.0],
+        },
+        Test {
+            case: TestCase::Floats,
+            entry_point: "vertex_block_4",
+            attributes: attributes_block_4,
+            input: &[
+                0u8, 0u8, 0u8, 63u8, // Float32 (0.5)
+                0u8, 0u8, 0u8, 191u8, 0u8, 0u8, 128u8, 64u8, // Float32x2 (-0.5, 4.0)
+                0u8, 0u8, 0u8, 192u8, 0u8, 0u8, 204u8, 194u8, 0u8, 0u8, 200u8,
+                66u8, // Float32x3 (-2.0, -102.0, 100.0)
+                0u8, 0u8, 92u8, 66u8, 0u8, 0u8, 72u8, 194u8, 0u8, 0u8, 32u8, 65u8, 0u8, 0u8, 128u8,
+                63u8, // Float32x4 (55.0, -50.0, 10.0, 1.0)
+                0u8, 60u8, 72u8, 53u8, // Float16x2 (1.0, 0.33)
+                72u8, 57u8, 0u8, 192u8, 0u8, 188u8, 0u8, 184u8, // Float16x4 (0.66, -2.0, -1.0, -0.5)
+            ],
+            checksums: &[0.0, 0.0, 0.0, 0.0, -1.5, 16.0],
+        },
+    ];
+
+    vertex_formats_common(ctx, &tests).await;
+}
+
+async fn vertex_formats_10_10_10_2(ctx: TestingContext) {
+    let attributes_block_5 = &wgpu::vertex_attr_array![
+        0 => Unorm10_10_10_2,
+    ];
+
+    let tests = vec![Test {
+        case: TestCase::Unorm1010102,
+        entry_point: "vertex_block_5",
+        attributes: attributes_block_5,
+        input: &[
+            // We are aiming for rgba of (0.5, 0.5, 0.5, 0.66)
+            // Packing   AA BB BBBB BBBB GGGG GGGG GG RR RRRR RRRR
+            // Binary    10 10 0000 0000 1000 0000 00 10 0000 0000
+            // Hex               A0        08         02        00
+            // Decimal          160         8          2         0
+            // unorm   0.66          0.5          0.5          0.5 = 2.16
+            0u8, 2u8, 8u8, 160u8, // Unorm10_10_10_2
+        ],
+        checksums: &[0.0, 0.0, 2.16, 0.0, 0.0, 0.0],
+    }];
+
+    vertex_formats_common(ctx, &tests).await;
+}
+
+async fn vertex_formats_common(ctx: TestingContext, tests: &[Test<'_>]) {
+    let shader = ctx
+        .device
+        .create_shader_module(wgpu::include_wgsl!("draw.vert.wgsl"));
+
+    let bgl = ctx
+        .device
+        .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &[wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: NonZeroU64::new(4),
+                },
+                visibility: wgpu::ShaderStages::VERTEX,
+                count: None,
+            }],
+        });
+
+    let ppl = ctx
+        .device
+        .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&bgl],
+            push_constant_ranges: &[],
+        });
+
+    let dummy = ctx
+        .device
+        .create_texture_with_data(
+            &ctx.queue,
+            &wgpu::TextureDescriptor {
+                label: Some("dummy"),
+                size: wgpu::Extent3d {
+                    width: 1,
+                    height: 1,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8Unorm,
+                usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_DST,
+                view_formats: &[],
+            },
+            wgpu::util::TextureDataOrder::LayerMajor,
+            &[0, 0, 0, 1],
+        )
+        .create_view(&wgpu::TextureViewDescriptor::default());
+
+    let mut failed = false;
+    for test in tests {
+        let buffer_input = ctx.device.create_buffer_init(&BufferInitDescriptor {
+            label: None,
+            contents: bytemuck::cast_slice(test.input),
+            usage: wgpu::BufferUsages::VERTEX,
+        });
+
+        let pipeline_desc = wgpu::RenderPipelineDescriptor {
+            label: None,
+            layout: Some(&ppl),
+            vertex: wgpu::VertexState {
+                buffers: &[wgpu::VertexBufferLayout {
+                    array_stride: 0, // Calculate, please!
+                    step_mode: wgpu::VertexStepMode::Vertex,
+                    attributes: test.attributes,
+                }],
+                module: &shader,
+                entry_point: test.entry_point,
+                compilation_options: Default::default(),
+            },
+            primitive: wgpu::PrimitiveState::default(),
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            fragment: Some(wgpu::FragmentState {
+                module: &shader,
+                entry_point: "fragment_main",
+                compilation_options: Default::default(),
+                targets: &[Some(wgpu::ColorTargetState {
+                    format: wgpu::TextureFormat::Rgba8Unorm,
+                    blend: None,
+                    write_mask: wgpu::ColorWrites::ALL,
+                })],
+            }),
+            multiview: None,
+            cache: None,
+        };
+
+        let pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
+
+        let expected = test.checksums;
+        let buffer_size = (std::mem::size_of_val(&expected[0]) * expected.len()) as u64;
+        let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
+
+        let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: buffer_size,
+            usage: wgpu::BufferUsages::COPY_SRC | wgpu::BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+
+        let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &bgl,
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: gpu_buffer.as_entire_binding(),
+            }],
+        });
+
+        let mut encoder1 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        let mut rpass = encoder1.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                ops: wgpu::Operations::default(),
+                resolve_target: None,
+                view: &dummy,
+            })],
+            depth_stencil_attachment: None,
+            timestamp_writes: None,
+            occlusion_query_set: None,
+        });
+
+        rpass.set_vertex_buffer(0, buffer_input.slice(..));
+        rpass.set_pipeline(&pipeline);
+        rpass.set_bind_group(0, &bg, &[]);
+
+        // Draw three vertices and no instance, which is enough to generate the
+        // checksums.
+        rpass.draw(0..3, 0..1);
+
+        drop(rpass);
+
+        let mut encoder2 = ctx
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
+
+        encoder2.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, buffer_size);
+
+        // See https://github.com/gfx-rs/wgpu/issues/4732 for why this is split between two submissions
+        // with a hard wait in between.
+        ctx.queue.submit([encoder1.finish()]);
+        ctx.async_poll(wgpu::Maintain::wait())
+            .await
+            .panic_on_timeout();
+        ctx.queue.submit([encoder2.finish()]);
+        let slice = cpu_buffer.slice(..);
+        slice.map_async(wgpu::MapMode::Read, |_| ());
+        ctx.async_poll(wgpu::Maintain::wait())
+            .await
+            .panic_on_timeout();
+        let data: Vec<f32> = bytemuck::cast_slice(&slice.get_mapped_range()).to_vec();
+
+        let case_name = format!("Case {:?}", test.case);
+
+        // Calculate the difference between data and expected. Since the data is
+        // a bunch of float checksums, we allow a fairly large epsilon, which helps
+        // with the accumulation of float rounding errors.
+        const EPSILON: f32 = 0.05;
+
+        let mut deltas = data.iter().zip(expected.iter()).map(|(d, e)| (d - e).abs());
+        if deltas.any(|x| x > EPSILON) {
+            eprintln!(
+                "Failed: Got: {:?} Expected: {:?} - {case_name}",
+                data, expected,
+            );
+            failed = true;
+            continue;
+        }
+
+        eprintln!("Passed: {case_name}");
+    }
+
+    assert!(!failed);
+}
+
+#[gpu_test]
+static VERTEX_FORMATS_ALL: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .test_features_limits()
+            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
+    )
+    .run_async(vertex_formats_all);
+
+// Some backends can handle Unorm-10-10-2, but GL backends seem to throw this error:
+// Validation Error: GL_INVALID_ENUM in glVertexAttribFormat(type = GL_UNSIGNED_INT_10_10_10_2)
+#[gpu_test]
+static VERTEX_FORMATS_10_10_10_2: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .expect_fail(FailureCase::backend(wgpu::Backends::GL))
+            .test_features_limits()
+            .features(wgpu::Features::VERTEX_WRITABLE_STORAGE),
+    )
+    .run_async(vertex_formats_10_10_10_2);
diff --git a/tests/tests/vertex_indices/mod.rs b/tests/tests/vertex_indices/mod.rs
index 59048ef31c1..dcc2ca82f54 100644
--- a/tests/tests/vertex_indices/mod.rs
+++ b/tests/tests/vertex_indices/mod.rs
@@ -166,7 +166,6 @@ struct Test {
     id_source: IdSource,
     draw_call_kind: DrawCallKind,
     encoder_kind: EncoderKind,
-    vertex_pulling_transform: bool,
 }
 
 impl Test {
@@ -280,15 +279,6 @@ async fn vertex_index_common(ctx: TestingContext) {
         cache: None,
     };
     let builtin_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = true;
-    let builtin_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = false;
 
     pipeline_desc.vertex.entry_point = "vs_main_buffers";
     pipeline_desc.vertex.buffers = &[
@@ -304,15 +294,6 @@ async fn vertex_index_common(ctx: TestingContext) {
         },
     ];
     let buffer_pipeline = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = true;
-    let buffer_pipeline_vpt = ctx.device.create_render_pipeline(&pipeline_desc);
-    pipeline_desc
-        .vertex
-        .compilation_options
-        .vertex_pulling_transform = false;
 
     let dummy = ctx
         .device
@@ -341,18 +322,12 @@ async fn vertex_index_common(ctx: TestingContext) {
         .cartesian_product(IdSource::iter())
         .cartesian_product(DrawCallKind::iter())
         .cartesian_product(EncoderKind::iter())
-        .cartesian_product([false, true])
-        .map(
-            |((((case, id_source), draw_call_kind), encoder_kind), vertex_pulling_transform)| {
-                Test {
-                    case,
-                    id_source,
-                    draw_call_kind,
-                    encoder_kind,
-                    vertex_pulling_transform,
-                }
-            },
-        )
+        .map(|(((case, id_source), draw_call_kind), encoder_kind)| Test {
+            case,
+            id_source,
+            draw_call_kind,
+            encoder_kind,
+        })
         .collect::<Vec<_>>();
 
     let features = ctx.adapter.features();
@@ -360,20 +335,8 @@ async fn vertex_index_common(ctx: TestingContext) {
     let mut failed = false;
     for test in tests {
         let pipeline = match test.id_source {
-            IdSource::Buffers => {
-                if test.vertex_pulling_transform {
-                    &buffer_pipeline_vpt
-                } else {
-                    &buffer_pipeline
-                }
-            }
-            IdSource::Builtins => {
-                if test.vertex_pulling_transform {
-                    &builtin_pipeline_vpt
-                } else {
-                    &builtin_pipeline
-                }
-            }
+            IdSource::Buffers => &buffer_pipeline,
+            IdSource::Builtins => &builtin_pipeline,
         };
 
         let expected = test.expectation(&ctx);
diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs
index 5ebd7c7de78..413a6edb453 100644
--- a/wgpu-core/src/device/global.rs
+++ b/wgpu-core/src/device/global.rs
@@ -1484,7 +1484,6 @@ impl Global {
                         .vertex
                         .stage
                         .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: desc.vertex.stage.vertex_pulling_transform,
                 };
                 ResolvedVertexState {
                     stage,
@@ -1511,7 +1510,6 @@ impl Global {
                         .vertex
                         .stage
                         .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: state.stage.vertex_pulling_transform,
                 };
                 Some(ResolvedFragmentState {
                     stage,
@@ -1720,7 +1718,6 @@ impl Global {
                 entry_point: desc.stage.entry_point.clone(),
                 constants: desc.stage.constants.clone(),
                 zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: desc.stage.vertex_pulling_transform,
             };
 
             let desc = ResolvedComputePipelineDescriptor {
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index ee943d7fdc7..9cf8b21c388 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -2694,7 +2694,6 @@ impl<A: HalApi> Device<A> {
                 entry_point: final_entry_point_name.as_ref(),
                 constants: desc.stage.constants.as_ref(),
                 zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: false,
             },
             cache: cache.as_ref().and_then(|it| it.raw.as_ref()),
         };
@@ -3114,7 +3113,6 @@ impl<A: HalApi> Device<A> {
                 entry_point: &vertex_entry_point_name,
                 constants: stage_desc.constants.as_ref(),
                 zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory,
-                vertex_pulling_transform: stage_desc.vertex_pulling_transform,
             }
         };
 
@@ -3171,7 +3169,6 @@ impl<A: HalApi> Device<A> {
                     zero_initialize_workgroup_memory: fragment_state
                         .stage
                         .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: false,
                 })
             }
             None => None,
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index 6366279eff1..2ab49f83d0c 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -147,8 +147,6 @@ pub struct ProgrammableStageDescriptor<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }
 
 /// Describes a programmable pipeline stage.
@@ -176,8 +174,6 @@ pub struct ResolvedProgrammableStageDescriptor<'a, A: HalApi> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }
 
 /// Number of implicit bind groups derived at pipeline creation.
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index daed0c1d35f..30ff45ff5bd 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -257,7 +257,6 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "vs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
             },
             vertex_buffers: &[],
             fragment_stage: Some(hal::ProgrammableStage {
@@ -265,7 +264,6 @@ impl<A: hal::Api> Example<A> {
                 entry_point: "fs_main",
                 constants: &constants,
                 zero_initialize_workgroup_memory: true,
-                vertex_pulling_transform: false,
             }),
             primitive: wgt::PrimitiveState {
                 topology: wgt::PrimitiveTopology::TriangleStrip,
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 1cde9fa2516..7cd6547f2c4 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -379,7 +379,6 @@ impl<A: hal::Api> Example<A> {
                     entry_point: "main",
                     constants: &Default::default(),
                     zero_initialize_workgroup_memory: true,
-                    vertex_pulling_transform: false,
                 },
                 cache: None,
             })
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index b28a005a7ab..9df479b13c1 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -1866,8 +1866,6 @@ pub struct ProgrammableStage<'a, A: Api> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }
 
 // Rust gets confused about the impl requirements for `A`
@@ -1878,7 +1876,6 @@ impl<A: Api> Clone for ProgrammableStage<'_, A> {
             entry_point: self.entry_point,
             constants: self.constants,
             zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
-            vertex_pulling_transform: self.vertex_pulling_transform,
         }
     }
 }
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
index efafc98e1b3..d9525999d8b 100644
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@@ -158,7 +158,7 @@ impl super::Device {
                 metal::MTLPrimitiveTopologyClass::Point => true,
                 _ => false,
             },
-            vertex_pulling_transform: stage.vertex_pulling_transform,
+            vertex_pulling_transform: true,
             vertex_buffer_mappings: vertex_buffer_mappings.to_vec(),
         };
 
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 91629d638c4..fae25422829 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -1180,10 +1180,6 @@ impl crate::Context for ContextWgpuCore {
                         .vertex
                         .compilation_options
                         .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: desc
-                        .vertex
-                        .compilation_options
-                        .vertex_pulling_transform,
                 },
                 buffers: Borrowed(&vertex_buffers),
             },
@@ -1198,7 +1194,6 @@ impl crate::Context for ContextWgpuCore {
                     zero_initialize_workgroup_memory: frag
                         .compilation_options
                         .zero_initialize_workgroup_memory,
-                    vertex_pulling_transform: false,
                 },
                 targets: Borrowed(frag.targets),
             }),
@@ -1244,7 +1239,6 @@ impl crate::Context for ContextWgpuCore {
                 zero_initialize_workgroup_memory: desc
                     .compilation_options
                     .zero_initialize_workgroup_memory,
-                vertex_pulling_transform: false,
             },
             cache: desc.cache.map(|c| c.id.into()),
         };
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index d895b696cf4..190e99f7073 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -2059,8 +2059,6 @@ pub struct PipelineCompilationOptions<'a> {
     /// This is required by the WebGPU spec, but may have overhead which can be avoided
     /// for cross-platform applications
     pub zero_initialize_workgroup_memory: bool,
-    /// Should the pipeline attempt to transform vertex shaders to use vertex pulling.
-    pub vertex_pulling_transform: bool,
 }
 
 impl<'a> Default for PipelineCompilationOptions<'a> {
@@ -2074,7 +2072,6 @@ impl<'a> Default for PipelineCompilationOptions<'a> {
         Self {
             constants,
             zero_initialize_workgroup_memory: true,
-            vertex_pulling_transform: false,
         }
     }
 }