Fix the example regressions from packed growable buffers. (#14375)

The "uberbuffers" PR #14257 caused some examples to fail intermittently for different reasons: 1. `morph_targets` could fail because vertex displacements for morph targets are keyed off the vertex index. With buffer packing, the vertex index can vary based on the position in the buffer, which caused the morph targets to be potentially incorrect. The solution is to include the first vertex index with the `MeshUniform` (and `MeshInputUniform` if GPU preprocessing is in use), so that the shader can calculate the true vertex index before performing the morph operation. This results in wasted space in `MeshUniform`, which is unfortunate, but we'll soon be filling in the padding with the ID of the material when bindless textures land, so this had to happen sooner or later anyhow. Including the vertex index in the `MeshInputUniform` caused an ordering problem. The `MeshInputUniform` was created during the extraction phase, before the allocations occurred, so the extraction logic didn't know where the mesh vertex data was going to end up. The solution is to move the `MeshInputUniform` creation (the `collect_meshes_for_gpu_building` system) to after the allocations phase. This should be better for parallelism anyhow, because it allows the extraction phase to finish quicker. It's also something we'll have to do for bindless in any event. 2. The `lines` and `fog_volumes` examples could fail because their custom drawing nodes weren't updated to supply the vertex and index offsets in their `draw_indexed` and `draw` calls. This commit fixes this oversight. Fixes #14366.
bevyengine · Jul 22, 2024 · d235d41 · d235d41
1 parent d30391b
commit d235d41
Show file tree

Hide file tree

Showing 7 changed files with 124 additions and 38 deletions.
diff --git a/crates/bevy_pbr/src/meshlet/gpu_scene.rs b/crates/bevy_pbr/src/meshlet/gpu_scene.rs
@@ -138,7 +138,7 @@ pub fn extract_meshlet_meshes(
         gpu_scene
             .instance_uniforms
             .get_mut()
-            .push(MeshUniform::new(&transforms, None));
+            .push(MeshUniform::new(&transforms, 0, None));
     }
 }
 

diff --git a/crates/bevy_pbr/src/prepass/prepass.wgsl b/crates/bevy_pbr/src/prepass/prepass.wgsl
@@ -1,5 +1,6 @@
 #import bevy_pbr::{
     prepass_bindings,
+    mesh_bindings::mesh,
     mesh_functions,
     prepass_io::{Vertex, VertexOutput, FragmentOutput},
     skinning,
@@ -15,18 +16,21 @@
 #ifdef MORPH_TARGETS
 fn morph_vertex(vertex_in: Vertex) -> Vertex {
     var vertex = vertex_in;
+    let first_vertex = mesh[vertex.instance_index].first_vertex_index;
+    let vertex_index = vertex.index - first_vertex;
+
     let weight_count = morph::layer_count();
     for (var i: u32 = 0u; i < weight_count; i ++) {
         let weight = morph::weight_at(i);
         if weight == 0.0 {
             continue;
         }
-        vertex.position += weight * morph::morph(vertex.index, morph::position_offset, i);
+        vertex.position += weight * morph::morph(vertex_index, morph::position_offset, i);
 #ifdef VERTEX_NORMALS
-        vertex.normal += weight * morph::morph(vertex.index, morph::normal_offset, i);
+        vertex.normal += weight * morph::morph(vertex_index, morph::normal_offset, i);
 #endif
 #ifdef VERTEX_TANGENTS
-        vertex.tangent += vec4(weight * morph::morph(vertex.index, morph::tangent_offset, i), 0.0);
+        vertex.tangent += vec4(weight * morph::morph(vertex_index, morph::tangent_offset, i), 0.0);
 #endif
     }
     return vertex;

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
@@ -186,8 +186,8 @@ impl Plugin for MeshRenderPlugin {
 
             if use_gpu_instance_buffer_builder {
                 render_app
-                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
-                    )
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+                    .init_resource::<RenderMeshInstanceGpuQueues>()
                     .add_systems(
                         ExtractSchedule,
                         extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
@@ -200,6 +200,9 @@ impl Plugin for MeshRenderPlugin {
                             gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
                                 .in_set(RenderSet::ManageViews)
                                 .after(prepare_view_targets),
+                            collect_meshes_for_gpu_building
+                                .in_set(RenderSet::PrepareAssets)
+                                .after(allocator::allocate_and_free_meshes),
                         ),
                     );
             } else {
@@ -275,6 +278,18 @@ pub struct MeshUniform {
     //
     // (MSB: most significant bit; LSB: least significant bit.)
     pub lightmap_uv_rect: UVec2,
+    /// The index of this mesh's first vertex in the vertex buffer.
+    ///
+    /// Multiple meshes can be packed into a single vertex buffer (see
+    /// [`MeshAllocator`]). This value stores the offset of the first vertex in
+    /// this mesh in that buffer.
+    pub first_vertex_index: u32,
+    /// Padding.
+    pub pad_a: u32,
+    /// Padding.
+    pub pad_b: u32,
+    /// Padding.
+    pub pad_c: u32,
 }
 
 /// Information that has to be transferred from CPU to GPU in order to produce
@@ -305,6 +320,18 @@ pub struct MeshInputUniform {
     ///
     /// This is used for TAA. If not present, this will be `u32::MAX`.
     pub previous_input_index: u32,
+    /// The index of this mesh's first vertex in the vertex buffer.
+    ///
+    /// Multiple meshes can be packed into a single vertex buffer (see
+    /// [`MeshAllocator`]). This value stores the offset of the first vertex in
+    /// this mesh in that buffer.
+    pub first_vertex_index: u32,
+    /// Padding.
+    pub pad_a: u32,
+    /// Padding.
+    pub pad_b: u32,
+    /// Padding.
+    pub pad_c: u32,
 }
 
 /// Information about each mesh instance needed to cull it on GPU.
@@ -331,7 +358,11 @@ pub struct MeshCullingData {
 pub struct MeshCullingDataBuffer(RawBufferVec<MeshCullingData>);
 
 impl MeshUniform {
-    pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
+    pub fn new(
+        mesh_transforms: &MeshTransforms,
+        first_vertex_index: u32,
+        maybe_lightmap_uv_rect: Option<Rect>,
+    ) -> Self {
         let (local_from_world_transpose_a, local_from_world_transpose_b) =
             mesh_transforms.world_from_local.inverse_transpose_3x3();
         Self {
@@ -341,6 +372,10 @@ impl MeshUniform {
             local_from_world_transpose_a,
             local_from_world_transpose_b,
             flags: mesh_transforms.flags,
+            first_vertex_index,
+            pad_a: 0,
+            pad_b: 0,
+            pad_c: 0,
         }
     }
 }
@@ -515,6 +550,14 @@ pub enum RenderMeshInstanceGpuQueue {
     GpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder, MeshCullingData)>),
 }
 
+/// The per-thread queues containing mesh instances, populated during the
+/// extract phase.
+///
+/// These are filled in [`extract_meshes_for_gpu_building`] and consumed in
+/// [`collect_meshes_for_gpu_building`].
+#[derive(Resource, Default, Deref, DerefMut)]
+pub struct RenderMeshInstanceGpuQueues(Parallel<RenderMeshInstanceGpuQueue>);
+
 impl RenderMeshInstanceShared {
     fn from_components(
         previous_transform: Option<&PreviousGlobalTransform>,
@@ -719,7 +762,14 @@ impl RenderMeshInstanceGpuBuilder {
         entity: Entity,
         render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
         current_input_buffer: &mut RawBufferVec<MeshInputUniform>,
+        mesh_allocator: &MeshAllocator,
     ) -> usize {
+        let first_vertex_index = match mesh_allocator.mesh_vertex_slice(&self.shared.mesh_asset_id)
+        {
+            Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
+            None => 0,
+        };
+
         // Push the mesh input uniform.
         let current_uniform_index = current_input_buffer.push(MeshInputUniform {
             world_from_local: self.world_from_local.to_transpose(),
@@ -729,6 +779,10 @@ impl RenderMeshInstanceGpuBuilder {
                 Some(previous_input_index) => previous_input_index.into(),
                 None => u32::MAX,
             },
+            first_vertex_index,
+            pad_a: 0,
+            pad_b: 0,
+            pad_c: 0,
         });
 
         // Record the [`RenderMeshInstance`].
@@ -900,11 +954,7 @@ pub fn extract_meshes_for_cpu_building(
 pub fn extract_meshes_for_gpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
     render_visibility_ranges: Res<RenderVisibilityRanges>,
-    mut batched_instance_buffers: ResMut<
-        gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
-    >,
-    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
-    mut render_mesh_instance_queues: Local<Parallel<RenderMeshInstanceGpuQueue>>,
+    mut render_mesh_instance_queues: ResMut<RenderMeshInstanceGpuQueues>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -1004,13 +1054,6 @@ pub fn extract_meshes_for_gpu_building(
             queue.push(entity, gpu_mesh_instance_builder, gpu_mesh_culling_data);
         },
     );
-
-    collect_meshes_for_gpu_building(
-        render_mesh_instances,
-        &mut batched_instance_buffers,
-        &mut mesh_culling_data_buffer,
-        &mut render_mesh_instance_queues,
-    );
 }
 
 /// A system that sets the [`RenderMeshInstanceFlags`] for each mesh based on
@@ -1044,22 +1087,28 @@ fn set_mesh_motion_vector_flags(
 
 /// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
 /// mesh uniforms are built.
-fn collect_meshes_for_gpu_building(
-    render_mesh_instances: &mut RenderMeshInstancesGpu,
-    batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
-        MeshUniform,
-        MeshInputUniform,
+pub fn collect_meshes_for_gpu_building(
+    render_mesh_instances: ResMut<RenderMeshInstances>,
+    batched_instance_buffers: ResMut<
+        gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
-    mesh_culling_data_buffer: &mut MeshCullingDataBuffer,
-    render_mesh_instance_queues: &mut Parallel<RenderMeshInstanceGpuQueue>,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+    mut render_mesh_instance_queues: ResMut<RenderMeshInstanceGpuQueues>,
+    mesh_allocator: Res<MeshAllocator>,
 ) {
+    let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) =
+        render_mesh_instances.into_inner()
+    else {
+        return;
+    };
+
     // Collect render mesh instances. Build up the uniform buffer.
 
     let gpu_preprocessing::BatchedInstanceBuffers {
         ref mut current_input_buffer,
         ref mut previous_input_buffer,
         ..
-    } = batched_instance_buffers;
+    } = batched_instance_buffers.into_inner();
 
     // Swap buffers.
     mem::swap(current_input_buffer, previous_input_buffer);
@@ -1076,19 +1125,22 @@ fn collect_meshes_for_gpu_building(
                 for (entity, mesh_instance_builder) in queue.drain(..) {
                     mesh_instance_builder.add_to(
                         entity,
-                        render_mesh_instances,
+                        &mut *render_mesh_instances,
                         current_input_buffer,
+                        &mesh_allocator,
                     );
                 }
             }
             RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue) => {
                 for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) {
                     let instance_data_index = mesh_instance_builder.add_to(
                         entity,
-                        render_mesh_instances,
+                        &mut *render_mesh_instances,
                         current_input_buffer,
+                        &mesh_allocator,
                     );
-                    let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer);
+                    let culling_data_index =
+                        mesh_culling_builder.add_to(&mut mesh_culling_data_buffer);
                     debug_assert_eq!(instance_data_index, culling_data_index);
                 }
             }
@@ -1220,7 +1272,7 @@ impl GetBatchData for MeshPipeline {
     type BufferData = MeshUniform;
 
     fn get_batch_data(
-        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -1231,11 +1283,17 @@ impl GetBatchData for MeshPipeline {
             return None;
         };
         let mesh_instance = mesh_instances.get(&entity)?;
+        let first_vertex_index =
+            match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) {
+                Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
+                None => 0,
+            };
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
         Some((
             MeshUniform::new(
                 &mesh_instance.transforms,
+                first_vertex_index,
                 maybe_lightmap.map(|lightmap| lightmap.uv_rect),
             ),
             mesh_instance.should_batch().then_some((
@@ -1277,7 +1335,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_batch_data(
-        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -1287,10 +1345,16 @@ impl GetFullBatchData for MeshPipeline {
             return None;
         };
         let mesh_instance = mesh_instances.get(&entity)?;
+        let first_vertex_index =
+            match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) {
+                Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
+                None => 0,
+            };
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
         Some(MeshUniform::new(
             &mesh_instance.transforms,
+            first_vertex_index,
             maybe_lightmap.map(|lightmap| lightmap.uv_rect),
         ))
     }
@@ -2354,7 +2418,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             }
             RenderMeshBufferInfo::NonIndexed => match indirect_parameters {
                 None => {
-                    pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                    pass.draw(vertex_buffer_slice.range, batch_range.clone());
                 }
                 Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
                     pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);

diff --git a/crates/bevy_pbr/src/render/mesh.wgsl b/crates/bevy_pbr/src/render/mesh.wgsl
@@ -1,4 +1,5 @@
 #import bevy_pbr::{
+    mesh_bindings::mesh,
     mesh_functions,
     skinning,
     morph::morph,
@@ -9,18 +10,21 @@
 #ifdef MORPH_TARGETS
 fn morph_vertex(vertex_in: Vertex) -> Vertex {
     var vertex = vertex_in;
+    let first_vertex = mesh[vertex.instance_index].first_vertex_index;
+    let vertex_index = vertex.index - first_vertex;
+
     let weight_count = bevy_pbr::morph::layer_count();
     for (var i: u32 = 0u; i < weight_count; i ++) {
         let weight = bevy_pbr::morph::weight_at(i);
         if weight == 0.0 {
             continue;
         }
-        vertex.position += weight * morph(vertex.index, bevy_pbr::morph::position_offset, i);
+        vertex.position += weight * morph(vertex_index, bevy_pbr::morph::position_offset, i);
 #ifdef VERTEX_NORMALS
-        vertex.normal += weight * morph(vertex.index, bevy_pbr::morph::normal_offset, i);
+        vertex.normal += weight * morph(vertex_index, bevy_pbr::morph::normal_offset, i);
 #endif
 #ifdef VERTEX_TANGENTS
-        vertex.tangent += vec4(weight * morph(vertex.index, bevy_pbr::morph::tangent_offset, i), 0.0);
+        vertex.tangent += vec4(weight * morph(vertex_index, bevy_pbr::morph::tangent_offset, i), 0.0);
 #endif
     }
     return vertex;

diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -22,6 +22,10 @@ struct MeshInput {
     // The index of this mesh's `MeshInput` in the `previous_input` array, if
     // applicable. If not present, this is `u32::MAX`.
     previous_input_index: u32,
+    first_vertex_index: u32,
+    pad_a: u32,
+    pad_b: u32,
+    pad_c: u32,
 }
 
 // Information about each mesh instance needed to cull it on GPU.
@@ -186,4 +190,5 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b;
     output[mesh_output_index].flags = current_input[input_index].flags;
     output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
+    output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index;
 }
diff --git a/crates/bevy_pbr/src/render/mesh_types.wgsl b/crates/bevy_pbr/src/render/mesh_types.wgsl
@@ -15,6 +15,11 @@ struct Mesh {
     // 'flags' is a bit field indicating various options. u32 is 32 bits so we have up to 32 options.
     flags: u32,
     lightmap_uv_rect: vec2<u32>,
+    // The index of the mesh's first vertex in the vertex buffer.
+    first_vertex_index: u32,
+    pad_a: u32,
+    pad_b: u32,
+    pad_c: u32,
 };
 
 #ifdef SKINNED

diff --git a/crates/bevy_pbr/src/volumetric_fog/render.rs b/crates/bevy_pbr/src/volumetric_fog/render.rs
@@ -472,10 +472,14 @@ impl ViewNode for VolumetricFogNode {
 
                     render_pass
                         .set_index_buffer(*index_buffer_slice.buffer.slice(..), *index_format);
-                    render_pass.draw_indexed(0..*count, 0, 0..1);
+                    render_pass.draw_indexed(
+                        index_buffer_slice.range.start..(index_buffer_slice.range.start + count),
+                        vertex_buffer_slice.range.start as i32,
+                        0..1,
+                    );
                 }
                 RenderMeshBufferInfo::NonIndexed => {
-                    render_pass.draw(0..render_mesh.vertex_count, 0..1);
+                    render_pass.draw(vertex_buffer_slice.range, 0..1);
                 }
             }
         }