From 7710f7966d4947ff8e3da34dbf393b9e6e01ec16 Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Wed, 13 Sep 2023 16:58:05 -0400 Subject: [PATCH] Workaround NV bug --- tests/tests/regression/issue_4122.rs | 123 +++++++++++++++++++++++++++ tests/tests/root.rs | 1 + wgpu-hal/src/vulkan/adapter.rs | 4 + wgpu-hal/src/vulkan/command.rs | 47 ++++++++-- wgpu-hal/src/vulkan/mod.rs | 6 ++ 5 files changed, 172 insertions(+), 9 deletions(-) create mode 100644 tests/tests/regression/issue_4122.rs diff --git a/tests/tests/regression/issue_4122.rs b/tests/tests/regression/issue_4122.rs new file mode 100644 index 00000000000..a661c80466e --- /dev/null +++ b/tests/tests/regression/issue_4122.rs @@ -0,0 +1,123 @@ +use std::{num::NonZeroU64, ops::Range}; + +use wasm_bindgen_test::wasm_bindgen_test; +use wgpu_test::{initialize_test, TestParameters, TestingContext}; + +fn fill_test(ctx: &TestingContext, range: Range, size: u64) -> bool { + let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("gpu_buffer"), + size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + + let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("cpu_buffer"), + size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + // Initialize the whole buffer with values. + let buffer_contents = vec![0xFF_u8; size as usize]; + ctx.queue.write_buffer(&gpu_buffer, 0, &buffer_contents); + + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("encoder"), + }); + + encoder.clear_buffer( + &gpu_buffer, + range.start, + NonZeroU64::new(range.end - range.start), + ); + encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, size); + + ctx.queue.submit(Some(encoder.finish())); + cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ()); + ctx.device.poll(wgpu::Maintain::Wait); + + let buffer_slice = cpu_buffer.slice(..); + let buffer_data = buffer_slice.get_mapped_range(); + + let first_clear_byte = buffer_data + .iter() + .enumerate() + .find_map(|(index, byte)| (*byte == 0x00).then_some(index)) + .expect("No clear happened at all"); + + let first_dirty_byte = buffer_data + .iter() + .enumerate() + .skip(first_clear_byte) + .find_map(|(index, byte)| (*byte != 0x00).then_some(index)) + .unwrap_or(size as usize); + + let second_clear_byte = buffer_data + .iter() + .enumerate() + .skip(first_dirty_byte) + .find_map(|(index, byte)| (*byte == 0x00).then_some(index)); + + if second_clear_byte.is_some() { + eprintln!("Found multiple cleared ranges instead of a single clear range of {}..{} on a buffer of size {}.", range.start, range.end, size); + return false; + } + + let cleared_range = first_clear_byte as u64..first_dirty_byte as u64; + + if cleared_range != range { + eprintln!( + "Cleared range is {}..{}, but the clear range is {}..{} on a buffer of size {}.", + cleared_range.start, cleared_range.end, range.start, range.end, size + ); + return false; + } + + eprintln!( + "Cleared range is {}..{} on a buffer of size {}.", + cleared_range.start, cleared_range.end, size + ); + + return true; +} + +#[wasm_bindgen_test] +#[test] +fn clear_buffer_bug() { + initialize_test(TestParameters::default(), |ctx| { + let mut succeeded = true; + for power in 4..16 { + let size = 1 << power; + for start_offset in (0..=68).step_by(4) { + for size_offset in (0..=68).step_by(4) { + let range = start_offset..size + size_offset + start_offset; + let result = fill_test(&ctx, range.clone(), 1 << 16); + + let copy_length = range.end - range.start; + + // This is the precondiiton for the nvidia bug. This is for reference only, + // as the bug is worked around. + let _succeeds = 'b: { + if copy_length >= 4096 { + if start_offset % 16 != 0 { + if copy_length == 4096 { + break 'b true; + } + if copy_length % 16 == 0 { + break 'b false; + } + } + } + true + }; + + succeeded &= result; + } + } + } + assert!(succeeded); + }); +} diff --git a/tests/tests/root.rs b/tests/tests/root.rs index 25df8eda903..85901ae4919 100644 --- a/tests/tests/root.rs +++ b/tests/tests/root.rs @@ -3,6 +3,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure; mod regression { mod issue_3457; mod issue_4024; + mod issue_4122; } mod bind_group_layout_dedup; diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 4a7ccf9535c..bcbab850845 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -984,6 +984,10 @@ impl super::Instance { super::Workarounds::EMPTY_RESOLVE_ATTACHMENT_LISTS, phd_capabilities.properties.vendor_id == db::qualcomm::VENDOR, ); + workarounds.set( + super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16, + phd_capabilities.properties.vendor_id == db::nvidia::VENDOR, + ); }; if phd_capabilities.effective_api_version == vk::API_VERSION_1_0 diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs index c2e7afe3f1e..391b754d335 100644 --- a/wgpu-hal/src/vulkan/command.rs +++ b/wgpu-hal/src/vulkan/command.rs @@ -212,15 +212,44 @@ impl crate::CommandEncoder for super::CommandEncoder { } unsafe fn clear_buffer(&mut self, buffer: &super::Buffer, range: crate::MemoryRange) { - unsafe { - self.device.raw.cmd_fill_buffer( - self.active, - buffer.raw, - range.start, - range.end - range.start, - 0, - ) - }; + let range_size = range.end - range.start; + if self.device.workarounds.contains( + super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16, + ) && range_size >= 4096 + && range.start % 16 != 0 + { + let rounded_start = wgt::math::align_to(range.start, 16); + let prefix_size = rounded_start - range.start; + + unsafe { + self.device.raw.cmd_fill_buffer( + self.active, + buffer.raw, + range.start, + prefix_size, + 0, + ) + }; + + // This will never be zero, as rounding can only add up to 12 bytes, and the total size is 4096. + let suffix_size = range.end - rounded_start; + + unsafe { + self.device.raw.cmd_fill_buffer( + self.active, + buffer.raw, + rounded_start, + suffix_size, + 0, + ) + }; + } else { + unsafe { + self.device + .raw + .cmd_fill_buffer(self.active, buffer.raw, range.start, range_size, 0) + }; + } } unsafe fn copy_buffer_to_buffer( diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index c2165e1dd83..6a162aae9e2 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -207,6 +207,12 @@ bitflags::bitflags!( /// Qualcomm OOMs when there are zero color attachments but a non-null pointer /// to a subpass resolve attachment array. This nulls out that pointer in that case. const EMPTY_RESOLVE_ATTACHMENT_LISTS = 0x2; + /// See issue_4122.rs for details on this nvidia issue. + /// + /// All fill buffers with offsets not aligned to 16 bytes and size more than 4096 bytes + /// will not fill the correct range. Workaround this by splitting the fill into multiple + /// calls, one with small size and unaligned offset, and one with aligned offset and large size. + const FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16 = 0x4; } );