Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround NV bug #4132

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions tests/tests/regression/issue_4122.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use std::{num::NonZeroU64, ops::Range};

use wasm_bindgen_test::wasm_bindgen_test;
use wgpu_test::{initialize_test, TestParameters, TestingContext};

fn fill_test(ctx: &TestingContext, range: Range<u64>, size: u64) -> bool {
let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("gpu_buffer"),
size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});

let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("cpu_buffer"),
size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});

// Initialize the whole buffer with values.
let buffer_contents = vec![0xFF_u8; size as usize];
ctx.queue.write_buffer(&gpu_buffer, 0, &buffer_contents);

let mut encoder = ctx
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("encoder"),
});

encoder.clear_buffer(
&gpu_buffer,
range.start,
NonZeroU64::new(range.end - range.start),
);
encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, size);

ctx.queue.submit(Some(encoder.finish()));
cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());
ctx.device.poll(wgpu::Maintain::Wait);

let buffer_slice = cpu_buffer.slice(..);
let buffer_data = buffer_slice.get_mapped_range();

let first_clear_byte = buffer_data
.iter()
.enumerate()
.find_map(|(index, byte)| (*byte == 0x00).then_some(index))
.expect("No clear happened at all");

let first_dirty_byte = buffer_data
.iter()
.enumerate()
.skip(first_clear_byte)
.find_map(|(index, byte)| (*byte != 0x00).then_some(index))
.unwrap_or(size as usize);

let second_clear_byte = buffer_data
.iter()
.enumerate()
.skip(first_dirty_byte)
.find_map(|(index, byte)| (*byte == 0x00).then_some(index));

if second_clear_byte.is_some() {
eprintln!("Found multiple cleared ranges instead of a single clear range of {}..{} on a buffer of size {}.", range.start, range.end, size);
return false;
}

let cleared_range = first_clear_byte as u64..first_dirty_byte as u64;

if cleared_range != range {
eprintln!(
"Cleared range is {}..{}, but the clear range is {}..{} on a buffer of size {}.",
cleared_range.start, cleared_range.end, range.start, range.end, size
);
return false;
}

eprintln!(
"Cleared range is {}..{} on a buffer of size {}.",
cleared_range.start, cleared_range.end, size
);

true
}

/// Nvidia has a bug in vkCmdFillBuffer where the clear range is not properly respected under
/// certain conditions. See https://github.com/gfx-rs/wgpu/issues/4122 for more information.
///
/// This test will fail on nvidia if the bug is not properly worked around.
#[wasm_bindgen_test]
#[test]
fn clear_buffer_bug() {
initialize_test(TestParameters::default(), |ctx| {
// This hits most of the cases in nvidia's clear buffer bug
let mut succeeded = true;
for power in 4..14 {
let size = 1 << power;
for start_offset in (0..=36).step_by(4) {
for size_offset in (0..=36).step_by(4) {
let range = start_offset..size + size_offset + start_offset;
let result = fill_test(&ctx, range, 1 << 16);

succeeded &= result;
}
}
}
assert!(succeeded);
});
}
1 change: 1 addition & 0 deletions tests/tests/root.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure;
mod regression {
mod issue_3457;
mod issue_4024;
mod issue_4122;
}

mod bind_group_layout_dedup;
Expand Down
4 changes: 4 additions & 0 deletions wgpu-hal/src/vulkan/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,10 @@ impl super::Instance {
super::Workarounds::EMPTY_RESOLVE_ATTACHMENT_LISTS,
phd_capabilities.properties.vendor_id == db::qualcomm::VENDOR,
);
workarounds.set(
super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
phd_capabilities.properties.vendor_id == db::nvidia::VENDOR,
);
};

if phd_capabilities.effective_api_version == vk::API_VERSION_1_0
Expand Down
47 changes: 38 additions & 9 deletions wgpu-hal/src/vulkan/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,15 +212,44 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
}

unsafe fn clear_buffer(&mut self, buffer: &super::Buffer, range: crate::MemoryRange) {
unsafe {
self.device.raw.cmd_fill_buffer(
self.active,
buffer.raw,
range.start,
range.end - range.start,
0,
)
};
let range_size = range.end - range.start;
if self.device.workarounds.contains(
super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
) && range_size >= 4096
&& range.start % 16 != 0
{
let rounded_start = wgt::math::align_to(range.start, 16);
let prefix_size = rounded_start - range.start;

unsafe {
self.device.raw.cmd_fill_buffer(
self.active,
buffer.raw,
range.start,
prefix_size,
0,
)
};

// This will never be zero, as rounding can only add up to 12 bytes, and the total size is 4096.
let suffix_size = range.end - rounded_start;

unsafe {
self.device.raw.cmd_fill_buffer(
self.active,
buffer.raw,
rounded_start,
suffix_size,
0,
)
};
} else {
unsafe {
self.device
.raw
.cmd_fill_buffer(self.active, buffer.raw, range.start, range_size, 0)
};
}
}

unsafe fn copy_buffer_to_buffer<T>(
Expand Down
22 changes: 22 additions & 0 deletions wgpu-hal/src/vulkan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,28 @@ bitflags::bitflags!(
/// Qualcomm OOMs when there are zero color attachments but a non-null pointer
/// to a subpass resolve attachment array. This nulls out that pointer in that case.
const EMPTY_RESOLVE_ATTACHMENT_LISTS = 0x2;
/// If the following code returns false, then nvidia will end up filling the wrong range.
///
/// ```skip
/// fn nvidia_succeeds() -> bool {
/// # let (copy_length, start_offset) = (0, 0);
/// if copy_length >= 4096 {
/// if start_offset % 16 != 0 {
/// if copy_length == 4096 {
/// return true;
/// }
/// if copy_length % 16 == 0 {
/// return false;
/// }
/// }
/// }
/// true
/// }
/// ```
///
/// As such, we need to make sure all calls to vkCmdFillBuffer are aligned to 16 bytes
/// if they cover a range of 4096 bytes or more.
const FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16 = 0x4;
}
);

Expand Down