diff --git a/CMakeLists.txt b/CMakeLists.txt index 7de79a43dd..b057f55d6a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -738,6 +738,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/renderer_vulkan/vk_resource_pool.h src/video_core/renderer_vulkan/vk_scheduler.cpp src/video_core/renderer_vulkan/vk_scheduler.h + src/video_core/renderer_vulkan/vk_shader_hle.cpp + src/video_core/renderer_vulkan/vk_shader_hle.h src/video_core/renderer_vulkan/vk_shader_util.cpp src/video_core/renderer_vulkan/vk_shader_util.h src/video_core/renderer_vulkan/vk_swapchain.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 1abdb230bb..e9fc064938 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -360,7 +360,8 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } -std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size) { +std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) { + // Check if any buffer contains the full requested range. const u64 page = gpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page]; if (buffer_id) { @@ -370,6 +371,13 @@ std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size) return {&buffer, buffer.Offset(gpu_addr)}; } } + // If no buffer contains the full requested range but some buffer within was GPU-modified, + // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications. + // This is only done if the request prefers to use GPU memory, otherwise we can skip it. + if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) { + return ObtainBuffer(gpu_addr, size, false, false); + } + // In all other cases, just do a CPU copy to the staging buffer. const u32 offset = staging_buffer.Copy(gpu_addr, size, 16); return {&staging_buffer, offset}; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 3dab95db75..e62913413a 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -96,7 +96,8 @@ class BufferCache { BufferId buffer_id = {}); /// Attempts to obtain a buffer without modifying the cache contents. - [[nodiscard]] std::pair ObtainViewBuffer(VAddr gpu_addr, u32 size); + [[nodiscard]] std::pair ObtainViewBuffer(VAddr gpu_addr, u32 size, + bool prefer_gpu); /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 0471fdb0a5..33358b8503 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -8,6 +8,7 @@ #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_hle.h" #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/texture_cache.h" #include "vk_rasterizer.h" @@ -318,6 +319,11 @@ void Rasterizer::DispatchDirect() { return; } + const auto& cs = pipeline->GetStage(Shader::Stage::Compute); + if (ExecuteShaderHLE(cs, liverpool->regs, *this)) { + return; + } + if (!BindResources(pipeline)) { return; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 1936276a21..9214372ee5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -28,6 +28,14 @@ class Rasterizer { AmdGpu::Liverpool* liverpool); ~Rasterizer(); + [[nodiscard]] Scheduler& GetScheduler() noexcept { + return scheduler; + } + + [[nodiscard]] VideoCore::BufferCache& GetBufferCache() noexcept { + return buffer_cache; + } + [[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept { return texture_cache; } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 1140bfbc21..45a9228c9d 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -10,6 +10,10 @@ #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" +namespace tracy { +class VkCtxScope; +} + namespace Vulkan { class Instance; diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.cpp b/src/video_core/renderer_vulkan/vk_shader_hle.cpp new file mode 100644 index 0000000000..df9d40f079 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_hle.cpp @@ -0,0 +1,139 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/info.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_hle.h" + +#include "vk_rasterizer.h" + +namespace Vulkan { + +static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f; + +bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, + Rasterizer& rasterizer) { + auto& scheduler = rasterizer.GetScheduler(); + auto& buffer_cache = rasterizer.GetBufferCache(); + + // Copy shader defines three formatted buffers as inputs: control, source, and destination. + const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info); + const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info); + const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info); + const auto buf_stride = src_buf_sharp.GetStride(); + ASSERT(buf_stride == dst_buf_sharp.GetStride()); + + struct CopyShaderControl { + u32 dst_idx; + u32 src_idx; + u32 end; + }; + static_assert(sizeof(CopyShaderControl) == 12); + ASSERT(ctl_buf_sharp.GetStride() == sizeof(CopyShaderControl)); + const auto ctl_buf = reinterpret_cast(ctl_buf_sharp.base_address); + + static std::vector copies; + copies.clear(); + copies.reserve(regs.cs_program.dim_x); + + for (u32 i = 0; i < regs.cs_program.dim_x; i++) { + const auto& [dst_idx, src_idx, end] = ctl_buf[i]; + const u32 local_dst_offset = dst_idx * buf_stride; + const u32 local_src_offset = src_idx * buf_stride; + const u32 local_size = (end + 1) * buf_stride; + copies.emplace_back(local_src_offset, local_dst_offset, local_size); + } + + scheduler.EndRendering(); + + static constexpr vk::MemoryBarrier READ_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite, + }; + static constexpr vk::MemoryBarrier WRITE_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }; + scheduler.CommandBuffer().pipelineBarrier( + vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, + vk::DependencyFlagBits::eByRegion, READ_BARRIER, {}, {}); + + static constexpr vk::DeviceSize MaxDistanceForMerge = 64_MB; + u32 batch_start = 0; + u32 batch_end = 1; + + while (batch_end < copies.size()) { + // Place first copy into the current batch + const auto& copy = copies[batch_start]; + auto src_offset_min = copy.srcOffset; + auto src_offset_max = copy.srcOffset + copy.size; + auto dst_offset_min = copy.dstOffset; + auto dst_offset_max = copy.dstOffset + copy.size; + + for (int i = batch_start + 1; i < copies.size(); i++) { + // Compute new src and dst bounds if we were to batch this copy + const auto [src_offset, dst_offset, size] = copies[i]; + auto new_src_offset_min = std::min(src_offset_min, src_offset); + auto new_src_offset_max = std::max(src_offset_max, src_offset + size); + if (new_src_offset_max - new_src_offset_min > MaxDistanceForMerge) { + continue; + } + + auto new_dst_offset_min = std::min(dst_offset_min, dst_offset); + auto new_dst_offset_max = std::max(dst_offset_max, dst_offset + size); + if (new_dst_offset_max - new_dst_offset_min > MaxDistanceForMerge) { + continue; + } + + // We can batch this copy + src_offset_min = new_src_offset_min; + src_offset_max = new_src_offset_max; + dst_offset_min = new_dst_offset_min; + dst_offset_max = new_dst_offset_max; + if (i != batch_end) { + std::swap(copies[i], copies[batch_end]); + } + ++batch_end; + } + + // Obtain buffers for the total source and destination ranges. + const auto [src_buf, src_buf_offset] = + buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min, + src_offset_max - src_offset_min, false, false); + const auto [dst_buf, dst_buf_offset] = + buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min, + dst_offset_max - dst_offset_min, true, false); + + // Apply found buffer base. + const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start); + for (auto& copy : vk_copies) { + copy.srcOffset = copy.srcOffset - src_offset_min + src_buf_offset; + copy.dstOffset = copy.dstOffset - dst_offset_min + dst_buf_offset; + } + + // Execute buffer copies. + LOG_TRACE(Render_Vulkan, "HLE buffer copy: src_size = {}, dst_size = {}", + src_offset_max - src_offset_min, dst_offset_max - dst_offset_min); + scheduler.CommandBuffer().copyBuffer(src_buf->Handle(), dst_buf->Handle(), vk_copies); + batch_start = batch_end; + ++batch_end; + } + + scheduler.CommandBuffer().pipelineBarrier( + vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, + vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); + + return true; +} + +bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, + Rasterizer& rasterizer) { + switch (info.pgm_hash) { + case COPY_SHADER_HASH: + return ExecuteCopyShaderHLE(info, regs, rasterizer); + default: + return false; + } +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.h b/src/video_core/renderer_vulkan/vk_shader_hle.h new file mode 100644 index 0000000000..fda9b1735f --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_hle.h @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "video_core/amdgpu/liverpool.h" + +namespace Shader { +struct Info; +} + +namespace Vulkan { + +class Rasterizer; + +/// Attempts to execute a shader using HLE if possible. +bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, + Rasterizer& rasterizer); + +} // namespace Vulkan diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 1670648b36..0e5bbc1f3a 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -466,6 +466,9 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const auto& num_mips = image.info.resources.levels; ASSERT(num_mips == image.info.mips_layout.size()); + const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified); + const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty); + boost::container::small_vector image_copy{}; for (u32 m = 0; m < num_mips; m++) { const u32 width = std::max(image.info.size.width >> m, 1u); @@ -475,8 +478,6 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const auto& mip = image.info.mips_layout[m]; // Protect GPU modified resources from accidental CPU reuploads. - const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified); - const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty); if (is_gpu_modified && !is_gpu_dirty) { const u8* addr = std::bit_cast(image.info.guest_address); const u64 hash = XXH3_64bits(addr + mip.offset, mip.size); @@ -515,7 +516,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size_bytes; - const auto [vk_buffer, buf_offset] = buffer_cache.ObtainViewBuffer(image_addr, image_size); + const auto [vk_buffer, buf_offset] = + buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty); // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW // hazard if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,