From cc1bb86af57f385c6532e41020b5e3a11200d654 Mon Sep 17 00:00:00 2001 From: Rys Sommefeldt Date: Wed, 26 Aug 2020 17:15:08 +0100 Subject: [PATCH] FidelityFX SSSR release v1.1 --- .gitlab-ci.yml | 18 +- docs/FFX_SSSR_API.pdf | Bin 131 -> 0 bytes docs/FFX_SSSR_GUI.pdf | Bin 131 -> 131 bytes docs/FFX_SSSR_Technology.pdf | Bin 132 -> 132 bytes ffx-sssr/CMakeLists.txt | 30 +- ffx-sssr/README.md | 77 +- ffx-sssr/inc/ffx_sssr.h | 18 +- ffx-sssr/inc/ffx_sssr_d3d12.h | 2 +- ffx-sssr/inc/ffx_sssr_vk.h | 61 + ffx-sssr/shaders/classify_tiles.hlsl | 30 +- ffx-sssr/shaders/common.hlsl | 2 +- ffx-sssr/shaders/intersect.hlsl | 35 +- ffx-sssr/shaders/prepare_indirect_args.hlsl | 11 +- ffx-sssr/shaders/resolve_eaw.hlsl | 18 +- ffx-sssr/shaders/resolve_eaw_stride.hlsl | 100 - ffx-sssr/shaders/resolve_spatial.hlsl | 23 +- ffx-sssr/shaders/resolve_temporal.hlsl | 36 +- ffx-sssr/src/context.cpp | 78 +- ffx-sssr/src/context.h | 20 +- ffx-sssr/src/context.inl | 49 +- ffx-sssr/src/d3d12/context_d3d12.cpp | 443 +++- ffx-sssr/src/d3d12/context_d3d12.h | 63 +- ffx-sssr/src/d3d12/context_d3d12.inl | 132 +- ffx-sssr/src/d3d12/reflection_view_d3d12.cpp | 593 +----- ffx-sssr/src/d3d12/reflection_view_d3d12.h | 85 +- ffx-sssr/src/d3d12/reflection_view_d3d12.inl | 106 - ffx-sssr/src/d3d12/shader_compiler_d3d12.cpp | 24 + ffx-sssr/src/reflection_error.cpp | 4 +- ffx-sssr/src/reflection_error.h | 4 +- ffx-sssr/src/vk/buffer_vk.cpp | 245 +++ ffx-sssr/src/vk/buffer_vk.h | 68 + ffx-sssr/src/vk/context_vk.cpp | 726 +++++++ ffx-sssr/src/vk/context_vk.h | 170 ++ ffx-sssr/src/vk/context_vk.inl | 206 ++ ffx-sssr/src/vk/image_vk.cpp | 208 ++ ffx-sssr/src/vk/image_vk.h | 66 + ffx-sssr/src/vk/reflection_view_vk.cpp | 1094 ++++++++++ ffx-sssr/src/vk/reflection_view_vk.h | 164 ++ ffx-sssr/src/vk/sampler_vk.cpp | 74 + ffx-sssr/src/vk/sampler_vk.h | 55 + ffx-sssr/src/vk/shader_compiler_vk.cpp | 220 ++ ffx-sssr/src/vk/shader_compiler_vk.h | 86 + ffx-sssr/src/vk/shader_compiler_vk.inl | 82 + ffx-sssr/src/vk/upload_buffer_vk.cpp | 100 + ffx-sssr/src/vk/upload_buffer_vk.h | 88 + ffx-sssr/src/vk/upload_buffer_vk.inl | 104 + sample/CMakeLists.txt | 14 + sample/README.md | 5 +- sample/build/GenerateSolutions.bat | 5 + sample/libs/cauldron | 2 +- sample/libs/dxc/CMakeLists.txt | 5 + sample/src/Common/config.json | 28 + sample/src/DX12/Sources/SampleRenderer.cpp | 58 +- sample/src/DX12/Sources/SampleRenderer.h | 11 +- sample/src/DX12/Sources/SssrSample.cpp | 213 +- sample/src/DX12/Sources/SssrSample.h | 14 +- sample/src/VK/CMakeLists.txt | 64 + sample/src/VK/Shaders/ApplyReflections.hlsl | 98 + sample/src/VK/Shaders/DepthDownsample.hlsl | 99 + sample/src/VK/Shaders/ffx_a.h | 1907 +++++++++++++++++ sample/src/VK/Shaders/ffx_spd.h | 1164 +++++++++++ sample/src/VK/Sources/SampleRenderer.cpp | 1934 ++++++++++++++++++ sample/src/VK/Sources/SampleRenderer.h | 256 +++ sample/src/VK/Sources/SssrSample.cpp | 591 ++++++ sample/src/VK/Sources/SssrSample.h | 81 + sample/src/VK/Sources/stdafx.cpp | 10 + sample/src/VK/Sources/stdafx.h | 66 + sample/src/VK/dpiawarescaling.manifest | 8 + 68 files changed, 11245 insertions(+), 1206 deletions(-) delete mode 100644 docs/FFX_SSSR_API.pdf create mode 100644 ffx-sssr/inc/ffx_sssr_vk.h delete mode 100644 ffx-sssr/shaders/resolve_eaw_stride.hlsl delete mode 100644 ffx-sssr/src/d3d12/reflection_view_d3d12.inl create mode 100644 ffx-sssr/src/vk/buffer_vk.cpp create mode 100644 ffx-sssr/src/vk/buffer_vk.h create mode 100644 ffx-sssr/src/vk/context_vk.cpp create mode 100644 ffx-sssr/src/vk/context_vk.h create mode 100644 ffx-sssr/src/vk/context_vk.inl create mode 100644 ffx-sssr/src/vk/image_vk.cpp create mode 100644 ffx-sssr/src/vk/image_vk.h create mode 100644 ffx-sssr/src/vk/reflection_view_vk.cpp create mode 100644 ffx-sssr/src/vk/reflection_view_vk.h create mode 100644 ffx-sssr/src/vk/sampler_vk.cpp create mode 100644 ffx-sssr/src/vk/sampler_vk.h create mode 100644 ffx-sssr/src/vk/shader_compiler_vk.cpp create mode 100644 ffx-sssr/src/vk/shader_compiler_vk.h create mode 100644 ffx-sssr/src/vk/shader_compiler_vk.inl create mode 100644 ffx-sssr/src/vk/upload_buffer_vk.cpp create mode 100644 ffx-sssr/src/vk/upload_buffer_vk.h create mode 100644 ffx-sssr/src/vk/upload_buffer_vk.inl create mode 100644 sample/libs/dxc/CMakeLists.txt create mode 100644 sample/src/VK/CMakeLists.txt create mode 100644 sample/src/VK/Shaders/ApplyReflections.hlsl create mode 100644 sample/src/VK/Shaders/DepthDownsample.hlsl create mode 100644 sample/src/VK/Shaders/ffx_a.h create mode 100644 sample/src/VK/Shaders/ffx_spd.h create mode 100644 sample/src/VK/Sources/SampleRenderer.cpp create mode 100644 sample/src/VK/Sources/SampleRenderer.h create mode 100644 sample/src/VK/Sources/SssrSample.cpp create mode 100644 sample/src/VK/Sources/SssrSample.h create mode 100644 sample/src/VK/Sources/stdafx.cpp create mode 100644 sample/src/VK/Sources/stdafx.h create mode 100644 sample/src/VK/dpiawarescaling.manifest diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e675369..dd04311 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,6 +18,18 @@ build_dx12: paths: - sample/bin/ +build_vk: + tags: + - windows + - amd64 + stage: build + script: + - 'cmake -S sample -B sample/build/VK -G "Visual Studio 15 2017" -A x64 -DGFX_API=VK' + - 'cmake --build sample/build/VK --config Release' + artifacts: + paths: + - sample/bin/ + package_sample: tags: - windows @@ -25,10 +37,13 @@ package_sample: stage: deploy dependencies: - build_dx12 + - build_vk script: - echo "Packaging build" - echo cd .\sample\bin\ > %SampleName%_DX12.bat - echo start %SampleName%_DX12.exe >> %SampleName%_DX12.bat + - echo cd .\sample\bin\ > %SampleName%_VK.bat + - echo start %SampleName%_VK.exe >> %SampleName%_VK.bat artifacts: name: "%SampleName%-%CI_COMMIT_TAG%-%CI_COMMIT_REF_NAME%-%CI_COMMIT_SHORT_SHA%" paths: @@ -38,4 +53,5 @@ package_sample: - sample/bin/ - sample/media/ - docs/ - - "%SampleName%_DX12.bat" \ No newline at end of file + - "%SampleName%_DX12.bat" + - "%SampleName%_VK.bat" \ No newline at end of file diff --git a/docs/FFX_SSSR_API.pdf b/docs/FFX_SSSR_API.pdf deleted file mode 100644 index c56f1e54207c1346682cd3497f31aaa449620576..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 131 zcmWN_!41P83;@7CQ?Nh-7%(B*1Y-(PTcSep==9C&q`UY#TK~v8=P@>=o^3u}Wh}S# zObhk58V8fQjOZ=PQFrjTrJby{+35gib4d)&bAnl7J5K|prM5`cJ@I$c^9$jJHM$38x}Ol7jMc-S;=-4Vfv7td8i eh!aO0l$edhGCV;?Ku?QCmQbGi%@ZrCYWV^0(ic|% delta 83 zcmV~$yA6Oa3I-ZEO4Vpx`7UxY=ZK9t0N2NjbCz dP@9BjH0!O;8tJ$~6l9WmLd$c%xpxsBUVgwU7OMaN diff --git a/docs/FFX_SSSR_Technology.pdf b/docs/FFX_SSSR_Technology.pdf index 61e864da59c11c23345313c7879a3f43ff995091..aed2649fbd8e163e9a2bf26bb5cd32e546a7ac87 100644 GIT binary patch delta 85 zcmWN`u@S%^2mrvdb&8CDfCQGv4uX1Toi4QlWaQ-A-)W~;F6HPLC|;-v2B4D-;@Opj fv0x4G@EK-crmcZm4f&Tlpu)`edd6|G6c)-K-(?nY delta 85 zcmV~$u@QhU2nEoy%@mFR<`cpa?f{0|S=(7>07v${wXN-Ir%htaI#ktZ+6Bq1+#?1@ e7(+*I0da#nPsAZgi4IG+I$L?}H+LmiZBl*!L>CMI diff --git a/ffx-sssr/CMakeLists.txt b/ffx-sssr/CMakeLists.txt index 986bbab..93d9090 100644 --- a/ffx-sssr/CMakeLists.txt +++ b/ffx-sssr/CMakeLists.txt @@ -4,8 +4,14 @@ project(stochastic-screen-space-reflections) find_package(PythonInterp 3.6 REQUIRED) -option(FFX_SSSR_NO_D3D12 "Stochastic Screen Space Reflections - Skip D3D12 backend" OFF) -option(FFX_SSSR_NO_VK "Stochastic Screen Space Reflections - Skip VK backend" OFF) +# ensure that only one option is enabled +if(FFX_SSSR_VK AND FFX_SSSR_D3D12) + message(FATAL_ERROR "FFX_SSSR_VK and FFX_SSSR_D3D12 are enabled. Please make sure to enable only one at a time.") +endif() + +if(FFX_SSSR_VK) + find_package(Vulkan REQUIRED) +endif() set_property(GLOBAL PROPERTY USE_FOLDERS ON) @@ -25,8 +31,7 @@ file(GLOB FFX_SSSR_SOURCE_FILES file(GLOB FFX_SSSR_SHADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/shaders/*.hlsl) -if(FFX_SSSR_NO_D3D12) -else() +if(FFX_SSSR_D3D12) file(GLOB FFX_SSSR_HEADER_FILES_D3D12 ${CMAKE_CURRENT_SOURCE_DIR}/inc/ffx_sssr_d3d12.h ${CMAKE_CURRENT_SOURCE_DIR}/src/d3d12/*.h) @@ -36,8 +41,7 @@ else() ${CMAKE_CURRENT_SOURCE_DIR}/src/d3d12/*.cpp) endif() -if(FFX_SSSR_NO_VK) -else() +if(FFX_SSSR_VK) file(GLOB FFX_SSSR_HEADER_FILES_VK ${CMAKE_CURRENT_SOURCE_DIR}/inc/ffx_sssr_vk.h ${CMAKE_CURRENT_SOURCE_DIR}/src/vk/*.h) @@ -54,7 +58,6 @@ foreach(shaderfile classify_tiles intersect prepare_indirect_args resolve_eaw - resolve_eaw_stride resolve_spatial resolve_temporal) @@ -87,12 +90,12 @@ target_include_directories(FFX_SSSR PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/external target_include_directories(FFX_SSSR PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/externals/dxc) target_include_directories(FFX_SSSR PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/externals/samplerCPP) -if(FFX_SSSR_NO_D3D12) - target_compile_definitions(FFX_SSSR PRIVATE FFX_SSSR_NO_D3D12) +if(FFX_SSSR_D3D12) + target_compile_definitions(FFX_SSSR PRIVATE FFX_SSSR_D3D12) endif() -if(FFX_SSSR_NO_VK) - target_compile_definitions(FFX_SSSR PRIVATE FFX_SSSR_NO_VK) +if(FFX_SSSR_VK) + target_compile_definitions(FFX_SSSR PRIVATE FFX_SSSR_VK) endif() target_sources(FFX_SSSR PRIVATE @@ -124,3 +127,8 @@ if(MSVC) VS_TOOL_OVERRIDE "None") endif() + + +if(FFX_SSSR_VK) + target_link_libraries (FFX_SSSR Vulkan::Vulkan) +endif() diff --git a/ffx-sssr/README.md b/ffx-sssr/README.md index 319fb3d..3e9bf7e 100644 --- a/ffx-sssr/README.md +++ b/ffx-sssr/README.md @@ -3,7 +3,17 @@ The **FidelityFX SSSR** library provides the means to render stochastic screen space reflections for the use in real-time applications. A full sample running the library can be found on the [FidelityFX SSSR GitHub page](https://github.com/GPUOpen-Effects/FidelityFX-SSSR.git). -The library supports D3D12 with SM 6.0 or higher. +The library supports D3D12 and Vulkan. + +## Prerequisits + +The library relies on [dxcompiler.dll](https://github.com/microsoft/DirectXShaderCompiler) to generate DXIL/SPIRV from HLSL at runtime. +Use the version built for SPIRV from the [DirectXShaderCompiler GitHub repository](https://github.com/microsoft/DirectXShaderCompiler) or the one that comes with the [Vulkan SDK 1.2.141.2 (or later)](https://www.lunarg.com/vulkan-sdk/) if you are planning to use the Vulkan version of **FidelityFX SSSR**. + +## Device Creation + +Vulkan version only: The library relies on [VK_EXT_subgroup_size_control](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_EXT_subgroup_size_control.html) for optimal performance on RDNA. Make sure the extension is enabled at device creation by adding `VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME` to `ppEnabledExtensionNames` if it is available. +Also enable `subgroupSizeControl` in `VkPhysicalDeviceSubgroupSizeControlFeaturesEXT` and chain it into the `pNext` chain of `VkDeviceCreateInfo` if the extension name is available. It is fine to run **FidelityFX SSSR** if the extension is not supported. ## Context - Initialization and Shutdown @@ -14,13 +24,25 @@ First the header files must be included. This is `ffx_sssr.h` for Graphics API i #include "ffx_sssr_d3d12.h" ``` -Then a context must be created. This usually is done only once per device. +or `ffx_sssr_vk.h` for Vulkan specific definitions: + +```C++ +#include "ffx_sssr.h" +#include "ffx_sssr_vk.h" +``` + +Then a context must be created. This usually is done only once per device. Depending on the preferred API, populate either `FfxSssrD3D12CreateContextInfo` or `FfxSssrVkCreateContextInfo`. ```C++ FfxSssrD3D12CreateContextInfo d3d12ContextInfo = {}; d3d12ContextInfo.pDevice = myDevice; d3d12ContextInfo.pUploadCommandList = myCommandList; +FfxSssrVkCreateContextInfo vkContextInfo = {}; +vkContextInfo.device = myDeviceHandle; +vkContextInfo.physicalDevice = myPhysicalDeviceHandle; +vkContextInfo.uploadCommandBuffer = myUploadCommandBufferHandle; + FfxSssrLoggingCallbacks loggingCallbacks = {}; loggingCallbacks.pUserData = myUserData; loggingCallbacks.pfnLogging = myLoggingFunction; @@ -32,10 +54,11 @@ contextInfo.frameCountBeforeMemoryReuse = myMaxFrameCountInFlight; contextInfo.uploadBufferSize = 8 * 1024 * 1024; contextInfo.pLoggingCallbacks = &loggingCallbacks; contextInfo.pD3D12CreateContextInfo = &d3d12ContextInfo; +contextInfo.pVkCreateContextInfo = &vkContextInfo; ``` The library requires certain input textures from the application to create a reflection view. -Thus, the context requires user specified unpack functions (HLSL) to access the individual attributes. It is recommended to keep these snippets as small as possible to guarantee good performance. +Thus, the context requires user specified unpack functions (HLSL SM 6.0) to access the individual attributes. It is recommended to keep these snippets as small as possible to achieve good performance. The function headers have to match in order for the shaders to compile. The `FFX_SSSR_*_TEXTURE_FORMAT` macros hold the definitions provided in the `p*TextureFormat` members of `FfxSssrCreateContextInfo`. The snippets provided below shall serve as a starting point: ```C++ @@ -61,7 +84,7 @@ if (status != FFX_SSSR_STATUS_OK) { } ``` -Finally, submit the command list provided to the `pUploadCommandList` member of `FfxSssrCreateContextInfoD3D12` to the queue of your choice to upload the internal resources to the GPU. +Finally, submit the command list provided to the `pUploadCommandList` member of `FfxSssrCreateContextInfoD3D12` to the queue of your choice to upload the internal resources to the GPU. The same is required on Vulkan for the `uploadCommandBuffer` member of `FfxSssrVkCreateContextInfo`. Once there is no need to render reflections anymore the context should be destroyed to free internal resources: @@ -76,7 +99,7 @@ if (status != FFX_SSSR_STATUS_OK) { Reflection views represent the abstraction for the first bounce of indirect light from reflective surfaces as seen from a given camera. -`FfxSssrReflectionView` resources can be created as such: +`FfxSssrReflectionView` resources can be created as such. Depending on the API fill either `FfxSssrD3D12CreateReflectionViewInfo` or `FfxSssrVkCreateReflectionViewInfo`: ```C++ FfxSssrD3D12CreateReflectionViewInfo d3d12ReflectionViewInfo = {}; @@ -92,11 +115,26 @@ d3d12ReflectionViewInfo.sceneSRV; d3d12ReflectionViewInfo.environmentMapSRV; d3d12ReflectionViewInfo.pEnvironmentMapSamplerDesc; +FfxSssrVkCreateReflectionViewInfo vkReflectionViewInfo = {}; +vkReflectionViewInfo.depthBufferHierarchySRV; +vkReflectionViewInfo.motionBufferSRV; +vkReflectionViewInfo.normalBufferSRV; +vkReflectionViewInfo.roughnessBufferSRV; +vkReflectionViewInfo.normalHistoryBufferSRV; +vkReflectionViewInfo.roughnessHistoryBufferSRV; +vkReflectionViewInfo.reflectionViewUAV; +vkReflectionViewInfo.sceneFormat; +vkReflectionViewInfo.sceneSRV; +vkReflectionViewInfo.environmentMapSRV; +vkReflectionViewInfo.environmentMapSampler; +vkReflectionViewInfo.uploadCommandBuffer; + FfxSssrCreateReflectionViewInfo reflectionViewInfo = {}; reflectionViewInfo.flags = FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS | FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_NORMAL_BUFFERS | FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_ROUGHNESS_BUFFERS; reflectionViewInfo.outputWidth = width; reflectionViewInfo.outputHeight = height; reflectionViewInfo.pD3D12CreateReflectionViewInfo = &d3d12ReflectionViewInfo; +reflectionViewInfo.pVkCreateReflectionViewInfo = &vkReflectionViewInfo; FfxSssrReflectionView myReflectionView; FfxSssrStatus status = ffxSssrCreateReflectionView(myContext, &reflectionViewInfo, &myReflectionView); @@ -105,7 +143,7 @@ if (status != FFX_SSSR_STATUS_OK) { } ``` -All SRVs and UAVs must be allocated from a CPU accessible descriptor heap as they are copied into the descriptor tables of the library. `FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS` can be used if the application intends to query for timings later. The `FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_*` flags should be set if the normal or roughness surfaces are written in an alternating fashion. Don't set the flags if the surfaces are copied each frame. +On D3D12 all SRVs and UAVs must be allocated from a CPU accessible descriptor heap as they are copied into the descriptor tables of the library. `FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS` can be used if the application intends to query for timings later. The `FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_*` flags should be set if the normal or roughness surfaces are written in an alternating fashion. Don't set the flags if the surfaces are copied each frame. The reflection view depends on the screen size. It is recommended to destroy the reflection view on resize and create a new one: @@ -127,12 +165,15 @@ if (status != FFX_SSSR_STATUS_OK) { ## Reflection View - Resolve -Calling `ffxSssrEncodeResolveReflectionView` dispatches the actual shaders that perform the hierarchical tracing through the depth buffer and optionally also dispatches the denoising passes if the `FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE` flag is set: +Calling `ffxSssrEncodeResolveReflectionView` dispatches the actual shaders that perform the hierarchical tracing through the depth buffer and optionally also dispatches the denoising passes if the `FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE` flag is set. Depending on the API populate either `FfxSssrD3D12CommandEncodeInfo` or `FfxSssrVkCommandEncodeInfo`: ```C++ FfxSssrD3D12CommandEncodeInfo d3d12EncodeInfo = {}; d3d12EncodeInfo.pCommandList = myCommandList; +FfxSssrVkCommandEncodeInfo vkEncodeInfo = {}; +vkEncodeInfo.commandBuffer = myCommandBufferHandle; + FfxSssrResolveReflectionViewInfo resolveInfo = {}; resolveInfo.flags = FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE | FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_ENABLE_VARIANCE_GUIDED_TRACING; resolveInfo.temporalStabilityScale = 0.99f; @@ -141,9 +182,9 @@ resolveInfo.mostDetailedDepthHierarchyMipLevel = 1; resolveInfo.depthBufferThickness = 0.015f; resolveInfo.minTraversalOccupancy = 4; resolveInfo.samplesPerQuad = FFX_SSSR_RAY_SAMPLES_PER_QUAD_1; -resolveInfo.eawPassCount = FFX_SSSR_EAW_PASS_COUNT_1; resolveInfo.roughnessThreshold = 0.2f; resolveInfo.pD3D12CommandEncodeInfo = &d3d12EncodeInfo; +resolveInfo.pVkCommandEncodeInfo = &vkEncodeInfo; FfxSssrStatus status = ffxSssrEncodeResolveReflectionView(myContext, myReflectionView, &resolveInfo); if (status != FFX_SSSR_STATUS_OK) { // Error handling @@ -156,8 +197,7 @@ if (status != FFX_SSSR_STATUS_OK) { * `resolveInfo.mostDetailedDepthHierarchyMipLevel` limits the most detailed mipmap for depth buffer lookups when tracing non-mirror reflection rays. * `resolveInfo.depthBufferThickness` configures the accepted hit distance behind the depth buffer in view space. * `resolveInfo.minTraversalOccupancy` limits the number of threads in the depth traversal loop. If less than that number of threads remain present they exit the intersection loop early even if they did not find a depth buffer intersection yet. This only affects non-mirror reflection rays. -* `resolveInfo.samplesPerQuad` serves as a starting point how many rays are spawned in glossy regions. The only supported values are `FFX_SSSR_RAY_SAMPLES_PER_QUAD_1`, `FFX_SSSR_RAY_SAMPLES_PER_QUAD_2` and `FFX_SSSR_RAY_SAMPLES_PER_QUAD_4`. The use of `FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_ENABLE_VARIANCE_GUIDED_TRACING` dynamically bumps this up to `4` to enforce convergence on a per pixel basis. -* `resolveInfo.eawPassCount` configures the number of Edge-aware á-trous wavelet passes. The only supported values are `FFX_SSSR_EAW_PASS_COUNT_1` and `FFX_SSSR_EAW_PASS_COUNT_3`. +* `resolveInfo.samplesPerQuad` serves as a starting point how many rays are spawned in glossy regions. The only supported values are `FFX_SSSR_RAY_SAMPLES_PER_QUAD_1`, `FFX_SSSR_RAY_SAMPLES_PER_QUAD_2` and `FFX_SSSR_RAY_SAMPLES_PER_QUAD_4`. The use of `FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_ENABLE_VARIANCE_GUIDED_TRACING` dynamically bumps this up to a maximum of `4` to enforce convergence on a per pixel basis. * `resolveInfo.roughnessThreshold` determines the roughness value below which reflection rays are spawned. Any roughness values higher are considered not reflective and the reflection view will contain `(0, 0, 0, 0)`. When resolving a reflection view, the following operations take place: @@ -167,15 +207,7 @@ When resolving a reflection view, the following operations take place: - The resulting radiance information is denoised using spatio-temporal filtering. - The shading values are written out to the output buffer supplied at creation time. -Note that the application is responsible for issuing the UAV barrier to synchronize the writes to the output buffer: - -``` -D3D12_RESOURCE_BARRIER resourceBarrier = {}; -resourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; -resourceBarrier.UAV.pResource = myOutputBuffer; - -myCommandList->ResourceBarrier(1, &resourceBarrier); -``` +Note that the application is responsible for issuing the required barrier to synchronize the writes to the output buffer. ## Reflection View - Performance Profiling @@ -219,12 +251,7 @@ if (status != FFX_SSSR_STATUS_OK) { } ``` -The retrieved times are expressed in numbers of GPU ticks and can be converted to seconds by querying the timestamp frequency of the queue used to execute the encoded command list: - -```C++ -uint64_t gpuTicksPerSecond; -myCommandQueue->GetTimestampFrequency(&gpuTicksPerSecond); -``` +The retrieved times are expressed in GPU ticks and can be converted using the timestamp frequency of the queue used to execute the encoded command list on D3D12 (`GetTimestampFrequency`). On Vulkan the `timestampPeriod` member of `VkPhysicalDeviceLimits` can be used to convert the times from GPU ticks to nanoseconds. ## Frame management diff --git a/ffx-sssr/inc/ffx_sssr.h b/ffx-sssr/inc/ffx_sssr.h index 5449c0a..1d3d572 100644 --- a/ffx-sssr/inc/ffx_sssr.h +++ b/ffx-sssr/inc/ffx_sssr.h @@ -25,7 +25,7 @@ THE SOFTWARE. #define FFX_SSSR_MAKE_VERSION(a,b,c) (((a) << 22) | ((b) << 12) | (c)) -#define FFX_SSSR_API_VERSION FFX_SSSR_MAKE_VERSION(1, 0, 0) +#define FFX_SSSR_API_VERSION FFX_SSSR_MAKE_VERSION(1, 1, 0) #define FFX_SSSR_STATIC_LIBRARY @@ -60,6 +60,9 @@ FFX_SSSR_DEFINE_HANDLE(FfxSssrReflectionView) typedef struct FfxSssrD3D12CreateContextInfo FfxSssrD3D12CreateContextInfo; typedef struct FfxSssrD3D12CreateReflectionViewInfo FfxSssrD3D12CreateReflectionViewInfo; typedef struct FfxSssrD3D12CommandEncodeInfo FfxSssrD3D12CommandEncodeInfo; +typedef struct FfxSssrVkCreateContextInfo FfxSssrVkCreateContextInfo; +typedef struct FfxSssrVkCreateReflectionViewInfo FfxSssrVkCreateReflectionViewInfo; +typedef struct FfxSssrVkCommandEncodeInfo FfxSssrVkCommandEncodeInfo; /** The return codes for the API functions. @@ -85,15 +88,6 @@ enum FfxSssrRaySamplesPerQuad FFX_SSSR_RAY_SAMPLES_PER_QUAD_4 }; -/** - The number of passes for Edge-aware �-trous wavelet filtering. -*/ -enum FfxSssrEawPassCount -{ - FFX_SSSR_EAW_PASS_COUNT_1, - FFX_SSSR_EAW_PASS_COUNT_3 -}; - /** The available flags for creating a reflection view. */ @@ -154,6 +148,7 @@ typedef struct FfxSssrCreateContextInfo union { const FfxSssrD3D12CreateContextInfo* pD3D12CreateContextInfo; + const FfxSssrVkCreateContextInfo* pVkCreateContextInfo; }; } FfxSssrCreateContextInfo; @@ -168,6 +163,7 @@ typedef struct FfxSssrCreateReflectionViewInfo union { const FfxSssrD3D12CreateReflectionViewInfo* pD3D12CreateReflectionViewInfo; + const FfxSssrVkCreateReflectionViewInfo* pVkCreateReflectionViewInfo; }; } FfxSssrCreateReflectionViewInfo; @@ -183,11 +179,11 @@ typedef struct FfxSssrResolveReflectionViewInfo uint32_t minTraversalOccupancy; ///< Minimum number of threads per wave to keep the intersection kernel running. float depthBufferThickness; ///< Unit in view space. Any intersections further behind the depth buffer are rejected as invalid hits. FfxSssrRaySamplesPerQuad samplesPerQuad; ///< Number of samples per 4 pixels in denoised regions. Mirror reflections are not affected by this. - FfxSssrEawPassCount eawPassCount; ///< Number of EAW passes. float roughnessThreshold; ///< Shoot reflection rays for roughness values that are lower than this threshold. union { const FfxSssrD3D12CommandEncodeInfo* pD3D12CommandEncodeInfo; ///< A pointer to the Direct3D12 command encoding parameters. + const FfxSssrVkCommandEncodeInfo* pVkCommandEncodeInfo; ///< A pointer to the Vulkan command encoding parameters. }; } FfxSssrResolveReflectionViewInfo; diff --git a/ffx-sssr/inc/ffx_sssr_d3d12.h b/ffx-sssr/inc/ffx_sssr_d3d12.h index 7ac8366..a3a938b 100644 --- a/ffx-sssr/inc/ffx_sssr_d3d12.h +++ b/ffx-sssr/inc/ffx_sssr_d3d12.h @@ -47,7 +47,7 @@ typedef struct FfxSssrD3D12CreateReflectionViewInfo D3D12_CPU_DESCRIPTOR_HANDLE normalHistoryBufferSRV; ///< Last frames normalBufferSRV. The descriptor handle must be allocated on a heap allowing CPU reads. D3D12_CPU_DESCRIPTOR_HANDLE roughnessHistoryBufferSRV; ///< Last frames roughnessHistoryBufferSRV. The descriptor handle must be allocated on a heap allowing CPU reads. D3D12_CPU_DESCRIPTOR_HANDLE environmentMapSRV; ///< Environment cube map serving as a fallback for ray misses. The descriptor handle must be allocated on a heap allowing CPU reads. - const D3D12_STATIC_SAMPLER_DESC * pEnvironmentMapSamplerDesc; ///< Description for the environment map sampler. + const D3D12_SAMPLER_DESC * pEnvironmentMapSamplerDesc; ///< Description for the environment map sampler. D3D12_CPU_DESCRIPTOR_HANDLE reflectionViewUAV; ///< The fully resolved reflection view. Make sure to synchronize for UAV writes. The descriptor handle must be allocated on a heap allowing CPU reads. } FfxSssrD3D12CreateReflectionViewInfo; diff --git a/ffx-sssr/inc/ffx_sssr_vk.h b/ffx-sssr/inc/ffx_sssr_vk.h new file mode 100644 index 0000000..e0ba2ed --- /dev/null +++ b/ffx-sssr/inc/ffx_sssr_vk.h @@ -0,0 +1,61 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +/** + The parameters for creating a Vulkan context. +*/ +typedef struct FfxSssrVkCreateContextInfo +{ + VkDevice device; + VkPhysicalDevice physicalDevice; + VkCommandBuffer uploadCommandBuffer; ///< Vulkan command buffer to upload static resources. The application has to begin the command buffer and has to handle synchronization to make sure the uploads are done. +} FfxSssrVkCreateContextInfo; + +/** + The parameters for creating a Vulkan reflection view. +*/ +typedef struct FfxSssrVkCreateReflectionViewInfo +{ + VkFormat sceneFormat; ///< The format of the sceneSRV to allow creating matching internal resources. + VkImageView sceneSRV; ///< The rendered scene without reflections. + VkImageView depthBufferHierarchySRV; ///< Full downsampled depth buffer. Each lower detail mip containing the minimum values of the higher detailed mip. + VkImageView motionBufferSRV; ///< The per pixel motion vectors. + VkImageView normalBufferSRV; ///< The surface normals in world space. Each channel mapped to [0, 1]. + VkImageView roughnessBufferSRV; ///< Perceptual roughness squared per pixel. + VkImageView normalHistoryBufferSRV; ///< Last frames normalBufferSRV. + VkImageView roughnessHistoryBufferSRV; ///< Last frames roughnessHistoryBufferSRV. + VkSampler environmentMapSampler; ///< Environment map sampler used when looking up the fallback for ray misses. + VkImageView environmentMapSRV; ///< Environment map serving as a fallback for ray misses. + VkImageView reflectionViewUAV; ///< The fully resolved reflection view. Make sure to synchronize for UAV writes. + VkCommandBuffer uploadCommandBuffer; ///< Vulkan command buffer to upload static resources. The application has to begin the command buffer and has to handle synchronization to make sure the uploads are done. +} FfxSssrVkCreateReflectionViewInfo; + +/** + \brief The parameters for encoding Vulkan device commands. +*/ +typedef struct FfxSssrVkCommandEncodeInfo +{ + VkCommandBuffer commandBuffer; ///< The Vulkan command buffer to be used for command encoding. +} FfxSssrVkCommandEncodeInfo; diff --git a/ffx-sssr/shaders/classify_tiles.hlsl b/ffx-sssr/shaders/classify_tiles.hlsl index 24239ed..7202541 100644 --- a/ffx-sssr/shaders/classify_tiles.hlsl +++ b/ffx-sssr/shaders/classify_tiles.hlsl @@ -23,17 +23,19 @@ THE SOFTWARE. #ifndef FFX_SSSR_CLASSIFY_TILES #define FFX_SSSR_CLASSIFY_TILES -Texture2D g_roughness : register(t0); - -RWBuffer g_tile_list : register(u0); -RWBuffer g_ray_list : register(u1); -globallycoherent RWBuffer g_tile_counter : register(u2); -globallycoherent RWBuffer g_ray_counter : register(u3); -RWTexture2D g_temporally_denoised_reflections : register(u4); -RWTexture2D g_temporally_denoised_reflections_history : register(u5); -RWTexture2D g_ray_lengths : register(u6); -RWTexture2D g_temporal_variance : register(u7); -RWTexture2D g_denoised_reflections : register(u8); +// In: +[[vk::binding(0, 1)]] Texture2D g_roughness : register(t0); + +// Out: +[[vk::binding(1, 1)]] RWBuffer g_tile_list : register(u0); +[[vk::binding(2, 1)]] RWBuffer g_ray_list : register(u1); +[[vk::binding(3, 1)]] globallycoherent RWBuffer g_tile_counter : register(u2); +[[vk::binding(4, 1)]] globallycoherent RWBuffer g_ray_counter : register(u3); +[[vk::binding(5, 1)]] RWTexture2D g_temporally_denoised_reflections : register(u4); +[[vk::binding(6, 1)]] RWTexture2D g_temporally_denoised_reflections_history : register(u5); +[[vk::binding(7, 1)]] RWTexture2D g_ray_lengths : register(u6); +[[vk::binding(8, 1)]] RWTexture2D g_temporal_variance : register(u7); +[[vk::binding(9, 1)]] RWTexture2D g_denoised_reflections : register(u8); groupshared uint g_ray_count; groupshared uint g_ray_base_index; @@ -51,9 +53,9 @@ void main(uint2 did : SV_DispatchThreadID, uint group_index : SV_GroupIndex) // Disable offscreen pixels bool needs_ray = !(did.x >= screen_size.x || did.y >= screen_size.y); - + // Dont shoot a ray on very rough surfaces. - float roughness = FfxSssrUnpackRoughness(g_roughness.Load(int3(did, 0))); + float roughness = FfxSssrUnpackRoughness(g_roughness.Load(int3(did, 0))); needs_ray = needs_ray && IsGlossy(roughness); // Also we dont need to run the denoiser on mirror reflections. @@ -107,7 +109,7 @@ void main(uint2 did : SV_DispatchThreadID, uint group_index : SV_GroupIndex) uint tile_index; uint ray_base_index = 0; - InterlockedAdd(g_tile_counter[0], denoise_count, tile_index); + InterlockedAdd(g_tile_counter[0], denoise_count, tile_index); InterlockedAdd(g_ray_counter[0], ray_count, ray_base_index); int cleaned_index = must_denoise ? tile_index : -1; diff --git a/ffx-sssr/shaders/common.hlsl b/ffx-sssr/shaders/common.hlsl index 07ecd77..c882630 100644 --- a/ffx-sssr/shaders/common.hlsl +++ b/ffx-sssr/shaders/common.hlsl @@ -48,7 +48,7 @@ FFX_SSSR_DEPTH_UNPACK_FUNCTION FFX_SSSR_SCENE_RADIANCE_UNPACK_FUNCTION // Common constants -cbuffer Constants : register(b0) +[[vk::binding(0, 0)]] cbuffer Constants : register(b0) { float4x4 g_inv_view_proj; float4x4 g_proj; diff --git a/ffx-sssr/shaders/intersect.hlsl b/ffx-sssr/shaders/intersect.hlsl index 19d2d82..92a1a5a 100644 --- a/ffx-sssr/shaders/intersect.hlsl +++ b/ffx-sssr/shaders/intersect.hlsl @@ -23,22 +23,25 @@ THE SOFTWARE. #ifndef FFX_SSSR_INTERSECT #define FFX_SSSR_INTERSECT -Texture2D g_lit_scene : register(t0); // Scene rendered with lighting and shadows -Texture2D g_depth_buffer_hierarchy : register(t1); -Texture2D g_normal : register(t2); -Texture2D g_roughness : register(t3); -TextureCube g_environment_map : register(t4); -Buffer g_sobol_buffer : register(t5); -Buffer g_ranking_tile_buffer : register(t6); -Buffer g_scrambling_tile_buffer : register(t7); - -SamplerState g_linear_sampler : register(s0); -SamplerState g_environment_map_sampler : register(s1); - -RWTexture2D g_intersection_result : register(u0); // Reflection colors at the end of the intersect pass. -RWTexture2D g_ray_lengths : register(u1); -RWTexture2D g_denoised_reflections : register(u2); // Mirror reflections don't need to be denoised, the intersection pass can just write them to the final target. -RWBuffer g_ray_list : register(u3); +// In: +[[vk::binding(0, 1)]] Texture2D g_lit_scene : register(t0); // scene rendered with lighting and shadows +[[vk::binding(1, 1)]] Texture2D g_depth_buffer_hierarchy : register(t1); +[[vk::binding(2, 1)]] Texture2D g_normal : register(t2); +[[vk::binding(3, 1)]] Texture2D g_roughness : register(t3); +[[vk::binding(4, 1)]] TextureCube g_environment_map : register(t4); +[[vk::binding(5, 1)]] Buffer g_sobol_buffer : register(t5); +[[vk::binding(6, 1)]] Buffer g_ranking_tile_buffer : register(t6); +[[vk::binding(7, 1)]] Buffer g_scrambling_tile_buffer : register(t7); +[[vk::binding(8, 1)]] Buffer g_ray_list : register(t8); + +// Samplers: +[[vk::binding(9, 1)]] SamplerState g_linear_sampler : register(s0); +[[vk::binding(10, 1)]] SamplerState g_environment_map_sampler : register(s1); + +// Out: +[[vk::binding(11, 1)]] RWTexture2D g_intersection_result : register(u0); // reflection colors at the end of the intersect pass. +[[vk::binding(12, 1)]] RWTexture2D g_ray_lengths : register(u1); +[[vk::binding(13, 1)]] RWTexture2D g_denoised_reflections : register(u2); // Mirror reflections don't need to be denoised, the intersection pass can just write them to the final target. // Blue Noise Sampler by Eric Heitz. Returns a value in the range [0, 1]. float SampleRandomNumber(in uint pixel_i, in uint pixel_j, in uint sample_index, in uint sample_dimension) diff --git a/ffx-sssr/shaders/prepare_indirect_args.hlsl b/ffx-sssr/shaders/prepare_indirect_args.hlsl index f1f1b2a..e6afa0e 100644 --- a/ffx-sssr/shaders/prepare_indirect_args.hlsl +++ b/ffx-sssr/shaders/prepare_indirect_args.hlsl @@ -23,10 +23,13 @@ THE SOFTWARE. #ifndef FFX_SSSR_INDIRECT_ARGS #define FFX_SSSR_INDIRECT_ARGS -RWBuffer g_tile_counter : register(u0); -RWBuffer g_ray_counter : register(u1); -RWBuffer g_intersect_args : register(u2); -RWBuffer g_denoiser_args : register(u3); +// In/Out: +[[vk::binding(0, 1)]] RWBuffer g_tile_counter : register(u0); +[[vk::binding(1, 1)]] RWBuffer g_ray_counter : register(u1); + +// Out: +[[vk::binding(2, 1)]] RWBuffer g_intersect_args : register(u2); +[[vk::binding(3, 1)]] RWBuffer g_denoiser_args : register(u3); [numthreads(1, 1, 1)] void main() diff --git a/ffx-sssr/shaders/resolve_eaw.hlsl b/ffx-sssr/shaders/resolve_eaw.hlsl index 4f45d84..20a4e44 100644 --- a/ffx-sssr/shaders/resolve_eaw.hlsl +++ b/ffx-sssr/shaders/resolve_eaw.hlsl @@ -23,15 +23,15 @@ THE SOFTWARE. #ifndef FFX_SSSR_EAW_RESOLVE #define FFX_SSSR_EAW_RESOLVE -Texture2D g_normal : register(t0); -Texture2D g_roughness : register(t1); -Texture2D g_depth_buffer : register(t2); - -SamplerState g_linear_sampler : register(s0); - -RWTexture2D g_temporally_denoised_reflections : register(u0); -RWTexture2D g_denoised_reflections : register(u1); // Will hold the reflection colors at the end of the resolve pass. -RWBuffer g_tile_list : register(u2); +// In: +[[vk::binding(0, 1)]] Texture2D g_normal : register(t0); +[[vk::binding(1, 1)]] Texture2D g_roughness : register(t1); +[[vk::binding(2, 1)]] Texture2D g_depth_buffer : register(t2); +[[vk::binding(3, 1)]] Buffer g_tile_list : register(t3); + +// Out: +[[vk::binding(4, 1)]] RWTexture2D g_temporally_denoised_reflections : register(u0); +[[vk::binding(5, 1)]] RWTexture2D g_denoised_reflections : register(u1); // will hold the reflection colors at the end of the resolve pass. groupshared uint g_shared_0[12][12]; groupshared uint g_shared_1[12][12]; diff --git a/ffx-sssr/shaders/resolve_eaw_stride.hlsl b/ffx-sssr/shaders/resolve_eaw_stride.hlsl deleted file mode 100644 index 75fa1b5..0000000 --- a/ffx-sssr/shaders/resolve_eaw_stride.hlsl +++ /dev/null @@ -1,100 +0,0 @@ -/********************************************************************** -Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -********************************************************************/ - -#ifndef FFX_SSSR_EAW_RESOLVE -#define FFX_SSSR_EAW_RESOLVE - -Texture2D g_normal : register(t0); -Texture2D g_roughness : register(t1); -Texture2D g_depth_buffer : register(t2); - -SamplerState g_linear_sampler : register(s0); - -RWTexture2D g_temporally_denoised_reflections : register(u0); -RWTexture2D g_denoised_reflections : register(u1); // Will hold the reflection colors at the end of the resolve pass. -RWBuffer g_tile_list : register(u2); - -min16float3 LoadRadiance(int2 idx) -{ - return g_temporally_denoised_reflections.Load(int3(idx, 0)).xyz; -} - -min16float LoadRoughnessValue(int2 idx) -{ - return FfxSssrUnpackRoughness(g_roughness.Load(int3(idx, 0))); -} - -min16float GetRoughnessRadiusWeight(min16float roughness_p, min16float roughness_q, min16float dist) -{ - return 1.0 - smoothstep(10 * roughness_p, 500 * roughness_p, dist); -} - -min16float4 ResolveScreenspaceReflections(int2 did, min16float center_roughness) -{ - const min16float roughness_sigma_min = 0.001; - const min16float roughness_sigma_max = 0.01; - - min16float3 sum = 0.0; - min16float total_weight = 0.0; - - const int radius = 2; - for (int dy = -radius; dy <= radius; ++dy) - { - for (int dx = -radius; dx <= radius; ++dx) - { - int2 texel_coords = did + FFX_SSSR_EAW_STRIDE * int2(dx, dy); - - min16float3 radiance = LoadRadiance(texel_coords); - min16float roughness = LoadRoughnessValue(texel_coords); - - min16float weight = GetEdgeStoppingRoughnessWeightFP16(center_roughness, roughness, roughness_sigma_min, roughness_sigma_max) - * GetRoughnessRadiusWeight(center_roughness, roughness, length(texel_coords - did)); - sum += weight * radiance; - total_weight += weight; - } - } - - sum /= max(total_weight, 0.0001); - return min16float4(sum, 1); -} - -void Resolve(int2 did) -{ - min16float3 center_radiance = LoadRadiance(did); - min16float center_roughness = LoadRoughnessValue(did); - if (!IsGlossy(center_roughness) || IsMirrorReflection(center_roughness)) - { - return; - } - g_denoised_reflections[did.xy] = ResolveScreenspaceReflections(did, center_roughness); -} - -[numthreads(8, 8, 1)] -void main(uint2 group_thread_id : SV_GroupThreadID, uint group_id : SV_GroupID) -{ - uint packed_base_coords = g_tile_list[group_id]; - uint2 base_coords = Unpack(packed_base_coords); - uint2 coords = base_coords + group_thread_id; - Resolve((int2)coords); -} - -#endif // FFX_SSSR_EAW_RESOLVE \ No newline at end of file diff --git a/ffx-sssr/shaders/resolve_spatial.hlsl b/ffx-sssr/shaders/resolve_spatial.hlsl index 604bdd7..d40f823 100644 --- a/ffx-sssr/shaders/resolve_spatial.hlsl +++ b/ffx-sssr/shaders/resolve_spatial.hlsl @@ -23,18 +23,17 @@ THE SOFTWARE. #ifndef FFX_SSSR_SPATIAL_RESOLVE #define FFX_SSSR_SPATIAL_RESOLVE -Texture2D g_depth_buffer : register(t0); -Texture2D g_normal : register(t1); -Texture2D g_roughness : register(t2); - -SamplerState g_linear_sampler : register(s0); - -RWTexture2D g_spatially_denoised_reflections : register(u0); -RWTexture2D g_ray_lengths : register(u1); -RWTexture2D g_intersection_result : register(u2); // Reflection colors at the end of the intersect pass. -RWTexture2D g_has_ray : register(u3); -RWBuffer g_tile_list : register(u4); - +// In: +[[vk::binding(0, 1)]] Texture2D g_depth_buffer : register(t0); +[[vk::binding(1, 1)]] Texture2D g_normal : register(t1); +[[vk::binding(2, 1)]] Texture2D g_roughness : register(t2); +[[vk::binding(3, 1)]] Texture2D g_intersection_result : register(t3); // reflection colors at the end of the intersect pass. +[[vk::binding(4, 1)]] Texture2D g_has_ray : register(t4); +[[vk::binding(5, 1)]] Buffer g_tile_list : register(t5); + +// Out: +[[vk::binding(6, 1)]] RWTexture2D g_spatially_denoised_reflections : register(u0); +[[vk::binding(7, 1)]] RWTexture2D g_ray_lengths : register(u1); // Only really need 16x16 but 17x17 avoids bank conflicts. groupshared uint g_shared_0[17][17]; diff --git a/ffx-sssr/shaders/resolve_temporal.hlsl b/ffx-sssr/shaders/resolve_temporal.hlsl index 3e25b21..9ea6137 100644 --- a/ffx-sssr/shaders/resolve_temporal.hlsl +++ b/ffx-sssr/shaders/resolve_temporal.hlsl @@ -23,21 +23,21 @@ THE SOFTWARE. #ifndef FFX_SSSR_TEMPORAL_RESOLVE #define FFX_SSSR_TEMPORAL_RESOLVE -Texture2D g_normal : register(t0); -Texture2D g_roughness : register(t1); -Texture2D g_normal_history : register(t2); -Texture2D g_roughness_history : register(t3); -Texture2D g_depth_buffer : register(t4); -Texture2D g_motion_vectors : register(t5); - -SamplerState g_linear_sampler : register(s0); - -RWTexture2D g_temporally_denoised_reflections : register(u0); -RWTexture2D g_spatially_denoised_reflections : register(u1); -RWTexture2D g_temporal_variance : register(u2); -RWTexture2D g_temporally_denoised_reflections_history : register(u3); // Reflection colors at the end of the temporal resolve pass of the previous frame. -RWTexture2D g_ray_lengths : register(u4); -RWBuffer g_tile_list : register(u5); +// In: +[[vk::binding(0, 1)]] Texture2D g_normal : register(t0); +[[vk::binding(1, 1)]] Texture2D g_roughness : register(t1); +[[vk::binding(2, 1)]] Texture2D g_normal_history : register(t2); +[[vk::binding(3, 1)]] Texture2D g_roughness_history : register(t3); +[[vk::binding(4, 1)]] Texture2D g_depth_buffer : register(t4); +[[vk::binding(5, 1)]] Texture2D g_motion_vectors : register(t5); +[[vk::binding(6, 1)]] Texture2D g_temporally_denoised_reflections_history : register(t6); // reflection colors at the end of the temporal resolve pass of the previous frame. +[[vk::binding(7, 1)]] Texture2D g_ray_lengths : register(t7); +[[vk::binding(8, 1)]] Buffer g_tile_list : register(t8); + +// Out: +[[vk::binding(9, 1)]] RWTexture2D g_temporally_denoised_reflections : register(u0); +[[vk::binding(10, 1)]] RWTexture2D g_spatially_denoised_reflections : register(u1); // Technically still an input, but we have to keep it as UAV +[[vk::binding(11, 1)]] RWTexture2D g_temporal_variance : register(u2); // From "Temporal Reprojection Anti-Aliasing" // https://github.com/playdeadgames/temporal @@ -115,7 +115,7 @@ float3 EstimateStdDeviation(int2 did, RWTexture2D tex) return sqrt(max(color_std, 0.0)); } -float3 SampleRadiance(int2 texel_coords, RWTexture2D tex) +float3 SampleRadiance(int2 texel_coords, Texture2D tex) { return tex.Load(int3(texel_coords, 0)).xyz; } @@ -183,8 +183,8 @@ float4 ResolveScreenspaceReflections(int2 did, float2 uv, uint2 image_size, floa { float3 normal = LoadNormal(did, g_normal); float3 radiance = g_spatially_denoised_reflections.Load(did).xyz; - float3 radiance_history = g_temporally_denoised_reflections_history.Load(did).xyz; - float ray_length = g_ray_lengths.Load(did); + float3 radiance_history = g_temporally_denoised_reflections_history.Load(int3(did, 0)).xyz; + float ray_length = g_ray_lengths.Load(int3(did, 0)); // And clip it to the local neighborhood float2 motion_vector = FfxSssrUnpackMotionVectors(g_motion_vectors.Load(int3(did, 0))); diff --git a/ffx-sssr/src/context.cpp b/ffx-sssr/src/context.cpp index b6ead9e..d0f2c53 100644 --- a/ffx-sssr/src/context.cpp +++ b/ffx-sssr/src/context.cpp @@ -21,10 +21,15 @@ THE SOFTWARE. ********************************************************************/ #include "context.h" -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 #include "ffx_sssr_d3d12.h" #include "d3d12/context_d3d12.h" -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + #include "ffx_sssr_vk.h" + #include "vk/context_vk.h" +#endif // FFX_SSSR_VK namespace ffx_sssr { @@ -44,7 +49,7 @@ namespace ffx_sssr , reflection_view_projection_matrices_(create_context_info.maxReflectionViewCount) { // Create platform-specific context(s) -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (create_context_info.pD3D12CreateContextInfo) { if (!create_context_info.pD3D12CreateContextInfo->pDevice) @@ -54,7 +59,19 @@ namespace ffx_sssr context_d3d12_ = std::make_unique(*this, create_context_info); } -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (create_context_info.pVkCreateContextInfo) + { + if (!create_context_info.pVkCreateContextInfo->device) + { + throw reflection_error(*this, FFX_SSSR_STATUS_INVALID_VALUE, "device must not be VK_NULL_HANDLE, cannot create Vulkan context"); + } + + context_vk_ = std::make_unique(*this, create_context_info); + } +#endif // FFX_SSSR_VK } /** @@ -85,10 +102,15 @@ namespace ffx_sssr reflection_view_view_matrices_.Erase(ID(object_id)); reflection_view_projection_matrices_.Erase(ID(object_id)); -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (context_d3d12_) context_d3d12_->reflection_views_.Erase(ID(object_id)); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (context_vk_) + context_vk_->reflection_views_.Erase(ID(object_id)); +#endif // FFX_SSSR_VK reflection_view_id_dispenser_.FreeId(object_id); } @@ -139,10 +161,15 @@ namespace ffx_sssr */ void Context::CreateReflectionView(std::uint64_t reflection_view_id, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info) { -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (context_d3d12_ && create_reflection_view_info.pD3D12CreateReflectionViewInfo) context_d3d12_->CreateReflectionView(reflection_view_id, create_reflection_view_info); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (context_vk_ && create_reflection_view_info.pVkCreateReflectionViewInfo) + context_vk_->CreateReflectionView(reflection_view_id, create_reflection_view_info); +#endif // FFX_SSSR_VK } /** @@ -156,9 +183,13 @@ namespace ffx_sssr FFX_SSSR_ASSERT(reflection_view_view_matrices_.At(ID(reflection_view_id))); // not created properly? FFX_SSSR_ASSERT(reflection_view_projection_matrices_.At(ID(reflection_view_id))); -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 context_d3d12_->ResolveReflectionView(reflection_view_id, resolve_reflection_view_info); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + context_vk_->ResolveReflectionView(reflection_view_id, resolve_reflection_view_info); +#endif // FFX_SSSR_VK } /** @@ -171,10 +202,15 @@ namespace ffx_sssr { FFX_SSSR_ASSERT(IsOfType(reflection_view_id) && IsObjectValid(reflection_view_id)); -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (context_d3d12_) context_d3d12_->GetReflectionViewTileClassificationElapsedTime(reflection_view_id, elapsed_time); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (context_vk_) + context_vk_->GetReflectionViewTileClassificationElapsedTime(reflection_view_id, elapsed_time); +#endif // FFX_SSSR_VK } /** @@ -187,10 +223,15 @@ namespace ffx_sssr { FFX_SSSR_ASSERT(IsOfType(reflection_view_id) && IsObjectValid(reflection_view_id)); -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (context_d3d12_) context_d3d12_->GetReflectionViewIntersectionElapsedTime(reflection_view_id, elapsed_time); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (context_vk_) + context_vk_->GetReflectionViewIntersectionElapsedTime(reflection_view_id, elapsed_time); +#endif // FFX_SSSR_VK } /** @@ -203,9 +244,14 @@ namespace ffx_sssr { FFX_SSSR_ASSERT(IsOfType(reflection_view_id) && IsObjectValid(reflection_view_id)); -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 if (context_d3d12_) context_d3d12_->GetReflectionViewDenoisingElapsedTime(reflection_view_id, elapsed_time); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + if (context_vk_) + context_vk_->GetReflectionViewDenoisingElapsedTime(reflection_view_id, elapsed_time); +#endif // FFX_SSSR_VK } } diff --git a/ffx-sssr/src/context.h b/ffx-sssr/src/context.h index d4c5ab8..d5b7454 100644 --- a/ffx-sssr/src/context.h +++ b/ffx-sssr/src/context.h @@ -33,7 +33,8 @@ THE SOFTWARE. namespace ffx_sssr { class ContextD3D12; - + class ContextVK; + /** The Context class encapsulates the data for a single execution context. @@ -70,6 +71,9 @@ namespace ffx_sssr inline ContextD3D12* GetContextD3D12(); inline ContextD3D12 const* GetContextD3D12() const; + inline ContextVK* GetContextVK(); + inline ContextVK const* GetContextVK() const; + void CreateReflectionView(std::uint64_t reflection_view_id, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); void ResolveReflectionView(std::uint64_t reflection_view_id, FfxSssrResolveReflectionViewInfo const& resolve_reflection_view_info); @@ -77,8 +81,8 @@ namespace ffx_sssr inline void SetAPICall(char const* api_call); inline static char const* GetErrorName(FfxSssrStatus error); - inline void Error(FfxSssrStatus error, char const* format, ...); - inline void Error(FfxSssrStatus error, char const* format, va_list args); + inline void Error(FfxSssrStatus error, char const* format, ...) const; + inline void Error(FfxSssrStatus error, char const* format, va_list args) const; inline void AdvanceToNextFrame(); void GetReflectionViewTileClassificationElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const; @@ -92,6 +96,7 @@ namespace ffx_sssr protected: friend class ContextD3D12; + friend class ContextVK; static inline ResourceType GetResourceType(std::uint64_t object_id); static inline void SetResourceType(std::uint64_t& object_id, ResourceType resource_type); @@ -109,10 +114,15 @@ namespace ffx_sssr // The API call that is currently being executed. char const* api_call_; -#ifndef FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 // The Direct3D12 context object. std::unique_ptr context_d3d12_; -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + +#ifdef FFX_SSSR_VK + // The Direct3D12 context object. + std::unique_ptr context_vk_; +#endif // FFX_SSSR_VK // The list of reflection view identifiers. IdDispenser reflection_view_id_dispenser_; diff --git a/ffx-sssr/src/context.inl b/ffx-sssr/src/context.inl index 6d75dc7..5150f6f 100644 --- a/ffx-sssr/src/context.inl +++ b/ffx-sssr/src/context.inl @@ -1,3 +1,4 @@ +#include "context.h" /********************************************************************** Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. @@ -148,11 +149,11 @@ namespace ffx_sssr */ ContextD3D12* Context::GetContextD3D12() { -#ifdef FFX_SSSR_NO_D3D12 - return nullptr; -#else // FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 return context_d3d12_.get(); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + + return nullptr; } /** @@ -162,11 +163,39 @@ namespace ffx_sssr */ ContextD3D12 const* Context::GetContextD3D12() const { -#ifdef FFX_SSSR_NO_D3D12 - return nullptr; -#else // FFX_SSSR_NO_D3D12 +#ifdef FFX_SSSR_D3D12 return context_d3d12_.get(); -#endif // FFX_SSSR_NO_D3D12 +#endif // FFX_SSSR_D3D12 + + return nullptr; + } + + /** + Gets the Vulkan context. + + \return The Vulkan context. + */ + inline ContextVK * Context::GetContextVK() + { +#ifdef FFX_SSSR_VK + return context_vk_.get(); +#endif // FFX_SSSR_VK + + return nullptr; + } + + /** + Gets the Vulkan context. + + \return The Vulkan context. + */ + inline ContextVK const * Context::GetContextVK() const + { +#ifdef FFX_SSSR_VK + return context_vk_.get(); +#endif // FFX_SSSR_VK + + return nullptr; } /** @@ -225,7 +254,7 @@ namespace ffx_sssr \param format The format for the error message. \param ... The content of the error message. */ - void Context::Error(FfxSssrStatus error, char const* format, ...) + void Context::Error(FfxSssrStatus error, char const* format, ...) const { va_list args; va_start(args, format); @@ -240,7 +269,7 @@ namespace ffx_sssr \param format The format for the error message. \param args The content of the error message. */ - void Context::Error(FfxSssrStatus error, char const* format, va_list args) + void Context::Error(FfxSssrStatus error, char const* format, va_list args) const { char buffer[2048], message[2048]; diff --git a/ffx-sssr/src/d3d12/context_d3d12.cpp b/ffx-sssr/src/d3d12/context_d3d12.cpp index 144dbaf..1c550f1 100644 --- a/ffx-sssr/src/d3d12/context_d3d12.cpp +++ b/ffx-sssr/src/d3d12/context_d3d12.cpp @@ -34,7 +34,6 @@ THE SOFTWARE. #include "shader_intersect.h" #include "shader_prepare_indirect_args.h" #include "shader_resolve_eaw.h" -#include "shader_resolve_eaw_stride.h" #include "shader_resolve_spatial.h" #include "shader_resolve_temporal.h" @@ -69,6 +68,82 @@ namespace { _1::sobol_256spp_256d, _1::rankingTile, _1::scramblingTile }, { _2::sobol_256spp_256d, _2::rankingTile, _2::scramblingTile }, }; + + /** + Initializes the descriptor range. + + \param range_type The type of the descriptor range. + \param num_descriptors The number of descriptors in the range. + \param base_shader_register The base descriptor for the range in shader code. + \return The resulting descriptor range. + */ + inline D3D12_DESCRIPTOR_RANGE InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE range_type, std::uint32_t num_descriptors, std::uint32_t base_shader_register) + { + D3D12_DESCRIPTOR_RANGE descriptor_range = {}; + descriptor_range.RangeType = range_type; + descriptor_range.NumDescriptors = num_descriptors; + descriptor_range.BaseShaderRegister = base_shader_register; + descriptor_range.OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND; + return descriptor_range; + } + + /** + Initializes the root parameter as descriptor table. + + \param num_descriptor_ranges The number of descriptor ranges for this parameter. + \param descriptor_ranges The array of descriptor ranges for this parameter. + \return The resulting root parameter. + */ + inline D3D12_ROOT_PARAMETER InitAsDescriptorTable(std::uint32_t num_descriptor_ranges, D3D12_DESCRIPTOR_RANGE const* descriptor_ranges) + { + D3D12_ROOT_PARAMETER root_parameter = {}; + root_parameter.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + root_parameter.DescriptorTable.NumDescriptorRanges = num_descriptor_ranges; + root_parameter.DescriptorTable.pDescriptorRanges = descriptor_ranges; + root_parameter.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // CS + return root_parameter; + } + + /** + Initializes the root parameter as constant buffer view. + + \param shader_register The slot of this constant buffer view. + \return The resulting root parameter. + */ + inline D3D12_ROOT_PARAMETER InitAsConstantBufferView(std::uint32_t shader_register) + { + D3D12_ROOT_PARAMETER root_parameter = {}; + root_parameter.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; + root_parameter.Descriptor.RegisterSpace = 0; + root_parameter.Descriptor.ShaderRegister = shader_register; + root_parameter.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // CS + return root_parameter; + } + + /** + Initializes a linear sampler for a static sampler description. + + \param shader_register The slot of this sampler. + \return The resulting sampler description. + */ + inline D3D12_STATIC_SAMPLER_DESC InitLinearSampler(std::uint32_t shader_register) + { + D3D12_STATIC_SAMPLER_DESC samplerDesc = {}; + samplerDesc.Filter = D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; + samplerDesc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_ALWAYS; + samplerDesc.BorderColor = D3D12_STATIC_BORDER_COLOR_TRANSPARENT_BLACK; + samplerDesc.MinLOD = 0.0f; + samplerDesc.MaxLOD = D3D12_FLOAT32_MAX; + samplerDesc.MipLODBias = 0; + samplerDesc.MaxAnisotropy = 1; + samplerDesc.ShaderRegister = shader_register; + samplerDesc.RegisterSpace = 0; + samplerDesc.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // Compute + return samplerDesc; + } } namespace ffx_sssr @@ -85,75 +160,37 @@ namespace ffx_sssr , shader_compiler_(context) , samplers_were_populated_(false) , upload_buffer_(*this, create_context_info.uploadBufferSize) + , tile_classification_pass_() + , indirect_args_pass_() + , intersection_pass_() + , spatial_denoising_pass_() + , temporal_denoising_pass_() + , eaw_denoising_pass_() + , indirect_dispatch_command_signature_(nullptr) , reflection_views_(create_context_info.maxReflectionViewCount) { FFX_SSSR_ASSERT(device_ != nullptr); + CompileShaders(create_context_info); + CreateRootSignatures(); + CreatePipelineStates(); - struct - { - char const* shader_name_ = nullptr; - char const* content_ = nullptr; - char const* profile_ = nullptr; - DxcDefine additional_define_ = {}; - } - const shader_source[] = - { - { "prepare_indirect_args", prepare_indirect_args, "cs_6_0"}, - { "classify_tiles", classify_tiles, "cs_6_0"}, - { "intersect", intersect, "cs_6_0"}, - { "resolve_spatial", resolve_spatial, "cs_6_0"}, - { "resolve_temporal", resolve_temporal, "cs_6_0"}, - { "resolve_eaw", resolve_eaw, "cs_6_0"}, - { "resolve_eaw_stride", resolve_eaw_stride, "cs_6_0", {L"FFX_SSSR_EAW_STRIDE", L"2"}}, - { "resolve_eaw_stride", resolve_eaw_stride, "cs_6_0", {L"FFX_SSSR_EAW_STRIDE", L"4"}}, - }; - - auto const common_include = std::string(common); - - DxcDefine defines[11]; - defines[0].Name = L"FFX_SSSR_ROUGHNESS_TEXTURE_FORMAT"; - defines[0].Value = create_context_info.pRoughnessTextureFormat; - defines[1].Name = L"FFX_SSSR_ROUGHNESS_UNPACK_FUNCTION"; - defines[1].Value = create_context_info.pUnpackRoughnessSnippet; - defines[2].Name = L"FFX_SSSR_NORMALS_TEXTURE_FORMAT"; - defines[2].Value = create_context_info.pNormalsTextureFormat; - defines[3].Name = L"FFX_SSSR_NORMALS_UNPACK_FUNCTION"; - defines[3].Value = create_context_info.pUnpackNormalsSnippet; - defines[4].Name = L"FFX_SSSR_MOTION_VECTOR_TEXTURE_FORMAT"; - defines[4].Value = create_context_info.pMotionVectorFormat; - defines[5].Name = L"FFX_SSSR_MOTION_VECTOR_UNPACK_FUNCTION"; - defines[5].Value = create_context_info.pUnpackMotionVectorsSnippet; - defines[6].Name = L"FFX_SSSR_DEPTH_TEXTURE_FORMAT"; - defines[6].Value = create_context_info.pDepthTextureFormat; - defines[7].Name = L"FFX_SSSR_DEPTH_UNPACK_FUNCTION"; - defines[7].Value = create_context_info.pUnpackDepthSnippet; - defines[8].Name = L"FFX_SSSR_SCENE_TEXTURE_FORMAT"; - defines[8].Value = create_context_info.pSceneTextureFormat; - defines[9].Name = L"FFX_SSSR_SCENE_RADIANCE_UNPACK_FUNCTION"; - defines[9].Value = create_context_info.pUnpackSceneRadianceSnippet; - - static_assert(FFX_SSSR_ARRAY_SIZE(shader_source) == kShader_Count, "'kShader_Count' filenames must be provided for building the various shaders"); - std::stringstream shader_content; - for (auto i = 0u; i < kShader_Count; ++i) + // Create command signature for indirect arguments { - auto const shader = static_cast(i); - defines[10] = shader_source[i].additional_define_; - ShaderKey const shader_key(shader, 0ull); - if (shaders_.find(shader_key) == shaders_.end()) + D3D12_INDIRECT_ARGUMENT_DESC dispatch = {}; + dispatch.Type = D3D12_INDIRECT_ARGUMENT_TYPE_DISPATCH; + + D3D12_COMMAND_SIGNATURE_DESC desc = {}; + desc.ByteStride = sizeof(D3D12_DISPATCH_ARGUMENTS); + desc.NodeMask = 0; + desc.NumArgumentDescs = 1; + desc.pArgumentDescs = &dispatch; + + HRESULT hr; + hr = device_->CreateCommandSignature(&desc, nullptr, IID_PPV_ARGS(&indirect_dispatch_command_signature_)); + if (!SUCCEEDED(hr)) { - // Append common includes - shader_content.str(std::string()); - shader_content.clear(); - shader_content << common << std::endl << shader_source[i].content_; - shaders_[shader_key] = shader_compiler_.CompileShaderString( - shader_content.str().c_str(), - static_cast(shader_content.str().size()), - shader_source[i].shader_name_, - shader_source[i].profile_, - nullptr, 0, - defines, FFX_SSSR_ARRAY_SIZE(defines)); + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create command signature for indirect dispatch."); } - FFX_SSSR_ASSERT(shaders_[shader_key]); // should never happen as compile throws in case of failure } // Create our blue noise samplers @@ -276,6 +313,9 @@ namespace ffx_sssr */ ContextD3D12::~ContextD3D12() { + if (indirect_dispatch_command_signature_) + indirect_dispatch_command_signature_->Release(); + indirect_dispatch_command_signature_ = nullptr; } /** @@ -401,6 +441,283 @@ namespace ffx_sssr reflection_views_[ID(reflection_view_id)].Resolve(context_, reflection_view, resolve_reflection_view_info); } + void ContextD3D12::CompileShaders(FfxSssrCreateContextInfo const& create_context_info) + { + struct + { + char const* shader_name_ = nullptr; + char const* content_ = nullptr; + char const* profile_ = nullptr; + } + const shader_source[] = + { + { "prepare_indirect_args", prepare_indirect_args, "cs_6_0"}, + { "classify_tiles", classify_tiles, "cs_6_0"}, + { "intersect", intersect, "cs_6_0"}, + { "resolve_spatial", resolve_spatial, "cs_6_0"}, + { "resolve_temporal", resolve_temporal, "cs_6_0"}, + { "resolve_eaw", resolve_eaw, "cs_6_0"}, + }; + + auto const common_include = std::string(common); + + DxcDefine defines[10]; + defines[0].Name = L"FFX_SSSR_ROUGHNESS_TEXTURE_FORMAT"; + defines[0].Value = create_context_info.pRoughnessTextureFormat; + defines[1].Name = L"FFX_SSSR_ROUGHNESS_UNPACK_FUNCTION"; + defines[1].Value = create_context_info.pUnpackRoughnessSnippet; + defines[2].Name = L"FFX_SSSR_NORMALS_TEXTURE_FORMAT"; + defines[2].Value = create_context_info.pNormalsTextureFormat; + defines[3].Name = L"FFX_SSSR_NORMALS_UNPACK_FUNCTION"; + defines[3].Value = create_context_info.pUnpackNormalsSnippet; + defines[4].Name = L"FFX_SSSR_MOTION_VECTOR_TEXTURE_FORMAT"; + defines[4].Value = create_context_info.pMotionVectorFormat; + defines[5].Name = L"FFX_SSSR_MOTION_VECTOR_UNPACK_FUNCTION"; + defines[5].Value = create_context_info.pUnpackMotionVectorsSnippet; + defines[6].Name = L"FFX_SSSR_DEPTH_TEXTURE_FORMAT"; + defines[6].Value = create_context_info.pDepthTextureFormat; + defines[7].Name = L"FFX_SSSR_DEPTH_UNPACK_FUNCTION"; + defines[7].Value = create_context_info.pUnpackDepthSnippet; + defines[8].Name = L"FFX_SSSR_SCENE_TEXTURE_FORMAT"; + defines[8].Value = create_context_info.pSceneTextureFormat; + defines[9].Name = L"FFX_SSSR_SCENE_RADIANCE_UNPACK_FUNCTION"; + defines[9].Value = create_context_info.pUnpackSceneRadianceSnippet; + + static_assert(FFX_SSSR_ARRAY_SIZE(shader_source) == kShader_Count, "'kShader_Count' filenames must be provided for building the various shaders"); + std::stringstream shader_content; + for (auto i = 0u; i < kShader_Count; ++i) + { + // Append common includes + shader_content.str(std::string()); + shader_content.clear(); + shader_content << common << std::endl << shader_source[i].content_; + + shaders_[i] = shader_compiler_.CompileShaderString( + shader_content.str().c_str(), + static_cast(shader_content.str().size()), + shader_source[i].shader_name_, + shader_source[i].profile_, + nullptr, 0, + defines, FFX_SSSR_ARRAY_SIZE(defines)); + } + } + + void ContextD3D12::CreateRootSignatures() + { + auto CreateRootSignature = [this]( + ShaderPass& pass + , const LPCWSTR name + , std::uint32_t num_descriptor_ranges + , D3D12_DESCRIPTOR_RANGE const* descriptor_ranges + ) { + + D3D12_DESCRIPTOR_RANGE environment_map_sampler_range = {}; + environment_map_sampler_range.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER; + environment_map_sampler_range.NumDescriptors = 1; + environment_map_sampler_range.BaseShaderRegister = 1; + environment_map_sampler_range.OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND; + + D3D12_ROOT_PARAMETER root[] = { + InitAsDescriptorTable(num_descriptor_ranges, descriptor_ranges), + InitAsConstantBufferView(0), + InitAsDescriptorTable(1, &environment_map_sampler_range), // g_environment_map_sampler + }; + + D3D12_STATIC_SAMPLER_DESC sampler_descs[] = { InitLinearSampler(0) }; // g_linear_sampler + + D3D12_ROOT_SIGNATURE_DESC rs_desc = {}; + rs_desc.NumParameters = FFX_SSSR_ARRAY_SIZE(root); + rs_desc.pParameters = root; + rs_desc.NumStaticSamplers = FFX_SSSR_ARRAY_SIZE(sampler_descs); + rs_desc.pStaticSamplers = sampler_descs; + + HRESULT hr; + ID3DBlob* rs, * rsError; + hr = D3D12SerializeRootSignature(&rs_desc, D3D_ROOT_SIGNATURE_VERSION_1, &rs, &rsError); + if (FAILED(hr)) + { + if (rsError) + { + std::string const error_message(static_cast(rsError->GetBufferPointer())); + rsError->Release(); + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to serialize root signature:\r\n> %s", error_message.c_str()); + } + else + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to serialize root signature"); + } + } + + hr = GetDevice()->CreateRootSignature(0, rs->GetBufferPointer(), rs->GetBufferSize(), IID_PPV_ARGS(&pass.root_signature_)); + rs->Release(); + if (FAILED(hr)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create root signature."); + } + + pass.root_signature_->SetName(name); + pass.descriptor_count_ = num_descriptor_ranges; + }; + + // Assemble the shader pass for tile classification + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_roughness + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_tile_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_tile_counter + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_ray_counter + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 4), // g_temporally_denoised_reflections + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 5), // g_temporally_denoised_reflections_history + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 6), // g_ray_lengths + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 7), // g_temporal_variance + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 8), // g_denoised_reflections + }; + CreateRootSignature(tile_classification_pass_, L"SSSR Tile Classification Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + + // Assemble the shader pass that prepares the indirect arguments + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_tile_counter + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_counter + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_intersect_args + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_denoiser_args + }; + CreateRootSignature(indirect_args_pass_, L"SSSR Indirect Arguments Pass Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + + // Assemble the shader pass for intersecting reflection rays with the depth buffer + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_lit_scene + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_depth_buffer_hierarchy + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_normal + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_roughness + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 4), // g_environment_map + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 5), // g_sobol_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 6), // g_ranking_tile_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 7), // g_scrambling_tile_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 8), // g_ray_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_intersection_result + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_lengths + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_denoised_reflections + + }; + CreateRootSignature(intersection_pass_, L"SSSR Depth Buffer Intersection Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + + // Assemble the shader pass for spatial resolve + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_depth_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_normal + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_roughness + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_intersection_result + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 4), // g_has_ray + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 5), // g_tile_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_spatially_denoised_reflections + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_lengths + }; + CreateRootSignature(spatial_denoising_pass_, L"SSSR Spatial Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + + // Assemble the shader pass for temporal resolve + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_normal + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_roughness + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_normal_history + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_roughness_history + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 4), // g_depth_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 5), // g_motion_vectors + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 6), // g_temporally_denoised_reflections_history + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 7), // g_ray_lengths + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 8), // g_tile_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_temporally_denoised_reflections + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_spatially_denoised_reflections + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_temporal_variance + }; + CreateRootSignature(temporal_denoising_pass_, L"SSSR Temporal Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + + // Assemble the shader pass for EAW resolve + { + D3D12_DESCRIPTOR_RANGE ranges[] = { + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_normal + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_roughness + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_depth_buffer + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_tile_list + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_temporally_denoised_reflections + InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_denoised_reflections + }; + CreateRootSignature(eaw_denoising_pass_, L"SSSR EAW Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); + } + } + + void ContextD3D12::CreatePipelineStates() + { + auto Compile = [this](ShaderPass& pass, ContextD3D12::Shader shader, const LPCWSTR name) { + FFX_SSSR_ASSERT(pass.root_signature_ != nullptr); + + // Create the pipeline state object + D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_state_desc = {}; + pipeline_state_desc.pRootSignature = pass.root_signature_; + pipeline_state_desc.CS = GetShader(shader); + + HRESULT hr = GetDevice()->CreateComputePipelineState(&pipeline_state_desc, + IID_PPV_ARGS(&pass.pipeline_state_)); + if (!SUCCEEDED(hr)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create compute pipeline state"); + } + + pass.pipeline_state_->SetName(name); + }; + + Compile(tile_classification_pass_, ContextD3D12::kShader_TileClassification, L"SSSR Tile Classification Pipeline"); + Compile(indirect_args_pass_, ContextD3D12::kShader_IndirectArguments, L"SSSR Indirect Arguments Pipeline"); + Compile(intersection_pass_, ContextD3D12::kShader_Intersection, L"SSSR Intersect Pipeline"); + Compile(spatial_denoising_pass_, ContextD3D12::kShader_SpatialResolve, L"SSSR Spatial Resolve Pipeline"); + Compile(temporal_denoising_pass_, ContextD3D12::kShader_TemporalResolve, L"SSSR Temporal Resolve Pipeline"); + Compile(eaw_denoising_pass_, ContextD3D12::kShader_EAWResolve, L"SSSR EAW Resolve Pipeline"); + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetTileClassificationPass() const + { + return tile_classification_pass_; + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetIndirectArgsPass() const + { + return indirect_args_pass_; + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetIntersectionPass() const + { + return intersection_pass_; + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetSpatialDenoisingPass() const + { + return spatial_denoising_pass_; + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetTemporalDenoisingPass() const + { + return temporal_denoising_pass_; + } + + const ContextD3D12::ShaderPass& ContextD3D12::GetEawDenoisingPass() const + { + return eaw_denoising_pass_; + } + + ID3D12CommandSignature* ContextD3D12::GetIndirectDispatchCommandSignature() + { + return indirect_dispatch_command_signature_; + } + /** Allocate a buffer resource to use as a shader resource view. diff --git a/ffx-sssr/src/d3d12/context_d3d12.h b/ffx-sssr/src/d3d12/context_d3d12.h index 3d5d4e0..49fc0e7 100644 --- a/ffx-sssr/src/d3d12/context_d3d12.h +++ b/ffx-sssr/src/d3d12/context_d3d12.h @@ -21,7 +21,7 @@ THE SOFTWARE. ********************************************************************/ #pragma once -#include +#include #include #include @@ -54,8 +54,6 @@ namespace ffx_sssr kShader_SpatialResolve, kShader_TemporalResolve, kShader_EAWResolve, - kShader_EAWResolve_Stride_2, - kShader_EAWResolve_Stride_4, kShader_Count }; @@ -87,23 +85,44 @@ namespace ffx_sssr friend class ReflectionViewD3D12; /** - The ShaderKey class allows to look up a specific shader for a given set of switches. + The ShaderPass class holds the data for an individual shader pass. */ - class ShaderKey + class ShaderPass { + FFX_SSSR_NON_COPYABLE(ShaderPass); + public: - inline ShaderKey(); - inline ShaderKey(Shader shader); - inline ShaderKey(Shader shader, std::uint64_t switches); + inline ShaderPass(); + inline ~ShaderPass(); + + inline operator bool() const; - inline bool operator <(ShaderKey const& other) const; + inline ShaderPass(ShaderPass&& other) noexcept; + inline ShaderPass& operator =(ShaderPass&& other) noexcept; - // The shader to be used. - Shader shader_; - // The set of switches to be used. - std::uint64_t switches_; + inline void SafeRelease(); + + // The pipeline state object. + ID3D12PipelineState* pipeline_state_; + // The root signature to be used. + ID3D12RootSignature* root_signature_; + // The number of descriptors in the root signature. + std::uint32_t descriptor_count_; }; + void CompileShaders(FfxSssrCreateContextInfo const& create_context_info); + void CreateRootSignatures(); + void CreatePipelineStates(); + + const ShaderPass& GetTileClassificationPass() const; + const ShaderPass& GetIndirectArgsPass() const; + const ShaderPass& GetIntersectionPass() const; + const ShaderPass& GetSpatialDenoisingPass() const; + const ShaderPass& GetTemporalDenoisingPass() const; + const ShaderPass& GetEawDenoisingPass() const; + + ID3D12CommandSignature* GetIndirectDispatchCommandSignature(); + bool AllocateSRVBuffer(std::size_t buffer_size, ID3D12Resource** resource, D3D12_RESOURCE_STATES initial_resource_state, wchar_t const* resource_name = nullptr) const; bool AllocateUAVBuffer(std::size_t buffer_size, ID3D12Resource** resource, D3D12_RESOURCE_STATES initial_resource_state, wchar_t const* resource_name = nullptr) const; bool AllocateReadbackBuffer(std::size_t buffer_size, ID3D12Resource** resource, D3D12_RESOURCE_STATES initial_resource_state, wchar_t const* resource_name = nullptr) const; @@ -113,7 +132,7 @@ namespace ffx_sssr // The device to be used. ID3D12Device* device_; // The compiled reflections shaders. - std::map shaders_; + std::array shaders_; // The compiler to be used for building the Direct3D12 shaders. ShaderCompilerD3D12 shader_compiler_; // The Blue Noise sampler optimized for 1 sample per pixel. @@ -126,6 +145,22 @@ namespace ffx_sssr UploadBufferD3D12 upload_buffer_; // The array of reflection views to be resolved. SparseArray reflection_views_; + + // The shader pass that classifies tiles. + ShaderPass tile_classification_pass_; + // The shader pass that prepares the indirect arguments. + ShaderPass indirect_args_pass_; + // The shader pass intersecting reflection rays with the depth buffer. + ShaderPass intersection_pass_; + // The shader pass that does spatial denoising. + ShaderPass spatial_denoising_pass_; + // The shader pass that does temporal denoising. + ShaderPass temporal_denoising_pass_; + // The shader pass that does the second spatial denoising. + ShaderPass eaw_denoising_pass_; + + // The command signature for the indirect dispatches. + ID3D12CommandSignature* indirect_dispatch_command_signature_; }; } diff --git a/ffx-sssr/src/d3d12/context_d3d12.inl b/ffx-sssr/src/d3d12/context_d3d12.inl index 948a4e4..a88bb65 100644 --- a/ffx-sssr/src/d3d12/context_d3d12.inl +++ b/ffx-sssr/src/d3d12/context_d3d12.inl @@ -24,50 +24,6 @@ THE SOFTWARE. namespace ffx_sssr { - /** - The constructor for the ShaderKey class. - */ - ContextD3D12::ShaderKey::ShaderKey() - : shader_(kShader_Count) - , switches_(0ull) - { - } - - /** - The constructor for the ShaderKey class. - - \param shader The shader to be used. - */ - ContextD3D12::ShaderKey::ShaderKey(Shader shader) - : shader_(shader) - , switches_(0ull) - { - } - - /** - The constructor for the ShaderKey class. - - \param shader The shader to be used. - \param switches The set of switches to be used. - */ - ContextD3D12::ShaderKey::ShaderKey(Shader shader, std::uint64_t switches) - : shader_(shader) - , switches_(switches) - { - } - - /** - Compares the two shader keys. - - \param other The shader key to be comparing with. - \return true if the shader key is less than the other one. - */ - bool ContextD3D12::ShaderKey::operator <(ShaderKey const& other) const - { - if (shader_ != other.shader_) - return (shader_ < other.shader_); - return switches_ < other.switches_; - } /** Gets the context. @@ -113,16 +69,12 @@ namespace ffx_sssr Gets the shader. \param shader The shader to be retrieved. - \param switches The set of switches to be used. \return The requested shader. */ ShaderD3D12 const& ContextD3D12::GetShader(Shader shader) const { FFX_SSSR_ASSERT(shader < kShader_Count); - ShaderKey const shader_key(shader, 0ull); - auto const it = shaders_.find(shader_key); - FFX_SSSR_ASSERT(it != shaders_.end()); - return (*it).second; + return shaders_[shader]; } /** @@ -188,4 +140,86 @@ namespace ffx_sssr throw reflection_error(context, FFX_SSSR_STATUS_INVALID_VALUE, "No command list was supplied, cannot encode device commands"); return command_list; } + + + /** + The constructor for the ShaderPass class. + */ + ContextD3D12::ShaderPass::ShaderPass() + : pipeline_state_(nullptr) + , root_signature_(nullptr) + , descriptor_count_(0) + { + } + + /** + The constructor for the ShaderPass class. + + \param other The shader pass to be moved. + */ + ContextD3D12::ShaderPass::ShaderPass(ShaderPass&& other) noexcept + : pipeline_state_(other.pipeline_state_) + , root_signature_(other.root_signature_) + , descriptor_count_(other.descriptor_count_) + { + other.pipeline_state_ = nullptr; + other.root_signature_ = nullptr; + other.descriptor_count_ = 0; + } + + /** + The destructor for the ShaderPass class. + */ + ContextD3D12::ShaderPass::~ShaderPass() + { + SafeRelease(); + } + + /** + Assigns the shader pass. + + \param other The shader pass to be moved. + \return The assigned shader pass. + */ + ContextD3D12::ShaderPass& ContextD3D12::ShaderPass::operator =(ShaderPass&& other) noexcept + { + if (this != &other) + { + pipeline_state_ = other.pipeline_state_; + root_signature_ = other.root_signature_; + descriptor_count_ = other.descriptor_count_; + + other.pipeline_state_ = nullptr; + other.root_signature_ = nullptr; + descriptor_count_ = 0; + } + + return *this; + } + + /** + Releases the shader pass. + */ + inline void ContextD3D12::ShaderPass::SafeRelease() + { + if (pipeline_state_) + pipeline_state_->Release(); + pipeline_state_ = nullptr; + + if (root_signature_) + root_signature_->Release(); + root_signature_ = nullptr; + + descriptor_count_ = 0; + } + + /** + Checks whether the shader pass is valid. + + \return true if the shader pass is valid, false otherwise. + */ + ContextD3D12::ShaderPass::operator bool() const + { + return (pipeline_state_ && root_signature_); + } } diff --git a/ffx-sssr/src/d3d12/reflection_view_d3d12.cpp b/ffx-sssr/src/d3d12/reflection_view_d3d12.cpp index 91f587d..221c98a 100644 --- a/ffx-sssr/src/d3d12/reflection_view_d3d12.cpp +++ b/ffx-sssr/src/d3d12/reflection_view_d3d12.cpp @@ -31,86 +31,6 @@ THE SOFTWARE. #include "ffx_sssr_d3d12.h" #include "descriptor_heap_d3d12.h" -namespace -{ - /** - Initializes the descriptor range. - - \param range_type The type of the descriptor range. - \param num_descriptors The number of descriptors in the range. - \param base_shader_register The base descriptor for the range in shader code. - \return The resulting descriptor range. - */ - inline D3D12_DESCRIPTOR_RANGE InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE range_type, std::uint32_t num_descriptors, std::uint32_t base_shader_register) - { - D3D12_DESCRIPTOR_RANGE descriptor_range = {}; - descriptor_range.RangeType = range_type; - descriptor_range.NumDescriptors = num_descriptors; - descriptor_range.BaseShaderRegister = base_shader_register; - descriptor_range.OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND; - return descriptor_range; - } - - /** - Initializes the root parameter as descriptor table. - - \param num_descriptor_ranges The number of descriptor ranges for this parameter. - \param descriptor_ranges The array of descriptor ranges for this parameter. - \return The resulting root parameter. - */ - inline D3D12_ROOT_PARAMETER InitAsDescriptorTable(std::uint32_t num_descriptor_ranges, D3D12_DESCRIPTOR_RANGE const* descriptor_ranges) - { - D3D12_ROOT_PARAMETER root_parameter = {}; - root_parameter.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; - root_parameter.DescriptorTable.NumDescriptorRanges = num_descriptor_ranges; - root_parameter.DescriptorTable.pDescriptorRanges = descriptor_ranges; - root_parameter.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // CS - return root_parameter; - } - - /** - Initializes the root parameter as constant buffer view. - - \param shader_register The slot of this constant buffer view. - \return The resulting root parameter. - */ - inline D3D12_ROOT_PARAMETER InitAsConstantBufferView(std::uint32_t shader_register) - { - D3D12_ROOT_PARAMETER root_parameter = {}; - root_parameter.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; - root_parameter.Descriptor.RegisterSpace = 0; - root_parameter.Descriptor.ShaderRegister = shader_register; - root_parameter.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // CS - return root_parameter; - } - - - /** - Initializes a linear sampler for a static sampler description. - - \param shader_register The slot of this sampler. - \return The resulting sampler description. - */ - inline D3D12_STATIC_SAMPLER_DESC InitLinearSampler(std::uint32_t shader_register) - { - D3D12_STATIC_SAMPLER_DESC samplerDesc = {}; - samplerDesc.Filter = D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; - samplerDesc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; - samplerDesc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; - samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; - samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_ALWAYS; - samplerDesc.BorderColor = D3D12_STATIC_BORDER_COLOR_TRANSPARENT_BLACK; - samplerDesc.MinLOD = 0.0f; - samplerDesc.MaxLOD = D3D12_FLOAT32_MAX; - samplerDesc.MipLODBias = 0; - samplerDesc.MaxAnisotropy = 1; - samplerDesc.ShaderRegister = shader_register; - samplerDesc.RegisterSpace = 0; - samplerDesc.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // Compute - return samplerDesc; - } -} - namespace ffx_sssr { /** @@ -121,23 +41,7 @@ namespace ffx_sssr , height_(0) , flags_(0) , descriptor_heap_cbv_srv_uav_(nullptr) - , tile_classification_pass_() - , descriptor_count_tile_classification_(0) - , indirect_args_pass_() - , descriptor_count_indirect_args_(0) - , intersection_pass_() - , descriptor_count_intersection_(0) - , spatial_denoising_pass_() - , descriptor_count_spatial_(0) - , temporal_denoising_pass_() - , descriptor_count_temporal_(0) - , eaw_denoising_pass_() - , eaw_stride_2_denoising_pass_() - , eaw_stride_4_denoising_pass_() - , descriptor_count_eaw_(0) - , descriptor_count_eaw_stride_2_(0) - , descriptor_count_eaw_stride_4_(0) - , indirect_dispatch_command_signature_(nullptr) + , descriptor_heap_samplers_(nullptr) , resource_heap_(nullptr) , tile_list_(nullptr) , tile_counter_(nullptr) @@ -148,9 +52,6 @@ namespace ffx_sssr , temporal_denoiser_result_() , ray_lengths_(nullptr) , temporal_variance_(nullptr) - , sobol_buffer_() - , ranking_tile_buffer_() - , scrambling_tile_buffer_() , tile_classification_elapsed_time_(0) , intersection_elapsed_time_(0) , denoising_elapsed_time_(0) @@ -165,8 +66,7 @@ namespace ffx_sssr , spatial_denoising_descriptor_table_() , temporal_denoising_descriptor_table_() , eaw_denoising_descriptor_table_() - , eaw_stride_2_denoising_descriptor_table_() - , eaw_stride_4_denoising_descriptor_table_() + , sampler_descriptor_table_() , prev_view_projection_() { } @@ -180,15 +80,8 @@ namespace ffx_sssr : width_(other.width_) , height_(other.height_) , flags_(other.flags_) - , indirect_args_pass_(std::move(other.indirect_args_pass_)) - , tile_classification_pass_(std::move(other.tile_classification_pass_)) - , intersection_pass_(std::move(other.intersection_pass_)) - , spatial_denoising_pass_(std::move(other.spatial_denoising_pass_)) - , temporal_denoising_pass_(std::move(other.temporal_denoising_pass_)) - , eaw_denoising_pass_(std::move(other.eaw_denoising_pass_)) - , eaw_stride_2_denoising_pass_(std::move(other.eaw_stride_2_denoising_pass_)) - , eaw_stride_4_denoising_pass_(std::move(other.eaw_stride_4_denoising_pass_)) , descriptor_heap_cbv_srv_uav_(other.descriptor_heap_cbv_srv_uav_) + , descriptor_heap_samplers_(other.descriptor_heap_samplers_) , tile_classification_elapsed_time_(other.tile_classification_elapsed_time_) , intersection_elapsed_time_(other.intersection_elapsed_time_) , denoising_elapsed_time_(other.denoising_elapsed_time_) @@ -196,18 +89,6 @@ namespace ffx_sssr , timestamp_query_buffer_(other.timestamp_query_buffer_) , timestamp_queries_(std::move(other.timestamp_queries_)) , timestamp_queries_index_(other.timestamp_queries_index_) - , sobol_buffer_(other.sobol_buffer_) - , ranking_tile_buffer_(other.ranking_tile_buffer_) - , scrambling_tile_buffer_(other.scrambling_tile_buffer_) - , descriptor_count_tile_classification_(other.descriptor_count_tile_classification_) - , descriptor_count_indirect_args_(other.descriptor_count_indirect_args_) - , descriptor_count_intersection_(other.descriptor_count_intersection_) - , descriptor_count_spatial_(other.descriptor_count_spatial_) - , descriptor_count_temporal_(other.descriptor_count_temporal_) - , descriptor_count_eaw_(other.descriptor_count_eaw_) - , descriptor_count_eaw_stride_2_(other.descriptor_count_eaw_stride_2_) - , descriptor_count_eaw_stride_4_(other.descriptor_count_eaw_stride_4_) - , indirect_dispatch_command_signature_(other.indirect_dispatch_command_signature_) , resource_heap_(other.resource_heap_) , tile_list_(other.tile_list_) , tile_counter_(other.tile_counter_) @@ -223,6 +104,7 @@ namespace ffx_sssr other.timestamp_query_heap_ = nullptr; other.timestamp_query_buffer_ = nullptr; other.descriptor_heap_cbv_srv_uav_ = nullptr; + other.descriptor_heap_samplers_ = nullptr; for (int i = 0; i < 2; ++i) { @@ -233,29 +115,10 @@ namespace ffx_sssr spatial_denoising_descriptor_table_[i] = other.spatial_denoising_descriptor_table_[i]; temporal_denoising_descriptor_table_[i] = other.temporal_denoising_descriptor_table_[i]; eaw_denoising_descriptor_table_[i] = other.eaw_denoising_descriptor_table_[i]; - eaw_stride_2_denoising_descriptor_table_[i] = other.eaw_stride_2_denoising_descriptor_table_[i]; - eaw_stride_4_denoising_descriptor_table_[i] = other.eaw_stride_4_denoising_descriptor_table_[i]; - other.temporal_denoiser_result_[i] = nullptr; } + sampler_descriptor_table_ = other.sampler_descriptor_table_; - other.tile_classification_pass_.root_signature_ = nullptr; - other.tile_classification_pass_.pipeline_state_ = nullptr; - other.indirect_args_pass_.root_signature_ = nullptr; - other.indirect_args_pass_.pipeline_state_ = nullptr; - other.intersection_pass_.root_signature_ = nullptr; - other.intersection_pass_.pipeline_state_ = nullptr; - other.spatial_denoising_pass_.root_signature_ = nullptr; - other.spatial_denoising_pass_.pipeline_state_ = nullptr; - other.temporal_denoising_pass_.root_signature_ = nullptr; - other.temporal_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_denoising_pass_.root_signature_ = nullptr; - other.eaw_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_stride_2_denoising_pass_.root_signature_ = nullptr; - other.eaw_stride_2_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_stride_4_denoising_pass_.root_signature_ = nullptr; - other.eaw_stride_4_denoising_pass_.pipeline_state_ = nullptr; - other.indirect_dispatch_command_signature_ = nullptr; other.resource_heap_ = nullptr; other.tile_list_ = nullptr; other.tile_counter_ = nullptr; @@ -290,24 +153,9 @@ namespace ffx_sssr width_ = other.width_; height_ = other.height_; flags_ = other.flags_; - indirect_args_pass_.root_signature_ = other.indirect_args_pass_.root_signature_; - indirect_args_pass_.pipeline_state_ = other.indirect_args_pass_.pipeline_state_; - tile_classification_pass_.root_signature_ = other.tile_classification_pass_.root_signature_; - tile_classification_pass_.pipeline_state_ = other.tile_classification_pass_.pipeline_state_; - intersection_pass_.root_signature_ = other.intersection_pass_.root_signature_; - intersection_pass_.pipeline_state_ = other.intersection_pass_.pipeline_state_; - spatial_denoising_pass_.root_signature_ = other.spatial_denoising_pass_.root_signature_; - spatial_denoising_pass_.pipeline_state_ = other.spatial_denoising_pass_.pipeline_state_; - temporal_denoising_pass_.root_signature_ = other.temporal_denoising_pass_.root_signature_; - temporal_denoising_pass_.pipeline_state_ = other.temporal_denoising_pass_.pipeline_state_; - eaw_denoising_pass_.root_signature_ = other.eaw_denoising_pass_.root_signature_; - eaw_denoising_pass_.pipeline_state_ = other.eaw_denoising_pass_.pipeline_state_; - eaw_stride_2_denoising_pass_.root_signature_ = other.eaw_stride_2_denoising_pass_.root_signature_; - eaw_stride_2_denoising_pass_.pipeline_state_ = other.eaw_stride_2_denoising_pass_.pipeline_state_; - eaw_stride_4_denoising_pass_.root_signature_ = other.eaw_stride_4_denoising_pass_.root_signature_; - eaw_stride_4_denoising_pass_.pipeline_state_ = other.eaw_stride_4_denoising_pass_.pipeline_state_; descriptor_heap_cbv_srv_uav_ = other.descriptor_heap_cbv_srv_uav_; + descriptor_heap_samplers_ = other.descriptor_heap_samplers_; tile_classification_elapsed_time_ = other.tile_classification_elapsed_time_; intersection_elapsed_time_ = other.intersection_elapsed_time_; denoising_elapsed_time_ = other.denoising_elapsed_time_; @@ -315,18 +163,6 @@ namespace ffx_sssr timestamp_query_buffer_ = other.timestamp_query_buffer_; timestamp_queries_ = other.timestamp_queries_;; timestamp_queries_index_ = other.timestamp_queries_index_; - sobol_buffer_ = other.sobol_buffer_; - ranking_tile_buffer_ = other.ranking_tile_buffer_; - scrambling_tile_buffer_ = other.scrambling_tile_buffer_; - descriptor_count_tile_classification_ = other.descriptor_count_tile_classification_; - descriptor_count_indirect_args_ = other.descriptor_count_indirect_args_; - descriptor_count_intersection_ = other.descriptor_count_intersection_; - descriptor_count_spatial_ = other.descriptor_count_spatial_; - descriptor_count_temporal_ = other.descriptor_count_temporal_; - descriptor_count_eaw_ = other.descriptor_count_eaw_; - descriptor_count_eaw_stride_2_ = other.descriptor_count_eaw_stride_2_; - descriptor_count_eaw_stride_4_ = other.descriptor_count_eaw_stride_4_; - indirect_dispatch_command_signature_ = other.indirect_dispatch_command_signature_; resource_heap_ = other.resource_heap_; tile_list_ = other.tile_list_; tile_counter_ = other.tile_counter_; @@ -342,6 +178,7 @@ namespace ffx_sssr other.timestamp_query_heap_ = nullptr; other.timestamp_query_buffer_ = nullptr; other.descriptor_heap_cbv_srv_uav_ = nullptr; + other.descriptor_heap_samplers_ = nullptr; for (int i = 0; i < 2; ++i) { @@ -352,29 +189,11 @@ namespace ffx_sssr spatial_denoising_descriptor_table_[i] = other.spatial_denoising_descriptor_table_[i]; temporal_denoising_descriptor_table_[i] = other.temporal_denoising_descriptor_table_[i]; eaw_denoising_descriptor_table_[i] = other.eaw_denoising_descriptor_table_[i]; - eaw_stride_2_denoising_descriptor_table_[i] = other.eaw_stride_2_denoising_descriptor_table_[i]; - eaw_stride_4_denoising_descriptor_table_[i] = other.eaw_stride_4_denoising_descriptor_table_[i]; other.temporal_denoiser_result_[i] = nullptr; } + sampler_descriptor_table_ = other.sampler_descriptor_table_; - other.tile_classification_pass_.root_signature_ = nullptr; - other.tile_classification_pass_.pipeline_state_ = nullptr; - other.indirect_args_pass_.root_signature_ = nullptr; - other.indirect_args_pass_.pipeline_state_ = nullptr; - other.intersection_pass_.root_signature_ = nullptr; - other.intersection_pass_.pipeline_state_ = nullptr; - other.spatial_denoising_pass_.root_signature_ = nullptr; - other.spatial_denoising_pass_.pipeline_state_ = nullptr; - other.temporal_denoising_pass_.root_signature_ = nullptr; - other.temporal_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_denoising_pass_.root_signature_ = nullptr; - other.eaw_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_stride_2_denoising_pass_.root_signature_ = nullptr; - other.eaw_stride_2_denoising_pass_.pipeline_state_ = nullptr; - other.eaw_stride_4_denoising_pass_.root_signature_ = nullptr; - other.eaw_stride_4_denoising_pass_.pipeline_state_ = nullptr; - other.indirect_dispatch_command_signature_ = nullptr; other.resource_heap_ = nullptr; other.tile_list_ = nullptr; other.tile_counter_ = nullptr; @@ -419,15 +238,13 @@ namespace ffx_sssr scene_format_ = create_reflection_view_info.pD3D12CreateReflectionViewInfo->sceneFormat; // Create reflection view resources - CreateRootSignature(context, create_reflection_view_info); - CreatePipelineState(context); CreateDescriptorHeaps(context); // Create tile classification-related buffers { ID3D12Device * device = context.GetContextD3D12()->GetDevice(); - uint32_t num_tiles = RoundedDivide(width_ * height_, 64u); + uint32_t num_tiles = RoundedDivide(width_, 8u) * RoundedDivide(height_, 8u); uint32_t num_pixels = width_ * height_; uint32_t tile_list_element_count = num_tiles; @@ -516,26 +333,7 @@ namespace ffx_sssr intersection_pass_indirect_args_->SetName(L"SSSR Intersect Indirect Args"); denoiser_pass_indirect_args_->SetName(L"SSSR Denoiser Indirect Args"); } - // Create command signature for indirect arguments - { - D3D12_INDIRECT_ARGUMENT_DESC dispatch = {}; - dispatch.Type = D3D12_INDIRECT_ARGUMENT_TYPE_DISPATCH; - - ID3D12Device * device = context.GetContextD3D12()->GetDevice(); - D3D12_COMMAND_SIGNATURE_DESC desc = {}; - desc.ByteStride = sizeof(D3D12_DISPATCH_ARGUMENTS); - desc.NodeMask = 0; - desc.NumArgumentDescs = 1; - desc.pArgumentDescs = &dispatch; - - HRESULT hr; - hr = device->CreateCommandSignature(&desc, nullptr, IID_PPV_ARGS(&indirect_dispatch_command_signature_)); - if (!SUCCEEDED(hr)) - { - throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create command signature for indirect dispatch."); - } - } - + // Create denoising-related resources { auto CreateCommittedResource = [this, &context]( @@ -589,35 +387,33 @@ namespace ffx_sssr temporal_variance_->SetName(L"SSSR Temporal Variance"); } + ContextD3D12* d3d12_context = context.GetContextD3D12(); + // Setup the descriptor tables { + descriptor_heap_samplers_->AllocateStaticDescriptor(sampler_descriptor_table_, 1); + // Suballocate descriptor heap for descriptor tables for (int i = 0; i < 2; ++i) { DescriptorD3D12 table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_tile_classification_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetTileClassificationPass().descriptor_count_); tile_classification_descriptor_table_[i] = table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_indirect_args_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetIndirectArgsPass().descriptor_count_); indirect_args_descriptor_table_[i] = table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_intersection_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetIntersectionPass().descriptor_count_); intersection_descriptor_table_[i] = table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_spatial_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetSpatialDenoisingPass().descriptor_count_); spatial_denoising_descriptor_table_[i] = table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_temporal_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetTemporalDenoisingPass().descriptor_count_); temporal_denoising_descriptor_table_[i] = table; - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_eaw_); + descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, d3d12_context->GetEawDenoisingPass().descriptor_count_); eaw_denoising_descriptor_table_[i] = table; - - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_eaw_stride_2_); - eaw_stride_2_denoising_descriptor_table_[i] = table; - - descriptor_heap_cbv_srv_uav_->AllocateStaticDescriptor(table, descriptor_count_eaw_stride_4_); - eaw_stride_4_denoising_descriptor_table_[i] = table; } ID3D12Device * device = context.GetContextD3D12()->GetDevice(); @@ -632,6 +428,7 @@ namespace ffx_sssr D3D12_CPU_DESCRIPTOR_HANDLE roughness_history_buffer_srv = create_reflection_view_info.pD3D12CreateReflectionViewInfo->roughnessHistoryBufferSRV; D3D12_CPU_DESCRIPTOR_HANDLE environment_map_srv = create_reflection_view_info.pD3D12CreateReflectionViewInfo->environmentMapSRV; D3D12_CPU_DESCRIPTOR_HANDLE output_buffer_uav = create_reflection_view_info.pD3D12CreateReflectionViewInfo->reflectionViewUAV; + const D3D12_SAMPLER_DESC* environment_map_sampler_desc = create_reflection_view_info.pD3D12CreateReflectionViewInfo->pEnvironmentMapSamplerDesc; D3D12_CPU_DESCRIPTOR_HANDLE normal_buffers[] = { normal_buffer_srv, normal_history_buffer_srv }; D3D12_CPU_DESCRIPTOR_HANDLE roughness_buffers[] = { roughness_buffer_srv, roughness_history_buffer_srv }; @@ -688,9 +485,10 @@ namespace ffx_sssr }; // Place the descriptors + device->CreateSampler(environment_map_sampler_desc, sampler_descriptor_table_.GetCPUDescriptor(0)); // g_environment_map_sampler for (int i = 0; i < 2; ++i) { - uint32_t num_tiles = RoundedDivide(width_ * height_, 64u); + uint32_t num_tiles = RoundedDivide(width_, 8u) * RoundedDivide(height_, 8u); uint32_t num_pixels = width_ * height_; // Tile Classifier pass @@ -746,10 +544,11 @@ namespace ffx_sssr shader_resource_view_desc.Buffer.NumElements = static_cast(sampler.scrambling_tile_buffer_->GetDesc().Width / sizeof(std::int32_t)); shader_resource_view_desc.Buffer.StructureByteStride = static_cast(sizeof(std::int32_t)); device->CreateShaderResourceView(sampler.scrambling_tile_buffer_, &shader_resource_view_desc, table.GetCPUDescriptor(offset++)); // g_scrambling_tile_buffer + + device->CreateShaderResourceView(ray_list_, &SRV_Buffer(num_pixels), table.GetCPUDescriptor(offset++)); // g_ray_list device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_intersection_result device->CreateUnorderedAccessView(ray_lengths_, nullptr, &UAV_Tex2D(DXGI_FORMAT_R16_FLOAT), table.GetCPUDescriptor(offset++)); // g_ray_lengths device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_denoised_reflections - device->CreateUnorderedAccessView(ray_list_, nullptr, &UAV_Buffer(num_pixels), table.GetCPUDescriptor(offset++)); // g_ray_list } // Spatial denoising pass @@ -759,11 +558,11 @@ namespace ffx_sssr device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), depth_hierarchy_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_depth_buffer device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_normal device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_roughness + device->CreateShaderResourceView(temporal_denoiser_result_[i], &SRV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_intersection_result + device->CreateShaderResourceView(temporal_variance_, &SRV_Tex2D(DXGI_FORMAT_R8_UNORM), table.GetCPUDescriptor(offset++)); // g_has_ray + device->CreateShaderResourceView(tile_list_, &SRV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_spatially_denoised_reflections device->CreateUnorderedAccessView(ray_lengths_, nullptr, &UAV_Tex2D(DXGI_FORMAT_R16_FLOAT), table.GetCPUDescriptor(offset++)); // g_ray_lengths - device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_intersection_result - device->CreateUnorderedAccessView(temporal_variance_, nullptr, &UAV_Tex2D(DXGI_FORMAT_R8_UNORM), table.GetCPUDescriptor(offset++)); // g_has_ray - device->CreateUnorderedAccessView(tile_list_, nullptr, &UAV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list } // Temporal denoising pass @@ -776,12 +575,12 @@ namespace ffx_sssr device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_roughness ? roughness_buffers[1 - i] : roughness_history_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_roughness_history device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), depth_hierarchy_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_depth_buffer device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), motion_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_motion_vectors + device->CreateShaderResourceView(temporal_denoiser_result_[1 - i], &SRV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections_history + device->CreateShaderResourceView(ray_lengths_, &SRV_Tex2D(DXGI_FORMAT_R16_FLOAT), table.GetCPUDescriptor(offset++)); // g_ray_lengths + device->CreateShaderResourceView(tile_list_, &SRV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_spatially_denoised_reflections device->CreateUnorderedAccessView(temporal_variance_, nullptr, &UAV_Tex2D(DXGI_FORMAT_R8_UNORM), table.GetCPUDescriptor(offset++)); // g_temporal_variance - device->CreateUnorderedAccessView(temporal_denoiser_result_[1 - i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections_history - device->CreateUnorderedAccessView(ray_lengths_, nullptr, &UAV_Tex2D(DXGI_FORMAT_R16_FLOAT), table.GetCPUDescriptor(offset++)); // g_ray_lengths - device->CreateUnorderedAccessView(tile_list_, nullptr, &UAV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list } // EAW denoising pass @@ -791,33 +590,9 @@ namespace ffx_sssr device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_normal device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_roughness device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), depth_hierarchy_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_depth_buffer + device->CreateShaderResourceView(tile_list_, &SRV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_denoised_reflections - device->CreateUnorderedAccessView(tile_list_, nullptr, &UAV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list - } - - // EAW Stride 2 denoising pass (the same as the EAW pass, but input and output buffers flipped) - { - DescriptorD3D12 table = eaw_stride_2_denoising_descriptor_table_[i]; - uint32_t offset = 0; - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_normal - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_roughness - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), depth_hierarchy_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_depth_buffer - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_denoised_reflections - device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections - device->CreateUnorderedAccessView(tile_list_, nullptr, &UAV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list - } - - // EAW Stride 4 denoising pass (the very same as the EAW pass) - { - DescriptorD3D12 table = eaw_stride_4_denoising_descriptor_table_[i]; - uint32_t offset = 0; - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_normal - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_roughness - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), depth_hierarchy_srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_depth_buffer - device->CreateUnorderedAccessView(temporal_denoiser_result_[i], nullptr, &UAV_Tex2D(scene_format_), table.GetCPUDescriptor(offset++)); // g_temporally_denoised_reflections - device->CopyDescriptorsSimple(1, table.GetCPUDescriptor(offset++), output_buffer_uav, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // g_denoised_reflections - device->CreateUnorderedAccessView(tile_list_, nullptr, &UAV_Buffer(num_tiles), table.GetCPUDescriptor(offset++)); // g_tile_list } } } @@ -859,25 +634,18 @@ namespace ffx_sssr */ void ReflectionViewD3D12::Destroy() { - intersection_pass_.SafeRelease(); - spatial_denoising_pass_.SafeRelease(); - temporal_denoising_pass_.SafeRelease(); - tile_classification_pass_.SafeRelease(); - indirect_args_pass_.SafeRelease(); - eaw_denoising_pass_.SafeRelease(); - eaw_stride_2_denoising_pass_.SafeRelease(); - eaw_stride_4_denoising_pass_.SafeRelease(); - if (descriptor_heap_cbv_srv_uav_) delete descriptor_heap_cbv_srv_uav_; descriptor_heap_cbv_srv_uav_ = nullptr; + if (descriptor_heap_samplers_) + delete descriptor_heap_samplers_; + descriptor_heap_samplers_ = nullptr; #define FFX_SSSR_SAFE_RELEASE(x)\ if(x) { x->Release(); }\ x = nullptr; - FFX_SSSR_SAFE_RELEASE(indirect_dispatch_command_signature_); FFX_SSSR_SAFE_RELEASE(timestamp_query_heap_); FFX_SSSR_SAFE_RELEASE(timestamp_query_buffer_); FFX_SSSR_SAFE_RELEASE(temporal_denoiser_result_[0]); @@ -897,208 +665,6 @@ namespace ffx_sssr timestamp_queries_.resize(0u); } - /** - Creates the reflection view root signature. - - \param context The context to be used. - */ - void ReflectionViewD3D12::CreateRootSignature(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info) - { - auto CreateRootSignature = [&context, &create_reflection_view_info]( - ShaderPass& pass - , const LPCWSTR name - , std::uint32_t num_descriptor_ranges - , D3D12_DESCRIPTOR_RANGE const* descriptor_ranges - ) { - - D3D12_ROOT_PARAMETER root[] = { - InitAsDescriptorTable(num_descriptor_ranges, descriptor_ranges), - InitAsConstantBufferView(0) - }; - - D3D12_STATIC_SAMPLER_DESC environment_sampler = *create_reflection_view_info.pD3D12CreateReflectionViewInfo->pEnvironmentMapSamplerDesc; - environment_sampler.RegisterSpace = 0; - environment_sampler.ShaderRegister = 1; - environment_sampler.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - - D3D12_STATIC_SAMPLER_DESC sampler_descs[] = { InitLinearSampler(0), environment_sampler }; // g_linear_sampler - - D3D12_ROOT_SIGNATURE_DESC rs_desc = {}; - rs_desc.NumParameters = FFX_SSSR_ARRAY_SIZE(root); - rs_desc.pParameters = root; - rs_desc.NumStaticSamplers = FFX_SSSR_ARRAY_SIZE(sampler_descs); - rs_desc.pStaticSamplers = sampler_descs; - - HRESULT hr; - ID3DBlob* rs, *rsError; - hr = D3D12SerializeRootSignature(&rs_desc, D3D_ROOT_SIGNATURE_VERSION_1, &rs, &rsError); - if (FAILED(hr)) - { - if (rsError) - { - std::string const error_message(static_cast(rsError->GetBufferPointer())); - rsError->Release(); - throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to serialize root signature:\r\n> %s", error_message.c_str()); - } - else - { - throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to serialize root signature"); - } - } - - hr = context.GetContextD3D12()->GetDevice()->CreateRootSignature(0, rs->GetBufferPointer(), rs->GetBufferSize(), IID_PPV_ARGS(&pass.root_signature_)); - rs->Release(); - if (FAILED(hr)) - { - throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create root signature."); - } - - pass.root_signature_->SetName(name); - }; - - // Assemble the shader pass for tile classification - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_roughness - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_tile_list - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_list - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_tile_counter - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_ray_counter - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 4), // g_temporally_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 5), // g_temporally_denoised_reflections_history - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 6), // g_ray_lengths - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 7), // g_temporal_variance - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 8), // g_denoised_reflections - }; - CreateRootSignature(tile_classification_pass_, L"SSSR Tile Classification Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_tile_classification_ = FFX_SSSR_ARRAY_SIZE(ranges); - } - - // Assemble the shader pass that prepares the indirect arguments - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_tile_counter - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_counter - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_intersect_args - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_denoiser_args - }; - CreateRootSignature(indirect_args_pass_, L"SSSR Indirect Arguments Pass Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_indirect_args_ = FFX_SSSR_ARRAY_SIZE(ranges); - } - - // Assemble the shader pass for intersecting reflection rays with the depth buffer - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_lit_scene - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_depth_buffer_hierarchy - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_normal - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_roughness - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 4), // g_environment_map - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 5), // g_sobol_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 6), // g_ranking_tile_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 7), // g_scrambling_tile_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_intersection_result - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_lengths - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_ray_list - }; - CreateRootSignature(intersection_pass_, L"SSSR Depth Buffer Intersection Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_intersection_ = FFX_SSSR_ARRAY_SIZE(ranges); - } - - // Assemble the shader pass for spatial resolve - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_depth_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_normal - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_roughness - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_spatially_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_ray_lengths - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_intersection_result - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_has_ray - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 4), // g_tile_list - }; - CreateRootSignature(spatial_denoising_pass_, L"SSSR Spatial Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_spatial_ = FFX_SSSR_ARRAY_SIZE(ranges); - } - - // Assemble the shader pass for temporal resolve - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_normal - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_roughness - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_normal_history - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 3), // g_roughness_history - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 4), // g_depth_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 5), // g_motion_vectors - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_temporally_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_spatially_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_temporal_variance - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 3), // g_temporally_denoised_reflections_history - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 4), // g_ray_lengths - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 5), // g_tile_list - }; - CreateRootSignature(temporal_denoising_pass_, L"SSSR Temporal Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_temporal_ = FFX_SSSR_ARRAY_SIZE(ranges); - } - - // Assemble the shader pass for EAW resolve - { - D3D12_DESCRIPTOR_RANGE ranges[] = { - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0), // g_normal - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 1), // g_roughness - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 2), // g_depth_buffer - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0), // g_temporally_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1), // g_denoised_reflections - InitDescriptorRange(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 2), // g_tile_list - }; - CreateRootSignature(eaw_denoising_pass_, L"SSSR EAW Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_eaw_ = FFX_SSSR_ARRAY_SIZE(ranges); - - CreateRootSignature(eaw_stride_2_denoising_pass_, L"SSSR EAW Stride 2 Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_eaw_stride_2_ = FFX_SSSR_ARRAY_SIZE(ranges); - - CreateRootSignature(eaw_stride_4_denoising_pass_, L"SSSR EAW Stride 4 Resolve Root Signature", FFX_SSSR_ARRAY_SIZE(ranges), ranges); - descriptor_count_eaw_stride_4_ = FFX_SSSR_ARRAY_SIZE(ranges); - - } - } - - /** - Creates the reflection view pipeline state. - - \param context The context to be used. - */ - void ReflectionViewD3D12::CreatePipelineState(Context& context) - { - auto Compile = [&context](ShaderPass& pass, ContextD3D12::Shader shader, const LPCWSTR name) { - FFX_SSSR_ASSERT(pass.root_signature_ != nullptr); - - // Create the pipeline state object - D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_state_desc = {}; - pipeline_state_desc.pRootSignature = pass.root_signature_; - pipeline_state_desc.CS = context.GetContextD3D12()->GetShader(shader); - - HRESULT hr = context.GetContextD3D12()->GetDevice()->CreateComputePipelineState(&pipeline_state_desc, - IID_PPV_ARGS(&pass.pipeline_state_)); - if (!SUCCEEDED(hr)) - { - throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create compute pipeline state"); - } - - pass.pipeline_state_->SetName(name); - }; - - Compile(tile_classification_pass_, ContextD3D12::kShader_TileClassification, L"SSSR Tile Classification Pipeline"); - Compile(indirect_args_pass_, ContextD3D12::kShader_IndirectArguments, L"SSSR Indirect Arguments Pipeline"); - Compile(intersection_pass_, ContextD3D12::kShader_Intersection, L"SSSR Intersect Pipeline"); - Compile(spatial_denoising_pass_, ContextD3D12::kShader_SpatialResolve, L"SSSR Spatial Resolve Pipeline"); - Compile(temporal_denoising_pass_, ContextD3D12::kShader_TemporalResolve, L"SSSR Temporal Resolve Pipeline"); - Compile(eaw_denoising_pass_, ContextD3D12::kShader_EAWResolve, L"SSSR EAW Resolve Pipeline"); - Compile(eaw_stride_2_denoising_pass_, ContextD3D12::kShader_EAWResolve_Stride_2, L"SSSR EAW Stride 2 Resolve Pipeline"); - Compile(eaw_stride_4_denoising_pass_, ContextD3D12::kShader_EAWResolve_Stride_4, L"SSSR EAW Stride 4 Resolve Pipeline"); - } - /** Creates the descriptor heaps. @@ -1106,12 +672,25 @@ namespace ffx_sssr */ void ReflectionViewD3D12::CreateDescriptorHeaps(Context& context) { + ContextD3D12* d3d12_context = context.GetContextD3D12(); + FFX_SSSR_ASSERT(!descriptor_heap_cbv_srv_uav_); + FFX_SSSR_ASSERT(!descriptor_heap_samplers_); + descriptor_heap_cbv_srv_uav_ = new DescriptorHeapD3D12(context); FFX_SSSR_ASSERT(descriptor_heap_cbv_srv_uav_ != nullptr); - - std::uint32_t descriptor_count = descriptor_count_tile_classification_ + descriptor_count_indirect_args_ + descriptor_count_intersection_ + descriptor_count_spatial_ + descriptor_count_temporal_ + descriptor_count_eaw_ + descriptor_count_eaw_stride_2_ + descriptor_count_eaw_stride_4_; + std::uint32_t descriptor_count + = d3d12_context->GetTileClassificationPass().descriptor_count_ + + d3d12_context->GetIndirectArgsPass().descriptor_count_ + + d3d12_context->GetIntersectionPass().descriptor_count_ + + d3d12_context->GetSpatialDenoisingPass().descriptor_count_ + + d3d12_context->GetTemporalDenoisingPass().descriptor_count_ + + d3d12_context->GetEawDenoisingPass().descriptor_count_; descriptor_heap_cbv_srv_uav_->Create(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, 2 * descriptor_count, 0u); + + descriptor_heap_samplers_ = new DescriptorHeapD3D12(context); + FFX_SSSR_ASSERT(descriptor_heap_samplers_ != nullptr); + descriptor_heap_samplers_->Create(D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER, 1, 0u); // g_environment_map_sampler } /** @@ -1149,17 +728,8 @@ namespace ffx_sssr // Get hold of the command list for recording FFX_SSSR_ASSERT(resolve_reflection_view_info.pD3D12CommandEncodeInfo); auto const command_list = ContextD3D12::GetCommandList(context, resolve_reflection_view_info.pD3D12CommandEncodeInfo->pCommandList); - FFX_SSSR_ASSERT(descriptor_heap_cbv_srv_uav_ && command_list); - FFX_SSSR_ASSERT(tile_classification_pass_); - FFX_SSSR_ASSERT(indirect_args_pass_); - FFX_SSSR_ASSERT(intersection_pass_); - FFX_SSSR_ASSERT(spatial_denoising_pass_); - FFX_SSSR_ASSERT(temporal_denoising_pass_); - FFX_SSSR_ASSERT(eaw_denoising_pass_); - FFX_SSSR_ASSERT(eaw_stride_2_denoising_pass_); - FFX_SSSR_ASSERT(eaw_stride_4_denoising_pass_); + FFX_SSSR_ASSERT(descriptor_heap_cbv_srv_uav_ && descriptor_heap_samplers_ && command_list); FFX_SSSR_ASSERT(resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_1 || resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_2 || resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_4); - FFX_SSSR_ASSERT(resolve_reflection_view_info.eawPassCount == FFX_SSSR_EAW_PASS_COUNT_1 || resolve_reflection_view_info.eawPassCount == FFX_SSSR_EAW_PASS_COUNT_3); // Query timestamp value prior to resolving the reflection view if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) @@ -1268,8 +838,8 @@ namespace ffx_sssr prev_view_projection_ = view_projection; std::uint32_t current_frame = context.GetFrameIndex() & 1u; - ID3D12DescriptorHeap *descriptorHeaps[] = { descriptor_heap_cbv_srv_uav_->GetDescriptorHeap() }; - command_list->SetDescriptorHeaps(1, descriptorHeaps); + ID3D12DescriptorHeap *descriptor_heaps[] = { descriptor_heap_cbv_srv_uav_->GetDescriptorHeap(), descriptor_heap_samplers_->GetDescriptorHeap() }; + command_list->SetDescriptorHeaps(FFX_SSSR_ARRAY_SIZE(descriptor_heaps), descriptor_heaps); ID3D12Resource * cb_resource = upload_buffer.GetResource(); size_t offset = upload_buffer.GetOffset(pass_data); @@ -1294,12 +864,14 @@ namespace ffx_sssr return barrier; }; + ContextD3D12* d3d12_context = context.GetContextD3D12(); + // Tile Classification pass { - command_list->SetComputeRootSignature(tile_classification_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetTileClassificationPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, tile_classification_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(tile_classification_pass_.pipeline_state_); + command_list->SetPipelineState(d3d12_context->GetTileClassificationPass().pipeline_state_); uint32_t dim_x = RoundedDivide(width_, 8u); uint32_t dim_y = RoundedDivide(height_, 8u); command_list->Dispatch(dim_x, dim_y, 1); @@ -1316,10 +888,10 @@ namespace ffx_sssr // Indirect Arguments pass { - command_list->SetComputeRootSignature(indirect_args_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetIndirectArgsPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, indirect_args_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(indirect_args_pass_.pipeline_state_); + command_list->SetPipelineState(d3d12_context->GetIndirectArgsPass().pipeline_state_); command_list->Dispatch(1, 1, 1); } @@ -1348,11 +920,12 @@ namespace ffx_sssr // Intersection pass { - command_list->SetComputeRootSignature(intersection_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetIntersectionPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, intersection_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(intersection_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, intersection_pass_indirect_args_, 0, nullptr, 0); + command_list->SetComputeRootDescriptorTable(2, sampler_descriptor_table_.GetGPUDescriptor()); + command_list->SetPipelineState(d3d12_context->GetIntersectionPass().pipeline_state_); + command_list->ExecuteIndirect(d3d12_context->GetIndirectDispatchCommandSignature(), 1, intersection_pass_indirect_args_, 0, nullptr, 0); } // Query the amount of time spent in the intersection pass @@ -1376,11 +949,11 @@ namespace ffx_sssr // Spatial denoiser passes { - command_list->SetComputeRootSignature(spatial_denoising_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetSpatialDenoisingPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, spatial_denoising_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(spatial_denoising_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, denoiser_pass_indirect_args_, 0, nullptr, 0); + command_list->SetPipelineState(d3d12_context->GetSpatialDenoisingPass().pipeline_state_); + command_list->ExecuteIndirect(d3d12_context->GetIndirectDispatchCommandSignature(), 1, denoiser_pass_indirect_args_, 0, nullptr, 0); } // Ensure that the spatial denoising pass finished. We don't have the resource for the final result available, thus we have to wait for any UAV access to finish. @@ -1388,11 +961,11 @@ namespace ffx_sssr // Temporal denoiser passes { - command_list->SetComputeRootSignature(temporal_denoising_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetTemporalDenoisingPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, temporal_denoising_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(temporal_denoising_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, denoiser_pass_indirect_args_, 0, nullptr, 0); + command_list->SetPipelineState(d3d12_context->GetTemporalDenoisingPass().pipeline_state_); + command_list->ExecuteIndirect(d3d12_context->GetIndirectDispatchCommandSignature(), 1, denoiser_pass_indirect_args_, 0, nullptr, 0); } // Ensure that the temporal denoising pass finished @@ -1400,33 +973,11 @@ namespace ffx_sssr // EAW denoiser passes { - command_list->SetComputeRootSignature(eaw_denoising_pass_.root_signature_); + command_list->SetComputeRootSignature(d3d12_context->GetEawDenoisingPass().root_signature_); command_list->SetComputeRootDescriptorTable(0, eaw_denoising_descriptor_table_[current_frame].GetGPUDescriptor()); command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(eaw_denoising_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, denoiser_pass_indirect_args_, 0, nullptr, 0); - } - - if (resolve_reflection_view_info.eawPassCount == FFX_SSSR_EAW_PASS_COUNT_3) - { - // Ensure that the prior EAW pass has finished - command_list->ResourceBarrier(1, &UAVBarrier(nullptr)); - - // EAW Stride 2 denoiser pass - command_list->SetComputeRootSignature(eaw_stride_2_denoising_pass_.root_signature_); - command_list->SetComputeRootDescriptorTable(0, eaw_stride_2_denoising_descriptor_table_[current_frame].GetGPUDescriptor()); - command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(eaw_stride_2_denoising_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, denoiser_pass_indirect_args_, 0, nullptr, 0); - - command_list->ResourceBarrier(1, &UAVBarrier(temporal_denoiser_result_[current_frame])); - - // EAW Stride 4 denoiser pass - command_list->SetComputeRootSignature(eaw_stride_4_denoising_pass_.root_signature_); - command_list->SetComputeRootDescriptorTable(0, eaw_stride_4_denoising_descriptor_table_[current_frame].GetGPUDescriptor()); - command_list->SetComputeRootConstantBufferView(1, cb_resource->GetGPUVirtualAddress() + offset); - command_list->SetPipelineState(eaw_stride_4_denoising_pass_.pipeline_state_); - command_list->ExecuteIndirect(indirect_dispatch_command_signature_, 1, denoiser_pass_indirect_args_, 0, nullptr, 0); + command_list->SetPipelineState(d3d12_context->GetEawDenoisingPass().pipeline_state_); + command_list->ExecuteIndirect(d3d12_context->GetIndirectDispatchCommandSignature(), 1, denoiser_pass_indirect_args_, 0, nullptr, 0); } // Query the amount of time spent in the denoiser passes diff --git a/ffx-sssr/src/d3d12/reflection_view_d3d12.h b/ffx-sssr/src/d3d12/reflection_view_d3d12.h index a982617..423be51 100644 --- a/ffx-sssr/src/d3d12/reflection_view_d3d12.h +++ b/ffx-sssr/src/d3d12/reflection_view_d3d12.h @@ -62,30 +62,6 @@ namespace ffx_sssr */ using TimestampQueries = std::vector; - /** - The ShaderPass class holds the data for an individual shader pass. - */ - class ShaderPass - { - FFX_SSSR_NON_COPYABLE(ShaderPass); - - public: - inline ShaderPass(); - inline ~ShaderPass(); - - inline operator bool() const; - - inline ShaderPass(ShaderPass&& other) noexcept; - inline ShaderPass& operator =(ShaderPass&& other) noexcept; - - inline void SafeRelease(); - - // The pipeline state object. - ID3D12PipelineState* pipeline_state_; - // The root signature to be used. - ID3D12RootSignature* root_signature_; - }; - ReflectionViewD3D12(); ~ReflectionViewD3D12(); @@ -95,8 +71,6 @@ namespace ffx_sssr void Create(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); void Destroy(); - void CreateRootSignature(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); - void CreatePipelineState(Context& context); void CreateDescriptorHeaps(Context& context); std::uint32_t GetTimestampQueryIndex() const; @@ -113,48 +87,8 @@ namespace ffx_sssr // The descriptor heap for CBVs, SRVs, and UAVs. DescriptorHeapD3D12* descriptor_heap_cbv_srv_uav_; - // The shader pass that classifies tiles. - ShaderPass tile_classification_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_tile_classification_; - - // The shader pass that prepares the indirect arguments. - ShaderPass indirect_args_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_indirect_args_; - - // The shader pass intersecting reflection rays with the depth buffer. - ShaderPass intersection_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_intersection_; - - // The shader pass that does spatial denoising. - ShaderPass spatial_denoising_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_spatial_; - - // The shader pass that does temporal denoising. - ShaderPass temporal_denoising_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_temporal_; - - // The shader pass that does the second spatial denoising. - ShaderPass eaw_denoising_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_eaw_; - - // The shader pass that does the second spatial denoising with stride 2. - ShaderPass eaw_stride_2_denoising_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_eaw_stride_2_; - - // The shader pass that does the second spatial denoising with stride 4. - ShaderPass eaw_stride_4_denoising_pass_; - // The number of descriptors used in the root signature. - std::uint32_t descriptor_count_eaw_stride_4_; - - // The command signature for the indirect dispatches. - ID3D12CommandSignature * indirect_dispatch_command_signature_; + // The descriptor heap for samplers. + DescriptorHeapD3D12* descriptor_heap_samplers_; // Single heap containing all resources. ID3D12Heap * resource_heap_; @@ -176,13 +110,6 @@ namespace ffx_sssr // Holds the temporal variance of the last two frames. ID3D12Resource * temporal_variance_; - // The Sobol sequence buffer. - D3D12_GPU_DESCRIPTOR_HANDLE sobol_buffer_; - // The ranking tile buffer for sampling. - D3D12_GPU_DESCRIPTOR_HANDLE ranking_tile_buffer_; - // The scrambling tile buffer for sampling. - D3D12_GPU_DESCRIPTOR_HANDLE scrambling_tile_buffer_; - // The number of GPU ticks spent in the tile classification pass. std::uint64_t tile_classification_elapsed_time_; // The number of GPU ticks spent in depth buffer intersection. @@ -217,14 +144,10 @@ namespace ffx_sssr DescriptorD3D12 temporal_denoising_descriptor_table_[2]; // Descriptor tables of the eaw denoising pass. DescriptorD3D12 eaw_denoising_descriptor_table_[2]; - // Descriptor tables of the eaw denoising pass with stride 2. - DescriptorD3D12 eaw_stride_2_denoising_descriptor_table_[2]; - // Descriptor tables of the eaw denoising pass with stride 4. - DescriptorD3D12 eaw_stride_4_denoising_descriptor_table_[2]; + // Descriptor tables for the environment map sampler. + DescriptorD3D12 sampler_descriptor_table_; // The view projection matrix of the last frame. matrix4 prev_view_projection_; }; } - -#include "reflection_view_d3d12.inl" \ No newline at end of file diff --git a/ffx-sssr/src/d3d12/reflection_view_d3d12.inl b/ffx-sssr/src/d3d12/reflection_view_d3d12.inl deleted file mode 100644 index 92b3847..0000000 --- a/ffx-sssr/src/d3d12/reflection_view_d3d12.inl +++ /dev/null @@ -1,106 +0,0 @@ -#include "reflection_view_d3d12.h" -/********************************************************************** -Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -********************************************************************/ -#pragma once - -namespace ffx_sssr -{ - /** - The constructor for the ShaderPass class. - */ - ReflectionViewD3D12::ShaderPass::ShaderPass() - : pipeline_state_(nullptr) - , root_signature_(nullptr) - { - } - - /** - The constructor for the ShaderPass class. - - \param other The shader pass to be moved. - */ - ReflectionViewD3D12::ShaderPass::ShaderPass(ShaderPass&& other) noexcept - : pipeline_state_(other.pipeline_state_) - , root_signature_(other.root_signature_) - { - other.pipeline_state_ = nullptr; - other.root_signature_ = nullptr; - } - - /** - The destructor for the ShaderPass class. - */ - ReflectionViewD3D12::ShaderPass::~ShaderPass() - { - if (pipeline_state_) - pipeline_state_->Release(); - pipeline_state_ = nullptr; - - if (root_signature_) - root_signature_->Release(); - root_signature_ = nullptr; - } - - /** - Assigns the shader pass. - - \param other The shader pass to be moved. - \return The assigned shader pass. - */ - ReflectionViewD3D12::ShaderPass& ReflectionViewD3D12::ShaderPass::operator =(ShaderPass&& other) noexcept - { - if (this != &other) - { - pipeline_state_ = other.pipeline_state_; - root_signature_ = other.root_signature_; - - other.pipeline_state_ = nullptr; - other.root_signature_ = nullptr; - } - - return *this; - } - - /** - Releases the shader pass. - */ - inline void ReflectionViewD3D12::ShaderPass::SafeRelease() - { - if (pipeline_state_) - pipeline_state_->Release(); - pipeline_state_ = nullptr; - - if (root_signature_) - root_signature_->Release(); - root_signature_ = nullptr; - } - - /** - Checks whether the shader pass is valid. - - \return true if the shader pass is valid, false otherwise. - */ - ReflectionViewD3D12::ShaderPass::operator bool() const - { - return (pipeline_state_ && root_signature_); - } -} diff --git a/ffx-sssr/src/d3d12/shader_compiler_d3d12.cpp b/ffx-sssr/src/d3d12/shader_compiler_d3d12.cpp index e3fb705..cd01097 100644 --- a/ffx-sssr/src/d3d12/shader_compiler_d3d12.cpp +++ b/ffx-sssr/src/d3d12/shader_compiler_d3d12.cpp @@ -25,6 +25,10 @@ THE SOFTWARE. #include #include +#if FFX_SSSR_DUMP_SHADERS +#include +#endif // FFX_SSSR_DUMP_SHADERS + #include "reflection_error.h" #include "utils.h" @@ -196,6 +200,19 @@ namespace ffx_sssr FFX_SSSR_ASSERT(dxc_program != nullptr); dxc_result->Release(); +#if FFX_SSSR_DUMP_SHADERS + IDxcBlobEncoding* disasm; + HRESULT hr = dxc_compiler_->Disassemble(dxc_program, &disasm); + if (SUCCEEDED(hr)) + { + std::wstring path = shader_name + std::wstring(L".dxil.disasm"); + std::ofstream filestream(path.c_str()); + filestream.write((const char*)disasm->GetBufferPointer(), disasm->GetBufferSize()); + filestream.close(); + disasm->Release(); + } +#endif // FFX_SSSR_DUMP_SHADERS + // Retrieve the shader bytecode shader.BytecodeLength = dxc_program->GetBufferSize(); auto const shader_bytecode = malloc(shader.BytecodeLength); @@ -204,6 +221,13 @@ namespace ffx_sssr shader.pShaderBytecode = shader_bytecode; dxc_program->Release(); +#if FFX_SSSR_DUMP_SHADERS + std::wstring path = shader_name + std::wstring(L".dxil"); + std::ofstream filestream(path.c_str(), std::ios::binary | std::ios::out); + filestream.write((const char*)shader.pShaderBytecode, shader.BytecodeLength); + filestream.close(); +#endif // FFX_SSSR_DUMP_SHADERS + return shader; } } diff --git a/ffx-sssr/src/reflection_error.cpp b/ffx-sssr/src/reflection_error.cpp index 0665610..f59cc6a 100644 --- a/ffx-sssr/src/reflection_error.cpp +++ b/ffx-sssr/src/reflection_error.cpp @@ -49,7 +49,7 @@ namespace ffx_sssr \param context The context to be used. \param error The error code for this exception. */ - reflection_error::reflection_error(Context& context, FfxSssrStatus error) + reflection_error::reflection_error(const Context& context, FfxSssrStatus error) : error_(error) { (void)&context; @@ -63,7 +63,7 @@ namespace ffx_sssr \param format The format for the error message. \param ... The content of the error message. */ - reflection_error::reflection_error(Context& context, FfxSssrStatus error, char const* format, ...) + reflection_error::reflection_error(const Context& context, FfxSssrStatus error, char const* format, ...) : error_(error) { va_list args; diff --git a/ffx-sssr/src/reflection_error.h b/ffx-sssr/src/reflection_error.h index c2d17fc..b49195a 100644 --- a/ffx-sssr/src/reflection_error.h +++ b/ffx-sssr/src/reflection_error.h @@ -37,8 +37,8 @@ namespace ffx_sssr public: reflection_error(); reflection_error(FfxSssrStatus error); - reflection_error(Context& context, FfxSssrStatus error); - reflection_error(Context& context, FfxSssrStatus error, char const* format, ...); + reflection_error(const Context& context, FfxSssrStatus error); + reflection_error(const Context& context, FfxSssrStatus error, char const* format, ...); // The error code for this exception. FfxSssrStatus error_; diff --git a/ffx-sssr/src/vk/buffer_vk.cpp b/ffx-sssr/src/vk/buffer_vk.cpp new file mode 100644 index 0000000..f68e45d --- /dev/null +++ b/ffx-sssr/src/vk/buffer_vk.cpp @@ -0,0 +1,245 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "buffer_vk.h" +#include "memory.h" + +namespace ffx_sssr +{ + /** + The constructor for the BufferVK class. + */ + BufferVK::BufferVK() + : buffer_(VK_NULL_HANDLE) + , device_(VK_NULL_HANDLE) + , memory_(VK_NULL_HANDLE) + , buffer_view_(VK_NULL_HANDLE) + , mappable_(false) + , mapped_(false) + { + } + + /** + The destructor for the BufferVK class. + */ + BufferVK::~BufferVK() + { + if (mapped_) + { + Unmap(); + } + + if (buffer_) + { + vkDestroyBuffer(device_, buffer_, nullptr); + buffer_ = VK_NULL_HANDLE; + } + + if (memory_) + { + vkFreeMemory(device_, memory_, nullptr); + memory_ = VK_NULL_HANDLE; + } + + if (buffer_view_) + { + vkDestroyBufferView(device_, buffer_view_, nullptr); + buffer_view_ = VK_NULL_HANDLE; + } + + device_ = VK_NULL_HANDLE; + } + + /** + The constructor for the BufferVK class. + + \param device The VkDevice that creates the buffer view. + \param physical_device The VkPhysicalDevice to determine the right memory heap. + \param create_info The CreateInfo struct. + */ + BufferVK::BufferVK(VkDevice device, VkPhysicalDevice physical_device, const CreateInfo& create_info) + : device_(device) + , buffer_(VK_NULL_HANDLE) + , memory_(VK_NULL_HANDLE) + , buffer_view_(VK_NULL_HANDLE) + , mappable_(false) + , mapped_(false) + { + VkBufferCreateInfo buffer_create_info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + buffer_create_info.pNext = nullptr; + buffer_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + buffer_create_info.size = create_info.size_in_bytes_; + buffer_create_info.usage = create_info.buffer_usage_; + if (VK_SUCCESS != vkCreateBuffer(device_, &buffer_create_info, nullptr, &buffer_)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + VkMemoryRequirements memory_requirements = {}; + vkGetBufferMemoryRequirements(device_, buffer_, &memory_requirements); + + VkPhysicalDeviceMemoryProperties memory_properties = {}; + vkGetPhysicalDeviceMemoryProperties(physical_device, &memory_properties); + + // find the right memory type for this image + int memory_type_index = -1; + for (uint32_t i = 0; i < memory_properties.memoryTypeCount; ++i) + { + const VkMemoryType& memory_type = memory_properties.memoryTypes[i]; + bool has_required_properties = memory_type.propertyFlags & create_info.memory_property_flags; + bool is_required_memory_type = memory_requirements.memoryTypeBits & (1 << i); + if (has_required_properties && is_required_memory_type) + { + memory_type_index = i; + break; + } + } + + // abort if we couldn't find the right memory type + if (memory_type_index == -1) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + if (create_info.memory_property_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + { + mappable_ = true; + mapped_ = false; + } + + VkMemoryAllocateInfo memory_allocate_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; + memory_allocate_info.pNext = nullptr; + memory_allocate_info.allocationSize = memory_requirements.size; + memory_allocate_info.memoryTypeIndex = memory_type_index; + if (VK_SUCCESS != vkAllocateMemory(device_, &memory_allocate_info, nullptr, &memory_)) + { + throw reflection_error(FFX_SSSR_STATUS_OUT_OF_MEMORY); + } + + if (VK_SUCCESS != vkBindBufferMemory(device_, buffer_, memory_, 0)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + if (create_info.format_ == VK_FORMAT_UNDEFINED) + { + buffer_view_ = VK_NULL_HANDLE; + return; // Skip buffer view creation. + } + + VkBufferViewCreateInfo buffer_view_create_info = { VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO }; + buffer_view_create_info.pNext = nullptr; + buffer_view_create_info.flags = 0; + buffer_view_create_info.buffer = buffer_; + buffer_view_create_info.format = create_info.format_; + buffer_view_create_info.offset = 0; + buffer_view_create_info.range = VK_WHOLE_SIZE; + if (VK_SUCCESS != vkCreateBufferView(device_, &buffer_view_create_info, nullptr, &buffer_view_)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + + FFX_SSSR_ASSERT(create_info.name_); // require all library objects to be named. + PFN_vkSetDebugUtilsObjectNameEXT vkSetDebugUtilsObjectName = (PFN_vkSetDebugUtilsObjectNameEXT)vkGetDeviceProcAddr(device, "vkSetDebugUtilsObjectNameEXT"); + if (vkSetDebugUtilsObjectName) + { + VkDebugUtilsObjectNameInfoEXT object_name_info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT }; + object_name_info.pNext = nullptr; + object_name_info.objectType = VK_OBJECT_TYPE_BUFFER; + object_name_info.objectHandle = reinterpret_cast(buffer_); + object_name_info.pObjectName = create_info.name_; + + VkResult result = vkSetDebugUtilsObjectName(device, &object_name_info); + FFX_SSSR_ASSERT(result == VK_SUCCESS); + } + } + + /** + The constructor for the BufferVK class. + + \param other The buffer to be moved. + */ + BufferVK::BufferVK(BufferVK && other) noexcept + : buffer_(other.buffer_) + , memory_(other.memory_) + , device_(other.device_) + , buffer_view_(other.buffer_view_) + , mappable_(other.mappable_) + , mapped_(other.mapped_) + { + other.buffer_ = VK_NULL_HANDLE; + other.memory_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + other.buffer_view_ = VK_NULL_HANDLE; + other.mappable_ = false; + other.mapped_ = false; + } + + /** + Assigns the buffer. + + \param other The buffer to be moved. + \return The assigned buffer. + */ + BufferVK & BufferVK::operator=(BufferVK && other) noexcept + { + if (this != &other) + { + buffer_ = other.buffer_; + memory_ = other.memory_; + device_ = other.device_; + buffer_view_ = other.buffer_view_; + mappable_ = other.mappable_; + mapped_ = other.mapped_; + + other.buffer_ = VK_NULL_HANDLE; + other.memory_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + other.buffer_view_ = VK_NULL_HANDLE; + other.mappable_ = false; + other.mapped_ = false; + } + + return *this; + } + + void BufferVK::Map(void** data) + { + FFX_SSSR_ASSERT(mappable_); + FFX_SSSR_ASSERT(!mapped_); + + if (VK_SUCCESS != vkMapMemory(device_, memory_, 0, VK_WHOLE_SIZE, 0, data)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + mapped_ = true; + } + + void BufferVK::Unmap() + { + FFX_SSSR_ASSERT(mappable_); + FFX_SSSR_ASSERT(mapped_); + + vkUnmapMemory(device_, memory_); + mapped_ = false; + } +} diff --git a/ffx-sssr/src/vk/buffer_vk.h b/ffx-sssr/src/vk/buffer_vk.h new file mode 100644 index 0000000..08bc580 --- /dev/null +++ b/ffx-sssr/src/vk/buffer_vk.h @@ -0,0 +1,68 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include "macros.h" +#include "ffx_sssr.h" + +namespace ffx_sssr +{ + /** + The BufferVK class is a helper class to create and destroy buffers on Vulkan. + */ + class BufferVK + { + FFX_SSSR_NON_COPYABLE(BufferVK); + + public: + + class CreateInfo + { + public: + VkDeviceSize size_in_bytes_; + VkMemoryPropertyFlags memory_property_flags; + VkBufferUsageFlags buffer_usage_; + VkFormat format_; + const char* name_; + }; + + BufferVK(); + ~BufferVK(); + + BufferVK(VkDevice device, VkPhysicalDevice physical_device, const CreateInfo& create_info); + + BufferVK(BufferVK&& other) noexcept; + BufferVK& BufferVK::operator =(BufferVK&& other) noexcept; + + void Map(void** data); + void Unmap(); + + VkDevice device_; + VkBuffer buffer_; + VkBufferView buffer_view_; + VkDeviceMemory memory_; // We're creating a low number of allocations for this library, so we just allocate a dedicated memory object per buffer. Normally you'd want to do sub-allocations of a larger allocation. + bool mappable_; + bool mapped_; + }; +} diff --git a/ffx-sssr/src/vk/context_vk.cpp b/ffx-sssr/src/vk/context_vk.cpp new file mode 100644 index 0000000..577dd94 --- /dev/null +++ b/ffx-sssr/src/vk/context_vk.cpp @@ -0,0 +1,726 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "context_vk.h" + +#include +#include + +#if FFX_SSSR_DUMP_SHADERS +#include +#endif // FFX_SSSR_DUMP_SHADERS + +#include "utils.h" +#include "context.h" +#include "reflection_view.h" +#include "ffx_sssr_vk.h" + +#include "shader_common.h" +#include "shader_classify_tiles.h" +#include "shader_intersect.h" +#include "shader_prepare_indirect_args.h" +#include "shader_resolve_eaw.h" +#include "shader_resolve_spatial.h" +#include "shader_resolve_temporal.h" + +namespace +{ + auto constexpr D3D12_VENDOR_ID_AMD = 0x1002u; + auto constexpr D3D12_VENDOR_ID_INTEL = 0x8086u; + auto constexpr D3D12_VENDOR_ID_NVIDIA = 0x10DEu; + + + namespace _1 + { + #include "samplerBlueNoiseErrorDistribution_128x128_OptimizedFor_2d2d2d2d_1spp.cpp" + } + + namespace _2 + { + #include "samplerBlueNoiseErrorDistribution_128x128_OptimizedFor_2d2d2d2d_2spp.cpp" + } + + /** + The available blue noise samplers for various sampling modes. + */ + struct + { + std::int32_t const (&sobol_buffer_)[256 * 256]; + std::int32_t const (&ranking_tile_buffer_)[128 * 128 * 8]; + std::int32_t const (&scrambling_tile_buffer_)[128 * 128 * 8]; + } + const g_sampler_states[] = + { + { _1::sobol_256spp_256d, _1::rankingTile, _1::scramblingTile }, + { _2::sobol_256spp_256d, _2::rankingTile, _2::scramblingTile }, + }; +} + +namespace ffx_sssr +{ + /** + The constructor for the ContextVK class. + + \param context The execution context. + \param create_context_info The context creation information. + */ + ContextVK::ContextVK(Context& context, FfxSssrCreateContextInfo const& create_context_info) : + context_(context) + , device_(create_context_info.pVkCreateContextInfo->device) + , physical_device_(create_context_info.pVkCreateContextInfo->physicalDevice) + , upload_buffer_(*this, create_context_info.uploadBufferSize) + , shader_compiler_(context) + , samplers_were_populated_(false) + , is_subgroup_size_control_extension_available_(false) + , tile_classification_pass_() + , indirect_args_pass_() + , intersection_pass_() + , spatial_denoising_pass_() + , temporal_denoising_pass_() + , eaw_denoising_pass_() + , reflection_views_(create_context_info.maxReflectionViewCount) + { + if (!device_) + { + throw reflection_error(context, FFX_SSSR_STATUS_INVALID_VALUE, "No device was supplied."); + } + + // Query if the implementation supports VK_EXT_subgroup_size_control + // This is the case if VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME is present. + // Rely on the application to enable the extension if it's available. + uint32_t extension_count; + if (VK_SUCCESS != vkEnumerateDeviceExtensionProperties(physical_device_, nullptr, &extension_count, NULL)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to enumerate device extension properties."); + } + std::vector device_extension_properties(extension_count); + if (VK_SUCCESS != vkEnumerateDeviceExtensionProperties(physical_device_, nullptr, &extension_count, device_extension_properties.data())) + { + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to query device extension properties."); + } + + is_subgroup_size_control_extension_available_ = std::find_if(device_extension_properties.begin(), device_extension_properties.end(), + [](const VkExtensionProperties& extensionProps) -> bool { return strcmp(extensionProps.extensionName, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME) == 0; }) + != device_extension_properties.end(); + + upload_buffer_.Initialize(); + CompileShaders(create_context_info); + CreatePipelines(); + + // Create our blue noise samplers + BlueNoiseSamplerVK* blue_noise_samplers[] = { &blue_noise_sampler_1spp_, &blue_noise_sampler_2spp_ }; + static_assert(FFX_SSSR_ARRAY_SIZE(blue_noise_samplers) == FFX_SSSR_ARRAY_SIZE(g_sampler_states), "Sampler arrays don't match."); + for (auto i = 0u; i < FFX_SSSR_ARRAY_SIZE(g_sampler_states); ++i) + { + auto const& sampler_state = g_sampler_states[i]; + BlueNoiseSamplerVK* sampler = blue_noise_samplers[i]; + + BufferVK::CreateInfo create_info = {}; + create_info.memory_property_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + create_info.buffer_usage_ = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + create_info.format_ = VK_FORMAT_R32_UINT; + + create_info.size_in_bytes_ = sizeof(sampler_state.sobol_buffer_); + create_info.name_ = "SSSR Sobol Buffer"; + sampler->sobol_buffer_ = BufferVK(device_, physical_device_, create_info); + + create_info.size_in_bytes_ = sizeof(sampler_state.ranking_tile_buffer_); + create_info.name_ = "SSSR Ranking Tile Buffer"; + sampler->ranking_tile_buffer_ = BufferVK(device_, physical_device_, create_info); + + create_info.size_in_bytes_ = sizeof(sampler_state.scrambling_tile_buffer_); + create_info.name_ = "SSSR Scrambling Tile Buffer"; + sampler->scrambling_tile_buffer_ = BufferVK(device_, physical_device_, create_info); + } + + VkCommandBuffer command_buffer = create_context_info.pVkCreateContextInfo->uploadCommandBuffer; + if (!samplers_were_populated_) + { + std::int32_t* upload_buffer; + + // Upload the relevant data to the various samplers + for (auto i = 0u; i < FFX_SSSR_ARRAY_SIZE(g_sampler_states); ++i) + { + auto const& sampler_state = g_sampler_states[i]; + BlueNoiseSamplerVK* sampler = blue_noise_samplers[i]; + + if (!upload_buffer_.AllocateBuffer(sizeof(sampler_state.sobol_buffer_), upload_buffer)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_OUT_OF_MEMORY, "Failed to allocate %llukiB of upload memory, consider increasing uploadBufferSize", RoundedDivide(sizeof(sampler_state.sobol_buffer_), 1024ull)); + } + memcpy(upload_buffer, sampler_state.sobol_buffer_, sizeof(sampler_state.sobol_buffer_)); + + VkBufferCopy region = {}; + region.srcOffset = static_cast(upload_buffer_.GetOffset(upload_buffer)); + region.dstOffset = 0; + region.size = sizeof(sampler_state.sobol_buffer_); + vkCmdCopyBuffer(command_buffer, upload_buffer_.GetResource(), sampler->sobol_buffer_.buffer_, 1, ®ion); + + if (!upload_buffer_.AllocateBuffer(sizeof(sampler_state.ranking_tile_buffer_), upload_buffer)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_OUT_OF_MEMORY, "Failed to allocate %llukiB of upload memory, consider increasing uploadBufferSize", RoundedDivide(sizeof(sampler_state.ranking_tile_buffer_), 1024ull)); + } + memcpy(upload_buffer, sampler_state.ranking_tile_buffer_, sizeof(sampler_state.ranking_tile_buffer_)); + + region.srcOffset = static_cast(upload_buffer_.GetOffset(upload_buffer)); + region.dstOffset = 0; + region.size = sizeof(sampler_state.ranking_tile_buffer_); + vkCmdCopyBuffer(command_buffer, upload_buffer_.GetResource(), sampler->ranking_tile_buffer_.buffer_, 1, ®ion); + + if (!upload_buffer_.AllocateBuffer(sizeof(sampler_state.scrambling_tile_buffer_), upload_buffer)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_OUT_OF_MEMORY, "Failed to allocate %llukiB of upload memory, consider increasing uploadBufferSize", RoundedDivide(sizeof(sampler_state.scrambling_tile_buffer_), 1024ull)); + } + memcpy(upload_buffer, sampler_state.scrambling_tile_buffer_, sizeof(sampler_state.scrambling_tile_buffer_)); + + region.srcOffset = static_cast(upload_buffer_.GetOffset(upload_buffer)); + region.dstOffset = 0; + region.size = sizeof(sampler_state.scrambling_tile_buffer_); + vkCmdCopyBuffer(command_buffer, upload_buffer_.GetResource(), sampler->scrambling_tile_buffer_.buffer_, 1, ®ion); + } + + // Flag that the samplers are now ready to use + samplers_were_populated_ = true; + } + } + + /** + The destructor for the ContextVK class. + */ + ContextVK::~ContextVK() + { + if (uniform_buffer_descriptor_set_layout_) + { + vkDestroyDescriptorSetLayout(device_, uniform_buffer_descriptor_set_layout_, nullptr); + } + } + + /** + Gets the number of GPU ticks spent in the tile classification pass. + + \param reflection_view_id The identifier for the reflection view object. + \param elapsed_time The number of GPU ticks spent in the tile classification pass. + */ + void ContextVK::GetReflectionViewTileClassificationElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const + { + FFX_SSSR_ASSERT(reflection_views_.At(ID(reflection_view_id))); // not created properly? + FFX_SSSR_ASSERT(context_.IsOfType(reflection_view_id) && context_.IsObjectValid(reflection_view_id)); + + auto const& reflection_view = reflection_views_[ID(reflection_view_id)]; + + if (!((reflection_view.flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_OPERATION, "Cannot query the tile classification elapsed time of a reflection view that was not created with the FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS flag"); + } + + elapsed_time = reflection_view.tile_classification_elapsed_time_; + } + + /** + Gets the number of GPU ticks spent intersecting the depth buffer. + + \param reflection_view_id The identifier for the reflection view object. + \param elapsed_time The number of GPU ticks spent intersecting the depth buffer. + */ + void ContextVK::GetReflectionViewIntersectionElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const + { + FFX_SSSR_ASSERT(reflection_views_.At(ID(reflection_view_id))); // not created properly? + FFX_SSSR_ASSERT(context_.IsOfType(reflection_view_id) && context_.IsObjectValid(reflection_view_id)); + + auto const& reflection_view = reflection_views_[ID(reflection_view_id)]; + + if (!((reflection_view.flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_OPERATION, "Cannot query the intersection elapsed time of a reflection view that was not created with the FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS flag"); + } + + elapsed_time = reflection_view.intersection_elapsed_time_; + } + + /** + Gets the number of GPU ticks spent denoising the Vulkan reflection view. + + \param reflection_view_id The identifier for the reflection view object. + \param elapsed_time The number of GPU ticks spent denoising. + */ + void ContextVK::GetReflectionViewDenoisingElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const + { + FFX_SSSR_ASSERT(reflection_views_.At(ID(reflection_view_id))); // not created properly? + FFX_SSSR_ASSERT(context_.IsOfType(reflection_view_id) && context_.IsObjectValid(reflection_view_id)); + + auto const& reflection_view = reflection_views_[ID(reflection_view_id)]; + + if (!((reflection_view.flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0)) + { + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_OPERATION, "Cannot query the denoising elapsed time of a reflection view that was not created with the FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS flag"); + } + + elapsed_time = reflection_view.denoising_elapsed_time_; + } + + /** + Creates the Vulkan reflection view. + + \param reflection_view_id The identifier of the reflection view object. + \param create_reflection_view_info The reflection view creation information. + */ + void ContextVK::CreateReflectionView(std::uint64_t reflection_view_id, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info) + { + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo); + FFX_SSSR_ASSERT(context_.IsOfType(reflection_view_id) && context_.IsObjectValid(reflection_view_id)); + + // Check user arguments + if (!create_reflection_view_info.outputWidth || !create_reflection_view_info.outputHeight) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The outputWidth and outputHeight parameters are required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->depthBufferHierarchySRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The depthBufferHierarchySRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->motionBufferSRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The motionBufferSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->normalBufferSRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The normalBufferSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessBufferSRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The roughnessBufferSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->normalHistoryBufferSRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The normalHistoryBufferSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessHistoryBufferSRV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The roughnessHistoryBufferSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->reflectionViewUAV) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The environmentMapSRV parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->environmentMapSampler) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The environmentMapSampler parameter is required when creating a reflection view"); + if(create_reflection_view_info.pVkCreateReflectionViewInfo->sceneFormat == VK_FORMAT_UNDEFINED) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The sceneFormat parameter is required when creating a reflection view"); + if (!create_reflection_view_info.pVkCreateReflectionViewInfo->uploadCommandBuffer) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_VALUE, "The uploadCommandBuffer parameter is required when creating a reflection view"); + + // Create the reflection view + auto& reflection_view = reflection_views_.Insert(ID(reflection_view_id)); + reflection_view.Create(context_, create_reflection_view_info); + } + + /** + Resolves the Vulkan reflection view. + + \param reflection_view_id The identifier of the reflection view object. + \param resolve_reflection_view_info The reflection view resolve information. + */ + void ContextVK::ResolveReflectionView(std::uint64_t reflection_view_id, FfxSssrResolveReflectionViewInfo const& resolve_reflection_view_info) + { + FFX_SSSR_ASSERT(reflection_views_.At(ID(reflection_view_id))); // not created properly? + FFX_SSSR_ASSERT(context_.IsOfType(reflection_view_id) && context_.IsObjectValid(reflection_view_id)); + FFX_SSSR_ASSERT(context_.reflection_view_view_matrices_.At(ID(reflection_view_id))); + FFX_SSSR_ASSERT(context_.reflection_view_projection_matrices_.At(ID(reflection_view_id))); + + ReflectionView reflection_view; + reflection_view.view_matrix_ = context_.reflection_view_view_matrices_[ID(reflection_view_id)]; + reflection_view.projection_matrix_ = context_.reflection_view_projection_matrices_[ID(reflection_view_id)]; + + reflection_views_[ID(reflection_view_id)].Resolve(context_, reflection_view, resolve_reflection_view_info); + } + + + void ContextVK::CompileShaders(FfxSssrCreateContextInfo const& create_context_info) + { + struct + { + char const* shader_name_ = nullptr; + char const* content_ = nullptr; + char const* profile_ = nullptr; + } + const shader_source[] = + { + { "prepare_indirect_args", prepare_indirect_args, "cs_6_0"}, + { "classify_tiles", classify_tiles, "cs_6_0"}, + { "intersect", intersect, "cs_6_0"}, + { "resolve_spatial", resolve_spatial, "cs_6_0"}, + { "resolve_temporal", resolve_temporal, "cs_6_0"}, + { "resolve_eaw", resolve_eaw, "cs_6_0"}, + }; + + auto const common_include = std::string(common); + + DxcDefine defines[10]; + defines[0].Name = L"FFX_SSSR_ROUGHNESS_TEXTURE_FORMAT"; + defines[0].Value = create_context_info.pRoughnessTextureFormat; + defines[1].Name = L"FFX_SSSR_ROUGHNESS_UNPACK_FUNCTION"; + defines[1].Value = create_context_info.pUnpackRoughnessSnippet; + defines[2].Name = L"FFX_SSSR_NORMALS_TEXTURE_FORMAT"; + defines[2].Value = create_context_info.pNormalsTextureFormat; + defines[3].Name = L"FFX_SSSR_NORMALS_UNPACK_FUNCTION"; + defines[3].Value = create_context_info.pUnpackNormalsSnippet; + defines[4].Name = L"FFX_SSSR_MOTION_VECTOR_TEXTURE_FORMAT"; + defines[4].Value = create_context_info.pMotionVectorFormat; + defines[5].Name = L"FFX_SSSR_MOTION_VECTOR_UNPACK_FUNCTION"; + defines[5].Value = create_context_info.pUnpackMotionVectorsSnippet; + defines[6].Name = L"FFX_SSSR_DEPTH_TEXTURE_FORMAT"; + defines[6].Value = create_context_info.pDepthTextureFormat; + defines[7].Name = L"FFX_SSSR_DEPTH_UNPACK_FUNCTION"; + defines[7].Value = create_context_info.pUnpackDepthSnippet; + defines[8].Name = L"FFX_SSSR_SCENE_TEXTURE_FORMAT"; + defines[8].Value = create_context_info.pSceneTextureFormat; + defines[9].Name = L"FFX_SSSR_SCENE_RADIANCE_UNPACK_FUNCTION"; + defines[9].Value = create_context_info.pUnpackSceneRadianceSnippet; + + static_assert(FFX_SSSR_ARRAY_SIZE(shader_source) == kShader_Count, "'kShader_Count' filenames must be provided for building the various shaders"); + std::stringstream shader_content; + LPCWSTR dxc_arguments[] = { L"-spirv", L"-fspv-target-env=vulkan1.1" }; + for (auto i = 0u; i < kShader_Count; ++i) + { + // Append common includes + shader_content.str(std::string()); + shader_content.clear(); + shader_content << common << std::endl << shader_source[i].content_; + + shaders_[i] = shader_compiler_.CompileShaderString( + shader_content.str().c_str(), + static_cast(shader_content.str().size()), + shader_source[i].shader_name_, + shader_source[i].profile_, + dxc_arguments, FFX_SSSR_ARRAY_SIZE(dxc_arguments), + defines, FFX_SSSR_ARRAY_SIZE(defines)); + } + } + + /** + Creates the reflection view pipeline state. + + \param context The Vulkan context to be used. + */ + void ContextVK::CreatePipelines() + { + VkDescriptorSetLayoutBinding layout_binding = {}; + layout_binding.binding = 0; + layout_binding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + layout_binding.descriptorCount = 1; + layout_binding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_binding.pImmutableSamplers = nullptr; + + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.bindingCount = 1; + descriptor_set_layout_create_info.pBindings = &layout_binding; + if (VK_SUCCESS != vkCreateDescriptorSetLayout(device_, &descriptor_set_layout_create_info, nullptr, &uniform_buffer_descriptor_set_layout_)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create descriptor set layout for uniform buffer"); + } + + auto Setup = [this](ShaderPass& pass, ContextVK::Shader shader, const VkDescriptorSetLayoutBinding* bindings, uint32_t bindings_count, VkPipelineShaderStageCreateFlags flags = 0) { + + pass.device_ = device_; + pass.bindings_count_ = bindings_count; + + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.bindingCount = bindings_count; + descriptor_set_layout_create_info.pBindings = bindings; + if (VK_SUCCESS != vkCreateDescriptorSetLayout(device_, &descriptor_set_layout_create_info, nullptr, &pass.descriptor_set_layout_)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create descriptor set layout"); + } + + VkDescriptorSetLayout layouts[2]; + layouts[0] = uniform_buffer_descriptor_set_layout_; + layouts[1] = pass.descriptor_set_layout_; + + VkPipelineLayoutCreateInfo layout_create_info = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + layout_create_info.pNext = nullptr; + layout_create_info.flags = 0; + layout_create_info.setLayoutCount = FFX_SSSR_ARRAY_SIZE(layouts); + layout_create_info.pSetLayouts = layouts; + layout_create_info.pushConstantRangeCount = 0; + layout_create_info.pPushConstantRanges = nullptr; + if (VK_SUCCESS != vkCreatePipelineLayout(device_, &layout_create_info, nullptr, &pass.pipeline_layout_)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create pipeline layout"); + } + + const ShaderVK& shader_vk = GetShader(shader); + + VkShaderModuleCreateInfo shader_create_info = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; + shader_create_info.pNext = nullptr; + shader_create_info.flags = 0; + shader_create_info.codeSize = shader_vk.BytecodeLength; + shader_create_info.pCode = static_cast(shader_vk.pShaderBytecode); + + VkShaderModule shader_module = VK_NULL_HANDLE; + if (VK_SUCCESS != vkCreateShaderModule(device_, &shader_create_info, nullptr, &shader_module)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create shader module"); + } + + VkPipelineShaderStageCreateInfo stage_create_info = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; + stage_create_info.pNext = nullptr; + stage_create_info.flags = flags; + stage_create_info.stage = VK_SHADER_STAGE_COMPUTE_BIT; + stage_create_info.module = shader_module; + stage_create_info.pName = "main"; + stage_create_info.pSpecializationInfo = nullptr; + + VkComputePipelineCreateInfo create_info = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; + create_info.pNext = nullptr; + create_info.basePipelineHandle = VK_NULL_HANDLE; + create_info.basePipelineIndex = 0; + create_info.flags = 0; +#if FFX_SSSR_DUMP_SHADERS + create_info.flags |= VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; +#endif // FFX_SSSR_DUMP_SHADERS + create_info.layout = pass.pipeline_layout_; + create_info.stage = stage_create_info; + if (VK_SUCCESS != vkCreateComputePipelines(device_, VK_NULL_HANDLE, 1, &create_info, nullptr, &pass.pipeline_)) + { + throw reflection_error(GetContext(), FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create compute pipeline state"); + } + + vkDestroyShaderModule(device_, shader_module, nullptr); + }; + + auto Bind = [](uint32_t binding, VkDescriptorType type) + { + VkDescriptorSetLayoutBinding layout_binding = {}; + layout_binding.binding = binding; + layout_binding.descriptorType = type; + layout_binding.descriptorCount = 1; + layout_binding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_binding.pImmutableSamplers = nullptr; + return layout_binding; + }; + + // Assemble the shader pass for tile classification + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_tile_list + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_ray_list + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_tile_counter + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_ray_counter + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporally_denoised_reflections + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporally_denoised_reflections_history + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_ray_lengths + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporal_variance + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_denoised_reflections + }; + Setup(tile_classification_pass_, ContextVK::kShader_TileClassification, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings)); + } + + // Assemble the shader pass that prepares the indirect arguments + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_tile_counter + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_ray_counter + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_intersect_args + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER), // g_denoiser_args + }; + Setup(indirect_args_pass_, ContextVK::kShader_IndirectArguments, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings)); + } + + // Assemble the shader pass for intersecting reflection rays with the depth buffer + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_lit_scene + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_depth_buffer_hierarchy + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_normal + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_environment_map + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_sobol_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_ranking_tile_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_scrambling_tile_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_ray_list + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLER), // g_linear_sampler + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLER), // g_environment_map_sampler + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_intersection_result + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_ray_lengths + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_denoised_reflections + }; + Setup(intersection_pass_, ContextVK::kShader_Intersection, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings)); + } + + // Assemble the shader pass for spatial resolve + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_depth_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_normal + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_intersection_result + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_has_ray + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_tile_list + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_spatially_denoised_reflections + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_ray_lengths + }; + Setup(spatial_denoising_pass_, ContextVK::kShader_SpatialResolve, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings), + is_subgroup_size_control_extension_available_ ? VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT : 0); + } + + // Assemble the shader pass for temporal resolve + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_normal + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_normal_history + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness_history + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_depth_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_motion_vectors + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_temporally_denoised_reflections_history + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_ray_lengths + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_tile_list + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporally_denoised_reflections + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_spatially_denoised_reflections + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporal_variance + }; + Setup(temporal_denoising_pass_, ContextVK::kShader_TemporalResolve, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings)); + } + + // Assemble the shader pass for EAW resolve + { + uint32_t binding = 0; + VkDescriptorSetLayoutBinding layout_bindings[] = { + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_normal + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_roughness + Bind(binding++, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE), // g_depth_buffer + Bind(binding++, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER), // g_tile_list + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_temporally_denoised_reflections + Bind(binding++, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), // g_denoised_reflections + }; + Setup(eaw_denoising_pass_, ContextVK::kShader_EAWResolve, layout_bindings, FFX_SSSR_ARRAY_SIZE(layout_bindings)); + } + +#if FFX_SSSR_DUMP_SHADERS + tile_classification_pass_.DumpInternalRepresentations("classify_tiles.dump.spirv.amdil.isa"); + indirect_args_pass_.DumpInternalRepresentations("prepare_indirect_args.dump.spirv.amdil.isa"); + intersection_pass_.DumpInternalRepresentations("intersect.dump.spirv.amdil.isa"); + spatial_denoising_pass_.DumpInternalRepresentations("resolve_spatial.dump.spirv.amdil.isa"); + temporal_denoising_pass_.DumpInternalRepresentations("resolve_temporal.dump.spirv.amdil.isa"); + eaw_denoising_pass_.DumpInternalRepresentations("resolve_eaw.dump.spirv.amdil.isa"); +#endif // FFX_SSSR_DUMP_SHADERS + } + + const ContextVK::ShaderPass& ContextVK::GetTileClassificationPass() const + { + return tile_classification_pass_; + } + + const ContextVK::ShaderPass& ContextVK::GetIndirectArgsPass() const + { + return indirect_args_pass_; + } + + const ContextVK::ShaderPass& ContextVK::GetIntersectionPass() const + { + return intersection_pass_; + } + + const ContextVK::ShaderPass& ContextVK::GetSpatialDenoisingPass() const + { + return spatial_denoising_pass_; + } + + const ContextVK::ShaderPass& ContextVK::GetTemporalDenoisingPass() const + { + return temporal_denoising_pass_; + } + + const ContextVK::ShaderPass& ContextVK::GetEawDenoisingPass() const + { + return eaw_denoising_pass_; + } + + VkDescriptorSetLayout ContextVK::GetUniformBufferDescriptorSetLayout() const + { + return uniform_buffer_descriptor_set_layout_; + } + + void ffx_sssr::ContextVK::ShaderPass::DumpInternalRepresentations(const char* path) + { +#if FFX_SSSR_DUMP_SHADERS + VkResult res = VK_SUCCESS; + + std::ofstream filestream(path); + + PFN_vkGetPipelineExecutablePropertiesKHR vkGetPipelineExecutablePropertiesKHR = (PFN_vkGetPipelineExecutablePropertiesKHR)vkGetDeviceProcAddr(device_, "vkGetPipelineExecutablePropertiesKHR"); + PFN_vkGetPipelineExecutableInternalRepresentationsKHR vkGetPipelineExecutableInternalRepresentationsKHR = (PFN_vkGetPipelineExecutableInternalRepresentationsKHR)vkGetDeviceProcAddr(device_, "vkGetPipelineExecutableInternalRepresentationsKHR"); + if (!vkGetPipelineExecutablePropertiesKHR || !vkGetPipelineExecutableInternalRepresentationsKHR) + { + FFX_SSSR_ASSERT(false); // Could not retrieve pipeline executable function pointers - is VK_KHR_pipeline_executable_properties enabled? + return; + } + + VkPipelineInfoKHR pipeline_info = { + VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR, NULL, pipeline_, + }; + + uint32_t executables_count = 0; + res = vkGetPipelineExecutablePropertiesKHR(device_, &pipeline_info, &executables_count, NULL); + FFX_SSSR_ASSERT(res == VK_SUCCESS); + std::vector executables(executables_count); + for (uint32_t i = 0; i < executables_count; ++i) + { + executables[i].sType = VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR; + } + res = vkGetPipelineExecutablePropertiesKHR(device_, &pipeline_info, &executables_count, executables.data()); + FFX_SSSR_ASSERT(res == VK_SUCCESS); + for (uint32_t j = 0; j < executables_count; j++) + { + const VkPipelineExecutablePropertiesKHR& exec = executables[j]; + + VkPipelineExecutableInfoKHR pipeline_exec_info = { VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR }; + pipeline_exec_info.pNext = nullptr; + pipeline_exec_info.pipeline = pipeline_; + pipeline_exec_info.executableIndex = j; + + // Internal representations + uint32_t internal_representation_count = 0; + res = vkGetPipelineExecutableInternalRepresentationsKHR(device_, &pipeline_exec_info, &internal_representation_count, NULL); + FFX_SSSR_ASSERT(res == VK_SUCCESS); + std::vector internal_representations(internal_representation_count); + for (uint32_t i = 0; i < internal_representation_count; i++) + { + internal_representations[i].sType = VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INTERNAL_REPRESENTATION_KHR; + } + res = vkGetPipelineExecutableInternalRepresentationsKHR(device_, &pipeline_exec_info, &internal_representation_count, internal_representations.data()); + FFX_SSSR_ASSERT(res == VK_SUCCESS); + + // For each VkPipelineExecutableInternalRepresentationKHR we now know the data size --> allocate space for pData and call vkGetPipelineExecutableInternalRepresentationsKHR again. + std::vector> data_pointers(internal_representation_count); + for (uint32_t i = 0; i < internal_representation_count; i++) + { + data_pointers[i] = std::make_unique(internal_representations[i].dataSize); + internal_representations[i].pData = data_pointers[i].get(); + } + res = vkGetPipelineExecutableInternalRepresentationsKHR(device_, &pipeline_exec_info, &internal_representation_count, internal_representations.data()); + FFX_SSSR_ASSERT(res == VK_SUCCESS); + + for (uint32_t i = 0; i < internal_representation_count; i++) + { + filestream.write(data_pointers[i].get(), internal_representations[i].dataSize); + } + } + + filestream.close(); +#endif // FFX_SSSR_DUMP_SHADERS + } + +} diff --git a/ffx-sssr/src/vk/context_vk.h b/ffx-sssr/src/vk/context_vk.h new file mode 100644 index 0000000..804d135 --- /dev/null +++ b/ffx-sssr/src/vk/context_vk.h @@ -0,0 +1,170 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include + +#define FFX_SSSR_DUMP_SHADERS 0 + +#include "sampler_vk.h" +#include "reflection_view_vk.h" +#include "upload_buffer_vk.h" +#include "shader_compiler_vk.h" + +namespace ffx_sssr +{ + class Context; + class ReflectionViewVK; + + /** + The ContextVK class encapsulates the data for a single Vulkan stochastic screen space reflections execution context. + */ + class ContextVK + { + FFX_SSSR_NON_COPYABLE(ContextVK); + + public: + /** + The available shaders. + */ + enum Shader + { + kShader_IndirectArguments, + kShader_TileClassification, + kShader_Intersection, + kShader_SpatialResolve, + kShader_TemporalResolve, + kShader_EAWResolve, + + kShader_Count + }; + + ContextVK(Context& context, FfxSssrCreateContextInfo const& create_context_info); + ~ContextVK(); + + inline Context& GetContext(); + inline Context const& GetContext() const; + + inline VkDevice GetDevice() const; + inline VkPhysicalDevice GetPhysicalDevice() const; + inline UploadBufferVK& GetUploadBuffer(); + + inline ShaderVK const& GetShader(Shader shader) const; + inline BlueNoiseSamplerVK const& GetSampler1SPP() const; + inline BlueNoiseSamplerVK const& GetSampler2SPP() const; + + void GetReflectionViewTileClassificationElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const; + void GetReflectionViewIntersectionElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const; + void GetReflectionViewDenoisingElapsedTime(std::uint64_t reflection_view_id, std::uint64_t& elapsed_time) const; + + void CreateReflectionView(std::uint64_t reflection_view_id, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); + void ResolveReflectionView(std::uint64_t reflection_view_id, FfxSssrResolveReflectionViewInfo const& resolve_reflection_view_info); + + protected: + friend class Context; + friend class ReflectionViewVK; + + /** + The ShaderPass class holds the data for an individual shader pass. + */ + class ShaderPass + { + FFX_SSSR_NON_COPYABLE(ShaderPass); + + public: + inline ShaderPass(); + inline ~ShaderPass(); + + inline operator bool() const; + + inline ShaderPass(ShaderPass&& other) noexcept; + inline ShaderPass& operator =(ShaderPass&& other) noexcept; + + void DumpInternalRepresentations(const char* path); + + // The device that created the pass. + VkDevice device_; + // The pipeline state object. + VkPipeline pipeline_; + // The pipeline layout. + VkPipelineLayout pipeline_layout_; + // The descriptor set layout. + VkDescriptorSetLayout descriptor_set_layout_; + // The number of resource bindings of this pass; + uint32_t bindings_count_; + + }; + + void CompileShaders(FfxSssrCreateContextInfo const& create_context_info); + void CreatePipelines(); + + const ShaderPass& GetTileClassificationPass() const; + const ShaderPass& GetIndirectArgsPass() const; + const ShaderPass& GetIntersectionPass() const; + const ShaderPass& GetSpatialDenoisingPass() const; + const ShaderPass& GetTemporalDenoisingPass() const; + const ShaderPass& GetEawDenoisingPass() const; + VkDescriptorSetLayout GetUniformBufferDescriptorSetLayout() const; + + // The execution context. + Context& context_; + // The device to be used. + VkDevice device_; + // The physical device to be used. + VkPhysicalDevice physical_device_; + // If the VK_EXT_subgroup_size_control extension is available. + bool is_subgroup_size_control_extension_available_; + // The compiled reflections shaders. + std::array shaders_; + // The compiler to be used for building the Vulkan shaders. + ShaderCompilerVK shader_compiler_; + // The Blue Noise sampler optimized for 1 sample per pixel. + BlueNoiseSamplerVK blue_noise_sampler_1spp_; + // The Blue Noise sampler optimized for 2 samples per pixel. + BlueNoiseSamplerVK blue_noise_sampler_2spp_; + // The flag for whether the samplers were populated. + bool samplers_were_populated_; + // The buffer to be used for uploading memory from the CPU to the GPU. + UploadBufferVK upload_buffer_; + // The array of reflection views to be resolved. + SparseArray reflection_views_; + + // Same descriptor set layout for all passes. + VkDescriptorSetLayout uniform_buffer_descriptor_set_layout_; + // The shader pass that classifies tiles. + ShaderPass tile_classification_pass_; + // The shader pass that prepares the indirect arguments. + ShaderPass indirect_args_pass_; + // The shader pass intersecting reflection rays with the depth buffer. + ShaderPass intersection_pass_; + // The shader pass that does spatial denoising. + ShaderPass spatial_denoising_pass_; + // The shader pass that does temporal denoising. + ShaderPass temporal_denoising_pass_; + // The shader pass that does the second spatial denoising. + ShaderPass eaw_denoising_pass_; + }; +} + +#include "context_vk.inl" diff --git a/ffx-sssr/src/vk/context_vk.inl b/ffx-sssr/src/vk/context_vk.inl new file mode 100644 index 0000000..fb2a83a --- /dev/null +++ b/ffx-sssr/src/vk/context_vk.inl @@ -0,0 +1,206 @@ +#include "context_vk.h" +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +namespace ffx_sssr +{ + /** + Gets the context. + + \return The context. + */ + Context& ContextVK::GetContext() + { + return context_; + } + + /** + Gets the Vulkan device. + + \return The Vulkan device. + */ + VkDevice ContextVK::GetDevice() const + { + return device_; + } + + + /** + Gets the Vulkan physical device. + + \return The Vulkan physical device. + */ + inline VkPhysicalDevice ContextVK::GetPhysicalDevice() const + { + return physical_device_; + } + + /** + Gets the context. + + \return The context. + */ + Context const& ContextVK::GetContext() const + { + return context_; + } + + /** + Gets hold of the upload buffer. + + \return The upload buffer. + */ + UploadBufferVK& ContextVK::GetUploadBuffer() + { + return upload_buffer_; + } + + /** + Gets the shader. + + \param shader The shader to be retrieved. + \param switches The set of switches to be used. + \return The requested shader. + */ + ShaderVK const& ContextVK::GetShader(Shader shader) const + { + FFX_SSSR_ASSERT(shader < kShader_Count); + return shaders_[shader]; + } + + /** + Gets a blue noise sampler with 1 sample per pixel. + + \return The requested sampler. + */ + inline BlueNoiseSamplerVK const & ContextVK::GetSampler1SPP() const + { + return blue_noise_sampler_1spp_; + } + + /** + Gets a blue noise sampler with 2 samples per pixel. + + \return The requested sampler. + */ + inline BlueNoiseSamplerVK const & ContextVK::GetSampler2SPP() const + { + return blue_noise_sampler_2spp_; + } + + /** + The constructor for the ShaderPass class. + */ + ContextVK::ShaderPass::ShaderPass() + : device_(VK_NULL_HANDLE) + , pipeline_(VK_NULL_HANDLE) + , pipeline_layout_(VK_NULL_HANDLE) + , descriptor_set_layout_(VK_NULL_HANDLE) + , bindings_count_(0) + { + } + + /** + The constructor for the ShaderPass class. + + \param other The shader pass to be moved. + */ + ContextVK::ShaderPass::ShaderPass(ShaderPass&& other) noexcept + : device_(other.device_) + , pipeline_(other.pipeline_) + , pipeline_layout_(other.pipeline_layout_) + , descriptor_set_layout_(other.descriptor_set_layout_) + , bindings_count_(other.bindings_count_) + { + other.device_ = VK_NULL_HANDLE; + other.pipeline_ = VK_NULL_HANDLE; + other.pipeline_layout_ = VK_NULL_HANDLE; + other.descriptor_set_layout_ = VK_NULL_HANDLE; + other.bindings_count_ = 0; + } + + /** + The destructor for the ShaderPass class. + */ + ContextVK::ShaderPass::~ShaderPass() + { + FFX_SSSR_ASSERT(device_); + + if (pipeline_) + { + vkDestroyPipeline(device_, pipeline_, nullptr); + } + + if (pipeline_layout_) + { + vkDestroyPipelineLayout(device_, pipeline_layout_, nullptr); + } + + if (descriptor_set_layout_) + { + vkDestroyDescriptorSetLayout(device_, descriptor_set_layout_, nullptr); + } + + device_ = VK_NULL_HANDLE; + pipeline_ = VK_NULL_HANDLE; + pipeline_layout_ = VK_NULL_HANDLE; + descriptor_set_layout_ = VK_NULL_HANDLE; + bindings_count_ = 0; + } + + /** + Assigns the shader pass. + + \param other The shader pass to be moved. + \return The assigned shader pass. + */ + ContextVK::ShaderPass& ContextVK::ShaderPass::operator =(ShaderPass&& other) noexcept + { + if (this != &other) + { + device_ = other.device_; + pipeline_ = other.pipeline_; + pipeline_layout_ = other.pipeline_layout_; + descriptor_set_layout_ = other.descriptor_set_layout_; + bindings_count_ = other.bindings_count_; + + other.device_ = VK_NULL_HANDLE; + other.pipeline_ = VK_NULL_HANDLE; + other.pipeline_layout_ = VK_NULL_HANDLE; + other.descriptor_set_layout_ = VK_NULL_HANDLE; + other.bindings_count_ = 0; + } + + return *this; + } + + /** + Checks whether the shader pass is valid. + + \return true if the shader pass is valid, false otherwise. + */ + ContextVK::ShaderPass::operator bool() const + { + return (device_ && pipeline_ && pipeline_layout_ && descriptor_set_layout_); + } +} diff --git a/ffx-sssr/src/vk/image_vk.cpp b/ffx-sssr/src/vk/image_vk.cpp new file mode 100644 index 0000000..3401495 --- /dev/null +++ b/ffx-sssr/src/vk/image_vk.cpp @@ -0,0 +1,208 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "image_vk.h" +#include "memory.h" + +namespace ffx_sssr +{ + /** + The constructor for the ImageVK class. + */ + ImageVK::ImageVK() + : image_(VK_NULL_HANDLE) + , device_(VK_NULL_HANDLE) + , memory_(VK_NULL_HANDLE) + , image_view_(VK_NULL_HANDLE) + { + } + + /** + The destructor for the ImageVK class. + */ + ImageVK::~ImageVK() + { + if (image_) + { + vkDestroyImage(device_, image_, nullptr); + image_ = VK_NULL_HANDLE; + } + + if (memory_) + { + vkFreeMemory(device_, memory_, nullptr); + memory_ = VK_NULL_HANDLE; + } + + if (image_view_) + { + vkDestroyImageView(device_, image_view_, nullptr); + image_view_ = VK_NULL_HANDLE; + } + + device_ = VK_NULL_HANDLE; + } + + /** + The constructor for the ImageVK class. + + \param device The VkDevice that creates the image view. + \param physical_device The VkPhysicalDevice to determine the right memory heap. + \param create_info The create info. + */ + ImageVK::ImageVK(VkDevice device, VkPhysicalDevice physical_device, const CreateInfo & create_info) + : device_(device) + { + VkImageCreateInfo image_create_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; + image_create_info.pNext = nullptr; + image_create_info.flags = 0; + image_create_info.imageType = VK_IMAGE_TYPE_2D; + image_create_info.format = create_info.format_; + image_create_info.extent = { create_info.width_, create_info.height_, 1 }; + image_create_info.mipLevels = create_info.mip_levels_; + image_create_info.arrayLayers = 1; + image_create_info.samples = VK_SAMPLE_COUNT_1_BIT; + image_create_info.tiling = VK_IMAGE_TILING_OPTIMAL; + image_create_info.usage = create_info.image_usage_; + image_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + image_create_info.queueFamilyIndexCount = 0; + image_create_info.pQueueFamilyIndices = nullptr; + image_create_info.initialLayout = create_info.initial_layout_; + if (VK_SUCCESS != vkCreateImage(device, &image_create_info, nullptr, &image_)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + VkMemoryRequirements memory_requirements = {}; + vkGetImageMemoryRequirements(device, image_, &memory_requirements); + + VkPhysicalDeviceMemoryProperties memory_properties = {}; + vkGetPhysicalDeviceMemoryProperties(physical_device, &memory_properties); + + // find the right memory type for this image + int memory_type_index = -1; + for (uint32_t i = 0; i < memory_properties.memoryTypeCount; ++i) + { + const VkMemoryType& memory_type = memory_properties.memoryTypes[i]; + bool has_required_properties = memory_type.propertyFlags & create_info.memory_property_flags; + bool is_required_memory_type = memory_requirements.memoryTypeBits & (1 << i); + if (has_required_properties && is_required_memory_type) + { + memory_type_index = i; + break; + } + } + + // abort if we couldn't find the right memory type + if (memory_type_index == -1) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + VkMemoryAllocateInfo memory_allocate_info = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; + memory_allocate_info.pNext = nullptr; + memory_allocate_info.allocationSize = memory_requirements.size; + memory_allocate_info.memoryTypeIndex = memory_type_index; + if (VK_SUCCESS != vkAllocateMemory(device, &memory_allocate_info, nullptr, &memory_)) + { + throw reflection_error(FFX_SSSR_STATUS_OUT_OF_MEMORY); + } + + if (VK_SUCCESS != vkBindImageMemory(device_, image_, memory_, 0)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + VkImageSubresourceRange subresource_range = {}; + subresource_range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + subresource_range.baseMipLevel = 0; + subresource_range.levelCount = create_info.mip_levels_; + subresource_range.baseArrayLayer = 0; + subresource_range.layerCount = 1; + + VkImageViewCreateInfo image_view_create_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO }; + image_view_create_info.pNext = VK_NULL_HANDLE; + image_view_create_info.flags = 0; + image_view_create_info.image = image_; + image_view_create_info.viewType = VK_IMAGE_VIEW_TYPE_2D; + image_view_create_info.format = create_info.format_; + image_view_create_info.subresourceRange = subresource_range; + if (VK_SUCCESS != vkCreateImageView(device_, &image_view_create_info, nullptr, &image_view_)) + { + throw reflection_error(FFX_SSSR_STATUS_INTERNAL_ERROR); + } + + FFX_SSSR_ASSERT(create_info.name_); // require all library objects to be named. + PFN_vkSetDebugUtilsObjectNameEXT vkSetDebugUtilsObjectName = (PFN_vkSetDebugUtilsObjectNameEXT)vkGetDeviceProcAddr(device, "vkSetDebugUtilsObjectNameEXT"); + if (vkSetDebugUtilsObjectName) + { + VkDebugUtilsObjectNameInfoEXT object_name_info = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT }; + object_name_info.pNext = nullptr; + object_name_info.objectType = VK_OBJECT_TYPE_IMAGE; + object_name_info.objectHandle = reinterpret_cast(image_); + object_name_info.pObjectName = create_info.name_; + + VkResult result = vkSetDebugUtilsObjectName(device, &object_name_info); + FFX_SSSR_ASSERT(result == VK_SUCCESS); + } + } + + /** + The constructor for the ImageVK class. + + \param other The image to be moved. + */ + ImageVK::ImageVK(ImageVK && other) noexcept + : image_(other.image_) + , device_(other.device_) + , image_view_(other.image_view_) + , memory_(other.memory_) + { + other.image_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + other.image_view_ = VK_NULL_HANDLE; + other.memory_ = VK_NULL_HANDLE; + } + + /** + Assigns the image. + + \param other The image to be moved. + \return The assigned image. + */ + ImageVK & ImageVK::operator=(ImageVK && other) noexcept + { + if (this != &other) + { + image_ = other.image_; + device_ = other.device_; + image_view_ = other.image_view_; + memory_ = other.memory_; + + other.image_ = VK_NULL_HANDLE; + other.device_ = VK_NULL_HANDLE; + other.image_view_ = VK_NULL_HANDLE; + other.memory_ = VK_NULL_HANDLE; + } + + return *this; + } +} diff --git a/ffx-sssr/src/vk/image_vk.h b/ffx-sssr/src/vk/image_vk.h new file mode 100644 index 0000000..0cf0d34 --- /dev/null +++ b/ffx-sssr/src/vk/image_vk.h @@ -0,0 +1,66 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include "macros.h" +#include "ffx_sssr.h" + +namespace ffx_sssr +{ + /** + The ImageVK class is a helper class to create and destroy image resources on Vulkan. + */ + class ImageVK + { + FFX_SSSR_NON_COPYABLE(ImageVK); + + public: + + class CreateInfo + { + public: + uint32_t width_; + uint32_t height_; + VkFormat format_; + uint32_t mip_levels_; + VkImageLayout initial_layout_; + VkMemoryPropertyFlags memory_property_flags; + VkImageUsageFlags image_usage_; + const char* name_; + }; + + ImageVK(); + ~ImageVK(); + + ImageVK(VkDevice device, VkPhysicalDevice physical_device, const CreateInfo& create_info); + + ImageVK(ImageVK&& other) noexcept; + ImageVK& ImageVK::operator =(ImageVK&& other) noexcept; + + VkDevice device_; + VkImage image_; + VkImageView image_view_; + VkDeviceMemory memory_; // We're creating a low number of allocations for this library, so we just allocate a dedicated memory object per buffer. Normally you'd want to do sub-allocations of a larger allocation. + }; +} diff --git a/ffx-sssr/src/vk/reflection_view_vk.cpp b/ffx-sssr/src/vk/reflection_view_vk.cpp new file mode 100644 index 0000000..c4d06f7 --- /dev/null +++ b/ffx-sssr/src/vk/reflection_view_vk.cpp @@ -0,0 +1,1094 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "reflection_view_vk.h" + +#include +#include + +#include "context.h" +#include "reflection_error.h" +#include "reflection_view.h" +#include "context_vk.h" +#include "ffx_sssr_vk.h" + +namespace ffx_sssr +{ + /** + The constructor for the ReflectionViewVK class. + */ + ReflectionViewVK::ReflectionViewVK() + : width_(0) + , height_(0) + , flags_(0) + , descriptor_pool_(0) + , tile_list_() + , tile_counter_() + , ray_list_() + , ray_counter_() + , intersection_pass_indirect_args_() + , denoiser_pass_indirect_args_() + , temporal_denoiser_result_() + , ray_lengths_() + , temporal_variance_() + , tile_classification_elapsed_time_(0) + , intersection_elapsed_time_(0) + , denoising_elapsed_time_(0) + , timestamp_query_pool_(0) + , timestamp_queries_() + , timestamp_queries_index_(0) + , scene_format_(VK_FORMAT_UNDEFINED) + , tile_classification_descriptor_set_() + , indirect_args_descriptor_set_() + , intersection_descriptor_set_() + , spatial_denoising_descriptor_set_() + , temporal_denoising_descriptor_set_() + , eaw_denoising_descriptor_set_() + , prev_view_projection_() + , uniform_buffer_descriptor_set_() + { + } + + /** + The constructor for the ReflectionViewVK class. + + \param other The reflection view to be moved. + */ + ReflectionViewVK::ReflectionViewVK(ReflectionViewVK&& other) noexcept + : width_(other.width_) + , height_(other.height_) + , flags_(other.flags_) + , descriptor_pool_(other.descriptor_pool_) + , tile_classification_elapsed_time_(other.tile_classification_elapsed_time_) + , intersection_elapsed_time_(other.intersection_elapsed_time_) + , denoising_elapsed_time_(other.denoising_elapsed_time_) + , timestamp_query_pool_(other.timestamp_query_pool_) + , timestamp_queries_(std::move(other.timestamp_queries_)) + , timestamp_queries_index_(other.timestamp_queries_index_) + , tile_list_(std::move(other.tile_list_)) + , tile_counter_(std::move(other.tile_counter_)) + , ray_list_(std::move(other.ray_list_)) + , ray_counter_(std::move(other.ray_counter_)) + , intersection_pass_indirect_args_(std::move(other.intersection_pass_indirect_args_)) + , denoiser_pass_indirect_args_(std::move(other.denoiser_pass_indirect_args_)) + , ray_lengths_(std::move(other.ray_lengths_)) + , temporal_variance_(std::move(other.temporal_variance_)) + , scene_format_(other.scene_format_) + , prev_view_projection_(other.prev_view_projection_) + { + + for (int i = 0; i < 2; ++i) + { + temporal_denoiser_result_[i] = std::move(other.temporal_denoiser_result_[i]); + + tile_classification_descriptor_set_[i] = other.tile_classification_descriptor_set_[i]; + indirect_args_descriptor_set_[i] = other.indirect_args_descriptor_set_[i]; + intersection_descriptor_set_[i] = other.intersection_descriptor_set_[i]; + spatial_denoising_descriptor_set_[i] = other.spatial_denoising_descriptor_set_[i]; + temporal_denoising_descriptor_set_[i] = other.temporal_denoising_descriptor_set_[i]; + eaw_denoising_descriptor_set_[i] = other.eaw_denoising_descriptor_set_[i]; + + other.tile_classification_descriptor_set_[i] = VK_NULL_HANDLE; + other.indirect_args_descriptor_set_[i] = VK_NULL_HANDLE; + other.intersection_descriptor_set_[i] = VK_NULL_HANDLE; + other.spatial_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + other.temporal_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + other.eaw_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + } + + for (int i = 0; i < FFX_SSSR_ARRAY_SIZE(uniform_buffer_descriptor_set_); ++i) + { + uniform_buffer_descriptor_set_[i] = other.uniform_buffer_descriptor_set_[i]; + other.uniform_buffer_descriptor_set_[i] = VK_NULL_HANDLE; + } + + other.descriptor_pool_ = VK_NULL_HANDLE; + other.timestamp_query_pool_ = VK_NULL_HANDLE; + } + + /** + The destructor for the ReflectionViewVK class. + */ + ReflectionViewVK::~ReflectionViewVK() + { + if (linear_sampler_) + { + vkDestroySampler(device_, linear_sampler_, nullptr); + } + + if (descriptor_pool_) + { + vkResetDescriptorPool(device_, descriptor_pool_, 0); + vkDestroyDescriptorPool(device_, descriptor_pool_, nullptr); + } + + if (timestamp_query_pool_) + { + vkDestroyQueryPool(device_, timestamp_query_pool_, nullptr); + } + } + + /** + Assigns the reflection view. + + \param other The reflection view to be moved. + \return The assigned reflection view. + */ + ReflectionViewVK& ReflectionViewVK::operator =(ReflectionViewVK&& other) noexcept + { + if (this != &other) + { + width_ = other.width_; + height_ = other.height_; + flags_ = other.flags_; + scene_format_ = other.scene_format_; + prev_view_projection_ = other.prev_view_projection_; + descriptor_pool_ = other.descriptor_pool_; + device_ = other.device_; + physical_device_ = other.physical_device_; + + timestamp_queries_ = other.timestamp_queries_; + timestamp_queries_index_ = other.timestamp_queries_index_; + tile_classification_elapsed_time_ = other.tile_classification_elapsed_time_; + intersection_elapsed_time_ = other.intersection_elapsed_time_; + denoising_elapsed_time_ = other.denoising_elapsed_time_; + timestamp_query_pool_ = other.timestamp_query_pool_; + + tile_list_ = std::move(other.tile_list_); + tile_counter_ = std::move(other.tile_counter_); + ray_list_ = std::move(other.ray_list_); + ray_counter_ = std::move(other.ray_counter_); + intersection_pass_indirect_args_ = std::move(other.intersection_pass_indirect_args_); + denoiser_pass_indirect_args_ = std::move(other.denoiser_pass_indirect_args_); + ray_lengths_ = std::move(other.ray_lengths_); + temporal_variance_ = std::move(other.temporal_variance_); + + other.descriptor_pool_ = VK_NULL_HANDLE; + timestamp_query_pool_ = VK_NULL_HANDLE; + + for (int i = 0; i < 2; ++i) + { + temporal_denoiser_result_[i] = std::move(other.temporal_denoiser_result_[i]); + + tile_classification_descriptor_set_[i] = other.tile_classification_descriptor_set_[i]; + indirect_args_descriptor_set_[i] = other.indirect_args_descriptor_set_[i]; + intersection_descriptor_set_[i] = other.intersection_descriptor_set_[i]; + spatial_denoising_descriptor_set_[i] = other.spatial_denoising_descriptor_set_[i]; + temporal_denoising_descriptor_set_[i] = other.temporal_denoising_descriptor_set_[i]; + eaw_denoising_descriptor_set_[i] = other.eaw_denoising_descriptor_set_[i]; + + other.tile_classification_descriptor_set_[i] = VK_NULL_HANDLE; + other.indirect_args_descriptor_set_[i] = VK_NULL_HANDLE; + other.intersection_descriptor_set_[i] = VK_NULL_HANDLE; + other.spatial_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + other.temporal_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + other.eaw_denoising_descriptor_set_[i] = VK_NULL_HANDLE; + } + + for (int i = 0; i < FFX_SSSR_ARRAY_SIZE(uniform_buffer_descriptor_set_); ++i) + { + uniform_buffer_descriptor_set_[i] = other.uniform_buffer_descriptor_set_[i]; + other.uniform_buffer_descriptor_set_[i] = VK_NULL_HANDLE; + } + } + + return *this; + } + + /** + Creates the reflection view. + + \param context The context to be used. + \param create_reflection_view_info The reflection view creation information. + */ + void ReflectionViewVK::Create(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info) + { + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo != nullptr); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->sceneFormat != VK_FORMAT_UNDEFINED); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->depthBufferHierarchySRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->motionBufferSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->normalBufferSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessBufferSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->normalHistoryBufferSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessHistoryBufferSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->environmentMapSRV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->environmentMapSampler); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->reflectionViewUAV); + FFX_SSSR_ASSERT(create_reflection_view_info.pVkCreateReflectionViewInfo->uploadCommandBuffer); + FFX_SSSR_ASSERT(create_reflection_view_info.outputWidth && create_reflection_view_info.outputHeight); + + // Populate the reflection view properties + device_ = context.GetContextVK()->GetDevice(); + physical_device_ = context.GetContextVK()->GetPhysicalDevice(); + width_ = create_reflection_view_info.outputWidth; + height_ = create_reflection_view_info.outputHeight; + flags_ = create_reflection_view_info.flags; + scene_format_ = create_reflection_view_info.pVkCreateReflectionViewInfo->sceneFormat; + + // Create pool for timestamp queries + VkQueryPoolCreateInfo query_pool_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO }; + query_pool_create_info.pNext = nullptr; + query_pool_create_info.flags = 0; + query_pool_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + query_pool_create_info.queryCount = kTimestampQuery_Count * context.GetFrameCountBeforeReuse(); + query_pool_create_info.pipelineStatistics = 0; + if (VK_SUCCESS != vkCreateQueryPool(device_, &query_pool_create_info, NULL, ×tamp_query_pool_)) + { + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create timestamp query pool"); + } + + timestamp_queries_.resize(context.GetFrameCountBeforeReuse()); + for (auto& timestamp_queries : timestamp_queries_) + { + timestamp_queries.reserve(kTimestampQuery_Count); + } + + // Create reflection view resources + CreateDescriptorPool(context); + SetupInternalResources(context, create_reflection_view_info); + AllocateDescriptorSets(context); + InitializeResourceDescriptorSets(context, create_reflection_view_info); + } + + /** + Returns an upper limit of required descriptors. + + \return The conservative count of total descriptors. + */ + uint32_t ReflectionViewVK::GetConservativeResourceDescriptorCount(const Context& context) const + { + const ContextVK* vk_context = context.GetContextVK(); + uint32_t resource_descriptor_count = vk_context->GetTileClassificationPass().bindings_count_ + + vk_context->GetIndirectArgsPass().bindings_count_ + + vk_context->GetIntersectionPass().bindings_count_ + + vk_context->GetSpatialDenoisingPass().bindings_count_ + + vk_context->GetTemporalDenoisingPass().bindings_count_ + + vk_context->GetEawDenoisingPass().bindings_count_; + resource_descriptor_count *= 2; // double buffering descriptors + return resource_descriptor_count; + } + + /** + Creates the descriptor pool. + + \param context The context to be used. + */ + void ReflectionViewVK::CreateDescriptorPool(const Context& context) + { + FFX_SSSR_ASSERT(!descriptor_pool_); + uint32_t resource_descriptor_count = GetConservativeResourceDescriptorCount(context); + + uint32_t frame_count = context.GetFrameCountBeforeReuse(); + uint32_t uniform_buffer_descriptor_count = frame_count; + + // Low descriptor counts overall, so we just allocate the max count per type. + VkDescriptorPoolSize pool_sizes[5]; + pool_sizes[0].descriptorCount = resource_descriptor_count; + pool_sizes[0].type = VK_DESCRIPTOR_TYPE_SAMPLER; + pool_sizes[1].descriptorCount = resource_descriptor_count; + pool_sizes[1].type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + pool_sizes[2].descriptorCount = resource_descriptor_count; + pool_sizes[2].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + pool_sizes[3].descriptorCount = resource_descriptor_count; + pool_sizes[3].type = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; + pool_sizes[4].descriptorCount = uniform_buffer_descriptor_count; + pool_sizes[4].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + + uint32_t uniform_buffer_set_count = frame_count; + uint32_t resources_set_count = 2 * 8; // 8 passes double buffered + + VkDescriptorPoolCreateInfo create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; + create_info.pNext = nullptr; + create_info.flags = 0; + create_info.maxSets = uniform_buffer_set_count + resources_set_count; + create_info.poolSizeCount = FFX_SSSR_ARRAY_SIZE(pool_sizes); + create_info.pPoolSizes = pool_sizes; + + if (VK_SUCCESS != vkCreateDescriptorPool(device_, &create_info, nullptr, &descriptor_pool_)) + { + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create descriptor pool."); + } + } + + /** + Creates all internal resources and handles initial resource transitions. + + \param context The context to be used. + \param reflection_view The reflection view to be resolved. + + */ + void ReflectionViewVK::SetupInternalResources(Context & context, FfxSssrCreateReflectionViewInfo const & create_reflection_view_info) + { + VkSamplerCreateInfo sampler_info = { VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO }; + sampler_info.pNext = nullptr; + sampler_info.flags = 0; + sampler_info.magFilter = VK_FILTER_LINEAR; + sampler_info.minFilter = VK_FILTER_LINEAR; + sampler_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + sampler_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + sampler_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + sampler_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + sampler_info.mipLodBias = 0; + sampler_info.anisotropyEnable = false; + sampler_info.maxAnisotropy = 0; + sampler_info.compareEnable = false; + sampler_info.compareOp = VK_COMPARE_OP_NEVER; + sampler_info.minLod = 0; + sampler_info.maxLod = 16; + sampler_info.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + sampler_info.unnormalizedCoordinates = false; + if (VK_SUCCESS != vkCreateSampler(device_, &sampler_info, nullptr, &linear_sampler_)) + { + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to create linear sampler"); + } + + // Create tile classification-related buffers + { + uint32_t num_tiles = RoundedDivide(width_, 8u) * RoundedDivide(height_, 8u); + uint32_t num_pixels = width_ * height_; + + uint32_t tile_list_element_count = num_tiles; + uint32_t tile_counter_element_count = 1; + uint32_t ray_list_element_count = num_pixels; + uint32_t ray_counter_element_count = 1; + uint32_t intersection_pass_indirect_args_element_count = 3; + uint32_t denoiser_pass_indirect_args_element_count = 3; + + BufferVK::CreateInfo create_info = {}; + create_info.memory_property_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + create_info.format_ = VK_FORMAT_R32_UINT; + create_info.buffer_usage_ = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + + create_info.size_in_bytes_ = tile_list_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Tile List"; + tile_list_ = BufferVK(device_, physical_device_, create_info); + + create_info.size_in_bytes_ = ray_list_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Ray List"; + ray_list_ = BufferVK(device_, physical_device_, create_info); + + create_info.buffer_usage_ = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + create_info.size_in_bytes_ = tile_counter_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Tile Counter"; + tile_counter_ = BufferVK(device_, physical_device_, create_info); + + create_info.size_in_bytes_ = ray_counter_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Ray Counter"; + ray_counter_ = BufferVK(device_, physical_device_, create_info); + + create_info.buffer_usage_ = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; + + create_info.size_in_bytes_ = intersection_pass_indirect_args_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Intersect Indirect Args"; + intersection_pass_indirect_args_ = BufferVK(device_, physical_device_, create_info); + + create_info.size_in_bytes_ = denoiser_pass_indirect_args_element_count * sizeof(uint32_t); + create_info.name_ = "SSSR Denoiser Indirect Args"; + denoiser_pass_indirect_args_ = BufferVK(device_, physical_device_, create_info); + } + + // Create denoising-related resources + { + ImageVK::CreateInfo create_info = {}; + create_info.width_ = width_; + create_info.height_ = height_; + create_info.mip_levels_ = 1; + create_info.initial_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; + create_info.memory_property_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + create_info.image_usage_ = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + create_info.format_ = scene_format_; + create_info.name_ = "SSSR Temporal Denoised Result 0"; + temporal_denoiser_result_[0] = ImageVK(device_, physical_device_, create_info); + + create_info.format_ = scene_format_; + create_info.name_ = "SSSR Temporal Denoised Result 1"; + temporal_denoiser_result_[1] = ImageVK(device_, physical_device_, create_info); + + create_info.format_ = VK_FORMAT_R16_SFLOAT; + create_info.name_ = "SSSR Ray Lengths"; + ray_lengths_ = ImageVK(device_, physical_device_, create_info); + + create_info.format_ = VK_FORMAT_R8_UNORM; + create_info.name_ = "SSSR Temporal Variance"; + temporal_variance_ = ImageVK(device_, physical_device_, create_info); + } + + VkCommandBuffer command_buffer = create_reflection_view_info.pVkCreateReflectionViewInfo->uploadCommandBuffer; + + VkImageMemoryBarrier image_barriers[] = { + Transition(temporal_denoiser_result_[0].image_, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL), + Transition(temporal_denoiser_result_[1].image_, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL), + Transition(ray_lengths_.image_, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL), + Transition(temporal_variance_.image_, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL) + }; + TransitionBarriers(command_buffer, image_barriers, FFX_SSSR_ARRAY_SIZE(image_barriers)); + + // Initial clear of counters. Successive clears are handled by the indirect arguments pass. + vkCmdFillBuffer(command_buffer, ray_counter_.buffer_, 0, VK_WHOLE_SIZE, 0); + vkCmdFillBuffer(command_buffer, tile_counter_.buffer_, 0, VK_WHOLE_SIZE, 0); + + VkClearColorValue clear_calue = {}; + clear_calue.float32[0] = 0; + clear_calue.float32[1] = 0; + clear_calue.float32[2] = 0; + clear_calue.float32[3] = 0; + + VkImageSubresourceRange subresource_range = {}; + subresource_range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + subresource_range.baseArrayLayer = 0; + subresource_range.baseMipLevel = 0; + subresource_range.layerCount = 1; + subresource_range.levelCount = 1; + + // Initial resource clears + vkCmdClearColorImage(command_buffer, temporal_denoiser_result_[0].image_, VK_IMAGE_LAYOUT_GENERAL, &clear_calue, 1, &subresource_range); + vkCmdClearColorImage(command_buffer, temporal_denoiser_result_[1].image_, VK_IMAGE_LAYOUT_GENERAL, &clear_calue, 1, &subresource_range); + vkCmdClearColorImage(command_buffer, ray_lengths_.image_, VK_IMAGE_LAYOUT_GENERAL, &clear_calue, 1, &subresource_range); + vkCmdClearColorImage(command_buffer, temporal_variance_.image_, VK_IMAGE_LAYOUT_GENERAL, &clear_calue, 1, &subresource_range); + } + + /** + Allocate all required descriptor sets from the descriptor pool. + This includes double buffering of the resource descriptor sets and + multi-buffering of the descriptor set containing the uniform buffer descriptor. + + \param context The context to be used. + */ + void ReflectionViewVK::AllocateDescriptorSets(Context& context) + { + ContextVK* vk_context = context.GetContextVK(); + for (int i = 0; i < 2; ++i) + { + tile_classification_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetTileClassificationPass().descriptor_set_layout_); + indirect_args_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetIndirectArgsPass().descriptor_set_layout_); + intersection_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetIntersectionPass().descriptor_set_layout_); + spatial_denoising_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetSpatialDenoisingPass().descriptor_set_layout_); + temporal_denoising_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetTemporalDenoisingPass().descriptor_set_layout_); + eaw_denoising_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetEawDenoisingPass().descriptor_set_layout_); + } + + uint32_t frame_count = context.GetFrameCountBeforeReuse(); + for (uint32_t i = 0; i < frame_count; ++i) + { + uniform_buffer_descriptor_set_[i] = AllocateDescriptorSet(context, vk_context->GetUniformBufferDescriptorSetLayout()); + } + } + + /** + Allocate a single descriptor set from the descriptor pool. + + \param context The context to be used. + \param layout The layout of the descriptor set. + \return The allocated set. + */ + VkDescriptorSet ReflectionViewVK::AllocateDescriptorSet(Context& context, VkDescriptorSetLayout layout) + { + VkDescriptorSetAllocateInfo alloc_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; + alloc_info.descriptorPool = descriptor_pool_; + alloc_info.pNext = nullptr; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &layout; + + VkDescriptorSet set; + if (VK_SUCCESS != vkAllocateDescriptorSets(device_, &alloc_info, &set)) + { + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to allocate descriptor set"); + } + return set; + } + + /** + Initializes the resource descriptor sets of each pass. + The uniform buffer on the other hand is updated each frame and thus not handled here. + + \param context The context to be used. + \param reflection_view The reflection view to be resolved. + + */ + void ReflectionViewVK::InitializeResourceDescriptorSets(Context & context, FfxSssrCreateReflectionViewInfo const & create_reflection_view_info) + { + VkImageView scene_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->sceneSRV; + VkImageView depth_hierarchy_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->depthBufferHierarchySRV; + VkImageView motion_buffer_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->motionBufferSRV; + VkImageView normal_buffer_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->normalBufferSRV; + VkImageView roughness_buffer_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessBufferSRV; + VkImageView normal_history_buffer_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->normalHistoryBufferSRV; + VkImageView roughness_history_buffer_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->roughnessHistoryBufferSRV; + VkSampler environment_map_sampler = create_reflection_view_info.pVkCreateReflectionViewInfo->environmentMapSampler; + VkImageView environment_map_srv = create_reflection_view_info.pVkCreateReflectionViewInfo->environmentMapSRV; + VkImageView output_buffer_uav = create_reflection_view_info.pVkCreateReflectionViewInfo->reflectionViewUAV; + + VkImageView normal_buffers[] = { normal_buffer_srv, normal_history_buffer_srv }; + VkImageView roughness_buffers[] = { roughness_buffer_srv, roughness_history_buffer_srv }; + + bool ping_pong_normal = (create_reflection_view_info.flags & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_NORMAL_BUFFERS) != 0; + bool ping_pong_roughness = (create_reflection_view_info.flags & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_PING_PONG_ROUGHNESS_BUFFERS) != 0; + + uint32_t descriptor_count = GetConservativeResourceDescriptorCount(context); + std::vector image_infos; + std::vector write_desc_sets; + image_infos.reserve(descriptor_count); + write_desc_sets.reserve(descriptor_count); + uint32_t binding = 0; + VkDescriptorSet target_set = VK_NULL_HANDLE; + +#define FFX_SSSR_DEBUG_DESCRIPTOR_SETUP 0 + + auto BindSampler = [this, &target_set, &binding, &write_desc_sets, &image_infos](VkSampler sampler) { + VkDescriptorImageInfo image_info = {}; + image_info.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_info.imageView = VK_NULL_HANDLE; + image_info.sampler = sampler; + image_infos.push_back(image_info); + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = target_set; + write_set.dstBinding = binding++; + write_set.dstArrayElement = 0; + write_set.descriptorCount = 1; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + write_set.pImageInfo = &image_infos.back(); + write_set.pBufferInfo = nullptr; + write_set.pTexelBufferView = nullptr; + write_desc_sets.push_back(write_set); + +#if FFX_SSSR_DEBUG_DESCRIPTOR_SETUP + vkUpdateDescriptorSets(device_, 1, &write_set, 0, nullptr); +#endif + }; + + auto BindImage = [this, &target_set, &binding, &write_desc_sets, &image_infos](VkDescriptorType type, VkImageView view, VkImageLayout layout) { + VkDescriptorImageInfo image_info = {}; + image_info.imageLayout = layout; + image_info.imageView = view; + image_info.sampler = VK_NULL_HANDLE; + image_infos.push_back(image_info); + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = target_set; + write_set.dstBinding = binding++; + write_set.dstArrayElement = 0; + write_set.descriptorCount = 1; + write_set.descriptorType = type; + write_set.pImageInfo = &image_infos.back(); + write_set.pBufferInfo = nullptr; + write_set.pTexelBufferView = nullptr; + write_desc_sets.push_back(write_set); + +#if FFX_SSSR_DEBUG_DESCRIPTOR_SETUP + vkUpdateDescriptorSets(device_, 1, &write_set, 0, nullptr); +#endif + }; + + auto BindBuffer = [this, &target_set, &binding, &write_desc_sets](VkDescriptorType type, const VkBufferView& buffer) { + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = target_set; + write_set.dstBinding = binding++; + write_set.dstArrayElement = 0; + write_set.descriptorCount = 1; + write_set.descriptorType = type; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = nullptr; + write_set.pTexelBufferView = &buffer; + write_desc_sets.push_back(write_set); + +#if FFX_SSSR_DEBUG_DESCRIPTOR_SETUP + vkUpdateDescriptorSets(device_, 1, &write_set, 0, nullptr); +#endif + }; + + // Place the descriptors + for (int i = 0; i < 2; ++i) + { + // Tile Classifier pass + { + target_set = tile_classification_descriptor_set_[i]; + binding = 0; + + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, tile_list_.buffer_view_); // g_tile_list + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, ray_list_.buffer_view_); // g_ray_list + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, tile_counter_.buffer_view_); // g_tile_counter + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, ray_counter_.buffer_view_); // g_ray_counter + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_denoiser_result_[i].image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporally_denoised_reflections + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_denoiser_result_[1 - i].image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporally_denoised_reflections_history + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ray_lengths_.image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_ray_lengths + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_variance_.image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporal_variance + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, output_buffer_uav, VK_IMAGE_LAYOUT_GENERAL); // g_denoised_reflections + } + + // Indirect args pass + { + target_set = indirect_args_descriptor_set_[i]; + binding = 0; + + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, tile_counter_.buffer_view_); // g_tile_counter + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, ray_counter_.buffer_view_); // g_ray_counter + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, intersection_pass_indirect_args_.buffer_view_); // g_intersect_args + BindBuffer(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, denoiser_pass_indirect_args_.buffer_view_); // g_denoiser_args + } + + // Intersection pass + { + target_set = intersection_descriptor_set_[i]; + binding = 0; + + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, scene_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_lit_scene + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, depth_hierarchy_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_depth_buffer_hierarchy + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_normal + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, environment_map_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_environment_map + + auto const& sampler = context.GetContextVK()->GetSampler2SPP(); + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, sampler.sobol_buffer_.buffer_view_); // g_sobol_buffer + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, sampler.ranking_tile_buffer_.buffer_view_); // g_ranking_tile_buffer + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, sampler.scrambling_tile_buffer_.buffer_view_); // g_scrambling_tile_buffer + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, ray_list_.buffer_view_); // g_ray_list + + BindSampler(linear_sampler_); // g_linear_sampler + BindSampler(environment_map_sampler); // g_environment_map_sampler + + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_denoiser_result_[i].image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_intersection_result + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ray_lengths_.image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_ray_lengths + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, output_buffer_uav, VK_IMAGE_LAYOUT_GENERAL); // g_denoised_reflections + } + + // Spatial denoising pass + { + target_set = spatial_denoising_descriptor_set_[i]; + binding = 0; + + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, depth_hierarchy_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_depth_buffer + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_normal + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, temporal_denoiser_result_[i].image_view_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_intersection_result + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, temporal_variance_.image_view_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_has_ray + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, tile_list_.buffer_view_); // g_tile_list + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, output_buffer_uav, VK_IMAGE_LAYOUT_GENERAL); // g_spatially_denoised_reflections + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ray_lengths_.image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_ray_lengths + } + + // Temporal denoising pass + { + target_set = temporal_denoising_descriptor_set_[i]; + binding = 0; + + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_normal + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_normal ? normal_buffers[1 - i] : normal_history_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_normal_history + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[1 - i] : roughness_history_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness_history + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, depth_hierarchy_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_depth_buffer + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, motion_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_motion_vectors + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, temporal_denoiser_result_[1 - i].image_view_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_temporally_denoised_reflections_history + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ray_lengths_.image_view_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_ray_lengths + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, tile_list_.buffer_view_); // g_tile_list + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_denoiser_result_[i].image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporally_denoised_reflections + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, output_buffer_uav, VK_IMAGE_LAYOUT_GENERAL); // g_spatially_denoised_reflections + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_variance_.image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporal_variance + } + + // EAW denoising pass + { + target_set = eaw_denoising_descriptor_set_[i]; + binding = 0; + + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_normal ? normal_buffers[i] : normal_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_normal + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, ping_pong_roughness ? roughness_buffers[i] : roughness_buffer_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_roughness + BindImage(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, depth_hierarchy_srv, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // g_depth_buffer + BindBuffer(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, tile_list_.buffer_view_); // g_tile_list + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, temporal_denoiser_result_[i].image_view_, VK_IMAGE_LAYOUT_GENERAL); // g_temporally_denoised_reflections + BindImage(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, output_buffer_uav, VK_IMAGE_LAYOUT_GENERAL); // g_denoised_reflections + } + } + vkUpdateDescriptorSets(device_, static_cast(write_desc_sets.size()), write_desc_sets.data(), 0, nullptr); + } + + /** + Gets the index of the current timestamp query. + + \return The index of the current timestamp query. + */ + std::uint32_t ReflectionViewVK::GetTimestampQueryIndex() const + { + return timestamp_queries_index_ * kTimestampQuery_Count + static_cast(timestamp_queries_[timestamp_queries_index_].size()); + } + + float Clamp(float value, float min, float max) + { + if (value < min) + { + return min; + } + else if (value > max) + { + return max; + } + return value; + } + + /** + Resolves the Vulkan reflection view. + + \param context The context to be used. + \param reflection_view The reflection view to be resolved. + \param resolve_reflection_view_info The reflection view resolve information. + */ + void ReflectionViewVK::Resolve(Context& context, ReflectionView const& reflection_view, FfxSssrResolveReflectionViewInfo const& resolve_reflection_view_info) + { + auto const command_buffer = resolve_reflection_view_info.pVkCommandEncodeInfo->commandBuffer; + if (!command_buffer) + { + throw reflection_error(context, FFX_SSSR_STATUS_INVALID_VALUE, "No command buffer was supplied, cannot encode device commands"); + } + + FFX_SSSR_ASSERT(resolve_reflection_view_info.pVkCommandEncodeInfo); + FFX_SSSR_ASSERT(resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_1 || resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_2 || resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_4); + + // Query timestamp value prior to resolving the reflection view + if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) + { + auto& timestamp_queries = timestamp_queries_[timestamp_queries_index_]; + + auto const start_index = timestamp_queries_index_ * kTimestampQuery_Count; + + if (!timestamp_queries.empty()) + { + // Reset performance counters + tile_classification_elapsed_time_ = 0ull; + denoising_elapsed_time_ = 0ull; + intersection_elapsed_time_ = 0ull; + + uint32_t timestamp_count = static_cast(timestamp_queries.size()); + + uint64_t data[kTimestampQuery_Count * 8]; // maximum of 8 frames in flight allowed + VkResult result = vkGetQueryPoolResults(device_, + timestamp_query_pool_, + start_index, + timestamp_count, + timestamp_count * sizeof(uint64_t), + data, + sizeof(uint64_t), + VK_QUERY_RESULT_WITH_AVAILABILITY_BIT); + + if (result == VK_SUCCESS) + { + for (auto i = 0u, j = 1u; j < timestamp_count; ++i, ++j) + { + auto const elapsed_time = (data[j] - data[i]); + + switch (timestamp_queries[j]) + { + case kTimestampQuery_TileClassification: + tile_classification_elapsed_time_ = elapsed_time; + break; + case kTimestampQuery_Intersection: + intersection_elapsed_time_ = elapsed_time; + break; + case kTimestampQuery_Denoising: + denoising_elapsed_time_ = elapsed_time; + break; + default: + // unrecognized timestamp query + break; + } + } + } + else if (result != VK_NOT_READY) + { + throw reflection_error(context, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to query timestamp query results"); + } + } + + timestamp_queries.clear(); + + vkCmdResetQueryPool(command_buffer, timestamp_query_pool_, start_index, kTimestampQuery_Count); + + vkCmdWriteTimestamp(command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timestamp_query_pool_, GetTimestampQueryIndex()); + timestamp_queries.push_back(kTimestampQuery_Init); + } + + // Encode the relevant pass data + struct PassData + { + matrix4 inv_view_projection_; + matrix4 projection_; + matrix4 inv_projection_; + matrix4 view_; + matrix4 inv_view_; + matrix4 prev_view_projection_; + std::uint32_t frame_index_; + std::uint32_t max_traversal_intersections_; + std::uint32_t min_traversal_occupancy_; + std::uint32_t most_detailed_mip_; + float temporal_stability_factor_; + float depth_buffer_thickness_; + std::uint32_t samples_per_quad_; + std::uint32_t temporal_variance_guided_tracing_enabled_; + float roughness_threshold_; + std::uint32_t skip_denoiser_; + }; + auto& upload_buffer = context.GetContextVK()->GetUploadBuffer(); + PassData* pass_data; + if (!upload_buffer.AllocateBuffer(sizeof(PassData), pass_data)) + { + throw reflection_error(context, FFX_SSSR_STATUS_OUT_OF_MEMORY, "Failed to allocate %u bytes of upload memory, consider increasing uploadBufferSize", sizeof(PassData)); + } + + // Fill constant buffer + matrix4 view_projection = reflection_view.projection_matrix_ * reflection_view.view_matrix_; + pass_data->inv_view_projection_ = matrix4::inverse(view_projection); + pass_data->projection_ = reflection_view.projection_matrix_; + pass_data->inv_projection_ = matrix4::inverse(reflection_view.projection_matrix_); + pass_data->view_ = reflection_view.view_matrix_; + pass_data->inv_view_ = matrix4::inverse(reflection_view.view_matrix_); + pass_data->prev_view_projection_ = prev_view_projection_; + pass_data->frame_index_ = context.GetFrameIndex(); + + float temporal_stability_scale = Clamp(resolve_reflection_view_info.temporalStabilityScale, 0, 1); + pass_data->max_traversal_intersections_ = resolve_reflection_view_info.maxTraversalIterations; + pass_data->min_traversal_occupancy_ = resolve_reflection_view_info.minTraversalOccupancy; + pass_data->most_detailed_mip_ = resolve_reflection_view_info.mostDetailedDepthHierarchyMipLevel; + pass_data->temporal_stability_factor_ = temporal_stability_scale * temporal_stability_scale; + pass_data->depth_buffer_thickness_ = resolve_reflection_view_info.depthBufferThickness; + pass_data->samples_per_quad_ = resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_4 ? 4 : (resolve_reflection_view_info.samplesPerQuad == FFX_SSSR_RAY_SAMPLES_PER_QUAD_2 ? 2 : 1); + pass_data->temporal_variance_guided_tracing_enabled_ = resolve_reflection_view_info.flags & FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_ENABLE_VARIANCE_GUIDED_TRACING ? 1 : 0; + pass_data->roughness_threshold_ = resolve_reflection_view_info.roughnessThreshold; + pass_data->skip_denoiser_ = resolve_reflection_view_info.flags & FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE ? 0 : 1; + prev_view_projection_ = view_projection; + + uint32_t uniform_buffer_index = context.GetFrameIndex() % context.GetFrameCountBeforeReuse(); + VkDescriptorSet uniform_buffer_descriptor_set = uniform_buffer_descriptor_set_[uniform_buffer_index]; + + // Update descriptor to sliding window in upload buffer that contains the updated pass data + { + VkDescriptorBufferInfo buffer_info = {}; + buffer_info.buffer = upload_buffer.GetResource(); + buffer_info.offset = upload_buffer.GetOffset(pass_data); + buffer_info.range = sizeof(PassData); + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = uniform_buffer_descriptor_set; + write_set.dstBinding = 0; + write_set.dstArrayElement = 0; + write_set.descriptorCount = 1; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = &buffer_info; + write_set.pTexelBufferView = nullptr; + vkUpdateDescriptorSets(device_, 1, &write_set, 0, nullptr); + } + + std::uint32_t resource_descriptor_set_index = context.GetFrameIndex() & 1u; + + ContextVK* vk_context = context.GetContextVK(); + + // Tile Classification pass + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, tile_classification_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetTileClassificationPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetTileClassificationPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + uint32_t dim_x = RoundedDivide(width_, 8u); + uint32_t dim_y = RoundedDivide(height_, 8u); + vkCmdDispatch(command_buffer, dim_x, dim_y, 1); + } + + // Ensure that the tile classification pass finished + ComputeBarrier(command_buffer); + + // Indirect Arguments pass + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, indirect_args_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetIndirectArgsPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetIndirectArgsPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + vkCmdDispatch(command_buffer, 1, 1, 1); + } + + // Query the amount of time spent in the intersection pass + if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) + { + auto& timestamp_queries = timestamp_queries_[timestamp_queries_index_]; + + FFX_SSSR_ASSERT(timestamp_queries.size() == 1ull && timestamp_queries[0] == kTimestampQuery_Init); + + vkCmdWriteTimestamp(command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timestamp_query_pool_, GetTimestampQueryIndex()); + timestamp_queries.push_back(kTimestampQuery_TileClassification); + } + + // Ensure that the arguments are written + IndirectArgumentsBarrier(command_buffer); + + // Intersection pass + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, intersection_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetIntersectionPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetIntersectionPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + vkCmdDispatchIndirect(command_buffer, intersection_pass_indirect_args_.buffer_, 0); + } + + // Query the amount of time spent in the intersection pass + if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) + { + auto& timestamp_queries = timestamp_queries_[timestamp_queries_index_]; + + FFX_SSSR_ASSERT(timestamp_queries.size() == 2ull && timestamp_queries[1] == kTimestampQuery_TileClassification); + + vkCmdWriteTimestamp(command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timestamp_query_pool_, GetTimestampQueryIndex()); + timestamp_queries.push_back(kTimestampQuery_Intersection); + } + + if (resolve_reflection_view_info.flags & FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE) + { + // Ensure that the intersection pass finished + VkImageMemoryBarrier intersection_finished_barriers[] = { + Transition(temporal_denoiser_result_[resource_descriptor_set_index].image_, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL), + Transition(temporal_variance_.image_, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) + }; + TransitionBarriers(command_buffer, intersection_finished_barriers, FFX_SSSR_ARRAY_SIZE(intersection_finished_barriers)); + + // Spatial denoiser passes + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, spatial_denoising_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetSpatialDenoisingPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetSpatialDenoisingPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + vkCmdDispatchIndirect(command_buffer, denoiser_pass_indirect_args_.buffer_, 0); + } + + // Ensure that the spatial denoising pass finished. We don't have the resource for the final result available, thus we have to wait for any UAV access to finish. + VkImageMemoryBarrier spatial_denoiser_finished_barriers[] = { + Transition(temporal_denoiser_result_[resource_descriptor_set_index].image_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL), + Transition(temporal_denoiser_result_[1 - resource_descriptor_set_index].image_, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL), + Transition(temporal_variance_.image_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL), + Transition(ray_lengths_.image_, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) + }; + TransitionBarriers(command_buffer, spatial_denoiser_finished_barriers, FFX_SSSR_ARRAY_SIZE(spatial_denoiser_finished_barriers)); + + // Temporal denoiser passes + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, temporal_denoising_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetTemporalDenoisingPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetTemporalDenoisingPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + vkCmdDispatchIndirect(command_buffer, denoiser_pass_indirect_args_.buffer_, 0); + } + + // Ensure that the temporal denoising pass finished + VkImageMemoryBarrier temporal_denoiser_finished_barriers[] = { + Transition(ray_lengths_.image_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL), + Transition(temporal_denoiser_result_[1 - resource_descriptor_set_index].image_, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL), + }; + TransitionBarriers(command_buffer, temporal_denoiser_finished_barriers, FFX_SSSR_ARRAY_SIZE(temporal_denoiser_finished_barriers)); + + // EAW denoiser passes + { + VkDescriptorSet sets[] = { uniform_buffer_descriptor_set, eaw_denoising_descriptor_set_[resource_descriptor_set_index] }; + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetEawDenoisingPass().pipeline_); + vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, vk_context->GetEawDenoisingPass().pipeline_layout_, 0, FFX_SSSR_ARRAY_SIZE(sets), sets, 0, nullptr); + vkCmdDispatchIndirect(command_buffer, denoiser_pass_indirect_args_.buffer_, 0); + } + + // Query the amount of time spent in the denoiser passes + if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) + { + auto& timestamp_queries = timestamp_queries_[timestamp_queries_index_]; + + FFX_SSSR_ASSERT(timestamp_queries.size() == 3ull && timestamp_queries[2] == kTimestampQuery_Intersection); + + vkCmdWriteTimestamp(command_buffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timestamp_query_pool_, GetTimestampQueryIndex()); + timestamp_queries.push_back(kTimestampQuery_Denoising); + } + } + + // Move timestamp queries to next frame + if ((flags_ & FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS) != 0) + { + timestamp_queries_index_ = (timestamp_queries_index_ + 1u) % context.GetFrameCountBeforeReuse(); + } + } + + VkImageMemoryBarrier ReflectionViewVK::Transition(VkImage image, VkImageLayout before, VkImageLayout after) const + { + VkImageMemoryBarrier barrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = before; + barrier.newLayout = after; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + + VkImageSubresourceRange subresourceRange = {}; + subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + subresourceRange.baseArrayLayer = 0; + subresourceRange.layerCount = 1; + subresourceRange.baseMipLevel = 0; + subresourceRange.levelCount = 1; + + barrier.subresourceRange = subresourceRange; + return barrier; + } + + void ReflectionViewVK::TransitionBarriers(VkCommandBuffer command_buffer, const VkImageMemoryBarrier * image_barriers, uint32_t image_barriers_count) const + { + vkCmdPipelineBarrier(command_buffer, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 0, nullptr, + 0, nullptr, + image_barriers_count, image_barriers); + } + + void ReflectionViewVK::ComputeBarrier(VkCommandBuffer command_buffer) const + { + VkMemoryBarrier barrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + vkCmdPipelineBarrier(command_buffer, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 1, &barrier, + 0, nullptr, + 0, nullptr); + } + + void ReflectionViewVK::IndirectArgumentsBarrier(VkCommandBuffer command_buffer) const + { + VkMemoryBarrier barrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT; + vkCmdPipelineBarrier(command_buffer, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, + 0, + 1, &barrier, + 0, nullptr, + 0, nullptr); + } +} diff --git a/ffx-sssr/src/vk/reflection_view_vk.h b/ffx-sssr/src/vk/reflection_view_vk.h new file mode 100644 index 0000000..19563f5 --- /dev/null +++ b/ffx-sssr/src/vk/reflection_view_vk.h @@ -0,0 +1,164 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include +#include +#include + +#include "macros.h" +#include "matrix4.h" +#include "ffx_sssr.h" +#include "buffer_vk.h" +#include "image_vk.h" + +namespace ffx_sssr +{ + class Context; + class ReflectionView; + + /** + The ReflectionViewVK class encapsulates the data required for resolving an individual reflection view. + */ + class ReflectionViewVK + { + FFX_SSSR_NON_COPYABLE(ReflectionViewVK); + + public: + + /** + The available timestamp queries. + */ + enum TimestampQuery + { + kTimestampQuery_Init, + kTimestampQuery_TileClassification, + kTimestampQuery_Intersection, + kTimestampQuery_Denoising, + + kTimestampQuery_Count + }; + + /** + The type definition for an array of timestamp queries. + */ + using TimestampQueries = std::vector; + + ReflectionViewVK(); + ~ReflectionViewVK(); + + ReflectionViewVK(ReflectionViewVK&& other) noexcept; + ReflectionViewVK& operator =(ReflectionViewVK&& other) noexcept; + + void Create(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); + + uint32_t GetConservativeResourceDescriptorCount(const Context& context) const; + void CreateDescriptorPool(const Context& context); + void SetupInternalResources(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); + + void AllocateDescriptorSets(Context& context); + VkDescriptorSet AllocateDescriptorSet(Context& context, VkDescriptorSetLayout layout); + void InitializeResourceDescriptorSets(Context& context, FfxSssrCreateReflectionViewInfo const& create_reflection_view_info); + + std::uint32_t GetTimestampQueryIndex() const; + + void Resolve(Context& context, ReflectionView const& reflection_view, FfxSssrResolveReflectionViewInfo const& resolve_reflection_view_info); + + // The device that created the reflection view. Livetime handled by the context. + VkDevice device_; + // The physical device that created the reflection view. Livetime handled by the context. + VkPhysicalDevice physical_device_; + // The width of the reflection view (in texels). + std::uint32_t width_; + // The height of the reflection view (in texels). + std::uint32_t height_; + // The reflection view creation flags. + FfxSssrCreateReflectionViewFlags flags_; + + // The descriptor pool for all resource views. + VkDescriptorPool descriptor_pool_; + + // Linear sampler. + VkSampler linear_sampler_; + // Containing all tiles that need at least one ray. + BufferVK tile_list_; + BufferVK tile_counter_; + // Containing all rays that need to be traced. + BufferVK ray_list_; + BufferVK ray_counter_; + // Indirect arguments for intersection pass. + BufferVK intersection_pass_indirect_args_; + // Indirect arguments for denoiser pass. + BufferVK denoiser_pass_indirect_args_; + // Intermediate result of the temporal denoising pass - double buffered to keep history and aliases the intersection result. + ImageVK temporal_denoiser_result_[2]; + // Holds the length of each reflection ray - used for temporal reprojection. + ImageVK ray_lengths_; + // Holds the temporal variance of the last two frames. + ImageVK temporal_variance_; + + // The query pool containing the recorded timestamps. + VkQueryPool timestamp_query_pool_; + // The number of GPU ticks spent in the tile classification pass. + std::uint64_t tile_classification_elapsed_time_; + // The number of GPU ticks spent in depth buffer intersection. + std::uint64_t intersection_elapsed_time_; + // The number of GPU ticks spent denoising. + std::uint64_t denoising_elapsed_time_; + // The array of timestamp that were queried. + std::vector timestamp_queries_; + // The index of the active set of timestamp queries. + std::uint32_t timestamp_queries_index_; + + // Format of the resolved scene. + VkFormat scene_format_; + + // The descriptor tables. One per shader pass per frame. + // Even with more than 2 frames in flight we only swap between the last two + // as we keep only one frame of history. + + // Descriptor set for uniform buffers. Be conservative in the number of frames in flight. + VkDescriptorSet uniform_buffer_descriptor_set_[8]; + // Descriptor sets of the tile classification pass. + VkDescriptorSet tile_classification_descriptor_set_[2]; + // Descriptor sets of the indirect arguments pass. + VkDescriptorSet indirect_args_descriptor_set_[2]; + // Descriptor sets of the depth buffer intersection pass. + VkDescriptorSet intersection_descriptor_set_[2]; + // Descriptor sets of the spatial denoising pass. + VkDescriptorSet spatial_denoising_descriptor_set_[2]; + // Descriptor sets of the temporal denoising pass. + VkDescriptorSet temporal_denoising_descriptor_set_[2]; + // Descriptor sets of the eaw denoising pass. + VkDescriptorSet eaw_denoising_descriptor_set_[2]; + + // The view projection matrix of the last frame. + matrix4 prev_view_projection_; + + private: + VkImageMemoryBarrier Transition(VkImage image, VkImageLayout before, VkImageLayout after) const; + void TransitionBarriers(VkCommandBuffer command_buffer, const VkImageMemoryBarrier* image_barriers, uint32_t image_barriers_count) const; + void ComputeBarrier(VkCommandBuffer command_buffer) const; + void IndirectArgumentsBarrier(VkCommandBuffer command_buffer) const; + }; +} + diff --git a/ffx-sssr/src/vk/sampler_vk.cpp b/ffx-sssr/src/vk/sampler_vk.cpp new file mode 100644 index 0000000..ecf5521 --- /dev/null +++ b/ffx-sssr/src/vk/sampler_vk.cpp @@ -0,0 +1,74 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "sampler_vk.h" + +#include + +namespace ffx_sssr +{ + /** + The constructor for the SamplerD3D12 class. + */ + BlueNoiseSamplerVK::BlueNoiseSamplerVK() + : sobol_buffer_() + , ranking_tile_buffer_() + , scrambling_tile_buffer_() + { + } + + /** + The constructor for the SamplerD3D12 class. + + \param other The sampler to be moved. + */ + BlueNoiseSamplerVK::BlueNoiseSamplerVK(BlueNoiseSamplerVK&& other) noexcept + : sobol_buffer_(std::move(other.sobol_buffer_)) + , ranking_tile_buffer_(std::move(other.ranking_tile_buffer_)) + , scrambling_tile_buffer_(std::move(other.scrambling_tile_buffer_)) + { + } + + /** + The destructor for the SamplerD3D12 class. + */ + BlueNoiseSamplerVK::~BlueNoiseSamplerVK() + { + } + + /** + Assigns the sampler. + + \param other The sampler to be moved. + \return The assigned sampler. + */ + BlueNoiseSamplerVK& BlueNoiseSamplerVK::operator =(BlueNoiseSamplerVK&& other) noexcept + { + if (this != &other) + { + sobol_buffer_ = std::move(other.sobol_buffer_); + ranking_tile_buffer_ = std::move(other.ranking_tile_buffer_); + scrambling_tile_buffer_ = std::move(other.scrambling_tile_buffer_); + } + + return *this; + } +} diff --git a/ffx-sssr/src/vk/sampler_vk.h b/ffx-sssr/src/vk/sampler_vk.h new file mode 100644 index 0000000..07d9c11 --- /dev/null +++ b/ffx-sssr/src/vk/sampler_vk.h @@ -0,0 +1,55 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include "macros.h" +#include "ffx_sssr.h" +#include "buffer_vk.h" + +namespace ffx_sssr +{ + /** + The BlueNoiseSamplerVK class represents a blue-noise sampler to be used for random number generation. + + \note Original implementation can be found here: https://eheitzresearch.wordpress.com/762-2/ + */ + class BlueNoiseSamplerVK + { + FFX_SSSR_NON_COPYABLE(BlueNoiseSamplerVK); + + public: + BlueNoiseSamplerVK(); + ~BlueNoiseSamplerVK(); + + BlueNoiseSamplerVK(BlueNoiseSamplerVK&& other) noexcept; + BlueNoiseSamplerVK& BlueNoiseSamplerVK::operator =(BlueNoiseSamplerVK&& other) noexcept; + + // The Sobol sequence buffer. + BufferVK sobol_buffer_; + // The ranking tile buffer for sampling. + BufferVK ranking_tile_buffer_; + // The scrambling tile buffer for sampling. + BufferVK scrambling_tile_buffer_; + }; +} diff --git a/ffx-sssr/src/vk/shader_compiler_vk.cpp b/ffx-sssr/src/vk/shader_compiler_vk.cpp new file mode 100644 index 0000000..74d2e32 --- /dev/null +++ b/ffx-sssr/src/vk/shader_compiler_vk.cpp @@ -0,0 +1,220 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "shader_compiler_vk.h" + +#include +#include +#include + +#if FFX_SSSR_DUMP_SHADERS +#include +#endif // FFX_SSSR_DUMP_SHADERS + +#include "reflection_error.h" +#include "utils.h" + +namespace ffx_sssr +{ + /** + The constructor for the ShaderCompilerVK class. + + \param context The context to be used. + */ + ShaderCompilerVK::ShaderCompilerVK(Context& context) + : context_(context) + , dxc_include_handler_(nullptr) + , dxc_compiler_(nullptr) + , dxc_library_(nullptr) + { + } + + /** + The destructor for the ShaderCompilerVK class. + */ + ShaderCompilerVK::~ShaderCompilerVK() + { + if (dxc_compiler_) + dxc_compiler_->Release(); + if (dxc_library_) + dxc_library_->Release(); + if (dxc_include_handler_) + dxc_include_handler_->Release(); + + dxc_dll_support_.Cleanup(); + } + + /** + Compiles the shader file. + + \param filename The location of the shader file. + \param profile The targeted shader model. + \param defines The list of defines to be used. + \param define_count The number of defines. + \return The compiled shader. + */ + ShaderVK ShaderCompilerVK::CompileShaderFile(char const* filename, char const* profile, LPCWSTR* arguments, std::uint32_t argument_count, DxcDefine* defines, std::uint32_t define_count) + { + HRESULT result; + FFX_SSSR_ASSERT(filename && profile); + + if (!LoadShaderCompiler()) + { + return ShaderVK(); + } + + // Compile the shader code from source + IDxcBlobEncoding* dxc_source; + auto const shader_filename = StringToWString(filename); + result = dxc_library_->CreateBlobFromFile(shader_filename.c_str(), nullptr, &dxc_source); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_OPERATION, "Could not create shader blob from %s", filename); + + ShaderVK shader = CompileShaderBlob(dxc_source, shader_filename.c_str(), profile, arguments, argument_count, defines, define_count); + + dxc_source->Release(); + + return shader; + } + + ShaderVK ShaderCompilerVK::CompileShaderString(char const * string, std::uint32_t string_size, char const* shader_name, char const * profile, LPCWSTR * arguments, std::uint32_t argument_count, DxcDefine * defines, std::uint32_t define_count) + { + HRESULT result; + FFX_SSSR_ASSERT(string && profile); + + if (!LoadShaderCompiler()) + { + return ShaderVK(); + } + + IDxcBlobEncoding* dxc_source; + result = dxc_library_->CreateBlobWithEncodingFromPinned((LPBYTE)string, string_size, 0, &dxc_source); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INVALID_OPERATION, "Could not create blob with encoding from pinned for %s", shader_name); + + auto const wc_shader_name = StringToWString(shader_name); + + ShaderVK shader = CompileShaderBlob(dxc_source, wc_shader_name.c_str(), profile, arguments, argument_count, defines, define_count); + + dxc_source->Release(); + + return shader; + } + + bool ShaderCompilerVK::LoadShaderCompiler() + { + // Load shader compiler + if (!dxc_dll_support_.IsEnabled()) + { + HRESULT result = dxc_dll_support_.Initialize(); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to initialize dxcompiler.dll support"); + + result = dxc_dll_support_.CreateInstance(CLSID_DxcCompiler, &dxc_compiler_); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to create DXC compiler instance"); + + result = dxc_dll_support_.CreateInstance(CLSID_DxcLibrary, &dxc_library_); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to create DXC library instance"); + + result = dxc_library_->CreateIncludeHandler(&dxc_include_handler_); + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to create DXC include handler"); + } + else if (!dxc_compiler_ || !dxc_library_) + { + return false; // failed to create DXC instances + } + + return true; + } + + ShaderVK ShaderCompilerVK::CompileShaderBlob(IDxcBlob * dxc_source, wchar_t const * shader_name, char const * profile, LPCWSTR * arguments, std::uint32_t argument_count, DxcDefine * defines, std::uint32_t define_count) + { + HRESULT result; + + std::vector resolved_defines; + resolved_defines.reserve(define_count); + + for (uint32_t i = 0; i < define_count; ++i) + { + if (defines[i].Name != nullptr) + { + resolved_defines.push_back(defines[i]); + if (resolved_defines.back().Value == nullptr) + { + resolved_defines.back().Value = L"1"; + } + } + } + + ShaderVK shader; + IDxcOperationResult* dxc_result; + auto const target_profile = StringToWString(profile); + result = dxc_compiler_->Compile(dxc_source, + shader_name, + L"main", + target_profile.c_str(), + arguments, + argument_count, + resolved_defines.data(), + static_cast(resolved_defines.size()), + dxc_include_handler_, + &dxc_result); + + // Check for compilation errors + if (FAILED(result)) + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Failed to compile D3D12 shader source code"); + if (FAILED(dxc_result->GetStatus(&result)) || FAILED(result)) + { + IDxcBlobEncoding* dxc_error; + dxc_result->GetErrorBuffer(&dxc_error); + std::string const error(static_cast(dxc_error->GetBufferPointer())); + dxc_result->Release(); + dxc_error->Release(); + throw reflection_error(context_, FFX_SSSR_STATUS_INTERNAL_ERROR, "Unable to compile shader file:\r\n> %s", error.c_str()); + } + + // Get hold of the program blob + IDxcBlob* dxc_program = nullptr; + dxc_result->GetResult(&dxc_program); + FFX_SSSR_ASSERT(dxc_program != nullptr); + dxc_result->Release(); + + // Retrieve the shader bytecode + shader.BytecodeLength = dxc_program->GetBufferSize(); + auto const shader_bytecode = malloc(shader.BytecodeLength); + FFX_SSSR_ASSERT(shader_bytecode != nullptr); // out of memory + memcpy(shader_bytecode, dxc_program->GetBufferPointer(), shader.BytecodeLength); + shader.pShaderBytecode = shader_bytecode; + dxc_program->Release(); + +#if FFX_SSSR_DUMP_SHADERS + std::wstring path = shader_name + std::wstring(L".spirv"); + std::ofstream filestream(path.c_str(), std::ios::binary | std::ios::out); + filestream.write((const char*)shader.pShaderBytecode, shader.BytecodeLength); + filestream.close(); +#endif // FFX_SSSR_DUMP_SHADERS + + return shader; + } +} diff --git a/ffx-sssr/src/vk/shader_compiler_vk.h b/ffx-sssr/src/vk/shader_compiler_vk.h new file mode 100644 index 0000000..9eb85fb --- /dev/null +++ b/ffx-sssr/src/vk/shader_compiler_vk.h @@ -0,0 +1,86 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include +#include + +#include "macros.h" + +namespace ffx_sssr +{ + class Context; + + /** + The ShaderVK class is a simple helper for freeing the shader bytecode upon destruction. + */ + class ShaderVK + { + FFX_SSSR_NON_COPYABLE(ShaderVK); + + public: + inline ShaderVK(); + inline ~ShaderVK(); + + inline operator bool() const; + + inline ShaderVK(ShaderVK&& other) noexcept; + inline ShaderVK& operator =(ShaderVK&& other) noexcept; + + const void* pShaderBytecode; + SIZE_T BytecodeLength; + }; + + /** + The ShaderCompilerVK class is a utility for compiling Vulkan shader code. + */ + class ShaderCompilerVK + { + FFX_SSSR_NON_COPYABLE(ShaderCompilerVK); + + public: + ShaderCompilerVK(Context& context); + ~ShaderCompilerVK(); + + ShaderVK CompileShaderFile(char const* filename, char const* profile, LPCWSTR* arguments = nullptr, std::uint32_t argument_count = 0, DxcDefine* defines = nullptr, std::uint32_t define_count = 0u); + ShaderVK CompileShaderString(char const* string, std::uint32_t string_size, char const* shader_name, char const* profile, LPCWSTR* arguments = nullptr, std::uint32_t argument_count = 0, DxcDefine* defines = nullptr, std::uint32_t define_count = 0u); + + protected: + bool LoadShaderCompiler(); + ShaderVK CompileShaderBlob(IDxcBlob* dxc_source, wchar_t const* shader_name, char const* profile, LPCWSTR* arguments = nullptr, std::uint32_t argument_count = 0, DxcDefine* defines = nullptr, std::uint32_t define_count = 0u); + + // The context to be used. + Context& context_; + // A helper for loading the dxcompiler library. + dxc::DxcDllSupport dxc_dll_support_; + // The Vulkan include handler. + IDxcIncludeHandler* dxc_include_handler_; + // The Vulkan shader compiler. + IDxcCompiler2* dxc_compiler_; + // The Vulkan shader library. + IDxcLibrary* dxc_library_; + }; +} + +#include "shader_compiler_vk.inl" diff --git a/ffx-sssr/src/vk/shader_compiler_vk.inl b/ffx-sssr/src/vk/shader_compiler_vk.inl new file mode 100644 index 0000000..02f9e8d --- /dev/null +++ b/ffx-sssr/src/vk/shader_compiler_vk.inl @@ -0,0 +1,82 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +namespace ffx_sssr +{ + /** + The constructor for the ShaderVK class. + */ + ShaderVK::ShaderVK() + { + memset(this, 0, sizeof(*this)); + } + + /** + The destructor for the ShaderVK class. + */ + ShaderVK::~ShaderVK() + { + free(const_cast(pShaderBytecode)); + } + + /** + The constructor for the ShaderVK class. + + \param other The shader to be moved. + */ + ShaderVK::ShaderVK(ShaderVK&& other) noexcept + { + pShaderBytecode = other.pShaderBytecode; + BytecodeLength = other.BytecodeLength; + + other.pShaderBytecode = nullptr; + } + + /** + Assigns the shader. + + \param other The shader to be moved. + \return The assigned shader. + */ + ShaderVK& ShaderVK::operator =(ShaderVK&& other) noexcept + { + if (this != &other) + { + pShaderBytecode = other.pShaderBytecode; + BytecodeLength = other.BytecodeLength; + + other.pShaderBytecode = nullptr; + } + return *this; + } + + /** + Checks whether the shader is valid. + + \return true if the shader is valid, false otherwise. + */ + ShaderVK::operator bool() const + { + return pShaderBytecode != nullptr; + } +} diff --git a/ffx-sssr/src/vk/upload_buffer_vk.cpp b/ffx-sssr/src/vk/upload_buffer_vk.cpp new file mode 100644 index 0000000..a0c398e --- /dev/null +++ b/ffx-sssr/src/vk/upload_buffer_vk.cpp @@ -0,0 +1,100 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#include "upload_buffer_vk.h" + +#include "utils.h" +#include "context.h" +#include "context_vk.h" + +namespace ffx_sssr +{ + /** + The constructor for the UploadBufferVK class. + + \param context The Vulkan context to be used. + \param buffer_size The size of the upload buffer (in bytes). + */ + UploadBufferVK::UploadBufferVK(ContextVK& context, std::size_t buffer_size) + : data_(nullptr) + , context_(context) + , buffer_() + , buffer_size_(buffer_size) + , blocks_(buffer_size) + { + FFX_SSSR_ASSERT(context.GetDevice()); + FFX_SSSR_ASSERT(context.GetPhysicalDevice()); + FFX_SSSR_ASSERT(buffer_size_ > 0); + } + + /** + The destructor for the UploadBufferVK class. + */ + UploadBufferVK::~UploadBufferVK() + { + if (buffer_.mapped_) + { + buffer_.Unmap(); + } + } + + /** + Allocates a buffer. + + \param size The size of the buffer (in bytes). + \param data The pointer to the pinned memory. + \return true if the buffer was allocated successfully, false otherwise. + */ + bool UploadBufferVK::AllocateBufferInternal(std::size_t size, void*& data) + { + std::size_t start; + + auto const memory_block = blocks_.AcquireBlock(start, size, 256u); + + if (!memory_block) + { + return false; + } + + data = static_cast(data_) + start; + + memory_block->block_index_ = context_.GetContext().GetFrameIndex(); + memory_block->frame_index_ = &context_.GetContext().GetFrameIndex(); + memory_block->frame_count_before_reuse_ = context_.GetContext().GetFrameCountBeforeReuse(); + + return true; + } + + /** + Initialize and map the upload buffer. Has to be deferred as we can't access the allocator in the constructor yet. + */ + void UploadBufferVK::Initialize() + { + BufferVK::CreateInfo create_info = {}; + create_info.memory_property_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; // TODO: VMA_MEMORY_USAGE_CPU_TO_GPU + create_info.buffer_usage_ = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + create_info.format_ = VK_FORMAT_UNDEFINED; + create_info.size_in_bytes_ = buffer_size_; + + buffer_ = BufferVK(context_.GetDevice(), context_.GetPhysicalDevice(), create_info); + buffer_.Map(&data_); + } +} diff --git a/ffx-sssr/src/vk/upload_buffer_vk.h b/ffx-sssr/src/vk/upload_buffer_vk.h new file mode 100644 index 0000000..9b1df3f --- /dev/null +++ b/ffx-sssr/src/vk/upload_buffer_vk.h @@ -0,0 +1,88 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +#include + +#include "memory.h" +#include "buffer_vk.h" + +namespace ffx_sssr +{ + class Context; + class ContextVK; + + /** + The UploadBufferVK class allows to transfer some memory from the CPU to the GPU. + */ + class UploadBufferVK + { + FFX_SSSR_NON_COPYABLE(UploadBufferVK); + + public: + UploadBufferVK(ContextVK& context, std::size_t buffer_size); + ~UploadBufferVK(); + + void Initialize(); + + inline std::size_t GetSize() const; + inline VkBuffer GetResource() const; + inline std::size_t GetOffset(void *data) const; + + template + bool AllocateBuffer(std::size_t size, TYPE*& data); + protected: + + bool AllocateBufferInternal(std::size_t size, void*& data); + + /** + The Block class represents an individual synchronizable block to be upload for memory upload. + */ + class Block + { + public: + inline Block(); + + inline bool CanBeReused() const; + + // The index of the currently calculated frame. + std::uint32_t* frame_index_; + // The frame at which this block was created. + std::uint32_t block_index_; + // The number of elapsed frames before re-use. + std::uint32_t frame_count_before_reuse_; + }; + + // The pointer to the mapped data. + void* data_; + // The context to be used. + ContextVK& context_; + // The resource to the upload buffer. + BufferVK buffer_; + // The maximum size of the buffer in bytes. + std::size_t buffer_size_; + // The available blocks for memory upload. + RingBuffer blocks_; + }; +} + +#include "upload_buffer_vk.inl" diff --git a/ffx-sssr/src/vk/upload_buffer_vk.inl b/ffx-sssr/src/vk/upload_buffer_vk.inl new file mode 100644 index 0000000..1be590b --- /dev/null +++ b/ffx-sssr/src/vk/upload_buffer_vk.inl @@ -0,0 +1,104 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ +#pragma once + +namespace ffx_sssr +{ + /** + The constructor for the Block class. + */ + UploadBufferVK::Block::Block() + : frame_index_(nullptr) + , block_index_(0u) + , frame_count_before_reuse_(0u) + { + } + + /** + Checks whether the memory block can now be re-used. + + \return true if the memory block can be re-used, false otherwise. + */ + bool UploadBufferVK::Block::CanBeReused() const + { + FFX_SSSR_ASSERT(frame_index_ && *frame_index_ >= block_index_); + + return (*frame_index_ - block_index_ >= frame_count_before_reuse_); + } + + /** + Gets the size of the upload buffer. + + \return The size of the upload buffer (in bytes). + */ + std::size_t UploadBufferVK::GetSize() const + { + return buffer_size_; + } + + /** + Gets the resource for the upload buffer. + + \return The resource for the upload buffer. + */ + VkBuffer UploadBufferVK::GetResource() const + { + return buffer_.buffer_; + } + + /** + Gets the offset for the allocate range of memory. + + \param data The allocated range of memory. + \return The offset within the upload buffer (in bytes). + */ + std::size_t UploadBufferVK::GetOffset(void* data) const + { + if (!data) + return 0ull; + auto const offset = static_cast(data) - static_cast(data_); + FFX_SSSR_ASSERT(data >= data_ && static_cast(offset) < buffer_size_); // buffer overflow! + return static_cast(offset); + } + + /** + Allocates a buffer. + + \param size The size of the buffer (in bytes). + \param data The pointer to the pinned memory. + \return true if the buffer was allocated successfully, false otherwise. + */ + template + bool UploadBufferVK::AllocateBuffer(std::size_t size, TYPE*& data) + { + void* data_internal; + + if (!AllocateBufferInternal(Align(size, 256ull), data_internal)) + { + return false; + } + + data = static_cast(data_internal); + + return true; + } +} diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 2b673aa..3c225ee 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -3,6 +3,13 @@ set(CMAKE_GENERATOR_PLATFORM x64) project (SssrSample_${GFX_API}) +# set options for FidelityFX SSSR +if(GFX_API STREQUAL DX12) + set(FFX_SSSR_D3D12 ON) +elseif(GFX_API STREQUAL VK) + set(FFX_SSSR_VK ON) +endif() + # ouput exe to bin directory SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_HOME_DIRECTORY}/bin) foreach( OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES} ) @@ -18,9 +25,16 @@ set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT $ if(GFX_API STREQUAL DX12) add_subdirectory(src/DX12) + elseif(GFX_API STREQUAL VK) find_package(Vulkan REQUIRED) add_subdirectory(src/VK) + + message(STATUS ${VULKAN_LIBRARIES}) + + # the Vulkan sample requires glslc.exe for GLSL shader compilation. copy it over from the Vulkan SDK + file(COPY $ENV{VULKAN_SDK}/Bin/glslc.exe DESTINATION ${CMAKE_HOME_DIRECTORY}/bin) + else() message(STATUS "----------------------------------------------------------------------------------------") message(STATUS "") diff --git a/sample/README.md b/sample/README.md index c2d5dbe..4453eee 100644 --- a/sample/README.md +++ b/sample/README.md @@ -13,11 +13,12 @@ To build this sample, the following tools are required: - [CMake 3.4](https://cmake.org/download/) - [Visual Studio 2017](https://visualstudio.microsoft.com/downloads/) - [Windows 10 SDK 10.0.17763.0](https://developer.microsoft.com/en-us/windows/downloads/windows-10-sdk) +- [Vulkan SDK 1.2.141.2](https://www.lunarg.com/vulkan-sdk/) - [Python 3.6](https://www.python.org/downloads/release/python-360/) +- [Git LFS](https://git-lfs.github.com/) Then follow these steps: -0) This repository makes use of https://git-lfs.github.com/ to store large files. 1) Clone the repository with its submodules: ``` > git clone https://github.com/GPUOpen-Effects/FidelityFX-SSSR.git --recurse-submodules @@ -29,5 +30,5 @@ Then follow these steps: > GenerateSolutions.bat ``` -3) Open the solution in the DX12 directory, compile and run. +3) Open the solution in the DX12/VK directory, compile and run. diff --git a/sample/build/GenerateSolutions.bat b/sample/build/GenerateSolutions.bat index 4a5d14b..2205458 100644 --- a/sample/build/GenerateSolutions.bat +++ b/sample/build/GenerateSolutions.bat @@ -1,4 +1,9 @@ mkdir DX12 cd DX12 cmake ..\.. -DGFX_API=DX12 %* +cd .. + +mkdir VK +cd VK +cmake ..\.. -DGFX_API=VK %* cd .. \ No newline at end of file diff --git a/sample/libs/cauldron b/sample/libs/cauldron index fd91cd7..050b274 160000 --- a/sample/libs/cauldron +++ b/sample/libs/cauldron @@ -1 +1 @@ -Subproject commit fd91cd744d014505daef1780dceee49fd62ce953 +Subproject commit 050b274df95777d688686d017a6926a515a58b30 diff --git a/sample/libs/dxc/CMakeLists.txt b/sample/libs/dxc/CMakeLists.txt new file mode 100644 index 0000000..36641b8 --- /dev/null +++ b/sample/libs/dxc/CMakeLists.txt @@ -0,0 +1,5 @@ +project (DXC) + +add_library(${PROJECT_NAME} SHARED IMPORTED GLOBAL) + +set_property(TARGET ${PROJECT_NAME} PROPERTY IMPORTED_IMPLIB dxcompiler.lib) \ No newline at end of file diff --git a/sample/src/Common/config.json b/sample/src/Common/config.json index b2ceb12..e252835 100644 --- a/sample/src/Common/config.json +++ b/sample/src/Common/config.json @@ -1,4 +1,8 @@ { + "width": 1920, + "height": 1080, + "fullScreen": false, + "benchmark": false, "scenes": [ { "name": "Chess", @@ -14,6 +18,30 @@ "pitch": 0.15, "distance": 1.0, "lookAt": [ 0, 0, 0 ] + }, + "BenchmarkSettings": { + "timeStep": 1, + "timeStart": 0, + "timeEnd": 10000, + "exitWhenTimeEnds": true, + "resultsFilename": "Chess.csv", + "warmUpFrames": 200, + "sequence": { + "keyFrames": [ + { + "time": 0, + "from": [ 0.921573281, 0.149438143, 0.358288825 ], + "to": [ 0, 0, 0 ], + "screenShotName": "Chess_Camera1.jpg" + }, + { + "time": 5000, + "from": [ 0.681404650, 0.0632404834, 0.171917275 ], + "to": [ 0, 0, 0 ], + "screenShotName": "Chess_Camera2.jpg" + } + ] + } } } ] diff --git a/sample/src/DX12/Sources/SampleRenderer.cpp b/sample/src/DX12/Sources/SampleRenderer.cpp index 9c14027..67dd981 100644 --- a/sample/src/DX12/Sources/SampleRenderer.cpp +++ b/sample/src/DX12/Sources/SampleRenderer.cpp @@ -30,7 +30,10 @@ THE SOFTWARE. void FfxSssrLoggingFunction(const char* pMessage, void* pUserData) { - Trace(pMessage); + char buffer[4096]; + snprintf(buffer, sizeof(buffer), "%s\n", pMessage); + MessageBox(NULL, buffer, "RtShadows Error", MB_OK | MB_ICONERROR); + exit(-1); } //-------------------------------------------------------------------------------------- @@ -59,11 +62,11 @@ void SampleRenderer::OnCreate(Device* pDevice, SwapChain *pSwapChain) m_CommandListRing.OnCreate(pDevice, backBufferCount, commandListsPerBackBuffer, pDevice->GetGraphicsQueue()->GetDesc()); // Create a 'dynamic' constant buffer - const uint32_t constantBuffersMemSize = 20 * 1024 * 1024; + const uint32_t constantBuffersMemSize = 200 * 1024 * 1024; m_ConstantBufferRing.OnCreate(pDevice, backBufferCount, constantBuffersMemSize, &m_ResourceViewHeaps); // Create a 'static' pool for vertices, indices and constant buffers - const uint32_t staticGeometryMemSize = 128 * 1024 * 1024; + const uint32_t staticGeometryMemSize = 5 * 128 * 1024 * 1024; m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, USE_VID_MEM, "StaticGeom"); // initialize the GPU time stamps module @@ -326,7 +329,21 @@ void SampleRenderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, m_SssrOutputBuffer.CreateUAV(0, &m_SssrOutputBufferUAV); m_SssrOutputBuffer.CreateUAV(0, &m_SssrOutputBufferUAVGPU); - m_SkyDome.SetDescriptorSpec(0, &m_SssrEnvironmentMapSRV, 0, &m_SssrEnvironmentMapSamplerDesc); + D3D12_STATIC_SAMPLER_DESC environmentSamplerDesc = {}; + m_SkyDome.SetDescriptorSpec(0, &m_SssrEnvironmentMapSRV, 0, &environmentSamplerDesc); + m_SssrEnvironmentMapSamplerDesc.AddressU = environmentSamplerDesc.AddressU; + m_SssrEnvironmentMapSamplerDesc.AddressV = environmentSamplerDesc.AddressV; + m_SssrEnvironmentMapSamplerDesc.AddressW = environmentSamplerDesc.AddressW; + m_SssrEnvironmentMapSamplerDesc.BorderColor[0] = 0; + m_SssrEnvironmentMapSamplerDesc.BorderColor[1] = 0; + m_SssrEnvironmentMapSamplerDesc.BorderColor[2] = 0; + m_SssrEnvironmentMapSamplerDesc.BorderColor[3] = 0; + m_SssrEnvironmentMapSamplerDesc.ComparisonFunc = environmentSamplerDesc.ComparisonFunc; + m_SssrEnvironmentMapSamplerDesc.Filter = environmentSamplerDesc.Filter; + m_SssrEnvironmentMapSamplerDesc.MaxAnisotropy = environmentSamplerDesc.MaxAnisotropy; + m_SssrEnvironmentMapSamplerDesc.MaxLOD = environmentSamplerDesc.MaxLOD; + m_SssrEnvironmentMapSamplerDesc.MinLOD = environmentSamplerDesc.MinLOD; + m_SssrEnvironmentMapSamplerDesc.MipLODBias = environmentSamplerDesc.MipLODBias; FfxSssrD3D12CreateReflectionViewInfo d3d12ReflectionViewInfo = {}; d3d12ReflectionViewInfo.depthBufferHierarchySRV = m_SssrDepthBufferHierarchySRV.GetCPU(); @@ -411,6 +428,8 @@ int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) ImGui::EndPopup(); } + AsyncPool* pAsyncPool = &m_AsyncPool; + // Loading stages // if (stage == 0) @@ -429,7 +448,7 @@ int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) // here we are loading onto the GPU all the textures and the inverse matrices // this data will be used to create the PBR and Depth passes - m_pGLTFTexturesAndBuffers->LoadTextures(); + m_pGLTFTexturesAndBuffers->LoadTextures(pAsyncPool); } else if (stage == 7) { @@ -444,7 +463,8 @@ int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, - m_pGLTFTexturesAndBuffers + m_pGLTFTexturesAndBuffers, + pAsyncPool ); } @@ -460,7 +480,8 @@ int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) &m_VidMemBufferPool, m_pGLTFTexturesAndBuffers, m_MotionVectors.GetFormat(), - m_NormalBuffer.GetFormat() + m_NormalBuffer.GetFormat(), + pAsyncPool ); } } @@ -479,10 +500,13 @@ int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) m_pGLTFTexturesAndBuffers, &m_AmbientLight, false, + false, DXGI_FORMAT_R16G16B16A16_FLOAT, m_SpecularRoughness.GetFormat(), DXGI_FORMAT_UNKNOWN, - 1 + DXGI_FORMAT_UNKNOWN, + 1, + pAsyncPool ); } else if (stage == 10) @@ -811,7 +835,6 @@ void SampleRenderer::RenderScreenSpaceReflections(ID3D12GraphicsCommandList* pCm resolveInfo.depthBufferThickness = pState->depthBufferThickness; resolveInfo.minTraversalOccupancy = pState->minTraversalOccupancy; resolveInfo.samplesPerQuad = pState->samplesPerQuad == 4 ? FFX_SSSR_RAY_SAMPLES_PER_QUAD_4 : (pState->samplesPerQuad == 2 ? FFX_SSSR_RAY_SAMPLES_PER_QUAD_2 : FFX_SSSR_RAY_SAMPLES_PER_QUAD_1); - resolveInfo.eawPassCount = pState->eawPassCount == 3 ? FFX_SSSR_EAW_PASS_COUNT_3 : FFX_SSSR_EAW_PASS_COUNT_1; resolveInfo.roughnessThreshold = pState->roughnessThreshold; status = ffxSssrEncodeResolveReflectionView(m_SssrContext, m_SssrReflectionView, &resolveInfo); @@ -1160,6 +1183,11 @@ void SampleRenderer::OnRender(State *pState, SwapChain *pSwapChain) // Render HUD RenderHUD(pCmdLst2, pSwapChain); + if (pState->screenshotName != NULL) + { + m_SaveTexture.CopyRenderTargetIntoStagingTexture(m_pDevice->GetDevice(), pCmdLst2, pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_RENDER_TARGET); + } + // Transition swapchain into present mode Barriers(pCmdLst2, { CD3DX12_RESOURCE_BARRIER::Transition(pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_PRESENT) @@ -1174,6 +1202,12 @@ void SampleRenderer::OnRender(State *pState, SwapChain *pSwapChain) ID3D12CommandList* CmdListList2[] = { pCmdLst2 }; m_pDevice->GetGraphicsQueue()->ExecuteCommandLists(1, CmdListList2); + + if (pState->screenshotName != NULL) + { + m_SaveTexture.SaveStagingTextureAsJpeg(m_pDevice->GetDevice(), m_pDevice->GetGraphicsQueue(), pState->screenshotName->c_str()); + pState->screenshotName = NULL; + } // Update previous camera matrices pState->camera.UpdatePreviousMatrices(); @@ -1238,8 +1272,8 @@ void SampleRenderer::CreateApplyReflectionsPipeline() D3D12_SHADER_BYTECODE vsShaderByteCode = {}; D3D12_SHADER_BYTECODE psShaderByteCode = {}; DefineList defines; - CompileShaderFromFile("ApplyReflections.hlsl", &defines, "vs_main", "vs_5_1", 0, &vsShaderByteCode); - CompileShaderFromFile("ApplyReflections.hlsl", &defines, "ps_main", "ps_5_1", 0, &psShaderByteCode); + CompileShaderFromFile("ApplyReflections.hlsl", &defines, "vs_main", "-T vs_6_0", &vsShaderByteCode); + CompileShaderFromFile("ApplyReflections.hlsl", &defines, "ps_main", "-T ps_6_0", &psShaderByteCode); D3D12_GRAPHICS_PIPELINE_STATE_DESC desc = {}; desc.VS = vsShaderByteCode; @@ -1335,7 +1369,7 @@ void SampleRenderer::CreateDepthDownsamplePipeline() D3D12_SHADER_BYTECODE shaderByteCode = {}; DefineList defines; - CompileShaderFromFile("DepthDownsample.hlsl", &defines, "main", "cs_6_0", 0, &shaderByteCode); + CompileShaderFromFile("DepthDownsample.hlsl", &defines, "main", "-T cs_6_0", &shaderByteCode); D3D12_COMPUTE_PIPELINE_STATE_DESC desc = {}; desc.pRootSignature = m_DownsampleRootSignature; diff --git a/sample/src/DX12/Sources/SampleRenderer.h b/sample/src/DX12/Sources/SampleRenderer.h index 158c385..44468dd 100644 --- a/sample/src/DX12/Sources/SampleRenderer.h +++ b/sample/src/DX12/Sources/SampleRenderer.h @@ -24,6 +24,8 @@ THE SOFTWARE. #include +#include "base/SaveTexture.h" + // We are queuing (backBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame static const int backBufferCount = 3; @@ -65,7 +67,6 @@ class SampleRenderer float depthBufferThickness; int minTraversalOccupancy; int samplesPerQuad; - int eawPassCount; bool bEnableVarianceGuidedTracing; float roughnessThreshold; @@ -74,6 +75,8 @@ class SampleRenderer float denoisingTime; bool showReflectionTarget; + bool isBenchmarking; + const std::string* screenshotName; }; void OnCreate(Device* pDevice, SwapChain *pSwapChain); @@ -203,7 +206,7 @@ class SampleRenderer CBV_SRV_UAV m_SssrOutputBufferUAV; CBV_SRV_UAV m_SssrOutputBufferUAVGPU; CBV_SRV_UAV m_SssrEnvironmentMapSRV; - D3D12_STATIC_SAMPLER_DESC m_SssrEnvironmentMapSamplerDesc; + D3D12_SAMPLER_DESC m_SssrEnvironmentMapSamplerDesc; Texture m_SssrOutputBuffer; RTV m_ApplyPipelineRTV; @@ -225,4 +228,8 @@ class SampleRenderer UINT64 m_GpuTicksPerSecond; + SaveTexture m_SaveTexture; + + // For multithreaded texture loading + AsyncPool m_AsyncPool; }; diff --git a/sample/src/DX12/Sources/SssrSample.cpp b/sample/src/DX12/Sources/SssrSample.cpp index 066d509..f9c9c35 100644 --- a/sample/src/DX12/Sources/SssrSample.cpp +++ b/sample/src/DX12/Sources/SssrSample.cpp @@ -23,8 +23,16 @@ THE SOFTWARE. #include "stdafx.h" #include "SssrSample.h" +#include "base/ShaderCompilerCache.h" + +#ifdef _DEBUG +const bool CPU_BASED_VALIDATION_ENABLED = true; +const bool GPU_BASED_VALIDATION_ENABLED = false; +#else +const bool CPU_BASED_VALIDATION_ENABLED = false; +const bool GPU_BASED_VALIDATION_ENABLED = false; +#endif // _DEBUG -const bool VALIDATION_ENABLED = false; SssrSample::SssrSample(LPCSTR name) : FrameworkWindows(name) { @@ -45,10 +53,9 @@ SssrSample::SssrSample(LPCSTR name) : FrameworkWindows(name) //-------------------------------------------------------------------------------------- void SssrSample::OnCreate(HWND hWnd) { - if (!LoadConfiguration()) - { - exit(0); - } + // get the list of scenes + for (const auto& scene : m_JsonConfigFile["scenes"]) + m_SceneNames.push_back(scene["name"]); DWORD dwAttrib = GetFileAttributes("..\\media\\"); if ((dwAttrib == INVALID_FILE_ATTRIBUTES) || ((dwAttrib & FILE_ATTRIBUTE_DIRECTORY)) == 0) @@ -59,10 +66,11 @@ void SssrSample::OnCreate(HWND hWnd) // Create Device // - m_Device.OnCreate("SssrSample", "Cauldron", VALIDATION_ENABLED, hWnd); + m_Device.OnCreate("SssrSample", "Cauldron", CPU_BASED_VALIDATION_ENABLED, GPU_BASED_VALIDATION_ENABLED, hWnd); m_Device.CreatePipelineCache(); - //init the shader compiler + // Init the shader compiler + InitDirectXCompiler(); CreateShaderCache(); // Create Swapchain @@ -106,12 +114,12 @@ void SssrSample::OnCreate(HWND hWnd) m_State.depthBufferThickness = 0.015f; m_State.minTraversalOccupancy = 4; m_State.samplesPerQuad = 1; - m_State.eawPassCount = 1; m_State.bEnableVarianceGuidedTracing = true; m_State.bShowIntersectionResults = false; m_State.roughnessThreshold = 0.2f; m_State.showReflectionTarget = false; m_State.bDrawScreenSpaceReflections = true; + m_State.screenshotName = NULL; } //-------------------------------------------------------------------------------------- @@ -174,23 +182,6 @@ void SssrSample::SetFullScreen(bool fullscreen) m_Swapchain.SetFullScreen(fullscreen); } -bool SssrSample::LoadConfiguration() -{ - std::ifstream f("config.json"); - if (!f) - { - MessageBox(NULL, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); - return false; - } - f >> m_JsonConfigFile; - - // get the list of scenes - for (const auto & scene : m_JsonConfigFile["scenes"]) - m_SceneNames.push_back(scene["name"]); - - return true; -} - void SssrSample::BuildUI() { ImGuiStyle& style = ImGui::GetStyle(); @@ -216,55 +207,7 @@ void SssrSample::BuildUI() auto getterLambda = [](void* data, int idx, const char** out_str)->bool { *out_str = ((std::vector *)data)->at(idx).c_str(); return true; }; if (ImGui::Combo("model", &selectedScene, getterLambda, &m_SceneNames, (int)m_SceneNames.size()) || (m_pGltfLoader == NULL)) { - json scene = m_JsonConfigFile["scenes"][selectedScene]; - if (m_pGltfLoader != NULL) - { - //free resources, unload the current scene, and load new scene... - m_Device.GPUFlush(); - - m_Node->UnloadScene(); - m_Node->OnDestroyWindowSizeDependentResources(); - m_Node->OnDestroy(); - m_pGltfLoader->Unload(); - m_Node->OnCreate(&m_Device, &m_Swapchain); - m_Node->OnCreateWindowSizeDependentResources(&m_Swapchain, m_Width, m_Height); - } - - delete(m_pGltfLoader); - m_pGltfLoader = new GLTFCommon(); - - if (m_pGltfLoader->Load(scene["directory"], scene["filename"]) == false) - { - MessageBox(NULL, "The selected model couldn't be found, please check the documentation", "Cauldron Panic!", MB_ICONERROR); - exit(0); - } - - // Load the UI settings, and also some defaults cameras and lights, in case the GLTF has none - { -#define LOAD(j, key, val) val = j.value(key, val) - - // global settings - LOAD(scene, "toneMapper", m_State.toneMapper); - LOAD(scene, "skyDomeType", m_State.skyDomeType); - LOAD(scene, "exposure", m_State.exposure); - LOAD(scene, "iblFactor", m_State.iblFactor); - LOAD(scene, "emmisiveFactor", m_State.emmisiveFactor); - LOAD(scene, "skyDomeType", m_State.skyDomeType); - - // default light - m_State.lightIntensity = scene.value("intensity", 1.0f); - - // default camera (in case the gltf has none) - json camera = scene["camera"]; - LOAD(camera, "yaw", m_Yaw); - LOAD(camera, "pitch", m_Pitch); - LOAD(camera, "distance", m_Distance); - XMVECTOR lookAt = GetVector(GetElementJsonArray(camera, "lookAt", { 0.0, 0.0, 0.0 })); - m_State.camera.LookAt(m_Yaw, m_Pitch, m_Distance, lookAt); - - // indicate the mainloop we started loading a GLTF and it needs to load the rest (textures and geometry) - m_bLoadingScene = true; - } + LoadScene(selectedScene); // bail out as we need to reload everything ImGui::End(); @@ -314,10 +257,6 @@ void SssrSample::BuildUI() ImGui::RadioButton("2", &m_State.samplesPerQuad, 2); ImGui::SameLine(); ImGui::RadioButton("4", &m_State.samplesPerQuad, 4); - ImGui::Text("EAW Pass Count"); ImGui::SameLine(); - ImGui::RadioButton("EAW 1", &m_State.eawPassCount, 1); ImGui::SameLine(); - ImGui::RadioButton("EAW 3", &m_State.eawPassCount, 3); - ImGui::Value("Tile Classification Elapsed Time", 1000 * m_State.tileClassificationTime, "%.1f us"); ImGui::Value("Intersection Elapsed Time", 1000 * m_State.intersectionTime, "%.1f us"); ImGui::Value("Denoising Elapsed Time", 1000 * m_State.denoisingTime, "%.1f us"); @@ -325,24 +264,19 @@ void SssrSample::BuildUI() if (ImGui::CollapsingHeader("Profiler")) { - std::vector timeStamps = m_Node->GetTimingValues(); + const std::vector& timeStamps = m_Node->GetTimingValues(); if (timeStamps.size() > 0) { - for (uint32_t i = 1; i < timeStamps.size(); i++) + for (uint32_t i = 0; i < timeStamps.size(); i++) { - float DeltaTime = ((float)(timeStamps[i].m_microseconds - timeStamps[i - 1].m_microseconds)); - ImGui::Text("%-17s: %7.1f us", timeStamps[i].m_label.c_str(), DeltaTime); + ImGui::Text("%-22s: %7.1f", timeStamps[i].m_label.c_str(), timeStamps[i].m_microseconds); } //scrolling data and average computing static float values[128]; - values[127] = (float)(timeStamps.back().m_microseconds - timeStamps.front().m_microseconds); - float average = values[0]; - for (uint32_t i = 0; i < 128 - 1; i++) { values[i] = values[i + 1]; average += values[i]; } - average /= 128; - - ImGui::Text("%-17s: %7.1f us", "Total GPU time", average); - ImGui::PlotLines("", values, 128, 0, "", 0.0f, 30000.0f, ImVec2(0, 80)); + values[127] = timeStamps.back().m_microseconds; + for (uint32_t i = 0; i < 128 - 1; i++) { values[i] = values[i + 1]; } + ImGui::PlotLines("", values, 128, 0, "GPU frame time (us)", 0.0f, 30000.0f, ImVec2(0, 80)); } } @@ -407,6 +341,65 @@ void SssrSample::HandleInput() } } +void SssrSample::LoadScene(int sceneIndex) +{ + json scene = m_JsonConfigFile["scenes"][sceneIndex]; + if (m_pGltfLoader != NULL) + { + //free resources, unload the current scene, and load new scene... + m_Device.GPUFlush(); + + m_Node->UnloadScene(); + m_Node->OnDestroyWindowSizeDependentResources(); + m_Node->OnDestroy(); + m_pGltfLoader->Unload(); + m_Node->OnCreate(&m_Device, &m_Swapchain); + m_Node->OnCreateWindowSizeDependentResources(&m_Swapchain, m_Width, m_Height); + } + + delete(m_pGltfLoader); + m_pGltfLoader = new GLTFCommon(); + + if (m_pGltfLoader->Load(scene["directory"], scene["filename"]) == false) + { + MessageBox(NULL, "The selected model couldn't be found, please check the documentation", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + + // Load the UI settings, and also some defaults cameras and lights, in case the GLTF has none + { +#define LOAD(j, key, val) val = j.value(key, val) + + // global settings + LOAD(scene, "toneMapper", m_State.toneMapper); + LOAD(scene, "skyDomeType", m_State.skyDomeType); + LOAD(scene, "exposure", m_State.exposure); + LOAD(scene, "iblFactor", m_State.iblFactor); + LOAD(scene, "emmisiveFactor", m_State.emmisiveFactor); + LOAD(scene, "skyDomeType", m_State.skyDomeType); + + // default light + m_State.lightIntensity = scene.value("intensity", 1.0f); + + // default camera (in case the gltf has none) + json camera = scene["camera"]; + LOAD(camera, "yaw", m_Yaw); + LOAD(camera, "pitch", m_Pitch); + LOAD(camera, "distance", m_Distance); + XMVECTOR lookAt = GetVector(GetElementJsonArray(camera, "lookAt", { 0.0, 0.0, 0.0 })); + m_State.camera.LookAt(m_Yaw, m_Pitch, m_Distance, lookAt); + + // set benchmarking state if enabled + if (m_State.isBenchmarking) + { + BenchmarkConfig(scene["BenchmarkSettings"], -1, m_pGltfLoader); + } + + // indicate the mainloop we started loading a GLTF and it needs to load the rest (textures and geometry) + m_bLoadingScene = true; + } +} + //-------------------------------------------------------------------------------------- // // OnResize @@ -448,6 +441,39 @@ void SssrSample::OnResize(uint32_t width, uint32_t height) m_State.camera.SetFov(XM_PI / 4, m_Width, m_Height, 0.1f, 1000.0f); } +void SssrSample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool* pbFullScreen) +{ + // First load configuration + std::ifstream f("config.json"); + if (!f) + { + MessageBox(NULL, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); + exit(-1); + } + f >> m_JsonConfigFile; + + // Parse command line and override the config file + try + { + if (strlen(lpCmdLine) > 0) + { + auto j3 = json::parse(lpCmdLine); + m_JsonConfigFile.merge_patch(j3); + } + } + catch (json::parse_error) + { + Trace("Error parsing commandline\n"); + exit(0); + } + + // Set values + *pWidth = m_JsonConfigFile.value("width", 1920); + *pHeight = m_JsonConfigFile.value("height", 1080); + *pbFullScreen = m_JsonConfigFile.value("fullScreen", false); + m_State.isBenchmarking = m_JsonConfigFile.value("benchmark", false); +} + //-------------------------------------------------------------------------------------- // // OnRender, updates the state from the UI, animates, transforms and renders the scene @@ -478,6 +504,11 @@ void SssrSample::OnRender() m_bLoadingScene = false; } } + else if (m_pGltfLoader && m_State.isBenchmarking) + { + const std::vector& timeStamps = m_Node->GetTimingValues(); + m_Time = BenchmarkLoop(timeStamps, &m_State.camera, &m_State.screenshotName); + } else { if (m_bShowUI) @@ -533,6 +564,6 @@ int WINAPI WinMain(HINSTANCE hInstance, uint32_t Width = 1920; // 1536; uint32_t Height = 1080; // 841; - // create new DX sample - return RunFramework(hInstance, lpCmdLine, nCmdShow, Width, Height, new SssrSample(Name)); + // create new sample + return RunFramework(hInstance, lpCmdLine, nCmdShow, new SssrSample(Name)); } \ No newline at end of file diff --git a/sample/src/DX12/Sources/SssrSample.h b/sample/src/DX12/Sources/SssrSample.h index 7438d13..80e62ec 100644 --- a/sample/src/DX12/Sources/SssrSample.h +++ b/sample/src/DX12/Sources/SssrSample.h @@ -43,17 +43,19 @@ class SssrSample : public FrameworkWindows { public: SssrSample(LPCSTR name); - void OnCreate(HWND hWnd); - void OnDestroy(); - void OnRender(); - bool OnEvent(MSG msg); - void OnResize(uint32_t Width, uint32_t Height); + void OnCreate(HWND hWnd) override; + void OnDestroy() override; + void OnRender() override; + bool OnEvent(MSG msg) override; + void OnResize(uint32_t Width, uint32_t Height) override; + virtual void OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool* pbFullScreen) override; + void SetFullScreen(bool fullscreen); private: - bool LoadConfiguration(); void BuildUI(); void HandleInput(); + void LoadScene(int sceneIndex); Device m_Device; SwapChain m_Swapchain; diff --git a/sample/src/VK/CMakeLists.txt b/sample/src/VK/CMakeLists.txt new file mode 100644 index 0000000..01542d8 --- /dev/null +++ b/sample/src/VK/CMakeLists.txt @@ -0,0 +1,64 @@ +project (SssrSample_VK) + +add_compile_options(/MP) + +set(Sources_src + Sources/SssrSample.cpp + Sources/SssrSample.h + Sources/SampleRenderer.cpp + Sources/SampleRenderer.h + Sources/stdafx.cpp + Sources/stdafx.h) + +set(Shaders_src + ${CMAKE_CURRENT_SOURCE_DIR}/Shaders/ApplyReflections.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/Shaders/DepthDownsample.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/Shaders/ffx_a.h + ${CMAKE_CURRENT_SOURCE_DIR}/Shaders/ffx_spd.h) + +set(Common_src + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/config.json +) + +source_group("Sources" FILES ${Sources_src}) +source_group("Shaders" FILES ${Shaders_src}) +source_group("Common" FILES ${Common_src}) + +set_source_files_properties(${Shaders_src} PROPERTIES VS_TOOL_OVERRIDE "Text") +set_source_files_properties(${Common_src} PROPERTIES VS_TOOL_OVERRIDE "Text") + +function(copyCommand list dest) + foreach(fullFileName ${list}) + get_filename_component(file ${fullFileName} NAME) + message("Generating custom command for ${fullFileName}") + add_custom_command( + OUTPUT ${dest}/${file} + PRE_BUILD + COMMAND cmake -E make_directory ${dest} + COMMAND cmake -E copy ${fullFileName} ${dest} + MAIN_DEPENDENCY ${fullFileName} + COMMENT "Updating ${file} into ${dest}" + ) + endforeach() +endfunction() + +copyCommand("${Shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) +copyCommand("${Common_src}" ${CMAKE_HOME_DIRECTORY}/bin) + +add_executable(${PROJECT_NAME} WIN32 ${Sources_src} ${Shaders_src} ${Common_src}) +target_link_libraries (${PROJECT_NAME} LINK_PUBLIC Cauldron_VK FFX_SSSR ImGUI) + +set_target_properties(${PROJECT_NAME} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_HOME_DIRECTORY}/bin") + +IF (MSVC) + IF (CMAKE_MAJOR_VERSION LESS 3) + MESSAGE(WARNING "CMake version 3.0 or newer is required use build variable TARGET_FILE") + ELSE() + ADD_CUSTOM_COMMAND( + TARGET ${PROJECT_NAME} + POST_BUILD + COMMAND "mt.exe" -manifest \"${CMAKE_CURRENT_SOURCE_DIR}\\dpiawarescaling.manifest\" -inputresource:\"$\"\;\#1 -outputresource:\"$\"\;\#1 + COMMENT "Adding display aware manifest..." + ) + ENDIF() +ENDIF(MSVC) \ No newline at end of file diff --git a/sample/src/VK/Shaders/ApplyReflections.hlsl b/sample/src/VK/Shaders/ApplyReflections.hlsl new file mode 100644 index 0000000..1cb369d --- /dev/null +++ b/sample/src/VK/Shaders/ApplyReflections.hlsl @@ -0,0 +1,98 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ + +#ifndef SSR_APPLY +#define SSR_APPLY + +[[vk::binding(0)]] Texture2D reflectionTarget : register(t0); +[[vk::binding(1)]] Texture2D normalsTexture : register(t1); +[[vk::binding(2)]] Texture2D specularRoughnessTexture : register(t2); +[[vk::binding(3)]] Texture2D brdfTexture : register(t3); + +[[vk::binding(4)]] SamplerState linearSampler : register(s0); + +[[vk::binding(5)]] cbuffer Constants : register(b0) +{ + float4 viewDirection; + uint showReflectionTarget; + uint drawReflections; +}; + +struct VertexInput +{ + uint vertexId : SV_VertexID; +}; + +struct VertexOut +{ + float4 position : SV_Position; + float2 texcoord : TEXCOORD0; +}; + +VertexOut vs_main(VertexInput input) +{ + VertexOut output; + output.texcoord = float2((input.vertexId << 1) & 2, input.vertexId & 2); + output.position = float4(output.texcoord.xy * 2.0 - 1.0, 0.0, 1.0); + return output; +} + +// Important bits from the PBR shader +float3 getIBLContribution(float perceptualRoughness, float3 specularColor, float3 specularLight, float3 n, float3 v) +{ + float NdotV = clamp(dot(n, v), 0.0, 1.0); + float2 brdfSamplePoint = clamp(float2(NdotV, perceptualRoughness), float2(0.0, 0.0), float2(1.0, 1.0)); + // retrieve a scale and bias to F0. See [1], Figure 3 + float2 brdf = brdfTexture.Sample(linearSampler, brdfSamplePoint).rg; + + float3 specular = specularLight * (specularColor * brdf.x + brdf.y); + return specular; +} + +float4 ps_main(VertexOut input) : SV_Target0 +{ + input.texcoord.y = 1 - input.texcoord.y; + float3 radiance = reflectionTarget.Sample(linearSampler, input.texcoord).xyz; + float4 specularRoughness = specularRoughnessTexture.Sample(linearSampler, input.texcoord); + float3 specularColor = specularRoughness.xyz; + float perceptualRoughness = sqrt(specularRoughness.w); // specularRoughness.w contains alphaRoughness + float3 normal = 2 * normalsTexture.Sample(linearSampler, input.texcoord).xyz - 1; + float3 view = viewDirection.xyz; + + if (showReflectionTarget == 1) + { + // Show just the reflection view + return float4(radiance, 0); + } + else if (drawReflections == 1) + { + radiance = getIBLContribution(perceptualRoughness, specularColor, radiance, normal, view); + return float4(radiance, 1); // Show the reflections applied to the scene + } + else + { + // Show just the scene + return float4(0, 0, 0, 1); + } +} + +#endif // SSR_APPLY \ No newline at end of file diff --git a/sample/src/VK/Shaders/DepthDownsample.hlsl b/sample/src/VK/Shaders/DepthDownsample.hlsl new file mode 100644 index 0000000..85da089 --- /dev/null +++ b/sample/src/VK/Shaders/DepthDownsample.hlsl @@ -0,0 +1,99 @@ +/********************************************************************** +Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +********************************************************************/ + +#ifndef SSR_DEPTH_DOWNSAMPLE +#define SSR_DEPTH_DOWNSAMPLE + +[[vk::binding(0)]] Texture2D g_depth_buffer : register(t0); +[[vk::binding(1)]] RWTexture2D g_downsampled_depth_buffer[13] : register(u0); // 12 is the maximum amount of supported mips by the downsampling lib (4096x4096). We copy the depth buffer over for simplicity. +[[vk::binding(2)]] RWBuffer g_global_atomic : register(u13); // Single atomic counter that stores the number of remaining threadgroups to process. + +#define A_GPU +#define A_HLSL +#include "ffx_a.h" + +groupshared float g_group_shared_depth_values[16][16]; +groupshared uint g_group_shared_counter; + +#define DS_FALLBACK + +// Define fetch and store functions +AF4 SpdLoadSourceImage(ASU2 index) { return g_depth_buffer[index].xxxx; } +AF4 SpdLoad(ASU2 index) { return g_downsampled_depth_buffer[6][index].xxxx; } // 5 -> 6 as we store a copy of the depth buffer at index 0 +void SpdStore(ASU2 pix, AF4 outValue, AU1 index) { g_downsampled_depth_buffer[index + 1][pix] = outValue.x; } // + 1 as we store a copy of the depth buffer at index 0 +void SpdIncreaseAtomicCounter() { InterlockedAdd(g_global_atomic[0], 1, g_group_shared_counter); } +AU1 SpdGetAtomicCounter() { return g_group_shared_counter; } +AF4 SpdLoadIntermediate(AU1 x, AU1 y) { + float f = g_group_shared_depth_values[x][y]; + return f.xxxx; +} +void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) { g_group_shared_depth_values[x][y] = value.x; } +AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) { return min(min(v0, v1), min(v2,v3)); } + +#include "ffx_spd.h" + +uint GetThreadgroupCount(uint2 image_size) +{ + // Each threadgroup works on 64x64 texels + return ((image_size.x + 63) / 64) * ((image_size.y + 63) / 64); +} + +// Returns mips count of a texture with specified size +float GetMipsCount(float2 texture_size) +{ + float max_dim = max(texture_size.x, texture_size.y); + return 1.0 + floor(log2(max_dim)); +} + +[numthreads(32, 8, 1)] +void main(uint3 did : SV_DispatchThreadID, uint3 gid : SV_GroupID, uint gi : SV_GroupIndex) +{ + float2 depth_image_size = 0; + g_depth_buffer.GetDimensions(depth_image_size.x, depth_image_size.y); + + // Copy most detailed level into the hierarchy and transform it. + uint2 u_depth_image_size = uint2(depth_image_size); + for (int i = 0; i < 2; ++i) + { + for (int j = 0; j < 8; ++j) + { + uint2 idx = uint2(2 * did.x + i, 8 * did.y + j); + if (idx.x < u_depth_image_size.x && idx.y < u_depth_image_size.y) + { + g_downsampled_depth_buffer[0][idx] = g_depth_buffer[idx]; + } + } + } + + float2 image_size = 0; + g_downsampled_depth_buffer[0].GetDimensions(image_size.x, image_size.y); + float mips_count = GetMipsCount(image_size); + uint threadgroup_count = GetThreadgroupCount(image_size); + + SpdDownsample( + AU2(gid.xy), + AU1(gi), + AU1(mips_count), + AU1(threadgroup_count)); +} + +#endif // SSR_DEPTH_DOWNSAMPLE \ No newline at end of file diff --git a/sample/src/VK/Shaders/ffx_a.h b/sample/src/VK/Shaders/ffx_a.h new file mode 100644 index 0000000..b92546e --- /dev/null +++ b/sample/src/VK/Shaders/ffx_a.h @@ -0,0 +1,1907 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20190530 +// +//============================================================================================================================== +// LICENSE +// ======= +// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) <2014> +// ------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +// - Add subgroup ops. +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20190531 - Fixed changed to llabs() because long is int on Windows. +// 20190530 - Updated for new CPU/GPU portability. +// 20190528 - Fix AU1_AH2_x() on HLSL (had incorrectly swapped x and y), fixed asuint() cases. +// 20190527 - Added min3/max3 for low precision for HLSL. +// 20190526 - Updated with half approximations, added ARsq*(), and ASat*() for CPU. +// 20190519 - Added more approximations. +// 20190514 - Added long conversions. +// 20190513 - Added the real BFI moved the other one to ABfiM(). +// 20190507 - Added extra remap useful for 2D reductions. +// 20190507 - Started adding wave ops, add parabolic sin/cos. +// 20190505 - Added ASigned*() and friends, setup more auto-typecast, GLSL extensions, etc. +// 20190504 - Added min3/max3 for 32-bit integers. +// 20190503 - Added type reinterpretation for half. +// 20190416 - Added min3/max3 for half. +// 20190405 - Misc bug fixing. +// 20190404 - Cleaned up color conversion code. Switched "splat" to shorter naming "type_". Misc bug fixing. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +// Requires standard C types: stdint.h +// Requires a collection of standard math intrinsics. +// - Requires VS2013 when not using GCC to get exp2() and log2(). +// - https://blogs.msdn.microsoft.com/vcblog/2013/07/19/c99-library-support-in-visual-studio-2013/ +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a minimum subset of functionality compared to the GPU parts. +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Hackary to get CPU and GPU to share all setup code, without duplicate code paths. +// Unfortunately this is the level of "ugly" that is required since the languages are very different. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + AF1 AWaveAdd(AF1 v){return subgroupAdd(v);} + AF2 AWaveAdd(AF2 v){return subgroupAdd(v);} + AF3 AWaveAdd(AF3 v){return subgroupAdd(v);} + AF4 AWaveAdd(AF4 v){return subgroupAdd(v);} + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #define AW1_AH1(x) AW1(asuint(AF1(x))) + #define AW2_AH2(x) AW2(asuint(AF2(x))) + #define AW3_AH3(x) AW3(asuint(AF3(x))) + #define AW4_AH4(x) AW4(asuint(AF4(x))) +//------------------------------------------------------------------------------------------------------------------------------ + // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #define AH1_AW1(x) AH1(asfloat(AU1(x))) + #define AH2_AW2(x) AH2(asfloat(AU2(x))) + #define AH3_AW3(x) AH3(asfloat(AU3(x))) + #define AH4_AW4(x) AH4(asfloat(AU4(x))) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFN_F AF1_AU1(0x7f800000u) + #define A_INFP_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//============================================================================================================================== + #ifdef A_HALF + #define A_INFN_H AH1_AW1(0x7c00u) + #define A_INFP_H AH1_AW1(0xfc00u) +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// IDEAS +// ===== +// - Polaris hardware has 16-bit support, but non-double rate. +// Could be possible still get part double rate for some of this logic, +// by clearing out the lower half's sign when necessary and using 32-bit ops... +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Verify packed math ABS is correctly doing an AND. +//============================================================================================================================== + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to -1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + AF1 ATo709F1(AF1 c){return max(min(c*AF1_(4.5),AF1_(0.018)),AF1_(1.099)*pow(c,AF1_(0.45))-AF1_(0.099));} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,rcpX);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){return max(min(c*AF1_(12.92),AF1_(0.0031308)),AF1_(1.055)*pow(c,AF1_(0.41666))-AF1_(0.055));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} +//============================================================================================================================== + AF1 AFrom709F1(AF1 c){return max(min(c*AF1_(1.0/4.5),AF1_(0.081)), + pow((c+AF1_(0.099))*(AF1_(1.0)/(AF1_(1.099))),AF1_(1.0/0.45)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromSrgbF1(AF1 c){return max(min(c*AF1_(1.0/12.92),AF1_(0.04045)), + pow((c+AF1_(0.055))*(AF1_(1.0)/AF1_(1.055)),AF1_(2.4)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} +//============================================================================================================================== + #ifdef A_HALF + AH2 ATo709H2(AH2 c){return max(min(c*AH2_(4.5),AH2_(0.018)),AH2_(1.099)*pow(c,AH2_(0.45))-AH2_(0.099));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToSrgbH2(AH2 c){return max(min(c*AH2_(12.92),AH2_(0.0031308)),AH2_(1.055)*pow(c,AH2_(0.41666))-AH2_(0.055));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToTwoH2(AH2 c){return sqrt(c);} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH2 AFrom709H2(AH2 c){return max(min(c*AH2_(1.0/4.5),AH2_(0.081)), + pow((c+AH2_(0.099))*(AH2_(1.0)/(AH2_(1.099))),AH2_(1.0/0.45)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromSrgbH2(AH2 c){return max(min(c*AH2_(1.0/12.92),AH2_(0.04045)), + pow((c+AH2_(0.055))*(AH2_(1.0)/AH2_(1.055)),AH2_(2.4)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromTwoH2(AH2 c){return c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/128 +// 1024 : 8 +// 2047 : a little less than 16 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) min(a,b) + #define AMaxF1(a,b) min(a,b) + #define AMaxL1(a,b) min(a,b) + #define AMaxU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif diff --git a/sample/src/VK/Shaders/ffx_spd.h b/sample/src/VK/Shaders/ffx_spd.h new file mode 100644 index 0000000..68c9ef4 --- /dev/null +++ b/sample/src/VK/Shaders/ffx_spd.h @@ -0,0 +1,1164 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// [FFX SPD] Single Pass Downsampler 1.0 +// +//============================================================================================================================== +// LICENSE +// ======= +// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// ------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------------------------------------------------------ +// INTEGRATION SUMMARY FOR CPU +// =========================== +// // you need to provide as constants: +// // number of mip levels to be computed (maximum is 12) +// // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6) +// ... +// // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image +// vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6,1); + +//------------------------------------------------------------------------------------------------------------------------------ +// INTEGRATION SUMMARY FOR GPU +// =========================== + +// [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image +// follow additionally the instructions marked with [SAMPLER] +// add following define: +// #SPD_LINEAR_SAMPLER +// this is recommended, as using one sample() with linear filter to reduce 2x2 is faster +// than 4x load() plus manual averaging + +// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT. +// // Note: If you use UNORM/SRGB format, you need to convert to linear space +// // when using UAV load() and store() +// // conversion to linear (load function): x*x +// // conversion from linear (store function): sqrt() + +// // source image +// GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc; +// [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc; +// HLSL: [[vk::binding(0)]] Texture2D imgSrc :register(u0); + +// // destination -> 12 is the maximum number of mips supported by DS +// GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12]; +// HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D imgDst[12] :register(u1); + +// // global atomic counter - MUST be initialized to 0 +// // GLSL: +// layout(std430, set=0, binding=2) buffer globalAtomicBuffer +// { +// uint counter; +// } globalAtomic; +// // HLSL: +// struct globalAtomicBuffer +// { +// uint counter; +// }; +// [[vk::binding(2)]] RWStructuredBuffer globalAtomic; + +// // [SAMPLER] add sampler +// GLSL: layout(set=0, binding=3) uniform sampler srcSampler; +// HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0); + +// // constants - either push constant or constant buffer +// // or calculate within shader +// // [SAMPLER] when using sampler add inverse source image size +// // GLSL: +// layout(push_constant) uniform pushConstants { +// uint mips; // needed to opt out earlier if mips are < 12 +// uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * numWorkGroupsZ +// } spdConstants; +// // HLSL: +// [[vk::push_constant]] +// cbuffer spdConstants { +// uint mips; +// uint numWorkGroups; +// }; + +// ... +// // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc) +// #define A_GPU 1 +// #define A_GLSL 1 // or // #define A_HLSL 1 + +// // if you want to use PACKED version +// // recommended if bpc <= 16bit +// #define A_HALF + +// ... +// // Include the portability header (or copy it in without an include). +// #include "ffx_a.h" +// ... + +// // Define LDS variables +// shared AF4 spd_intermediate[16][16]; // HLSL: groupshared +// shared AU1 spd_counter; // HLSL: groupshared +// // PACKED version +// shared AH4 spd_intermediate[16][16]; // HLSL: groupshared +// // Note: You can also use +// shared AF1 spd_intermediateR[16][16]; +// shared AF1 spd_intermediateG[16][16]; +// shared AF1 spd_intermediateB[16][16]; +// shared AF1 spd_intermediateA[16][16]; +// // or for Packed version: +// shared AH2 spd_intermediateRG[16][16]; +// shared AH2 spd_intermediateBA[16][16]; +// // This is potentially faster +// // Adapt your load and store functions accordingly + +// // if subgroup operations are not supported / can't use SM6.0 +// #define SPD_NO_WAVE_OPERATIONS + +// // Define the fetch function(s) and the reduction function +// // if non-power-of-2 textures, add border controls to the load and store functions +// // to make sure the borders of the mip level look as you want it +// // if you don't add border controls you'll read zeros past the border +// // if you load with a sampler, this is obv. handled by your sampler :) +// // this is also the place where you need to do color space transformation if needed +// // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions +// // no automatic to/from linear conversions are happening +// // there is to/from linear conversions when using a sampler and render target approach +// // conversion to linear (load function): x*x +// // conversion from linear (store function): sqrt() + +// // Load from source image +// GLSL: AF4 SpdLoadSourceImage(ASU2 p){return imageLoad(imgSrc, p);} +// HLSL: AF4 SpdLoadSourceImage(ASU2 tex){return imgSrc[tex];} +// [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :) +// GLSL: +// AF4 SpdLoadSourceImage(ASU2 p){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return texture(sampler2D(imgSrc, srcSampler), textureCoord); +// } +// HLSL: +// AF4 SpdLoadSourceImage(ASU2 p){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return imgSrc.SampleLevel(srcSampler, textureCoord, 0); +// } + +// // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color. +// // Loads the 5th mip level, each value is computed by a different thread group +// // last thread group will access all its elements and compute the subsequent mips +// GLSL: AF4 SpdLoad(ASU2 p){return imageLoad(imgDst[5],p);} +// HLSL: AF4 SpdLoad(ASU2 tex){return imgDst[5][tex];} + +// Define the store function +// GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip){imageStore(imgDst[mip], p, value);} +// HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 index){imgDst[index][pix] = value;} + +// // Define the atomic counter increase function +// // GLSL: +// void SpdIncreaseAtomicCounter(){spd_counter = atomicAdd(globalAtomic.counter, 1);} +// AU1 SpdGetAtomicCounter() {return spd_counter;} +// // HLSL: +// void SpdIncreaseAtomicCounter(){InterlockedAdd(globalAtomic[0].counter, 1, spd_counter);} +// AU1 SpdGetAtomicCounter(){return spd_counter;} + +// // Define the LDS load and store functions +// // GLSL: +// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spd_intermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spd_intermediate[x][y] = value;} +// // HLSL: +// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spd_intermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spd_intermediate[x][y] = value;} + +// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value +// Example below: computes the average value +// AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;} + +// // PACKED VERSION +// Load from source image +// GLSL: AH4 SpdLoadSourceImageH(ASU2 p){return AH4(imageLoad(imgSrc, p));} +// HLSL: AH4 SpdLoadSourceImageH(ASU2 tex){return AH4(imgSrc[tex]);} +// [SAMPLER] +// GLSL: +// AH4 SpdLoadSourceImageH(ASU2 p){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord)); +// } +// HLSL: +// AH4 SpdLoadSourceImageH(ASU2 p){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0)); +// } + +// // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color. +// // Loads the 5th mip level, each value is computed by a different thread group +// // last thread group will access all its elements and compute the subsequent mips +// GLSL: AH4 SpdLoadH(ASU2 p){return AH4(imageLoad(imgDst[5],p));} +// HLSL: AH4 SpdLoadH(ASU2 tex){return AH4(imgDst[5][tex]);} + +// Define the store function +// GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip){imageStore(imgDst[mip], p, AF4(value));} +// HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index){imgDst[index][pix] = AF4(value);} + +// // Define the atomic counter increase function +// // GLSL: +// void SpdIncreaseAtomicCounter(){spd_counter = atomicAdd(globalAtomic.counter, 1);} +// AU1 SpdGetAtomicCounter() {return spd_counter;} +// // HLSL: +// void SpdIncreaseAtomicCounter(){InterlockedAdd(globalAtomic[0].counter, 1, spd_counter);} +// AU1 SpdGetAtomicCounter(){return spd_counter;} + +// // Define the lds load and store functions +// // GLSL: +// AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spd_intermediate[x][y];} +// void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spd_intermediate[x][y] = value;} +// // HLSL: +// AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spd_intermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spd_intermediate[x][y] = value;} + +// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value +// Example below: computes the average value +// AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);} + +// // + +// // If you only use PACKED version +// #define SPD_PACKED_ONLY + +// // Include this SPD (single pass downsampler) header file (or copy it in without an include). +// #include "ffx_spd.h" +// ... + +// // Example in shader integration +// // GLSL: +// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; +// void main(){ +// // Call the downsampling function +// SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), +// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups)); +// +// // PACKED: +// SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), +// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups)); +// ... +// // HLSL: +// [numthreads(256,1,1)] +// void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) { +// SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex), +// AU1(mips), AU1(numWorkGroups)); +// +// // PACKED: +// SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex), +// AU1(mips), AU1(numWorkGroups)); +// ... + +// +//------------------------------------------------------------------------------------------------------------------------------ + + + +//============================================================================================================================== +// NON-PACKED VERSION +//============================================================================================================================== + +#ifdef SPD_PACKED_ONLY + // Avoid compiler error +AF4 SpdLoadSourceImage(ASU2 p) { return AF4(0.0, 0.0, 0.0, 0.0); } +AF4 SpdLoad(ASU2 p) { return AF4(0.0, 0.0, 0.0, 0.0); } +void SpdStore(ASU2 p, AF4 value, AU1 mip) {} +AF4 SpdLoadIntermediate(AU1 x, AU1 y) { return AF4(0.0, 0.0, 0.0, 0.0); } +void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) {} +AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) { return AF4(0.0, 0.0, 0.0, 0.0); } +#endif + +//_____________________________________________________________/\_______________________________________________________________ +#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) +#extension GL_KHR_shader_subgroup_quad : require +#endif + +void SpdWorkgroupShuffleBarrier() { +#ifdef A_GLSL + barrier(); +#endif +#ifdef A_HLSL + GroupMemoryBarrierWithGroupSync(); +#endif +} + +// Only last active workgroup should proceed +bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex) +{ + // global atomic counter + if (localInvocationIndex == 0) + { + SpdIncreaseAtomicCounter(); + } + SpdWorkgroupShuffleBarrier(); + return (SpdGetAtomicCounter() != (numWorkGroups - 1)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// User defined: AF4 DSReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3); + +AF4 SpdReduceQuad(AF4 v) +{ +#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + AF4 v0 = v; + AF4 v1 = subgroupQuadSwapHorizontal(v); + AF4 v2 = subgroupQuadSwapVertical(v); + AF4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4(v0, v1, v2, v3); +#elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + AU1 quad = WaveGetLaneIndex() & (~0x3); + AF4 v0 = v; + AF4 v1 = WaveReadLaneAt(v, quad | 1); + AF4 v2 = WaveReadLaneAt(v, quad | 2); + AF4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4(v0, v1, v2, v3); + /* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // works for DX11 + AF4 v0 = v; + AF4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + AF4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + AF4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4(v0, v1, v2, v3); + */ +#endif + return AF4_x(0.0); +} + +AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AF4 v0 = SpdLoadIntermediate(i0.x, i0.y); + AF4 v1 = SpdLoadIntermediate(i1.x, i1.y); + AF4 v2 = SpdLoadIntermediate(i2.x, i2.y); + AF4 v3 = SpdLoadIntermediate(i3.x, i3.y); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AF4 v0 = SpdLoad(ASU2(i0)); + AF4 v1 = SpdLoad(ASU2(i1)); + AF4 v2 = SpdLoad(ASU2(i2)); + AF4 v3 = SpdLoad(ASU2(i3)); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoad4(AU2 base) +{ + return SpdReduceLoad4( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1))); +} + +AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AF4 v0 = SpdLoadSourceImage(ASU2(i0)); + AF4 v1 = SpdLoadSourceImage(ASU2(i1)); + AF4 v2 = SpdLoadSourceImage(ASU2(i2)); + AF4 v3 = SpdLoadSourceImage(ASU2(i3)); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoadSourceImage4(AU2 base) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImage(ASU2(base)); +#else + return SpdReduceLoadSourceImage4( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1))); +#endif +} + +void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ + AF4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[0], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[1], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[2], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[3], 0); + + if (mip <= 1) + return; + + v[0] = SpdReduceQuad(v[0]); + v[1] = SpdReduceQuad(v[1]); + v[2] = SpdReduceQuad(v[2]); + v[3] = SpdReduceQuad(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x / 2, y / 2), v[0], 1); + SpdStoreIntermediate( + x / 2, y / 2, v[0]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x / 2 + 8, y / 2), v[1], 1); + SpdStoreIntermediate( + x / 2 + 8, y / 2, v[1]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x / 2, y / 2 + 8), v[2], 1); + SpdStoreIntermediate( + x / 2, y / 2 + 8, v[2]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x / 2 + 8, y / 2 + 8), v[3], 1); + SpdStoreIntermediate( + x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ + AF4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[0], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[1], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[2], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage4(tex); + SpdStore(pix, v[3], 0); + + if (mip <= 1) + return; + + for (int i = 0; i < 4; i++) + { + SpdStoreIntermediate(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediate( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStore(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediate(x + 0, y + 0, v[0]); + SpdStoreIntermediate(x + 8, y + 0, v[1]); + SpdStoreIntermediate(x + 0, y + 8, v[2]); + SpdStoreIntermediate(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip); +#else + SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip); +#endif +} + + +void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + AF4 v = SpdReduceIntermediate( + AU2(x * 2 + 0 + 0, y * 2 + 0), + AU2(x * 2 + 0 + 1, y * 2 + 0), + AU2(x * 2 + 0 + 0, y * 2 + 1), + AU2(x * 2 + 0 + 1, y * 2 + 1) + ); + SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediate(x * 2 + y % 2, y * 2, v); + } +#else + AF4 v = SpdLoadIntermediate(x, y); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediate(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + AF4 v = SpdReduceIntermediate( + AU2(x * 4 + 0 + 0, y * 4 + 0), + AU2(x * 4 + 2 + 0, y * 4 + 0), + AU2(x * 4 + 0 + 1, y * 4 + 2), + AU2(x * 4 + 2 + 1, y * 4 + 2) + ); + SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediate(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + AF4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediate(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + AF4 v = SpdReduceIntermediate( + AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4) + ); + SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediate(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + AF4 v = SpdLoadIntermediate(x * 4 + y, y * 4); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediate(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + AF4 v = SpdReduceIntermediate( + AU2(0, 0), + AU2(1, 0), + AU2(2, 0), + AU2(3, 0) + ); + SpdStore(ASU2(workGroupID.xy), v, mip); + } +#else + if (localInvocationIndex < 4) + { + AF4 v = SpdLoadIntermediate(localInvocationIndex, 0); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy), v, mip); + } + } +#endif +} + +void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips) +{ + ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0); + ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0); + AF4 v0 = SpdReduceLoad4(tex); + SpdStore(pix, v0, 6); + + tex = ASU2(x * 4 + 2, y * 4 + 0); + pix = ASU2(x * 2 + 1, y * 2 + 0); + AF4 v1 = SpdReduceLoad4(tex); + SpdStore(pix, v1, 6); + + tex = ASU2(x * 4 + 0, y * 4 + 2); + pix = ASU2(x * 2 + 0, y * 2 + 1); + AF4 v2 = SpdReduceLoad4(tex); + SpdStore(pix, v2, 6); + + tex = ASU2(x * 4 + 2, y * 4 + 2); + pix = ASU2(x * 2 + 1, y * 2 + 1); + AF4 v3 = SpdReduceLoad4(tex); + SpdStore(pix, v3, 6); + + if (mips <= 7) return; + // no barrier needed, working on values only from the same thread + + AF4 v = SpdReduce4(v0, v1, v2, v3); + SpdStore(ASU2(x, y), v, 7); + SpdStoreIntermediate(x, y, v); +} + +void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips) +{ + if (mips <= baseMip) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip); + + if (mips <= baseMip + 1) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1); + + if (mips <= baseMip + 2) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2); + + if (mips <= baseMip + 3) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5(x, y, workGroupID, localInvocationIndex, baseMip + 3); +} + +void SpdDownsample( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups +) { + AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64); + AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips); + + SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips); + + if (mips <= 6) return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex)) return; + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7(x, y, mips); + + SpdDownsampleNextFour(x, y, AU2(0, 0), localInvocationIndex, 8, mips); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//============================================================================================================================== +// PACKED VERSION +//============================================================================================================================== + +#ifdef A_HALF // A_HALF + +#ifdef A_GLSL +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +AH4 SpdReduceQuadH(AH4 v) +{ +#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + AH4 v0 = v; + AH4 v1 = subgroupQuadSwapHorizontal(v); + AH4 v2 = subgroupQuadSwapVertical(v); + AH4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4H(v0, v1, v2, v3); +#elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + AU1 quad = WaveGetLaneIndex() & (~0x3); + AH4 v0 = v; + AH4 v1 = WaveReadLaneAt(v, quad | 1); + AH4 v2 = WaveReadLaneAt(v, quad | 2); + AH4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4H(v0, v1, v2, v3); + /* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // works for DX11 + AH4 v0 = v; + AH4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + AH4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + AH4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4H(v0, v1, v2, v3); + */ +#endif + return AH4(0.0, 0.0, 0.0, 0.0); + +} + +AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y); + AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y); + AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y); + AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AH4 v0 = SpdLoadH(ASU2(i0)); + AH4 v1 = SpdLoadH(ASU2(i1)); + AH4 v2 = SpdLoadH(ASU2(i2)); + AH4 v3 = SpdLoadH(ASU2(i3)); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoad4H(AU2 base) +{ + return SpdReduceLoad4H( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1))); +} + +AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AH4 v0 = SpdLoadSourceImageH(ASU2(i0)); + AH4 v1 = SpdLoadSourceImageH(ASU2(i1)); + AH4 v2 = SpdLoadSourceImageH(ASU2(i2)); + AH4 v3 = SpdLoadSourceImageH(ASU2(i3)); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoadSourceImage4H(AU2 base) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImageH(ASU2(base)); +#else + return SpdReduceLoadSourceImage4H( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1))); +#endif +} + +void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips) +{ + AH4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[0], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[1], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[2], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[3], 0); + + if (mips <= 1) + return; + + v[0] = SpdReduceQuadH(v[0]); + v[1] = SpdReduceQuadH(v[1]); + v[2] = SpdReduceQuadH(v[2]); + v[3] = SpdReduceQuadH(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2), v[0], 1); + SpdStoreIntermediateH(x / 2, y / 2, v[0]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2), v[1], 1); + SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2, y / 2 + 8), v[2], 1); + SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x / 2 + 8, y / 2 + 8), v[3], 1); + SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips) +{ + AH4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[0], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[1], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[2], 0); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage4H(tex); + SpdStoreH(pix, v[3], 0); + + if (mips <= 1) + return; + + for (int i = 0; i < 4; i++) + { + SpdStoreIntermediateH(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediateH( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediateH(x + 0, y + 0, v[0]); + SpdStoreIntermediateH(x + 8, y + 0, v[1]); + SpdStoreIntermediateH(x + 0, y + 8, v[2]); + SpdStoreIntermediateH(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips); +#else + SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips); +#endif +} + + +void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + AH4 v = SpdReduceIntermediateH( + AU2(x * 2 + 0 + 0, y * 2 + 0), + AU2(x * 2 + 0 + 1, y * 2 + 0), + AU2(x * 2 + 0 + 0, y * 2 + 1), + AU2(x * 2 + 0 + 1, y * 2 + 1) + ); + SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v); + } +#else + AH4 v = SpdLoadIntermediateH(x, y); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediateH(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + AH4 v = SpdReduceIntermediateH( + AU2(x * 4 + 0 + 0, y * 4 + 0), + AU2(x * 4 + 2 + 0, y * 4 + 0), + AU2(x * 4 + 0 + 1, y * 4 + 2), + AU2(x * 4 + 2 + 1, y * 4 + 2) + ); + SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediateH(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + AH4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + AH4 v = SpdReduceIntermediateH( + AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4) + ); + SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediateH(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + AH4 v = SpdLoadIntermediateH(x * 4 + y, y * 4); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x / 2, y / 2), v, mip); + SpdStoreIntermediateH(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + AH4 v = SpdReduceIntermediateH( + AU2(0, 0), + AU2(1, 0), + AU2(2, 0), + AU2(3, 0) + ); + SpdStoreH(ASU2(workGroupID.xy), v, mip); + } +#else + if (localInvocationIndex < 4) + { + AH4 v = SpdLoadIntermediateH(localInvocationIndex, 0); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy), v, mip); + } + } +#endif +} + +void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips) +{ + ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0); + ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0); + AH4 v0 = SpdReduceLoad4H(tex); + SpdStoreH(pix, v0, 6); + + tex = ASU2(x * 4 + 2, y * 4 + 0); + pix = ASU2(x * 2 + 1, y * 2 + 0); + AH4 v1 = SpdReduceLoad4H(tex); + SpdStoreH(pix, v1, 6); + + tex = ASU2(x * 4 + 0, y * 4 + 2); + pix = ASU2(x * 2 + 0, y * 2 + 1); + AH4 v2 = SpdReduceLoad4H(tex); + SpdStoreH(pix, v2, 6); + + tex = ASU2(x * 4 + 2, y * 4 + 2); + pix = ASU2(x * 2 + 1, y * 2 + 1); + AH4 v3 = SpdReduceLoad4H(tex); + SpdStoreH(pix, v3, 6); + + if (mips < 8) return; + // no barrier needed, working on values only from the same thread + + AH4 v = SpdReduce4H(v0, v1, v2, v3); + SpdStoreH(ASU2(x, y), v, 7); + SpdStoreIntermediateH(x, y, v); +} + +void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips) +{ + if (mips <= baseMip) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip); + + if (mips <= baseMip + 1) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1); + + if (mips <= baseMip + 2) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2); + + if (mips <= baseMip + 3) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5H(x, y, workGroupID, localInvocationIndex, baseMip + 3); +} + +void SpdDownsampleH( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups +) { + AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64); + AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + + SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips); + + SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips); + + if (mips < 7) return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex)) return; + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7H(x, y, mips); + + SpdDownsampleNextFourH(x, y, AU2(0, 0), localInvocationIndex, 8, mips); +} + +#endif \ No newline at end of file diff --git a/sample/src/VK/Sources/SampleRenderer.cpp b/sample/src/VK/Sources/SampleRenderer.cpp new file mode 100644 index 0000000..e29908f --- /dev/null +++ b/sample/src/VK/Sources/SampleRenderer.cpp @@ -0,0 +1,1934 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2018 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" + +#include "SampleRenderer.h" +#include + +#undef max +#undef min + +void SSRLoggingFunction(const char* pMessage, void* pUserData) +{ + char buffer[4096]; + snprintf(buffer, sizeof(buffer), "%s\n", pMessage); + MessageBox(NULL, buffer, "RtShadows Error", MB_OK | MB_ICONERROR); + exit(-1); +} + +//-------------------------------------------------------------------------------------- +// +// OnCreate +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::OnCreate(Device* pDevice, SwapChain *pSwapChain) +{ + m_pDevice = pDevice; + m_CurrentFrame = 0; + + // Initialize helpers + + // Create all the heaps for the resources views + const uint32_t cbvDescriptorCount = 2000; + const uint32_t srvDescriptorCount = 2000; + const uint32_t uavDescriptorCount = 10; + const uint32_t samplerDescriptorCount = 20; + m_ResourceViewHeaps.OnCreate(pDevice, cbvDescriptorCount, srvDescriptorCount, uavDescriptorCount, samplerDescriptorCount); + + // Create a commandlist ring for the Direct queue + uint32_t commandListsPerBackBuffer = 8; + m_CommandListRing.OnCreate(pDevice, backBufferCount, commandListsPerBackBuffer); + + // Create a 'dynamic' constant buffer + const uint32_t constantBuffersMemSize = 200 * 1024 * 1024; + m_ConstantBufferRing.OnCreate(pDevice, backBufferCount, constantBuffersMemSize, "Uniforms"); + + // Create a 'static' pool for vertices and indices + const uint32_t staticGeometryMemSize = 5 * 128 * 1024 * 1024; + const uint32_t systemGeometryMemSize = 32 * 1024; + m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, USE_VID_MEM, "StaticGeom"); + m_SysMemBufferPool.OnCreate(pDevice, systemGeometryMemSize, false, "PostProcGeom"); + + // initialize the GPU time stamps module + m_GPUTimer.OnCreate(pDevice, backBufferCount); + + // Quick helper to upload resources, it has it's own commandList and uses suballocation. + // for 4K textures we'll need 100Megs + const uint32_t uploadHeapMemSize = 1000 * 1024 * 1024; + m_UploadHeap.OnCreate(pDevice, staticGeometryMemSize); // initialize an upload heap (uses suballocation for faster results) + + CreateApplyReflectionsPipeline(); + CreateDepthDownsamplePipeline(); + + + // Create a command buffer for upload + m_CommandListRing.OnBeginFrame(); + VkCommandBuffer uploadCommandBuffer = BeginNewCommandBuffer(); + + FfxSssrVkCreateContextInfo vkContextInfo = {}; + vkContextInfo.device = m_pDevice->GetDevice(); + vkContextInfo.physicalDevice = m_pDevice->GetPhysicalDevice(); + vkContextInfo.uploadCommandBuffer = uploadCommandBuffer; + + FfxSssrLoggingCallbacks loggingCallbacks = {}; + loggingCallbacks.pUserData = this; + loggingCallbacks.pfnLogging = SSRLoggingFunction; + + FfxSssrCreateContextInfo contextInfo = {}; + contextInfo.apiVersion = FFX_SSSR_API_VERSION; + contextInfo.frameCountBeforeMemoryReuse = backBufferCount; + contextInfo.maxReflectionViewCount = 1; + contextInfo.pVkCreateContextInfo = &vkContextInfo; + contextInfo.pLoggingCallbacks = &loggingCallbacks; + contextInfo.uploadBufferSize = 8 * 1024 * 1024; + contextInfo.pRoughnessTextureFormat = L"float4"; + contextInfo.pUnpackRoughnessSnippet = L"float FfxSssrUnpackRoughness(FFX_SSSR_ROUGHNESS_TEXTURE_FORMAT packed) { return packed.w; }"; + contextInfo.pNormalsTextureFormat = L"float4"; + contextInfo.pUnpackNormalsSnippet = L"float3 FfxSssrUnpackNormals(FFX_SSSR_NORMALS_TEXTURE_FORMAT packed) { return 2 * packed.xyz - 1; }"; + contextInfo.pSceneTextureFormat = L"float4"; + contextInfo.pUnpackSceneRadianceSnippet = L"float3 FfxSssrUnpackSceneRadiance(FFX_SSSR_SCENE_TEXTURE_FORMAT packed) { return packed.xyz; }"; + contextInfo.pDepthTextureFormat = L"float"; + contextInfo.pUnpackDepthSnippet = L"float FfxSssrUnpackDepth(FFX_SSSR_DEPTH_TEXTURE_FORMAT packed) { return packed.x; }"; + contextInfo.pMotionVectorFormat = L"float2"; + contextInfo.pUnpackMotionVectorsSnippet = L"float2 FfxSssrUnpackMotionVectors(FFX_SSSR_MOTION_VECTOR_TEXTURE_FORMAT packed) { return packed.xy * float2(0.5, -0.5); }"; + + FfxSssrStatus status = ffxSssrCreateContext(&contextInfo, &m_SssrContext); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrCreateContext failed."); + } + + // Wait for the upload to finish; + SubmitCommandBuffer(uploadCommandBuffer); + m_pDevice->GPUFlush(); + + VkSamplerCreateInfo samplerCreateInfo = { VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO }; + samplerCreateInfo.pNext = nullptr; + samplerCreateInfo.flags = 0; + samplerCreateInfo.magFilter = VK_FILTER_LINEAR; + samplerCreateInfo.minFilter = VK_FILTER_LINEAR; + samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + samplerCreateInfo.mipLodBias = 0; + samplerCreateInfo.anisotropyEnable = false; + samplerCreateInfo.maxAnisotropy = 0; + samplerCreateInfo.compareEnable = false; + samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER; + samplerCreateInfo.minLod = 0; + samplerCreateInfo.maxLod = 16; + samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + samplerCreateInfo.unnormalizedCoordinates = false; + if (VK_SUCCESS != vkCreateSampler(m_pDevice->GetDevice(), &samplerCreateInfo, nullptr, &m_LinearSampler)) + { + Trace("Failed to create linear sampler."); + } + + // Create a 2Kx2K Shadowmap atlas to hold 4 cascades/spotlights + m_ShadowMap.InitDepthStencil(m_pDevice, 2 * 1024, 2 * 1024, VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, "ShadowMap"); + m_ShadowMap.CreateSRV(&m_ShadowMapSRV); + m_ShadowMap.CreateDSV(&m_ShadowMapDSV); + + // Create render pass shadow + // + { + VkAttachmentDescription depthAttachments; + AttachClearBeforeUse(m_ShadowMap.GetFormat(), VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &depthAttachments); + m_RenderPassShadow = CreateRenderPassOptimal(m_pDevice->GetDevice(), 0, NULL, &depthAttachments); + + // Create frame buffer + // + VkImageView attachmentViews[1] = { m_ShadowMapDSV }; + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.pNext = NULL; + framebufferInfo.renderPass = m_RenderPassShadow; + framebufferInfo.attachmentCount = 1; + framebufferInfo.pAttachments = attachmentViews; + framebufferInfo.width = m_ShadowMap.GetWidth(); + framebufferInfo.height = m_ShadowMap.GetHeight(); + framebufferInfo.layers = 1; + VkResult res = vkCreateFramebuffer(m_pDevice->GetDevice(), &framebufferInfo, NULL, &m_FramebufferShadows); + assert(res == VK_SUCCESS); + } + + // Create motion vector render pass + // + { + VkAttachmentDescription colorAttachments[2], depthAttachment; + // motion vector RT + AttachClearBeforeUse(VK_FORMAT_R16G16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &colorAttachments[0]); + // normals RT + AttachClearBeforeUse(VK_FORMAT_A2B10G10R10_UNORM_PACK32, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &colorAttachments[1]); + // depth RT + AttachClearBeforeUse(VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, &depthAttachment); + m_RenderPassMV = CreateRenderPassOptimal(m_pDevice->GetDevice(), _countof(colorAttachments), colorAttachments, &depthAttachment); + } + + // Create HDR render pass color with color clear + // + { + VkAttachmentDescription colorAttachments[1], depthAttachment; + // color RT + AttachClearBeforeUse(VK_FORMAT_R16G16B16A16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, &colorAttachments[0]); + // depth RT + AttachBlending(VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, &depthAttachment); + m_RenderPassClearHDR = CreateRenderPassOptimal(m_pDevice->GetDevice(), _countof(colorAttachments), colorAttachments, &depthAttachment); + } + + // Create PBR render pass + // + { + VkAttachmentDescription colorAttachments[2], depthAttachment; + // color RT + AttachBlending(VK_FORMAT_R16G16B16A16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, &colorAttachments[0]); + // specular roughness RT + AttachClearBeforeUse(VK_FORMAT_R8G8B8A8_UNORM, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &colorAttachments[1]); + // depth RT + AttachBlending(VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, &depthAttachment); + m_RenderPassPBR = CreateRenderPassOptimal(m_pDevice->GetDevice(), _countof(colorAttachments), colorAttachments, &depthAttachment); + } + + // Create HDR render pass color without clear + // + { + VkAttachmentDescription colorAttachments[1], depthAttachment; + // color RT + AttachBlending(VK_FORMAT_R16G16B16A16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &colorAttachments[0]); + // depth RT + AttachBlending(VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &depthAttachment); + m_RenderPassHDR = CreateRenderPassOptimal(m_pDevice->GetDevice(), _countof(colorAttachments), colorAttachments, &depthAttachment); + } + + m_SkyDome.OnCreate(pDevice, m_RenderPassHDR, &m_UploadHeap, VK_FORMAT_R16G16B16A16_SFLOAT, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, "..\\media\\envmaps\\papermill\\diffuse.dds", "..\\media\\envmaps\\papermill\\specular.dds", VK_SAMPLE_COUNT_1_BIT); + m_AmbientLight.OnCreate(pDevice, m_RenderPassHDR, &m_UploadHeap, VK_FORMAT_R16G16B16A16_SFLOAT, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, "..\\media\\envmaps\\white\\diffuse.dds", "..\\media\\envmaps\\white\\specular.dds", VK_SAMPLE_COUNT_1_BIT); + m_SkyDomeProc.OnCreate(pDevice, m_RenderPassHDR, &m_UploadHeap, VK_FORMAT_R16G16B16A16_SFLOAT, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, VK_SAMPLE_COUNT_1_BIT); + m_Wireframe.OnCreate(pDevice, m_RenderPassHDR, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, VK_SAMPLE_COUNT_1_BIT); + m_WireframeBox.OnCreate(pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool); + m_DownSample.OnCreate(pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, VK_FORMAT_R16G16B16A16_SFLOAT); + m_Bloom.OnCreate(pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, VK_FORMAT_R16G16B16A16_SFLOAT); + + // Create tonemapping pass + m_ToneMapping.OnCreate(m_pDevice, pSwapChain->GetRenderPass(), &m_ResourceViewHeaps, &m_SysMemBufferPool, &m_ConstantBufferRing); + + // Initialize UI rendering resources + m_ImGUI.OnCreate(m_pDevice, pSwapChain->GetRenderPass(), &m_UploadHeap, &m_ConstantBufferRing); + + m_BrdfLut.InitFromFile(pDevice, &m_UploadHeap, "BrdfLut.dds", false); // LUT images are stored as linear + m_BrdfLut.CreateSRV(&m_BrdfLutSRV); + + // Make sure upload heap has finished uploading before continuing +#if (USE_VID_MEM==true) + m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); + m_UploadHeap.FlushAndFinish(); +#endif +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroy +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::OnDestroy() +{ + m_ImGUI.OnDestroy(); + m_ToneMapping.OnDestroy(); + m_Bloom.OnDestroy(); + m_DownSample.OnDestroy(); + m_WireframeBox.OnDestroy(); + m_Wireframe.OnDestroy(); + m_SkyDomeProc.OnDestroy(); + m_SkyDome.OnDestroy(); + m_AmbientLight.OnDestroy(); + m_ShadowMap.OnDestroy(); + m_BrdfLut.OnDestroy(); + + ffxSssrDestroyContext(m_SssrContext); + + VkDevice device = m_pDevice->GetDevice(); + + vkDestroySampler(device, m_LinearSampler, nullptr); + vkDestroyImageView(device, m_BrdfLutSRV, nullptr); + vkDestroyImageView(device, m_ShadowMapDSV, nullptr); + vkDestroyImageView(device, m_ShadowMapSRV, nullptr); + + vkDestroyPipeline(device, m_DepthDownsamplePipeline, nullptr); + vkDestroyPipelineLayout(device, m_DepthDownsamplePipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(device, m_DepthDownsampleDescriptorSetLayout, nullptr); + m_ResourceViewHeaps.FreeDescriptor(m_DepthDownsampleDescriptorSet); + + vkDestroyPipeline(device, m_ApplyPipeline, nullptr); + vkDestroyPipelineLayout(device, m_ApplyPipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(device, m_ApplyPipelineDescriptorSetLayout, nullptr); + + for (int i = 0; i < backBufferCount; ++i) + { + m_ResourceViewHeaps.FreeDescriptor(m_ApplyPipelineDescriptorSet[i]); + } + + vkDestroyRenderPass(device, m_RenderPassShadow, nullptr); + vkDestroyRenderPass(device, m_RenderPassClearHDR, nullptr); + vkDestroyRenderPass(device, m_RenderPassHDR, nullptr); + vkDestroyRenderPass(device, m_RenderPassPBR, nullptr); + vkDestroyRenderPass(device, m_RenderPassMV, nullptr); + vkDestroyRenderPass(device, m_RenderPassApply, nullptr); + + vkDestroyFramebuffer(device, m_FramebufferShadows, nullptr); + + m_UploadHeap.OnDestroy(); + m_GPUTimer.OnDestroy(); + m_VidMemBufferPool.OnDestroy(); + m_SysMemBufferPool.OnDestroy(); + m_ConstantBufferRing.OnDestroy(); + m_ResourceViewHeaps.OnDestroy(); + m_CommandListRing.OnDestroy(); +} + +//-------------------------------------------------------------------------------------- +// +// OnCreateWindowSizeDependentResources +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, uint32_t Width, uint32_t Height) +{ + m_Width = Width; + m_Height = Height; + + // Set the viewport + // + m_Viewport.x = 0; + m_Viewport.y = (float)m_Height; + m_Viewport.width = (float)m_Width; + m_Viewport.height = -(float)(m_Height); + m_Viewport.minDepth = (float)0.0f; + m_Viewport.maxDepth = (float)1.0f; + + // Create scissor rectangle + // + m_Scissor.extent.width = m_Width; + m_Scissor.extent.height = m_Height; + m_Scissor.offset.x = 0; + m_Scissor.offset.y = 0; + + // Create depth buffer + // + m_DepthBuffer.InitDepthStencil(m_pDevice, m_Width, m_Height, VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, "DepthBuffer"); + m_DepthBuffer.CreateSRV(&m_DepthBufferSRV); + m_DepthBuffer.CreateDSV(&m_DepthBufferDSV); + + // Create Texture + RTV + // + m_HDR.InitRenderTarget(m_pDevice, m_Width, m_Height, VK_FORMAT_R16G16B16A16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, (VkImageUsageFlags)(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT), false, "HDR"); + m_HDR.CreateSRV(&m_HDRSRV); + + VkImageCreateInfo imageCreateInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; + imageCreateInfo.pNext = nullptr; + imageCreateInfo.arrayLayers = 1; + imageCreateInfo.extent = { m_Width, m_Height, 1 }; + imageCreateInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT; + imageCreateInfo.imageType = VK_IMAGE_TYPE_2D; + imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + imageCreateInfo.mipLevels = 1; + imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; + imageCreateInfo.usage = (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + imageCreateInfo.flags = 0; + m_SssrOutputBuffer.Init(m_pDevice, &imageCreateInfo, "m_SssrOutputBuffer"); + + m_NormalBuffer.InitRenderTarget(m_pDevice, m_Width, m_Height, VK_FORMAT_A2B10G10R10_UNORM_PACK32, VK_SAMPLE_COUNT_1_BIT, (VkImageUsageFlags)(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT), false, "m_NormalBuffer"); + m_NormalBuffer.CreateSRV(&m_NormalBufferSRV); + + imageCreateInfo.format = VK_FORMAT_A2B10G10R10_UNORM_PACK32; + m_NormalHistoryBuffer.Init(m_pDevice, &imageCreateInfo, "m_NormalHistoryBuffer"); + + m_SpecularRoughnessHistory.InitRenderTarget(m_pDevice, m_Width, m_Height, VK_FORMAT_R8G8B8A8_UNORM, VK_SAMPLE_COUNT_1_BIT, (VkImageUsageFlags)(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT), false, "m_SpecularRoughnessHistory"); + m_SpecularRoughness.InitRenderTarget(m_pDevice, m_Width, m_Height, VK_FORMAT_R8G8B8A8_UNORM, VK_SAMPLE_COUNT_1_BIT, (VkImageUsageFlags)(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT), false, "m_SpecularRoughness"); + m_SpecularRoughness.CreateSRV(&m_SpecularRoughnessSRV); + + m_MotionVectors.InitRenderTarget(m_pDevice, m_Width, m_Height, VK_FORMAT_R16G16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, (VkImageUsageFlags)(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT), false, "m_MotionVector"); + m_MotionVectors.CreateSRV(&m_MotionVectorsSRV); + + // Create framebuffer for the RT + // + { + VkImageView hdrAttachments[2] = { m_HDRSRV, m_DepthBufferDSV }; + + VkFramebufferCreateInfo hdrFramebufferInfo = {}; + hdrFramebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + hdrFramebufferInfo.pNext = NULL; + hdrFramebufferInfo.renderPass = m_RenderPassHDR; + hdrFramebufferInfo.attachmentCount = _countof(hdrAttachments); + hdrFramebufferInfo.pAttachments = hdrAttachments; + hdrFramebufferInfo.width = m_Width; + hdrFramebufferInfo.height = m_Height; + hdrFramebufferInfo.layers = 1; + + VkResult res = vkCreateFramebuffer(m_pDevice->GetDevice(), &hdrFramebufferInfo, NULL, &m_FramebufferHDR); + assert(res == VK_SUCCESS); + } + + { + VkImageView pbrAttachments[3] = { m_HDRSRV, m_SpecularRoughnessSRV, m_DepthBufferDSV }; + + VkFramebufferCreateInfo pbrFramebufferInfo = {}; + pbrFramebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + pbrFramebufferInfo.pNext = NULL; + pbrFramebufferInfo.renderPass = m_RenderPassPBR; + pbrFramebufferInfo.attachmentCount = _countof(pbrAttachments); + pbrFramebufferInfo.pAttachments = pbrAttachments; + pbrFramebufferInfo.width = m_Width; + pbrFramebufferInfo.height = m_Height; + pbrFramebufferInfo.layers = 1; + + VkResult res = vkCreateFramebuffer(m_pDevice->GetDevice(), &pbrFramebufferInfo, NULL, &m_FramebufferPBR); + assert(res == VK_SUCCESS); + } + + { + VkImageView mvAttachments[3] = { m_MotionVectorsSRV, m_NormalBufferSRV, m_DepthBufferDSV }; + + VkFramebufferCreateInfo mvFramebufferInfo = {}; + mvFramebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + mvFramebufferInfo.pNext = NULL; + mvFramebufferInfo.renderPass = m_RenderPassMV; + mvFramebufferInfo.attachmentCount = _countof(mvAttachments); + mvFramebufferInfo.pAttachments = mvAttachments; + mvFramebufferInfo.width = m_Width; + mvFramebufferInfo.height = m_Height; + mvFramebufferInfo.layers = 1; + + VkResult res = vkCreateFramebuffer(m_pDevice->GetDevice(), &mvFramebufferInfo, NULL, &m_FramebufferMV); + assert(res == VK_SUCCESS); + } + + { + m_HDR.CreateRTV(&m_ApplyPipelineRTV); + VkImageView attachmentViews[1] = { m_ApplyPipelineRTV }; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.pNext = NULL; + framebufferInfo.renderPass = m_RenderPassApply; + framebufferInfo.attachmentCount = 1; + framebufferInfo.pAttachments = attachmentViews; + framebufferInfo.width = m_Width; + framebufferInfo.height = m_Height; + framebufferInfo.layers = 1; + + VkResult res = vkCreateFramebuffer(m_pDevice->GetDevice(), &framebufferInfo, NULL, &m_FramebufferApply); + assert(res == VK_SUCCESS); + } + + // update bloom and downscaling effect + // + m_DownSample.OnCreateWindowSizeDependentResources(m_Width, m_Height, &m_HDR, 6); + m_Bloom.OnCreateWindowSizeDependentResources(m_Width / 2, m_Height / 2, m_DownSample.GetTexture(), 6, &m_HDR); + + // update the pipelines if the swapchain render pass has changed (for example when the format of the swapchain changes) + // + m_ToneMapping.UpdatePipelines(pSwapChain->GetRenderPass()); + m_ImGUI.UpdatePipeline(pSwapChain->GetRenderPass()); + + // Depth downsampling pass with single CS + { + m_DepthMipLevelCount = static_cast(std::log2(std::max(m_Width, m_Height))) + 1; + + // Downsampled depth buffer + imageCreateInfo.format = VK_FORMAT_R32_SFLOAT; + imageCreateInfo.mipLevels = m_DepthMipLevelCount; + m_DepthHierarchy.Init(m_pDevice, &imageCreateInfo, "m_DepthHierarchy"); + for (UINT i = 0; i < std::min(13u, m_DepthMipLevelCount); ++i) + { + m_DepthHierarchy.CreateSRV(&m_DepthHierarchyDescriptors[i], i); + } + + // Atomic counter + + VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferCreateInfo.pNext = nullptr; + bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + bufferCreateInfo.size = 4; + bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + VmaAllocationCreateInfo allocCreateInfo = {}; + allocCreateInfo.memoryTypeBits = 0; + allocCreateInfo.pool = VK_NULL_HANDLE; + allocCreateInfo.preferredFlags = 0; + allocCreateInfo.pUserData = "m_AtomicCounter"; + allocCreateInfo.requiredFlags = 0; + allocCreateInfo.usage = VMA_MEMORY_USAGE_UNKNOWN; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_AtomicCounter, &m_AtomicCounterAllocation, nullptr)) + { + Trace("Failed to create buffer for atomic counter"); + } + + VkBufferViewCreateInfo bufferViewCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO }; + bufferViewCreateInfo.buffer = m_AtomicCounter; + bufferViewCreateInfo.format = VK_FORMAT_R32_UINT; + bufferViewCreateInfo.range = VK_WHOLE_SIZE; + bufferViewCreateInfo.flags = 0; + if (VK_SUCCESS != vkCreateBufferView(m_pDevice->GetDevice(), &bufferViewCreateInfo, nullptr, &m_AtomicCounterUAV)) + { + Trace("Failed to create buffer view for atomic counter"); + } + } + + // Setup SSR + // + m_HDR.CreateSRV(&m_SssrSceneSRV); + m_DepthHierarchy.CreateSRV(&m_SssrDepthBufferHierarchySRV); + m_MotionVectors.CreateSRV(&m_SssrMotionBufferSRV); + m_NormalBuffer.CreateSRV(&m_SssrNormalBufferSRV); + m_NormalHistoryBuffer.CreateSRV(&m_SssrNormalHistoryBufferSRV); + m_SpecularRoughness.CreateSRV(&m_SssrRoughnessBufferSRV); + m_SpecularRoughnessHistory.CreateSRV(&m_SssrRoughnessHistoryBufferSRV); + m_SssrOutputBuffer.CreateSRV(&m_SssrOutputBufferUAV); + m_SssrEnvironmentMapSRV = m_SkyDome.GetCubeSpecularTextureView(); + m_SssrEnvironmentMapSampler = m_SkyDome.GetCubeSpecularTextureSampler(); + + m_CommandListRing.OnBeginFrame(); + VkCommandBuffer cb = BeginNewCommandBuffer(); + + FfxSssrVkCreateReflectionViewInfo vkReflectionViewInfo = {}; + vkReflectionViewInfo.depthBufferHierarchySRV = m_SssrDepthBufferHierarchySRV; + vkReflectionViewInfo.motionBufferSRV = m_SssrMotionBufferSRV; + vkReflectionViewInfo.normalBufferSRV = m_SssrNormalBufferSRV; + vkReflectionViewInfo.roughnessBufferSRV = m_SssrRoughnessBufferSRV; + vkReflectionViewInfo.normalHistoryBufferSRV = m_SssrNormalHistoryBufferSRV; + vkReflectionViewInfo.roughnessHistoryBufferSRV = m_SssrRoughnessHistoryBufferSRV; + vkReflectionViewInfo.reflectionViewUAV = m_SssrOutputBufferUAV; + vkReflectionViewInfo.sceneFormat = m_SssrOutputBuffer.GetFormat(); + vkReflectionViewInfo.sceneSRV = m_SssrSceneSRV; + vkReflectionViewInfo.environmentMapSRV = m_SssrEnvironmentMapSRV; + vkReflectionViewInfo.environmentMapSampler = m_SssrEnvironmentMapSampler; + vkReflectionViewInfo.uploadCommandBuffer = cb; + + FfxSssrCreateReflectionViewInfo reflectionViewInfo = {}; + reflectionViewInfo.flags = FFX_SSSR_CREATE_REFLECTION_VIEW_FLAG_ENABLE_PERFORMANCE_COUNTERS; + reflectionViewInfo.outputWidth = m_Width; + reflectionViewInfo.outputHeight = m_Height; + reflectionViewInfo.pVkCreateReflectionViewInfo = &vkReflectionViewInfo; + + FfxSssrStatus status = ffxSssrCreateReflectionView(m_SssrContext, &reflectionViewInfo, &m_SssrReflectionView); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrCreateReflectionView failed."); + } + m_SssrCreatedReflectionView = true; + + // Fill apply reflections descriptor set + VkDescriptorImageInfo applyReflectionsImageInfos[5]; + applyReflectionsImageInfos[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + applyReflectionsImageInfos[0].imageView = m_SssrOutputBufferUAV; + applyReflectionsImageInfos[0].sampler = VK_NULL_HANDLE; + applyReflectionsImageInfos[1].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + applyReflectionsImageInfos[1].imageView = m_NormalBufferSRV; + applyReflectionsImageInfos[1].sampler = VK_NULL_HANDLE; + applyReflectionsImageInfos[2].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + applyReflectionsImageInfos[2].imageView = m_SpecularRoughnessSRV; + applyReflectionsImageInfos[2].sampler = VK_NULL_HANDLE; + applyReflectionsImageInfos[3].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + applyReflectionsImageInfos[3].imageView = m_BrdfLutSRV; + applyReflectionsImageInfos[3].sampler = VK_NULL_HANDLE; + applyReflectionsImageInfos[4].imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; + applyReflectionsImageInfos[4].imageView = VK_NULL_HANDLE; + applyReflectionsImageInfos[4].sampler = m_LinearSampler; + + for (int i = 0; i < backBufferCount; ++i) + { + VkWriteDescriptorSet applyReflectionsWriteDescSets[5]; + applyReflectionsWriteDescSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + applyReflectionsWriteDescSets[0].pNext = nullptr; + applyReflectionsWriteDescSets[0].descriptorCount = 1; + applyReflectionsWriteDescSets[0].dstArrayElement = 0; + applyReflectionsWriteDescSets[0].dstSet = m_ApplyPipelineDescriptorSet[i]; + applyReflectionsWriteDescSets[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + applyReflectionsWriteDescSets[0].dstBinding = 0; + applyReflectionsWriteDescSets[0].pImageInfo = &applyReflectionsImageInfos[0]; + + applyReflectionsWriteDescSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + applyReflectionsWriteDescSets[1].pNext = nullptr; + applyReflectionsWriteDescSets[1].descriptorCount = 1; + applyReflectionsWriteDescSets[1].dstArrayElement = 0; + applyReflectionsWriteDescSets[1].dstSet = m_ApplyPipelineDescriptorSet[i]; + applyReflectionsWriteDescSets[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + applyReflectionsWriteDescSets[1].dstBinding = 1; + applyReflectionsWriteDescSets[1].pImageInfo = &applyReflectionsImageInfos[1]; + + applyReflectionsWriteDescSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + applyReflectionsWriteDescSets[2].pNext = nullptr; + applyReflectionsWriteDescSets[2].descriptorCount = 1; + applyReflectionsWriteDescSets[2].dstArrayElement = 0; + applyReflectionsWriteDescSets[2].dstSet = m_ApplyPipelineDescriptorSet[i]; + applyReflectionsWriteDescSets[2].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + applyReflectionsWriteDescSets[2].dstBinding = 2; + applyReflectionsWriteDescSets[2].pImageInfo = &applyReflectionsImageInfos[2]; + + applyReflectionsWriteDescSets[3].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + applyReflectionsWriteDescSets[3].pNext = nullptr; + applyReflectionsWriteDescSets[3].descriptorCount = 1; + applyReflectionsWriteDescSets[3].dstArrayElement = 0; + applyReflectionsWriteDescSets[3].dstSet = m_ApplyPipelineDescriptorSet[i]; + applyReflectionsWriteDescSets[3].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + applyReflectionsWriteDescSets[3].dstBinding = 3; + applyReflectionsWriteDescSets[3].pImageInfo = &applyReflectionsImageInfos[3]; + + applyReflectionsWriteDescSets[4].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + applyReflectionsWriteDescSets[4].pNext = nullptr; + applyReflectionsWriteDescSets[4].descriptorCount = 1; + applyReflectionsWriteDescSets[4].dstArrayElement = 0; + applyReflectionsWriteDescSets[4].dstSet = m_ApplyPipelineDescriptorSet[i]; + applyReflectionsWriteDescSets[4].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + applyReflectionsWriteDescSets[4].dstBinding = 4; + applyReflectionsWriteDescSets[4].pImageInfo = &applyReflectionsImageInfos[4]; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), _countof(applyReflectionsWriteDescSets), applyReflectionsWriteDescSets, 0, nullptr); + } + + // Fill depth downsample descriptor set + VkDescriptorImageInfo downsampleImageInfos[15]; + downsampleImageInfos[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + downsampleImageInfos[0].imageView = m_DepthBufferDSV; + downsampleImageInfos[0].sampler = VK_NULL_HANDLE; + + uint32_t i = 0; + for (; i < m_DepthMipLevelCount; ++i) + { + uint32_t idx = i + 1; + downsampleImageInfos[idx].imageLayout = VK_IMAGE_LAYOUT_GENERAL; + downsampleImageInfos[idx].imageView = m_DepthHierarchyDescriptors[i]; + downsampleImageInfos[idx].sampler = VK_NULL_HANDLE; + } + + VkWriteDescriptorSet depthDownsampleWriteDescSets[15]; + depthDownsampleWriteDescSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + depthDownsampleWriteDescSets[0].pNext = nullptr; + depthDownsampleWriteDescSets[0].descriptorCount = 1; + depthDownsampleWriteDescSets[0].dstArrayElement = 0; + depthDownsampleWriteDescSets[0].dstSet = m_DepthDownsampleDescriptorSet; + depthDownsampleWriteDescSets[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + depthDownsampleWriteDescSets[0].dstBinding = 0; + depthDownsampleWriteDescSets[0].pImageInfo = &downsampleImageInfos[0]; + + i = 0; + for (; i < m_DepthMipLevelCount; ++i) + { + uint32_t idx = i + 1; + depthDownsampleWriteDescSets[idx].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + depthDownsampleWriteDescSets[idx].pNext = nullptr; + depthDownsampleWriteDescSets[idx].descriptorCount = 1; + depthDownsampleWriteDescSets[idx].dstArrayElement = i; + depthDownsampleWriteDescSets[idx].dstSet = m_DepthDownsampleDescriptorSet; + depthDownsampleWriteDescSets[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + depthDownsampleWriteDescSets[idx].dstBinding = 1; + depthDownsampleWriteDescSets[idx].pImageInfo = &downsampleImageInfos[idx]; + } + + // Map the remaining mip levels to the lowest mip + for (; i < 13; ++i) + { + uint32_t idx = i + 1; + depthDownsampleWriteDescSets[idx].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + depthDownsampleWriteDescSets[idx].pNext = nullptr; + depthDownsampleWriteDescSets[idx].descriptorCount = 1; + depthDownsampleWriteDescSets[idx].dstArrayElement = i; + depthDownsampleWriteDescSets[idx].dstSet = m_DepthDownsampleDescriptorSet; + depthDownsampleWriteDescSets[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + depthDownsampleWriteDescSets[idx].dstBinding = 1; + depthDownsampleWriteDescSets[idx].pImageInfo = &downsampleImageInfos[m_DepthMipLevelCount]; + } + + depthDownsampleWriteDescSets[14].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + depthDownsampleWriteDescSets[14].pNext = nullptr; + depthDownsampleWriteDescSets[14].descriptorCount = 1; + depthDownsampleWriteDescSets[14].dstArrayElement = 0; + depthDownsampleWriteDescSets[14].dstSet = m_DepthDownsampleDescriptorSet; + depthDownsampleWriteDescSets[14].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; + depthDownsampleWriteDescSets[14].dstBinding = 2; + depthDownsampleWriteDescSets[14].pTexelBufferView = &m_AtomicCounterUAV; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), _countof(depthDownsampleWriteDescSets), depthDownsampleWriteDescSets, 0, nullptr); + + // Initial layout transitions + Barriers(cb, { + Transition(m_NormalHistoryBuffer.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_SpecularRoughnessHistory.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_DepthHierarchy.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT, m_DepthMipLevelCount), + Transition(m_DownSample.GetTexture()->Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, 6), + Transition(m_SssrOutputBuffer.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + }); + + SubmitCommandBuffer(cb); +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroyWindowSizeDependentResources +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::OnDestroyWindowSizeDependentResources() +{ + m_Bloom.OnDestroyWindowSizeDependentResources(); + m_DownSample.OnDestroyWindowSizeDependentResources(); + + m_MotionVectors.OnDestroy(); + m_SpecularRoughness.OnDestroy(); + m_SpecularRoughnessHistory.OnDestroy(); + m_NormalBuffer.OnDestroy(); + m_NormalHistoryBuffer.OnDestroy(); + m_SssrOutputBuffer.OnDestroy(); + + VkDevice device = m_pDevice->GetDevice(); + + vkDestroyImageView(device, m_SssrSceneSRV, nullptr); + vkDestroyImageView(device, m_SssrDepthBufferHierarchySRV, nullptr); + vkDestroyImageView(device, m_SssrMotionBufferSRV, nullptr); + vkDestroyImageView(device, m_SssrNormalBufferSRV, nullptr); + vkDestroyImageView(device, m_SssrRoughnessBufferSRV, nullptr); + vkDestroyImageView(device, m_SssrNormalHistoryBufferSRV, nullptr); + vkDestroyImageView(device, m_SssrRoughnessHistoryBufferSRV, nullptr); + vkDestroyImageView(device, m_SssrOutputBufferUAV, nullptr); + vkDestroyImageView(device, m_ApplyPipelineRTV, nullptr); + vkDestroyImageView(device, m_DepthBufferSRV, nullptr); + for (int i = 0; i < 13; ++i) + { + if (m_DepthHierarchyDescriptors[i] != VK_NULL_HANDLE) + { + vkDestroyImageView(device, m_DepthHierarchyDescriptors[i], nullptr); + } + m_DepthHierarchyDescriptors[i] = VK_NULL_HANDLE; + } + vkDestroyImageView(device, m_HDRSRV, nullptr); + vkDestroyImageView(device, m_SpecularRoughnessSRV, nullptr); + vkDestroyImageView(device, m_NormalBufferSRV, nullptr); + vkDestroyImageView(device, m_MotionVectorsSRV, nullptr); + vkDestroyImageView(device, m_DepthBufferDSV, nullptr); + vkDestroyBufferView(device, m_AtomicCounterUAV, nullptr); + + if (m_SssrCreatedReflectionView) + { + ffxSssrDestroyReflectionView(m_SssrContext, m_SssrReflectionView); + } + + m_HDR.OnDestroy(); + m_DepthBuffer.OnDestroy(); + m_DepthHierarchy.OnDestroy(); + + vkDestroyFramebuffer(device, m_FramebufferHDR, nullptr); + vkDestroyFramebuffer(device, m_FramebufferPBR, nullptr); + vkDestroyFramebuffer(device, m_FramebufferMV, nullptr); + vkDestroyFramebuffer(device, m_FramebufferApply, nullptr); + + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_AtomicCounter, m_AtomicCounterAllocation); +} + +//-------------------------------------------------------------------------------------- +// +// LoadScene +// +//-------------------------------------------------------------------------------------- +int SampleRenderer::LoadScene(GLTFCommon *pGLTFCommon, int stage) +{ + // show loading progress + // + ImGui::OpenPopup("Loading"); + if (ImGui::BeginPopupModal("Loading", NULL, ImGuiWindowFlags_AlwaysAutoResize)) + { + float progress = (float)stage / 13.0f; + ImGui::ProgressBar(progress, ImVec2(0.f, 0.f), NULL); + ImGui::EndPopup(); + } + + AsyncPool* pAsyncPool = &m_AsyncPool; + + // Loading stages + // + if (stage == 0) + { + } + else if (stage == 5) + { + Profile p("m_pGltfLoader->Load"); + + m_pGLTFTexturesAndBuffers = new GLTFTexturesAndBuffers(); + m_pGLTFTexturesAndBuffers->OnCreate(m_pDevice, pGLTFCommon, &m_UploadHeap, &m_VidMemBufferPool, &m_ConstantBufferRing); + } + else if (stage == 6) + { + Profile p("LoadTextures"); + + // here we are loading onto the GPU all the textures and the inverse matrices + // this data will be used to create the PBR and Depth passes + m_pGLTFTexturesAndBuffers->LoadTextures(pAsyncPool); + } + else if (stage == 7) + { + Profile p("m_gltfDepth->OnCreate"); + + //create the glTF's textures, VBs, IBs, shaders and descriptors for this particular pass + m_gltfDepth = new GltfDepthPass(); + m_gltfDepth->OnCreate( + m_pDevice, + m_RenderPassShadow, + &m_UploadHeap, + &m_ResourceViewHeaps, + &m_ConstantBufferRing, + &m_VidMemBufferPool, + m_pGLTFTexturesAndBuffers, + pAsyncPool + ); + } + else if (stage == 8) + { + Profile p("m_gltfMotionVectors->OnCreate"); + + m_gltfMotionVectors = new GltfMotionVectorsPass(); + m_gltfMotionVectors->OnCreate( + m_pDevice, + m_RenderPassMV, + &m_UploadHeap, + &m_ResourceViewHeaps, + &m_ConstantBufferRing, + &m_VidMemBufferPool, + m_pGLTFTexturesAndBuffers, + m_MotionVectors.GetFormat(), + m_NormalBuffer.GetFormat(), + pAsyncPool + ); + } + else if (stage == 9) + { + Profile p("m_gltfPBR->OnCreate"); + + // same thing as above but for the PBR pass + m_gltfPBR = new GltfPbrPass(); + m_gltfPBR->OnCreate( + m_pDevice, + m_RenderPassPBR, + &m_UploadHeap, + &m_ResourceViewHeaps, + &m_ConstantBufferRing, + &m_VidMemBufferPool, + m_pGLTFTexturesAndBuffers, + &m_AmbientLight, + false, + m_ShadowMapSRV, + true, true, true, false, + VK_SAMPLE_COUNT_1_BIT, + pAsyncPool + ); + + +#if (USE_VID_MEM==true) + // we are borrowing the upload heap command list for uploading to the GPU the IBs and VBs + m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); + m_UploadHeap.FlushAndFinish(); +#endif + } + else if (stage == 10) + { + Profile p("m_gltfBBox->OnCreate"); + + // just a bounding box pass that will draw boundingboxes instead of the geometry itself + m_gltfBBox = new GltfBBoxPass(); + m_gltfBBox->OnCreate( + m_pDevice, + m_RenderPassHDR, + &m_ResourceViewHeaps, + &m_ConstantBufferRing, + &m_VidMemBufferPool, + m_pGLTFTexturesAndBuffers, + &m_Wireframe + ); +#if (USE_VID_MEM==true) + // we are borrowing the upload heap command list for uploading to the GPU the IBs and VBs + m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); + m_UploadHeap.FlushAndFinish(); +#endif + } + else if (stage == 11) + { + Profile p("Flush"); + + m_UploadHeap.FlushAndFinish(); + +#if (USE_VID_MEM==true) + //once everything is uploaded we dont need he upload heaps anymore + m_VidMemBufferPool.FreeUploadHeap(); +#endif + + // tell caller that we are done loading the map + return 0; + } + + stage++; + return stage; +} + +//-------------------------------------------------------------------------------------- +// +// UnloadScene +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::UnloadScene() +{ + if (m_gltfPBR) + { + m_gltfPBR->OnDestroy(); + delete m_gltfPBR; + m_gltfPBR = NULL; + } + + if (m_gltfMotionVectors) + { + m_gltfMotionVectors->OnDestroy(); + delete m_gltfMotionVectors; + m_gltfMotionVectors = NULL; + } + + if (m_gltfDepth) + { + m_gltfDepth->OnDestroy(); + delete m_gltfDepth; + m_gltfDepth = NULL; + } + + if (m_gltfBBox) + { + m_gltfBBox->OnDestroy(); + delete m_gltfBBox; + m_gltfBBox = NULL; + } + + if (m_pGLTFTexturesAndBuffers) + { + m_pGLTFTexturesAndBuffers->OnDestroy(); + delete m_pGLTFTexturesAndBuffers; + m_pGLTFTexturesAndBuffers = NULL; + } +} + +void SampleRenderer::CreateApplyReflectionsPipeline() +{ + VkDevice device = m_pDevice->GetDevice(); + + VkDescriptorSetLayoutBinding bindings[6]; + bindings[0].binding = 0; + bindings[0].descriptorCount = 1; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[0].pImmutableSamplers = nullptr; + + bindings[1].binding = 1; + bindings[1].descriptorCount = 1; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[1].pImmutableSamplers = nullptr; + + bindings[2].binding = 2; + bindings[2].descriptorCount = 1; + bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + bindings[2].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[2].pImmutableSamplers = nullptr; + + bindings[3].binding = 3; + bindings[3].descriptorCount = 1; + bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + bindings[3].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[3].pImmutableSamplers = nullptr; + + bindings[4].binding = 4; + bindings[4].descriptorCount = 1; + bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + bindings[4].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[4].pImmutableSamplers = nullptr; + + bindings[5].binding = 5; + bindings[5].descriptorCount = 1; + bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + bindings[5].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + bindings[5].pImmutableSamplers = nullptr; + + VkDescriptorSetLayoutCreateInfo descSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descSetLayoutCreateInfo.pNext = nullptr; + descSetLayoutCreateInfo.bindingCount = _countof(bindings); + descSetLayoutCreateInfo.pBindings = bindings; + descSetLayoutCreateInfo.flags = 0; + + if (VK_SUCCESS != vkCreateDescriptorSetLayout(device, &descSetLayoutCreateInfo, nullptr, &m_ApplyPipelineDescriptorSetLayout)) + { + Trace("Failed to create set layout for apply reflections pipeline."); + } + + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + pipelineLayoutCreateInfo.flags = 0; + pipelineLayoutCreateInfo.pNext = nullptr; + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &m_ApplyPipelineDescriptorSetLayout; + pipelineLayoutCreateInfo.pushConstantRangeCount = 0; + pipelineLayoutCreateInfo.pPushConstantRanges = nullptr; + + if (VK_SUCCESS != vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, nullptr, &m_ApplyPipelineLayout)) + { + Trace("Failed to create pipeline layout for apply reflections pipeline."); + } + + DefineList defines; + VkPipelineShaderStageCreateInfo vs, fs; + VKCompileFromFile(device, VK_SHADER_STAGE_VERTEX_BIT, "ApplyReflections.hlsl", "vs_main", "-T vs_6_0", &defines, &vs); + VKCompileFromFile(device, VK_SHADER_STAGE_FRAGMENT_BIT, "ApplyReflections.hlsl", "ps_main", "-T ps_6_0", &defines, &fs); + + VkPipelineVertexInputStateCreateInfo vertexInputStateInfo = { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO }; + vertexInputStateInfo.pNext = nullptr; + vertexInputStateInfo.flags = 0; + vertexInputStateInfo.vertexBindingDescriptionCount = 0; + vertexInputStateInfo.pVertexBindingDescriptions = nullptr; + vertexInputStateInfo.vertexAttributeDescriptionCount = 0; + vertexInputStateInfo.pVertexAttributeDescriptions = nullptr; + + VkPipelineColorBlendAttachmentState pipelineColorBlendAttachmentState = {}; + pipelineColorBlendAttachmentState.blendEnable = VK_TRUE; + pipelineColorBlendAttachmentState.srcColorBlendFactor = VK_BLEND_FACTOR_ONE; + pipelineColorBlendAttachmentState.dstColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA; + pipelineColorBlendAttachmentState.colorBlendOp = VK_BLEND_OP_ADD; + pipelineColorBlendAttachmentState.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; + pipelineColorBlendAttachmentState.dstAlphaBlendFactor = VK_BLEND_FACTOR_ONE; + pipelineColorBlendAttachmentState.alphaBlendOp = VK_BLEND_OP_ADD; + pipelineColorBlendAttachmentState.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + + VkPipelineColorBlendStateCreateInfo colorBlendStateCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO }; + colorBlendStateCreateInfo.pNext = nullptr; + colorBlendStateCreateInfo.flags = 0; + colorBlendStateCreateInfo.logicOpEnable = false; + colorBlendStateCreateInfo.attachmentCount = 1; + colorBlendStateCreateInfo.pAttachments = &pipelineColorBlendAttachmentState; + + VkDynamicState dynamicStates[] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }; + VkPipelineDynamicStateCreateInfo pipelineDynamicStateInfo = { VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO }; + pipelineDynamicStateInfo.pNext = nullptr; + pipelineDynamicStateInfo.flags = 0; + pipelineDynamicStateInfo.dynamicStateCount = _countof(dynamicStates); + pipelineDynamicStateInfo.pDynamicStates = dynamicStates; + + VkPipelineRasterizationStateCreateInfo pipelineRasterizationStateCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO }; + pipelineRasterizationStateCreateInfo.pNext = nullptr; + pipelineRasterizationStateCreateInfo.flags = 0; + pipelineRasterizationStateCreateInfo.depthClampEnable = VK_FALSE; + pipelineRasterizationStateCreateInfo.rasterizerDiscardEnable = VK_FALSE; + pipelineRasterizationStateCreateInfo.polygonMode = VK_POLYGON_MODE_FILL; + pipelineRasterizationStateCreateInfo.cullMode = VK_CULL_MODE_NONE; + pipelineRasterizationStateCreateInfo.frontFace = VK_FRONT_FACE_CLOCKWISE; + pipelineRasterizationStateCreateInfo.depthBiasEnable = VK_FALSE; + pipelineRasterizationStateCreateInfo.depthBiasConstantFactor = 0; + pipelineRasterizationStateCreateInfo.depthBiasClamp = 0; + pipelineRasterizationStateCreateInfo.depthBiasSlopeFactor = 0; + pipelineRasterizationStateCreateInfo.lineWidth = 0; + + VkPipelineMultisampleStateCreateInfo multisampleStateInfo = { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO }; + multisampleStateInfo.pNext = nullptr; + multisampleStateInfo.flags = 0; + multisampleStateInfo.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampleStateInfo.sampleShadingEnable = VK_FALSE; + multisampleStateInfo.minSampleShading = 0; + multisampleStateInfo.pSampleMask = nullptr; + multisampleStateInfo.alphaToCoverageEnable = VK_FALSE; + multisampleStateInfo.alphaToOneEnable = VK_FALSE; + + VkPipelineViewportStateCreateInfo viewportStateInfo = {}; + viewportStateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportStateInfo.pNext = nullptr; + viewportStateInfo.flags = 0; + viewportStateInfo.viewportCount = 1; + viewportStateInfo.scissorCount = 1; + viewportStateInfo.pScissors = nullptr; + viewportStateInfo.pViewports = nullptr; + + VkPipelineInputAssemblyStateCreateInfo inputAssemblyState = { VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO }; + inputAssemblyState.pNext = nullptr; + inputAssemblyState.flags = 0; + inputAssemblyState.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + inputAssemblyState.primitiveRestartEnable = VK_FALSE; + + VkAttachmentDescription colorAttachments[1]; + // m_HDR + AttachBlending(VK_FORMAT_R16G16B16A16_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &colorAttachments[0]); + m_RenderPassApply = CreateRenderPassOptimal(m_pDevice->GetDevice(), _countof(colorAttachments), colorAttachments, nullptr); + + VkPipelineShaderStageCreateInfo stages[] = { vs, fs }; + + VkGraphicsPipelineCreateInfo pipelineCreateInfo = { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO }; + pipelineCreateInfo.pNext = nullptr; + pipelineCreateInfo.flags = 0; + pipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE; + pipelineCreateInfo.basePipelineIndex = 0; + pipelineCreateInfo.layout = m_ApplyPipelineLayout; + pipelineCreateInfo.pColorBlendState = &colorBlendStateCreateInfo; + pipelineCreateInfo.pDepthStencilState = nullptr; + pipelineCreateInfo.pDynamicState = &pipelineDynamicStateInfo; + pipelineCreateInfo.pInputAssemblyState = &inputAssemblyState; + pipelineCreateInfo.pMultisampleState = &multisampleStateInfo; + pipelineCreateInfo.pRasterizationState = &pipelineRasterizationStateCreateInfo; + pipelineCreateInfo.stageCount = _countof(stages); + pipelineCreateInfo.pStages = stages; + pipelineCreateInfo.pTessellationState = nullptr; + pipelineCreateInfo.pVertexInputState = &vertexInputStateInfo; + pipelineCreateInfo.pViewportState = &viewportStateInfo; + pipelineCreateInfo.renderPass = m_RenderPassApply; + pipelineCreateInfo.subpass = 0; + + if (VK_SUCCESS != vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineCreateInfo, nullptr, &m_ApplyPipeline)) + { + Trace("Failed to create pipeline for the apply reflection target pass."); + } + + for (int i = 0; i < backBufferCount; ++i) + { + m_ResourceViewHeaps.AllocDescriptor(m_ApplyPipelineDescriptorSetLayout, &m_ApplyPipelineDescriptorSet[i]); + } +} + +void SampleRenderer::CreateDepthDownsamplePipeline() +{ + VkDevice device = m_pDevice->GetDevice(); + + VkDescriptorSetLayoutBinding bindings[3]; + bindings[0].binding = 0; + bindings[0].descriptorCount = 1; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[0].pImmutableSamplers = nullptr; + + bindings[1].binding = 1; + bindings[1].descriptorCount = 13; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[1].pImmutableSamplers = nullptr; + + bindings[2].binding = 2; + bindings[2].descriptorCount = 1; + bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; + bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[2].pImmutableSamplers = nullptr; + + VkDescriptorSetLayoutCreateInfo descSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descSetLayoutCreateInfo.pNext = nullptr; + descSetLayoutCreateInfo.bindingCount = _countof(bindings); + descSetLayoutCreateInfo.pBindings = bindings; + descSetLayoutCreateInfo.flags = 0; + + if (VK_SUCCESS != vkCreateDescriptorSetLayout(device, &descSetLayoutCreateInfo, nullptr, &m_DepthDownsampleDescriptorSetLayout)) + { + Trace("Failed to create descriptor set layout for depth downsampling pipeline."); + } + + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + pipelineLayoutCreateInfo.flags = 0; + pipelineLayoutCreateInfo.pNext = nullptr; + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &m_DepthDownsampleDescriptorSetLayout; + pipelineLayoutCreateInfo.pushConstantRangeCount = 0; + pipelineLayoutCreateInfo.pPushConstantRanges = nullptr; + + if (VK_SUCCESS != vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, nullptr, &m_DepthDownsamplePipelineLayout)) + { + Trace("Failed to create pipeline layout for depth downsampling pipeline."); + } + + DefineList defines; + VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo; + VKCompileFromFile(device, VK_SHADER_STAGE_COMPUTE_BIT, "DepthDownsample.hlsl", "main", "-T cs_6_0", &defines, &pipelineShaderStageCreateInfo); + + VkComputePipelineCreateInfo pipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; + pipelineCreateInfo.pNext = nullptr; + pipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE; + pipelineCreateInfo.basePipelineIndex = 0; + pipelineCreateInfo.flags = 0; + pipelineCreateInfo.layout = m_DepthDownsamplePipelineLayout; + pipelineCreateInfo.stage = pipelineShaderStageCreateInfo; + + if (VK_SUCCESS != vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &pipelineCreateInfo, nullptr, &m_DepthDownsamplePipeline)) + { + Trace("Failed to create pipeline for depth downsampling pipeline."); + } + + m_ResourceViewHeaps.AllocDescriptor(m_DepthDownsampleDescriptorSetLayout, &m_DepthDownsampleDescriptorSet); +} + +void SampleRenderer::StallFrame(float targetFrametime) +{ + // Simulate lower frame rates + static std::chrono::system_clock::time_point last = std::chrono::system_clock::now(); + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::chrono::duration diff = now - last; + last = now; + float deltaTime = 1000 * static_cast(diff.count()); + if (deltaTime < targetFrametime) + { + int deltaCount = static_cast(targetFrametime - deltaTime); + std::this_thread::sleep_for(std::chrono::milliseconds(deltaCount)); + } +} + +void SampleRenderer::BeginFrame(VkCommandBuffer cb) +{ + m_CurrentFrame = (m_CurrentFrame + 1) % backBufferCount; + FfxSssrStatus status = ffxSssrAdvanceToNextFrame(m_SssrContext); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrAdvanceToNextFrame failed."); + } + + // Timing values + // + double nanosecondsBetweenGPUTicks = m_pDevice->GetPhysicalDeviceProperries().limits.timestampPeriod; + m_MillisecondsBetweenGpuTicks = 1e-6 * nanosecondsBetweenGPUTicks; + + // Let our resource managers do some house keeping + // + m_ConstantBufferRing.OnBeginFrame(); + m_GPUTimer.OnBeginFrame(cb, &m_TimeStamps); +} + +VkBufferMemoryBarrier SampleRenderer::BufferBarrier(VkBuffer buffer) +{ + VkBufferMemoryBarrier barrier = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = buffer; + barrier.offset = 0; + barrier.size = VK_WHOLE_SIZE; + return barrier; +} + +VkImageMemoryBarrier SampleRenderer::Transition(VkImage image, VkImageLayout before, VkImageLayout after, VkImageAspectFlags aspectMask, int mipCount) +{ + VkImageMemoryBarrier barrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_MEMORY_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_MEMORY_READ_BIT; + barrier.oldLayout = before; + barrier.newLayout = after; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + + VkImageSubresourceRange subresourceRange = {}; + subresourceRange.aspectMask = aspectMask; // VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT; + subresourceRange.baseArrayLayer = 0; + subresourceRange.layerCount = 1; + subresourceRange.baseMipLevel = 0; + subresourceRange.levelCount = mipCount; + + barrier.subresourceRange = subresourceRange; + return barrier; +} + +void SampleRenderer::Barriers(VkCommandBuffer cb, const std::vector& imageBarriers) +{ + vkCmdPipelineBarrier(cb, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 0, nullptr, + 0, nullptr, + static_cast(imageBarriers.size()), imageBarriers.data()); +} + +VkCommandBuffer SampleRenderer::BeginNewCommandBuffer() +{ + VkCommandBuffer cb = m_CommandListRing.GetNewCommandList(); + VkCommandBufferBeginInfo commandBufferBeginInfo = {}; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = NULL; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = NULL; + VkResult res = vkBeginCommandBuffer(cb, &commandBufferBeginInfo); + assert(res == VK_SUCCESS); + return cb; +} + +void SampleRenderer::SubmitCommandBuffer(VkCommandBuffer cb, VkSemaphore* waitSemaphore, VkSemaphore* signalSemaphores, VkFence fence) +{ + VkResult res = vkEndCommandBuffer(cb); + assert(res == VK_SUCCESS); + + VkPipelineStageFlags submitWaitStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + + VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; + submitInfo.pNext = NULL; + submitInfo.waitSemaphoreCount = waitSemaphore ? 1 : 0; + submitInfo.pWaitSemaphores = waitSemaphore; + submitInfo.pWaitDstStageMask = waitSemaphore ? &submitWaitStage : NULL; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &cb; + submitInfo.signalSemaphoreCount = signalSemaphores ? 1 : 0; + submitInfo.pSignalSemaphores = signalSemaphores; + res = vkQueueSubmit(m_pDevice->GetGraphicsQueue(), 1, &submitInfo, fence); + assert(res == VK_SUCCESS); +} + +per_frame * SampleRenderer::FillFrameConstants(State *pState) +{ + // Sets the perFrame data (Camera and lights data), override as necessary and set them as constant buffers -------------- + // + per_frame *pPerFrame = NULL; + if (m_pGLTFTexturesAndBuffers) + { + pPerFrame = m_pGLTFTexturesAndBuffers->m_pGLTFCommon->SetPerFrameData(pState->camera); + + //override gltf camera with ours + pPerFrame->mCameraViewProj = pState->camera.GetView() * pState->camera.GetProjection(); + pPerFrame->cameraPos = pState->camera.GetPosition(); + pPerFrame->emmisiveFactor = pState->emmisiveFactor; + pPerFrame->iblFactor = pState->iblFactor; + + //if the gltf doesn't have any lights set a directional light + if (pPerFrame->lightCount == 0) + { + pPerFrame->lightCount = 1; + pPerFrame->lights[0].color[0] = pState->lightColor.x; + pPerFrame->lights[0].color[1] = pState->lightColor.y; + pPerFrame->lights[0].color[2] = pState->lightColor.z; + GetXYZ(pPerFrame->lights[0].position, pState->lightCamera.GetPosition()); + GetXYZ(pPerFrame->lights[0].direction, pState->lightCamera.GetDirection()); + + pPerFrame->lights[0].range = 30.0f; // in meters + pPerFrame->lights[0].type = LightType_Spot; + pPerFrame->lights[0].intensity = pState->lightIntensity; + pPerFrame->lights[0].innerConeCos = cosf(pState->lightCamera.GetFovV() * 0.9f / 2.0f); + pPerFrame->lights[0].outerConeCos = cosf(pState->lightCamera.GetFovV() / 2.0f); + pPerFrame->lights[0].mLightViewProj = pState->lightCamera.GetView() * pState->lightCamera.GetProjection(); + } + + // Up to 4 spotlights can have shadowmaps. Each spot the light has a shadowMap index which is used to find the shadowmap in the atlas + uint32_t shadowMapIndex = 0; + for (uint32_t i = 0; i < pPerFrame->lightCount; i++) + { + if ((shadowMapIndex < 4) && (pPerFrame->lights[i].type == LightType_Spot)) + { + pPerFrame->lights[i].shadowMapIndex = shadowMapIndex++; // set the shadowmap index so the color pass knows which shadow map to use + pPerFrame->lights[i].depthBias = 20.0f / 100000.0f; + } + else if ((shadowMapIndex < 4) && (pPerFrame->lights[i].type == LightType_Directional)) + { + pPerFrame->lights[i].shadowMapIndex = shadowMapIndex++; // same as above + pPerFrame->lights[i].depthBias = 100.0f / 100000.0f; + } + else + { + pPerFrame->lights[i].shadowMapIndex = -1; // no shadow for this light + } + } + + m_pGLTFTexturesAndBuffers->SetPerFrameConstants(); + + m_pGLTFTexturesAndBuffers->SetSkinningMatricesForSkeletons(); + } + + return pPerFrame; +} + +void SampleRenderer::RenderSpotLights(VkCommandBuffer cb, per_frame * pPerFrame) +{ + VkClearValue clearValue = {}; + clearValue.depthStencil.depth = 1; + clearValue.depthStencil.stencil = 0; + + VkRenderPassBeginInfo beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = 1; + beginInfo.pClearValues = &clearValue; + beginInfo.renderArea = { 0, 0, m_ShadowMap.GetWidth(), m_ShadowMap.GetHeight() }; + beginInfo.renderPass = m_RenderPassShadow; + beginInfo.framebuffer = m_FramebufferShadows; + vkCmdBeginRenderPass(cb, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + for (uint32_t i = 0; i < pPerFrame->lightCount; i++) + { + if (!(pPerFrame->lights[i].type == LightType_Spot || pPerFrame->lights[i].type == LightType_Directional)) + continue; + + // Set the RT's quadrant where to render the shadomap (these viewport offsets need to match the ones in shadowFiltering.h) + uint32_t viewportOffsetsX[4] = { 0, 1, 0, 1 }; + uint32_t viewportOffsetsY[4] = { 0, 0, 1, 1 }; + uint32_t viewportWidth = m_ShadowMap.GetWidth() / 2; + uint32_t viewportHeight = m_ShadowMap.GetHeight() / 2; + SetViewportAndScissor(cb, viewportOffsetsX[i] * viewportWidth, viewportOffsetsY[i] * viewportHeight, viewportWidth, viewportHeight); + + GltfDepthPass::per_frame *cbDepthPerFrame = m_gltfDepth->SetPerFrameConstants(); + cbDepthPerFrame->mViewProj = pPerFrame->lights[i].mLightViewProj; + + m_gltfDepth->Draw(cb); + + m_GPUTimer.GetTimeStamp(cb, "Shadow map"); + } + + vkCmdEndRenderPass(cb); +} + +void SampleRenderer::RenderMotionVectors(VkCommandBuffer cb, per_frame * pPerFrame, State * pState) +{ + vkCmdSetViewport(cb, 0, 1, &m_Viewport); + vkCmdSetScissor(cb, 0, 1, &m_Scissor); + + GltfMotionVectorsPass::per_frame *cbDepthPerFrame = m_gltfMotionVectors->SetPerFrameConstants(); + cbDepthPerFrame->mCurrViewProj = pPerFrame->mCameraViewProj; + cbDepthPerFrame->mPrevViewProj = pState->camera.GetPrevView() * pState->camera.GetProjection(); + + m_gltfMotionVectors->Draw(cb); + m_GPUTimer.GetTimeStamp(cb, "Motion vectors"); +} + + +void SampleRenderer::RenderSkydome(VkCommandBuffer cb, per_frame * pPerFrame, State * pState) +{ + VkClearValue clearValues[1]; + clearValues[0].color.float32[0] = 0; + clearValues[0].color.float32[1] = 0; + clearValues[0].color.float32[2] = 0; + clearValues[0].color.float32[3] = 0; + + VkRenderPassBeginInfo beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = _countof(clearValues); + beginInfo.pClearValues = clearValues; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = m_RenderPassClearHDR; + beginInfo.framebuffer = m_FramebufferHDR; + vkCmdBeginRenderPass(cb, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + vkCmdSetViewport(cb, 0, 1, &m_Viewport); + vkCmdSetScissor(cb, 0, 1, &m_Scissor); + + if (pState->skyDomeType == 1) + { + XMMATRIX clipToView = XMMatrixInverse(NULL, pPerFrame->mCameraViewProj); + m_SkyDome.Draw(cb, clipToView); + m_GPUTimer.GetTimeStamp(cb, "Skydome"); + } + else if (pState->skyDomeType == 0) + { + SkyDomeProc::Constants skyDomeConstants; + skyDomeConstants.invViewProj = XMMatrixInverse(NULL, pPerFrame->mCameraViewProj); + skyDomeConstants.vSunDirection = XMVectorSet(1.0f, 0.05f, 0.0f, 0.0f); + skyDomeConstants.turbidity = 10.0f; + skyDomeConstants.rayleigh = 2.0f; + skyDomeConstants.mieCoefficient = 0.005f; + skyDomeConstants.mieDirectionalG = 0.8f; + skyDomeConstants.luminance = 1.0f; + skyDomeConstants.sun = false; + m_SkyDomeProc.Draw(cb, skyDomeConstants); + m_GPUTimer.GetTimeStamp(cb, "Skydome proc"); + } + + vkCmdEndRenderPass(cb); +} + +void SampleRenderer::RenderScene(VkCommandBuffer cb) +{ + VkClearValue clearValues[2]; + clearValues[0].color.float32[0] = 0; + clearValues[0].color.float32[1] = 0; + clearValues[0].color.float32[2] = 0; + clearValues[0].color.float32[3] = 0; + clearValues[1].color.float32[0] = 1; + clearValues[1].color.float32[1] = 1; + clearValues[1].color.float32[2] = 1; + clearValues[1].color.float32[3] = 1; + + VkRenderPassBeginInfo beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = _countof(clearValues); + beginInfo.pClearValues = clearValues; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = m_RenderPassPBR; + beginInfo.framebuffer = m_FramebufferPBR; + vkCmdBeginRenderPass(cb, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + //set per frame constant buffer values + m_gltfPBR->Draw(cb); + + vkCmdEndRenderPass(cb); +} + +void SampleRenderer::RenderBoundingBoxes(VkCommandBuffer cb, per_frame * pPerFrame) +{ + m_gltfBBox->Draw(cb, pPerFrame->mCameraViewProj); + m_GPUTimer.GetTimeStamp(cb, "Bounding Box"); +} + + +void SampleRenderer::RenderLightFrustums(VkCommandBuffer cb, per_frame * pPerFrame, State * pState) +{ + SetPerfMarkerBegin(cb, "Light frustrums"); + + XMVECTOR vCenter = XMVectorSet(0.0f, 0.0f, 0.0f, 0.0f); + XMVECTOR vRadius = XMVectorSet(1.0f, 1.0f, 1.0f, 0.0f); + XMVECTOR vColor = XMVectorSet(1.0f, 1.0f, 1.0f, 1.0f); + for (uint32_t i = 0; i < pPerFrame->lightCount; i++) + { + XMMATRIX spotlightMatrix = XMMatrixInverse(NULL, pPerFrame->lights[i].mLightViewProj); + XMMATRIX worldMatrix = spotlightMatrix * pPerFrame->mCameraViewProj; + m_WireframeBox.Draw(cb, &m_Wireframe, worldMatrix, vCenter, vRadius, vColor); + } + + m_GPUTimer.GetTimeStamp(cb, "Light frustums"); + SetPerfMarkerEnd(cb); +} + + +void SampleRenderer::DownsampleDepthBuffer(VkCommandBuffer cb) +{ + // Clear m_AtomicCounter to 0 + vkCmdFillBuffer(cb, m_AtomicCounter, 0, VK_WHOLE_SIZE, 0); + + SetPerfMarkerBegin(cb, "Downsample Depth"); + + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, m_DepthDownsamplePipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, m_DepthDownsamplePipelineLayout, 0, 1, &m_DepthDownsampleDescriptorSet, 0, nullptr); + + // Each threadgroup works on 64x64 texels + uint32_t dimX = (m_Width + 63) / 64; + uint32_t dimY = (m_Height + 63) / 64; + vkCmdDispatch(cb, dimX, dimY, 1); + + m_GPUTimer.GetTimeStamp(cb, "Downsample Depth"); + SetPerfMarkerEnd(cb); +} + + +void SampleRenderer::RenderScreenSpaceReflections(VkCommandBuffer cb, State * pState) +{ + Barriers(cb, { + Transition(m_SssrOutputBuffer.Resource(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_DepthHierarchy.Resource(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, m_DepthMipLevelCount), + }); + + SetPerfMarkerBegin(cb, "FidelityFX SSSR"); + + const Camera * camera = &pState->camera; + XMMATRIX view = camera->GetView(); + XMMATRIX proj = camera->GetProjection(); + + XMFLOAT4X4 cameraView; + XMStoreFloat4x4(&cameraView, XMMatrixTranspose(view)); + XMFLOAT4X4 cameraProj; + XMStoreFloat4x4(&cameraProj, XMMatrixTranspose(proj)); + + FfxSssrStatus status; + status = ffxSssrReflectionViewSetCameraParameters(m_SssrContext, m_SssrReflectionView, &cameraView.m[0][0], &cameraProj.m[0][0]); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrReflectionViewSetCameraParameters failed."); + } + + VkClearColorValue clearValue = {}; + clearValue.float32[0] = 0; + clearValue.float32[1] = 0; + clearValue.float32[2] = 0; + clearValue.float32[3] = 0; + + VkImageSubresourceRange subresourceRange = {}; + subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + subresourceRange.baseArrayLayer = 0; + subresourceRange.baseMipLevel = 0; + subresourceRange.layerCount = 1; + subresourceRange.levelCount = 1; + vkCmdClearColorImage(cb, m_SssrOutputBuffer.Resource(), VK_IMAGE_LAYOUT_GENERAL, &clearValue, 1, &subresourceRange); + + // Ensure the image is cleared + VkMemoryBarrier barrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER }; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + vkCmdPipelineBarrier(cb, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 1, &barrier, + 0, nullptr, + 0, nullptr); + + + FfxSssrVkCommandEncodeInfo vkEncodeInfo = {}; + vkEncodeInfo.commandBuffer = cb; + + FfxSssrResolveReflectionViewInfo resolveInfo = {}; + resolveInfo.flags = pState->bShowIntersectionResults ? 0 : FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_DENOISE; + resolveInfo.flags |= pState->bEnableVarianceGuidedTracing ? FFX_SSSR_RESOLVE_REFLECTION_VIEW_FLAG_ENABLE_VARIANCE_GUIDED_TRACING : 0; + resolveInfo.pVkCommandEncodeInfo = &vkEncodeInfo; + resolveInfo.temporalStabilityScale = pState->temporalStability; + resolveInfo.maxTraversalIterations = pState->maxTraversalIterations; + resolveInfo.mostDetailedDepthHierarchyMipLevel = pState->mostDetailedDepthHierarchyMipLevel; + resolveInfo.depthBufferThickness = pState->depthBufferThickness; + resolveInfo.minTraversalOccupancy = pState->minTraversalOccupancy; + resolveInfo.samplesPerQuad = pState->samplesPerQuad == 4 ? FFX_SSSR_RAY_SAMPLES_PER_QUAD_4 : (pState->samplesPerQuad == 2 ? FFX_SSSR_RAY_SAMPLES_PER_QUAD_2 : FFX_SSSR_RAY_SAMPLES_PER_QUAD_1); + resolveInfo.roughnessThreshold = pState->roughnessThreshold; + + status = ffxSssrEncodeResolveReflectionView(m_SssrContext, m_SssrReflectionView, &resolveInfo); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrEncodeResolveReflectionView failed."); + } + + // Query timings + uint64_t tileClassificationTime; + status = ffxSssrReflectionViewGetTileClassificationElapsedTime(m_SssrContext, m_SssrReflectionView, &tileClassificationTime); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrReflectionViewGetTileClassificationElapsedTime failed."); + } + + static std::deque tileClassificationTimes(100); + tileClassificationTimes.pop_front(); + tileClassificationTimes.push_back(static_cast(tileClassificationTime * m_MillisecondsBetweenGpuTicks)); + pState->tileClassificationTime = 0; + for (auto& time : tileClassificationTimes) + { + pState->tileClassificationTime += time; + } + pState->tileClassificationTime /= tileClassificationTimes.size(); + + uint64_t intersectionTime; + status = ffxSssrReflectionViewGetIntersectionElapsedTime(m_SssrContext, m_SssrReflectionView, &intersectionTime); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrReflectionViewGetIntersectionElapsedTime failed."); + } + + static std::deque intersectionTimes(100); + intersectionTimes.pop_front(); + intersectionTimes.push_back(static_cast(intersectionTime * m_MillisecondsBetweenGpuTicks)); + pState->intersectionTime = 0; + for (auto& time : intersectionTimes) + { + pState->intersectionTime += time; + } + pState->intersectionTime /= intersectionTimes.size(); + + uint64_t denoisingTime; + status = ffxSssrReflectionViewGetDenoisingElapsedTime(m_SssrContext, m_SssrReflectionView, &denoisingTime); + if (status != FFX_SSSR_STATUS_OK) + { + Trace("ffxSssrReflectionViewGetDenoisingElapsedTime failed."); + } + + static std::deque denoisingTimes(100); + denoisingTimes.pop_front(); + denoisingTimes.push_back(static_cast(denoisingTime * m_MillisecondsBetweenGpuTicks)); + pState->denoisingTime = 0; + for (auto& time : denoisingTimes) + { + pState->denoisingTime += time; + } + pState->denoisingTime /= denoisingTimes.size(); + + m_GPUTimer.GetTimeStamp(cb, "FidelityFX SSSR"); + SetPerfMarkerEnd(cb); + + Barriers(cb, { + Transition(m_SssrOutputBuffer.Resource(), VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_DepthHierarchy.Resource(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT, m_DepthMipLevelCount), + }); +} + +void SampleRenderer::CopyHistorySurfaces(VkCommandBuffer cb) +{ + Barriers(cb, { + Transition(m_NormalBuffer.Resource(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_SpecularRoughness.Resource(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_NormalHistoryBuffer.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_SpecularRoughnessHistory.Resource(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + }); + + SetPerfMarkerBegin(cb, "Copy History Normals and Roughness"); + // Keep copy of normal roughness buffer for next frame + CopyToTexture(cb, &m_NormalBuffer, &m_NormalHistoryBuffer); + CopyToTexture(cb, &m_SpecularRoughness, &m_SpecularRoughnessHistory); + SetPerfMarkerEnd(cb); + + Barriers(cb, { + Transition(m_NormalBuffer.Resource(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_SpecularRoughness.Resource(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_NormalHistoryBuffer.Resource(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + Transition(m_SpecularRoughnessHistory.Resource(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT), + }); +} + +void SampleRenderer::ApplyReflectionTarget(VkCommandBuffer cb, State * pState) +{ + VkRenderPassBeginInfo beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = 0; + beginInfo.pClearValues = nullptr; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = m_RenderPassApply; + beginInfo.framebuffer = m_FramebufferApply; + vkCmdBeginRenderPass(cb, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + SetPerfMarkerBegin(cb, "Apply Reflection View"); + + struct PassConstants + { + XMFLOAT4 viewDir; + UINT showReflectionTarget; + UINT drawReflections; + } constants; + + XMVECTOR view = pState->camera.GetDirection(); + XMStoreFloat4(&constants.viewDir, view); + constants.showReflectionTarget = pState->showReflectionTarget ? 1 : 0; + constants.drawReflections = pState->bDrawScreenSpaceReflections ? 1 : 0; + + VkDescriptorBufferInfo uniformBufferInfo = m_ConstantBufferRing.AllocConstantBuffer(sizeof(PassConstants), &constants); + VkWriteDescriptorSet uniformBufferWriteDescSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + uniformBufferWriteDescSet.pNext = nullptr; + uniformBufferWriteDescSet.descriptorCount = 1; + uniformBufferWriteDescSet.dstArrayElement = 0; + uniformBufferWriteDescSet.dstSet = m_ApplyPipelineDescriptorSet[m_CurrentFrame]; + uniformBufferWriteDescSet.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uniformBufferWriteDescSet.dstBinding = 5; + uniformBufferWriteDescSet.pBufferInfo = &uniformBufferInfo; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &uniformBufferWriteDescSet, 0, nullptr); + + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, m_ApplyPipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, m_ApplyPipelineLayout, 0, 1, &m_ApplyPipelineDescriptorSet[m_CurrentFrame], 0, nullptr); + vkCmdSetViewport(cb, 0, 1, &m_Viewport); + vkCmdSetScissor(cb, 0, 1, &m_Scissor); + + vkCmdDraw(cb, 3, 1, 0, 0); + + m_GPUTimer.GetTimeStamp(cb, "Apply Reflection View"); + SetPerfMarkerEnd(cb); + + vkCmdEndRenderPass(cb); +} + +void SampleRenderer::DownsampleScene(VkCommandBuffer cb) +{ + m_DownSample.Draw(cb); + m_GPUTimer.GetTimeStamp(cb, "Downsample"); +} + +void SampleRenderer::RenderBloom(VkCommandBuffer cb) +{ + m_Bloom.Draw(cb); + m_GPUTimer.GetTimeStamp(cb, "Bloom"); +} + +void SampleRenderer::ApplyTonemapping(VkCommandBuffer cb, State * pState, SwapChain *pSwapChain) +{ + vkCmdSetViewport(cb, 0, 1, &m_Viewport); + vkCmdSetScissor(cb, 0, 1, &m_Scissor); + + m_ToneMapping.Draw(cb, m_HDRSRV, pState->exposure, pState->toneMapper); + m_GPUTimer.GetTimeStamp(cb, "Tone mapping"); +} + +void SampleRenderer::RenderHUD(VkCommandBuffer cb, SwapChain *pSwapChain) +{ + vkCmdSetViewport(cb, 0, 1, &m_Viewport); + vkCmdSetScissor(cb, 0, 1, &m_Scissor); + + m_ImGUI.Draw(cb); + + m_GPUTimer.GetTimeStamp(cb, "ImGUI rendering"); +} + +void SampleRenderer::CopyToTexture(VkCommandBuffer cb, Texture * source, Texture * target) +{ + VkImageCopy region = {}; + region.dstOffset = { 0, 0, 0 }; + region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.dstSubresource.baseArrayLayer = 0; + region.dstSubresource.layerCount = 1; + region.dstSubresource.mipLevel = 0; + region.extent = {m_Width, m_Height, 1}; + region.srcOffset = {0, 0, 0}; + region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + region.srcSubresource.baseArrayLayer = 0; + region.srcSubresource.layerCount = 1; + region.srcSubresource.mipLevel = 0; + vkCmdCopyImage(cb, source->Resource(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, target->Resource(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion); +} + +//-------------------------------------------------------------------------------------- +// +// OnRender +// +//-------------------------------------------------------------------------------------- +void SampleRenderer::OnRender(State *pState, SwapChain *pSwapChain) +{ + StallFrame(pState->targetFrametime); + + VkCommandBuffer cb1 = BeginNewCommandBuffer(); + BeginFrame(cb1); + + per_frame *pPerFrame = FillFrameConstants(pState); + + // Clears happen in the render passes ----------------------------------------------------------------------- + + // Render to shadow map atlas for spot lights ------------------------------------------ + // + if (m_gltfDepth && pPerFrame) + { + RenderSpotLights(cb1, pPerFrame); + } + + VkClearValue clearValues[3]; + clearValues[0].color.float32[0] = 0; + clearValues[0].color.float32[1] = 0; + clearValues[0].color.float32[2] = 0; + clearValues[0].color.float32[3] = 0; + clearValues[1].color.float32[0] = 0; + clearValues[1].color.float32[1] = 0; + clearValues[1].color.float32[2] = 0; + clearValues[1].color.float32[3] = 0; + clearValues[2].depthStencil.depth = 1; + clearValues[2].depthStencil.stencil = 0; + + VkRenderPassBeginInfo beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = _countof(clearValues); + beginInfo.pClearValues = clearValues; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = m_RenderPassMV; + beginInfo.framebuffer = m_FramebufferMV; + vkCmdBeginRenderPass(cb1, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + // Motion vectors --------------------------------------------------------------------------- + // + if (m_gltfMotionVectors && pPerFrame) + { + RenderMotionVectors(cb1, pPerFrame, pState); + } + + vkCmdEndRenderPass(cb1); + + // Render Scene to the HDR RT ------------------------------------------------ + // + + if (pPerFrame) + { + RenderSkydome(cb1, pPerFrame, pState); + + // Render scene to color buffer + if (m_gltfPBR) + { + RenderScene(cb1); + } + + beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = 0; + beginInfo.pClearValues = nullptr; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = m_RenderPassHDR; + beginInfo.framebuffer = m_FramebufferHDR; + vkCmdBeginRenderPass(cb1, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + // Draw object bounding boxes + if (m_gltfBBox && pState->bDrawBoundingBoxes) + { + RenderBoundingBoxes(cb1, pPerFrame); + } + + // Draw light frustum + if (pState->bDrawLightFrustum) + { + RenderLightFrustums(cb1, pPerFrame, pState); + } + + vkCmdEndRenderPass(cb1); + + m_GPUTimer.GetTimeStamp(cb1, "Rendering scene"); + } + + // Downsample depth buffer + if (m_gltfMotionVectors && pPerFrame) + { + DownsampleDepthBuffer(cb1); + } + + if (m_gltfPBR && pPerFrame) + { + // Stochastic SSR + RenderScreenSpaceReflections(cb1, pState); + + // Keep this frames results for next frame + CopyHistorySurfaces(cb1); + + // Apply the result of SSR + ApplyReflectionTarget(cb1, pState); + } + + if (pPerFrame && pState->bDrawBloom) + { + DownsampleScene(cb1); + RenderBloom(cb1); + } + + SubmitCommandBuffer(cb1); + + // Wait for swapchain (we are going to render to it) ----------------------------------- + // + int imageIndex = pSwapChain->WaitForSwapChain(); + m_CommandListRing.OnBeginFrame(); + + VkCommandBuffer cb2 = BeginNewCommandBuffer(); + + beginInfo = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; + beginInfo.pNext = nullptr; + beginInfo.clearValueCount = 0; + beginInfo.pClearValues = nullptr; + beginInfo.renderArea = { 0, 0, m_Width, m_Height }; + beginInfo.renderPass = pSwapChain->GetRenderPass(); + beginInfo.framebuffer = pSwapChain->GetFramebuffer(imageIndex); + vkCmdBeginRenderPass(cb2, &beginInfo, VK_SUBPASS_CONTENTS_INLINE); + + if (pPerFrame) + { + // Tonemapping + ApplyTonemapping(cb2, pState, pSwapChain); + } + + // Render HUD + RenderHUD(cb2, pSwapChain); + + m_GPUTimer.OnEndFrame(); + + vkCmdEndRenderPass(cb2); + + VkSemaphore imageAvailableSemaphore = VK_NULL_HANDLE; + VkSemaphore renderFinishedSemaphores = VK_NULL_HANDLE; + VkFence cmdBufExecutedFences = VK_NULL_HANDLE; + pSwapChain->GetSemaphores(&imageAvailableSemaphore, &renderFinishedSemaphores, &cmdBufExecutedFences); + + SubmitCommandBuffer(cb2, &imageAvailableSemaphore, &renderFinishedSemaphores, cmdBufExecutedFences); + + // Update previous camera matrices + pState->camera.UpdatePreviousMatrices(); +} diff --git a/sample/src/VK/Sources/SampleRenderer.h b/sample/src/VK/Sources/SampleRenderer.h new file mode 100644 index 0000000..47c0b21 --- /dev/null +++ b/sample/src/VK/Sources/SampleRenderer.h @@ -0,0 +1,256 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2018 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +#pragma once + +#include + +// We are queuing (backBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame +static const int backBufferCount = 3; + +#define USE_VID_MEM true + +using namespace CAULDRON_VK; + +// +// This class deals with the GPU side of the sample. +// + +class SampleRenderer +{ +public: + struct State + { + float time; + Camera camera; + + float exposure; + float emmisiveFactor; + float iblFactor; + float lightIntensity; + XMFLOAT3 lightColor; + Camera lightCamera; + + int toneMapper; + int skyDomeType; + bool bDrawBoundingBoxes; + bool bDrawLightFrustum; + bool bDrawBloom; + bool bDrawScreenSpaceReflections; + + float targetFrametime; + + bool bShowIntersectionResults; + float temporalStability; + int maxTraversalIterations; + int mostDetailedDepthHierarchyMipLevel; + float depthBufferThickness; + int minTraversalOccupancy; + int samplesPerQuad; + bool bEnableVarianceGuidedTracing; + float roughnessThreshold; + + float tileClassificationTime; + float intersectionTime; + float denoisingTime; + + bool showReflectionTarget; + bool isBenchmarking; + }; + + void OnCreate(Device *pDevice, SwapChain *pSwapChain); + void OnDestroy(); + + void OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, uint32_t Width, uint32_t Height); + void OnDestroyWindowSizeDependentResources(); + + int LoadScene(GLTFCommon *pGLTFCommon, int stage = 0); + void UnloadScene(); + + const std::vector &GetTimingValues() { return m_TimeStamps; } + + void OnRender(State *pState, SwapChain *pSwapChain); + +private: + void CreateApplyReflectionsPipeline(); + void CreateDepthDownsamplePipeline(); + void StallFrame(float targetFrametime); + void BeginFrame(VkCommandBuffer cb); + VkBufferMemoryBarrier BufferBarrier(VkBuffer buffer); + VkImageMemoryBarrier Transition(VkImage image, VkImageLayout before, VkImageLayout after, VkImageAspectFlags aspectMask, int mipCount = 1); + void Barriers(VkCommandBuffer cb, const std::vector& imageBarriers); + + VkCommandBuffer BeginNewCommandBuffer(); + void SubmitCommandBuffer(VkCommandBuffer cb, VkSemaphore* waitSemaphore = NULL, VkSemaphore* signalSemaphores = NULL, VkFence fence = VK_NULL_HANDLE); + + per_frame * FillFrameConstants(State * pState); + void RenderSpotLights(VkCommandBuffer cb, per_frame * pPerFrame); + void RenderMotionVectors(VkCommandBuffer cb, per_frame * pPerFrame, State * pState); + void RenderSkydome(VkCommandBuffer cb, per_frame * pPerFrame, State * pState); + void RenderScene(VkCommandBuffer cb); + void RenderBoundingBoxes(VkCommandBuffer cb, per_frame * pPerFrame); + void RenderLightFrustums(VkCommandBuffer cb, per_frame * pPerFrame, State * pState); + void DownsampleDepthBuffer(VkCommandBuffer cb); + void RenderScreenSpaceReflections(VkCommandBuffer cb, State * pState); + void CopyHistorySurfaces(VkCommandBuffer cb); + void ApplyReflectionTarget(VkCommandBuffer cb, State * pState); + void DownsampleScene(VkCommandBuffer cb); + void RenderBloom(VkCommandBuffer cb); + void ApplyTonemapping(VkCommandBuffer cb, State * pState, SwapChain * pSwapChain); + void RenderHUD(VkCommandBuffer cb, SwapChain * pSwapChain); + void CopyToTexture(VkCommandBuffer cb, Texture * source, Texture * target); + +private: + Device * m_pDevice; + + uint32_t m_Width; + uint32_t m_Height; + + uint32_t m_CurrentFrame; + + VkViewport m_Viewport; + VkRect2D m_Scissor; + + // Initialize helper classes + ResourceViewHeaps m_ResourceViewHeaps; + UploadHeap m_UploadHeap; + DynamicBufferRing m_ConstantBufferRing; + StaticBufferPool m_VidMemBufferPool; + StaticBufferPool m_SysMemBufferPool; + CommandListRing m_CommandListRing; + GPUTimestamps m_GPUTimer; + + //gltf passes + GltfPbrPass * m_gltfPBR; + GltfBBoxPass * m_gltfBBox; + GltfDepthPass * m_gltfDepth; + GltfMotionVectorsPass * m_gltfMotionVectors; + GLTFTexturesAndBuffers * m_pGLTFTexturesAndBuffers; + + // effects + Bloom m_Bloom; + SkyDome m_SkyDome; + SkyDome m_AmbientLight; + DownSamplePS m_DownSample; + SkyDomeProc m_SkyDomeProc; + ToneMapping m_ToneMapping; + + // Samplers + VkSampler m_LinearSampler; + + // BRDF LUT + Texture m_BrdfLut; + VkImageView m_BrdfLutSRV; + + // GUI + ImGUI m_ImGUI; + + // Temporary render targets + + // depth buffer + Texture m_DepthBuffer; + VkImageView m_DepthBufferDSV; + + // Motion Vectors resources + Texture m_MotionVectors; + VkImageView m_MotionVectorsSRV; + + // Normal buffer + Texture m_NormalBuffer; + VkImageView m_NormalBufferSRV; + Texture m_NormalHistoryBuffer; + + // Specular roughness target + Texture m_SpecularRoughness; + VkImageView m_SpecularRoughnessSRV; + Texture m_SpecularRoughnessHistory; + + // shadowmaps + Texture m_ShadowMap; + VkImageView m_ShadowMapDSV; + VkImageView m_ShadowMapSRV; + + // Resolved RT + Texture m_HDR; + VkImageView m_HDRSRV; + + // widgets + Wireframe m_Wireframe; + WireframeBox m_WireframeBox; + + std::vector m_TimeStamps; + + // SSR Effect + FfxSssrContext m_SssrContext; + FfxSssrReflectionView m_SssrReflectionView; + bool m_SssrCreatedReflectionView = false; + VkImageView m_SssrSceneSRV; + VkImageView m_SssrDepthBufferHierarchySRV; + VkImageView m_SssrMotionBufferSRV; + VkImageView m_SssrNormalBufferSRV; + VkImageView m_SssrRoughnessBufferSRV; + VkImageView m_SssrNormalHistoryBufferSRV; + VkImageView m_SssrRoughnessHistoryBufferSRV; + VkImageView m_SssrOutputBufferUAV; + VkImageView m_SssrEnvironmentMapSRV; + VkSampler m_SssrEnvironmentMapSampler; + Texture m_SssrOutputBuffer; + + // Pass to apply reflection target + VkPipeline m_ApplyPipeline; + VkPipelineLayout m_ApplyPipelineLayout; + VkDescriptorSetLayout m_ApplyPipelineDescriptorSetLayout; + VkDescriptorSet m_ApplyPipelineDescriptorSet[backBufferCount]; + + VkImageView m_ApplyPipelineRTV; + + // Depth downsampling with single CS + VkPipeline m_DepthDownsamplePipeline; + VkPipelineLayout m_DepthDownsamplePipelineLayout; + VkDescriptorSetLayout m_DepthDownsampleDescriptorSetLayout; + VkDescriptorSet m_DepthDownsampleDescriptorSet; + + VkImageView m_DepthBufferSRV; + VkImageView m_DepthHierarchyDescriptors[13]; + Texture m_DepthHierarchy; + VkBuffer m_AtomicCounter; + VmaAllocation m_AtomicCounterAllocation; + VkBufferView m_AtomicCounterUAV; + UINT m_DepthMipLevelCount = 0; + + double m_MillisecondsBetweenGpuTicks; + + // Renderpasses + VkRenderPass m_RenderPassShadow; + VkRenderPass m_RenderPassClearHDR; + VkRenderPass m_RenderPassHDR; + VkRenderPass m_RenderPassMV; + VkRenderPass m_RenderPassPBR; + VkRenderPass m_RenderPassApply; + + // Framebuffers + VkFramebuffer m_FramebufferShadows; + VkFramebuffer m_FramebufferHDR; + VkFramebuffer m_FramebufferMV; + VkFramebuffer m_FramebufferPBR; + VkFramebuffer m_FramebufferApply; + + // For multithreaded texture loading + AsyncPool m_AsyncPool; +}; + diff --git a/sample/src/VK/Sources/SssrSample.cpp b/sample/src/VK/Sources/SssrSample.cpp new file mode 100644 index 0000000..9277d64 --- /dev/null +++ b/sample/src/VK/Sources/SssrSample.cpp @@ -0,0 +1,591 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2018 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" + +#include "SssrSample.h" +#include "base/ShaderCompilerCache.h" +#include "base/Instance.h" + +SssrSample::SssrSample(LPCSTR name) : FrameworkWindows(name) +{ + m_LastFrameTime = MillisecondsNow(); + m_Time = 0; + m_bPlay = true; + m_bShowUI = true; + + m_CameraControlSelected = 0; // select WASD on start up + + m_pGltfLoader = NULL; +} + +//-------------------------------------------------------------------------------------- +// +// OnCreate +// +//-------------------------------------------------------------------------------------- +void SssrSample::OnCreate(HWND hWnd) +{ + // get the list of scenes + for (const auto& scene : m_JsonConfigFile["scenes"]) + m_SceneNames.push_back(scene["name"]); + + DWORD dwAttrib = GetFileAttributes("..\\media\\"); + if ((dwAttrib == INVALID_FILE_ATTRIBUTES) || ((dwAttrib & FILE_ATTRIBUTE_DIRECTORY)) == 0) + { + MessageBox(NULL, "Media files not found!\n\nPlease check the readme on how to get the media files.", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + + // Create Device + // +#ifdef _DEBUG + bool cpuValidationLayerEnabled = true; + bool gpuValidationLayerEnabled = false; +#else + bool cpuValidationLayerEnabled = false; + bool gpuValidationLayerEnabled = false; +#endif + + // Create the device + InstanceProperties ip; + ip.Init(); + m_Device.SetEssentialInstanceExtensions(cpuValidationLayerEnabled, gpuValidationLayerEnabled, &ip); + + // Create instance + VkInstance vulkanInstance; + VkPhysicalDevice physicalDevice; + CreateInstance("SssrSample", "Cauldron", &vulkanInstance, &physicalDevice, &ip); + + DeviceProperties dp; + dp.Init(physicalDevice); + m_Device.SetEssentialDeviceExtensions(&dp); + dp.AddDeviceExtensionName(VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME); + + bool addedSubgroupSizeControl = dp.AddDeviceExtensionName(VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME); + + VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroupSizeControlFeatures = {}; + subgroupSizeControlFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT; + subgroupSizeControlFeatures.pNext = nullptr; + subgroupSizeControlFeatures.subgroupSizeControl = true; + subgroupSizeControlFeatures.computeFullSubgroups = false; + if (addedSubgroupSizeControl) + { + dp.SetNewNext(&subgroupSizeControlFeatures); + } + + // Create device + m_Device.OnCreateEx(vulkanInstance, physicalDevice, hWnd, &dp); + + m_Device.CreatePipelineCache(); + + // Init the shader compiler + InitDirectXCompiler(); + CreateShaderCache(); + + // Create Swapchain + // + uint32_t dwNumberOfBackBuffers = 2; + m_SwapChain.OnCreate(&m_Device, dwNumberOfBackBuffers, hWnd); + + // Create a instance of the renderer and initialize it, we need to do that for each GPU + // + m_Node = new SampleRenderer(); + m_Node->OnCreate(&m_Device, &m_SwapChain); + + // init GUI (non gfx stuff) + // + ImGUI_Init((void *)hWnd); + + // Init Camera, looking at the origin + // + m_Yaw = 0.0f; + m_Pitch = 0.0f; + m_Distance = 3.5f; + + // init GUI state + m_State.toneMapper = 2; + m_State.skyDomeType = 1; + m_State.exposure = 1.0f; + m_State.emmisiveFactor = 1.0f; + m_State.iblFactor = 1.0f; + m_State.bDrawBoundingBoxes = false; + m_State.bDrawLightFrustum = false; + m_State.bDrawBloom = false; + m_State.camera.LookAt(m_Yaw, m_Pitch, m_Distance, XMVectorSet(0, 0, 0, 0)); + m_State.lightIntensity = 10.f; + m_State.lightCamera.SetFov(XM_PI / 6.0f, 1024, 1024, 0.1f, 20.0f); + m_State.lightCamera.LookAt(XM_PI / 2.0f, 0.58f, 3.5f, XMVectorSet(0, 0, 0, 0)); + m_State.lightColor = XMFLOAT3(1, 1, 1); + m_State.targetFrametime = 0; + m_State.temporalStability = 0.99f; + m_State.maxTraversalIterations = 128; + m_State.mostDetailedDepthHierarchyMipLevel = 1; + m_State.depthBufferThickness = 0.015f; + m_State.minTraversalOccupancy = 4; + m_State.samplesPerQuad = 1; + m_State.bEnableVarianceGuidedTracing = true; + m_State.bShowIntersectionResults = false; + m_State.roughnessThreshold = 0.2f; + m_State.showReflectionTarget = false; + m_State.bDrawScreenSpaceReflections = true; +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroy +// +//-------------------------------------------------------------------------------------- +void SssrSample::OnDestroy() +{ + ImGUI_Shutdown(); + + m_Device.GPUFlush(); + + // Fullscreen state should always be false before exiting the app. + m_SwapChain.SetFullScreen(false); + + m_Node->UnloadScene(); + m_Node->OnDestroyWindowSizeDependentResources(); + m_Node->OnDestroy(); + + delete m_Node; + + m_SwapChain.OnDestroyWindowSizeDependentResources(); + m_SwapChain.OnDestroy(); + + //shut down the shader compiler + DestroyShaderCache(&m_Device); + + m_Device.DestroyPipelineCache(); + + if (m_pGltfLoader) + { + delete m_pGltfLoader; + m_pGltfLoader = NULL; + } + + m_Device.OnDestroy(); +} + +//-------------------------------------------------------------------------------------- +// +// OnEvent, forward Win32 events to ImGUI +// +//-------------------------------------------------------------------------------------- +bool SssrSample::OnEvent(MSG msg) +{ + if (ImGUI_WndProcHandler(msg.hwnd, msg.message, msg.wParam, msg.lParam)) + return true; + + return true; +} + +//-------------------------------------------------------------------------------------- +// +// SetFullScreen +// +//-------------------------------------------------------------------------------------- +void SssrSample::SetFullScreen(bool fullscreen) +{ + m_Device.GPUFlush(); + + m_SwapChain.SetFullScreen(fullscreen); +} + +void SssrSample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool* pbFullScreen) +{ + // First load configuration + std::ifstream f("config.json"); + if (!f) + { + MessageBox(NULL, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); + exit(-1); + } + f >> m_JsonConfigFile; + + // Parse command line and override the config file + try + { + if (strlen(lpCmdLine) > 0) + { + auto j3 = json::parse(lpCmdLine); + m_JsonConfigFile.merge_patch(j3); + } + } + catch (json::parse_error) + { + Trace("Error parsing commandline\n"); + exit(0); + } + + // Set values + *pWidth = m_JsonConfigFile.value("width", 1920); + *pHeight = m_JsonConfigFile.value("height", 1080); + *pbFullScreen = m_JsonConfigFile.value("fullScreen", false); + m_State.isBenchmarking = m_JsonConfigFile.value("benchmark", false); +} + +void SssrSample::BuildUI() +{ + ImGuiStyle& style = ImGui::GetStyle(); + style.FrameBorderSize = 1.0f; + + bool opened = true; + ImGui::Begin("Stats", &opened); + + if (ImGui::CollapsingHeader("Info", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Resolution : %ix%i", m_Width, m_Height); + } + + if (ImGui::CollapsingHeader("Animation")) + { + ImGui::Checkbox("Play", &m_bPlay); + ImGui::SliderFloat("Time", &m_Time, 0, 30); + } + + if (ImGui::CollapsingHeader("Model Selection", ImGuiTreeNodeFlags_DefaultOpen)) + { + static int selectedScene = 0; + auto getterLambda = [](void* data, int idx, const char** out_str)->bool { *out_str = ((std::vector *)data)->at(idx).c_str(); return true; }; + if (ImGui::Combo("model", &selectedScene, getterLambda, &m_SceneNames, (int)m_SceneNames.size()) || (m_pGltfLoader == NULL)) + { + LoadScene(selectedScene); + + // bail out as we need to reload everything + ImGui::End(); + ImGui::EndFrame(); + return; + } + + char *cameraControl[] = { "WASD", "Orbit", "cam #0", "cam #1", "cam #2", "cam #3" , "cam #4", "cam #5" }; + if (m_CameraControlSelected >= m_pGltfLoader->m_cameras.size() + 2) + m_CameraControlSelected = 0; + ImGui::Combo("Camera", &m_CameraControlSelected, cameraControl, (int)(m_pGltfLoader->m_cameras.size() + 2)); + + ImGui::Checkbox("Show Bounding Boxes", &m_State.bDrawBoundingBoxes); + } + + if (ImGui::CollapsingHeader("Lighting")) + { + const char * tonemappers[] = { "Timothy", "DX11DSK", "Reinhard", "Uncharted2Tonemap", "ACES", "No tonemapper" }; + ImGui::Combo("Tonemapper", &m_State.toneMapper, tonemappers, _countof(tonemappers)); + + const char * skyDomeType[] = { "Procedural Sky", "cubemap", "Simple clear" }; + ImGui::Combo("SkyDome", &m_State.skyDomeType, skyDomeType, _countof(skyDomeType)); + + ImGui::SliderFloat("IBL Factor", &m_State.iblFactor, 0.0f, 10.0f, NULL, 1.0f); + ImGui::SliderFloat("Emmisive", &m_State.emmisiveFactor, 1.0f, 1000.0f, NULL, 1.0f); + ImGui::SliderFloat("Exposure", &m_State.exposure, 0.0f, 4.0f); + ImGui::Checkbox("Show Light Frustums", &m_State.bDrawLightFrustum); + ImGui::Checkbox("Draw Bloom", &m_State.bDrawBloom); + } + + if (ImGui::CollapsingHeader("Reflections", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Checkbox("Draw Screen Space Reflections", &m_State.bDrawScreenSpaceReflections); + ImGui::Checkbox("Show Reflection Target", &m_State.showReflectionTarget); + ImGui::Checkbox("Show Intersection Results", &m_State.bShowIntersectionResults); + ImGui::SliderFloat("Target Frametime in ms", &m_State.targetFrametime, 0.0f, 50.0f); + ImGui::SliderInt("Max Traversal Iterations", &m_State.maxTraversalIterations, 0, 256); + ImGui::SliderInt("Min Traversal Occupancy", &m_State.minTraversalOccupancy, 0, 32); + ImGui::SliderInt("Most Detailed Level", &m_State.mostDetailedDepthHierarchyMipLevel, 0, 5); + ImGui::SliderFloat("Depth Buffer Thickness", &m_State.depthBufferThickness, 0.0f, 0.03f); + ImGui::SliderFloat("Roughness Threshold", &m_State.roughnessThreshold, 0.0f, 1.f); + ImGui::SliderFloat("Temporal Stability", &m_State.temporalStability, 0.0f, 1.0f); + ImGui::Checkbox("Enable Variance Guided Tracing", &m_State.bEnableVarianceGuidedTracing); + + ImGui::Text("Samples Per Quad"); ImGui::SameLine(); + ImGui::RadioButton("1", &m_State.samplesPerQuad, 1); ImGui::SameLine(); + ImGui::RadioButton("2", &m_State.samplesPerQuad, 2); ImGui::SameLine(); + ImGui::RadioButton("4", &m_State.samplesPerQuad, 4); + + ImGui::Value("Tile Classification Elapsed Time", 1000 * m_State.tileClassificationTime, "%.1f us"); + ImGui::Value("Intersection Elapsed Time", 1000 * m_State.intersectionTime, "%.1f us"); + ImGui::Value("Denoising Elapsed Time", 1000 * m_State.denoisingTime, "%.1f us"); + } + + if (ImGui::CollapsingHeader("Profiler")) + { + const std::vector& timeStamps = m_Node->GetTimingValues(); + if (timeStamps.size() > 0) + { + for (uint32_t i = 0; i < timeStamps.size(); i++) + { + ImGui::Text("%-22s: %7.1f", timeStamps[i].m_label.c_str(), timeStamps[i].m_microseconds); + } + + //scrolling data and average computing + static float values[128]; + values[127] = timeStamps.back().m_microseconds; + for (uint32_t i = 0; i < 128 - 1; i++) { values[i] = values[i + 1]; } + ImGui::PlotLines("", values, 128, 0, "GPU frame time (us)", 0.0f, 30000.0f, ImVec2(0, 80)); + } + } + + ImGui::Text("'X' to show/hide GUI"); + ImGui::End(); +} + +void SssrSample::HandleInput() +{ + // If the mouse was not used by the GUI then it's for the camera + // + ImGuiIO& io = ImGui::GetIO(); + + static std::chrono::system_clock::time_point last = std::chrono::system_clock::now(); + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::chrono::duration diff = now - last; + last = now; + + io.DeltaTime = static_cast(diff.count()); + + if (ImGui::IsKeyPressed('X')) + { + m_bShowUI = !m_bShowUI; + ShowCursor(m_bShowUI); + } + + if (io.WantCaptureMouse == false || !m_bShowUI) + { + if ((io.KeyCtrl == false) && (io.MouseDown[0] == true)) + { + m_Yaw -= io.MouseDelta.x / 100.f; + m_Pitch += io.MouseDelta.y / 100.f; + } + + // Choose camera movement depending on setting + // + if (m_CameraControlSelected == 0) + { + // WASD + // + m_State.camera.UpdateCameraWASD(m_Yaw, m_Pitch, io.KeysDown, io.DeltaTime); + } + else if (m_CameraControlSelected == 1) + { + // Orbiting + // + m_Distance -= (float)io.MouseWheel / 3.0f; + m_Distance = std::max(m_Distance, 0.1f); + + bool panning = (io.KeyCtrl == true) && (io.MouseDown[0] == true); + + m_State.camera.UpdateCameraPolar(m_Yaw, m_Pitch, panning ? -io.MouseDelta.x / 100.0f : 0.0f, panning ? io.MouseDelta.y / 100.0f : 0.0f, m_Distance); + } + else + { + // Use a camera from the GLTF + // + m_pGltfLoader->GetCamera(m_CameraControlSelected - 2, &m_State.camera); + m_Yaw = m_State.camera.GetYaw(); + m_Pitch = m_State.camera.GetPitch(); + } + } +} + +void SssrSample::LoadScene(int sceneIndex) +{ + json scene = m_JsonConfigFile["scenes"][sceneIndex]; + if (m_pGltfLoader != NULL) + { + //free resources, unload the current scene, and load new scene... + m_Device.GPUFlush(); + + m_Node->UnloadScene(); + m_Node->OnDestroyWindowSizeDependentResources(); + m_Node->OnDestroy(); + m_pGltfLoader->Unload(); + m_Node->OnCreate(&m_Device, &m_SwapChain); + m_Node->OnCreateWindowSizeDependentResources(&m_SwapChain, m_Width, m_Height); + } + + delete(m_pGltfLoader); + m_pGltfLoader = new GLTFCommon(); + + if (m_pGltfLoader->Load(scene["directory"], scene["filename"]) == false) + { + MessageBox(NULL, "The selected model couldn't be found, please check the documentation", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + + // Load the UI settings, and also some defaults cameras and lights, in case the GLTF has none + { +#define LOAD(j, key, val) val = j.value(key, val) + + // global settings + LOAD(scene, "toneMapper", m_State.toneMapper); + LOAD(scene, "skyDomeType", m_State.skyDomeType); + LOAD(scene, "exposure", m_State.exposure); + LOAD(scene, "iblFactor", m_State.iblFactor); + LOAD(scene, "emmisiveFactor", m_State.emmisiveFactor); + LOAD(scene, "skyDomeType", m_State.skyDomeType); + + // default light + m_State.lightIntensity = scene.value("intensity", 1.0f); + + // default camera (in case the gltf has none) + json camera = scene["camera"]; + LOAD(camera, "yaw", m_Yaw); + LOAD(camera, "pitch", m_Pitch); + LOAD(camera, "distance", m_Distance); + XMVECTOR lookAt = GetVector(GetElementJsonArray(camera, "lookAt", { 0.0, 0.0, 0.0 })); + m_State.camera.LookAt(m_Yaw, m_Pitch, m_Distance, lookAt); + + // set benchmarking state if enabled + if (m_State.isBenchmarking) + { + BenchmarkConfig(scene["BenchmarkSettings"], -1, m_pGltfLoader); + } + + // indicate the mainloop we started loading a GLTF and it needs to load the rest (textures and geometry) + m_bLoadingScene = true; + } +} + +//-------------------------------------------------------------------------------------- +// +// OnResize +// +//-------------------------------------------------------------------------------------- +void SssrSample::OnResize(uint32_t width, uint32_t height) +{ + if (m_Width != width || m_Height != height) + { + // Flush GPU + // + m_Device.GPUFlush(); + + // If resizing but no minimizing + // + if (m_Width > 0 && m_Height > 0) + { + if (m_Node != NULL) + { + m_Node->OnDestroyWindowSizeDependentResources(); + } + m_SwapChain.OnDestroyWindowSizeDependentResources(); + } + + m_Width = width; + m_Height = height; + + // if resizing but not minimizing the recreate it with the new size + // + if (m_Width > 0 && m_Height > 0) + { + m_SwapChain.OnCreateWindowSizeDependentResources(m_Width, m_Height, false, DISPLAYMODE_SDR); + if (m_Node != NULL) + { + m_Node->OnCreateWindowSizeDependentResources(&m_SwapChain, m_Width, m_Height); + } + } + } + m_State.camera.SetFov(XM_PI / 4, m_Width, m_Height, 0.1f, 1000.0f); +} + +//-------------------------------------------------------------------------------------- +// +// OnRender, updates the state from the UI, animates, transforms and renders the scene +// +//-------------------------------------------------------------------------------------- +void SssrSample::OnRender() +{ + // Get timings + // + double timeNow = MillisecondsNow(); + m_DeltaTime = timeNow - m_LastFrameTime; + m_LastFrameTime = timeNow; + + // Build UI and set the scene state. Note that the rendering of the UI happens later. + // + ImGUI_UpdateIO(); + ImGui::NewFrame(); + + if (m_bLoadingScene) + { + static int loadingStage = 0; + // LoadScene needs to be called a number of times, the scene is not fully loaded until it returns 0 + // This is done so we can display a progress bar when the scene is loading + loadingStage = m_Node->LoadScene(m_pGltfLoader, loadingStage); + if (loadingStage == 0) + { + m_Time = 0; + m_bLoadingScene = false; + } + } + else if (m_pGltfLoader && m_State.isBenchmarking) + { + const std::vector& timeStamps = m_Node->GetTimingValues(); + const std::string * screenshotName; + m_Time = BenchmarkLoop(timeStamps, &m_State.camera, &screenshotName); + } + else + { + if (m_bShowUI) + { + BuildUI(); + } + + if (!m_bLoadingScene) + { + HandleInput(); + } + } + + // Set animation time + // + if (m_bPlay) + { + m_Time += (float)m_DeltaTime / 1000.0f; + } + + // Animate and transform the scene + // + if (m_pGltfLoader) + { + m_pGltfLoader->SetAnimationTime(0, m_Time); + m_pGltfLoader->TransformScene(0, XMMatrixIdentity()); + } + + m_State.time = m_Time; + + // Do Render frame using AFR + // + m_Node->OnRender(&m_State, &m_SwapChain); + + m_SwapChain.Present(); +} + +//-------------------------------------------------------------------------------------- + // + // WinMain + // + //-------------------------------------------------------------------------------------- +int WINAPI WinMain(HINSTANCE hInstance, + HINSTANCE hPrevInstance, + LPSTR lpCmdLine, + int nCmdShow) +{ + LPCSTR Name = "Stochastic Screen Space Reflection Sample VK v1.0"; + + // create new sample + return RunFramework(hInstance, lpCmdLine, nCmdShow, new SssrSample(Name)); +} \ No newline at end of file diff --git a/sample/src/VK/Sources/SssrSample.h b/sample/src/VK/Sources/SssrSample.h new file mode 100644 index 0000000..913964c --- /dev/null +++ b/sample/src/VK/Sources/SssrSample.h @@ -0,0 +1,81 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2018 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +#pragma once + +#include "SampleRenderer.h" + +// +// This is the main class, it manages the state of the sample and does all the high level work without touching the GPU directly. +// This class uses the GPU via the the SampleRenderer class. We would have a SampleRenderer instance for each GPU. +// +// This class takes care of: +// +// - loading a scene (just the CPU data) +// - updating the camera +// - keeping track of time +// - handling the keyboard +// - updating the animation +// - building the UI (but do not renders it) +// - uses the SampleRenderer to update all the state to the GPU and do the rendering +// + +class SssrSample : public FrameworkWindows +{ +public: + SssrSample(LPCSTR name); + void OnCreate(HWND hWnd) override; + void OnDestroy() override; + void OnRender() override; + bool OnEvent(MSG msg) override; + void OnResize(uint32_t Width, uint32_t Height) override; + void OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool* pbFullScreen) override; + + void SetFullScreen(bool fullscreen); + +private: + void BuildUI(); + void HandleInput(); + void LoadScene(int sceneIndex); + + Device m_Device; + SwapChain m_SwapChain; + + GLTFCommon *m_pGltfLoader = NULL; + bool m_bLoadingScene = false; + + SampleRenderer *m_Node = NULL; + SampleRenderer::State m_State; + + float m_Distance; + float m_Yaw; + float m_Pitch; + + float m_Time; // WallClock in seconds. + double m_DeltaTime; // The elapsed time in milliseconds since the previous frame. + double m_LastFrameTime; + + // json config file + json m_JsonConfigFile; + std::vector m_SceneNames; + + bool m_bPlay; + bool m_bShowUI; + + int m_CameraControlSelected; +}; \ No newline at end of file diff --git a/sample/src/VK/Sources/stdafx.cpp b/sample/src/VK/Sources/stdafx.cpp new file mode 100644 index 0000000..583f95d --- /dev/null +++ b/sample/src/VK/Sources/stdafx.cpp @@ -0,0 +1,10 @@ +// stdafx.cpp : source file that includes just the standard includes +// ObjRendererD3D12.pch will be the pre-compiled header +// stdafx.obj will contain the pre-compiled type information + +#include "stdafx.h" + +// TODO: reference any additional headers you need in STDAFX.H +// and not in this file + + diff --git a/sample/src/VK/Sources/stdafx.h b/sample/src/VK/Sources/stdafx.h new file mode 100644 index 0000000..266b6b7 --- /dev/null +++ b/sample/src/VK/Sources/stdafx.h @@ -0,0 +1,66 @@ +// stdafx.h : include file for standard system include files, +// or project specific include files that are used frequently, but +// are changed infrequently +// +#pragma once + +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers +// Windows Header Files: +#include +#include + +// C RunTime Header Files +#include +#include +#include +#include +#include + +#include "vulkan/vulkan.h" + +// we are using DirectXMath +#include +using namespace DirectX; + +// TODO: reference additional headers your program requires here +#include "Base/Imgui.h" +#include "Base/ImguiHelper.h" +#include "Base/Device.h" +#include "Base/Helper.h" +#include "Base/Texture.h" +#include "Base/SwapChain.h" +#include "Base/UploadHeap.h" +#include "Base/GPUTimeStamps.h" +#include "Base/ExtDebugMarkers.h" +#include "Base/CommandListRing.h" +#include "Base/StaticBufferPool.h" +#include "Base/DynamicBufferRing.h" +#include "Base/ResourceViewHeaps.h" +#include "Base/ShaderCompilerHelper.h" + +#include "Misc/Misc.h" +#include "Misc/Camera.h" +#include "Misc/FrameworkWindows.h" + +#include "PostProc/Bloom.h" +#include "PostProc/BlurPS.h" +#include "PostProc/SkyDome.h" +#include "PostProc/ToneMapping.h" +#include "PostProc/SkyDomeProc.h" +#include "PostProc/DownSamplePS.h" +#include "PostProc/PostProcCS.h" + +#include "GLTF/GltfPbrPass.h" +#include "GLTF/GltfBBoxPass.h" +#include "GLTF/GltfDepthPass.h" +#include "GLTF/GltfMotionVectorsPass.h" + +#include "Widgets/Axis.h" +#include "Widgets/CheckerBoardFloor.h" +#include "Widgets/WireframeBox.h" +#include "Widgets/WireframeSphere.h" + +using namespace CAULDRON_VK; + +#include "ffx_sssr.h" +#include "ffx_sssr_vk.h" diff --git a/sample/src/VK/dpiawarescaling.manifest b/sample/src/VK/dpiawarescaling.manifest new file mode 100644 index 0000000..8dd8cd9 --- /dev/null +++ b/sample/src/VK/dpiawarescaling.manifest @@ -0,0 +1,8 @@ + + + + + true/PM + + + \ No newline at end of file