vkd3d: Implement an adaptive system for internal latency handle.

Optimize for smoother frame pace rather than strict adherence to the present wait internal. The scenario where this can greatly help is on FRR display where GPU is rendering slower than refresh rate. In this situation, we can observe that DXGI present blocks spuriously for a full frame interval, which can cause starvation issues, since CPU might become too late to submit more work to GPU in time, which is especially devastating on Deck due to its power management scheme. Signed-off-by: Hans-Kristian Arntzen <[email protected]>
HansKristian-Work · Apr 8, 2024 · 01dc996 · 01dc996
1 parent dbb4bc5
commit 01dc996
Showing 1 changed file with 119 additions and 6 deletions.
diff --git a/libs/vkd3d/swapchain.c b/libs/vkd3d/swapchain.c
@@ -75,6 +75,7 @@ struct present_wait_entry
 {
     uint64_t id;
     uint64_t begin_frame_time_ns;
+    uint64_t complete_timeline;
 };
 
 struct dxgi_vk_swap_chain
@@ -212,6 +213,9 @@ struct dxgi_vk_swap_chain
         pthread_mutex_t lock;
         bool active;
         bool skip_waits;
+
+        uint64_t estimated_delay_from_gpu_complete_to_flip_ns;
+        uint32_t present_wait_fifo_bound_estimate;
     } wait_thread;
 };
 
@@ -384,7 +388,8 @@ static void dxgi_vk_swap_chain_drain_user_images(struct dxgi_vk_swap_chain *chai
     dxgi_vk_swap_chain_drain_complete_semaphore(chain, chain->user.blit_count);
 }
 
-static void dxgi_vk_swap_chain_push_present_id(struct dxgi_vk_swap_chain *chain, uint64_t present_id, uint64_t begin_frame_time_ns)
+static void dxgi_vk_swap_chain_push_present_id(struct dxgi_vk_swap_chain *chain,
+        uint64_t present_id, uint64_t begin_frame_time_ns, uint64_t complete_timeline)
 {
     struct present_wait_entry *entry;
     pthread_mutex_lock(&chain->wait_thread.lock);
@@ -393,6 +398,7 @@ static void dxgi_vk_swap_chain_push_present_id(struct dxgi_vk_swap_chain *chain,
     entry = &chain->wait_thread.wait_queue[chain->wait_thread.wait_queue_count++];
     entry->id = present_id;
     entry->begin_frame_time_ns = begin_frame_time_ns;
+    entry->complete_timeline = complete_timeline;
     pthread_cond_signal(&chain->wait_thread.cond);
     pthread_mutex_unlock(&chain->wait_thread.lock);
 }
@@ -404,7 +410,7 @@ static void dxgi_vk_swap_chain_cleanup(struct dxgi_vk_swap_chain *chain)
 
     if (chain->wait_thread.active)
     {
-        dxgi_vk_swap_chain_push_present_id(chain, 0, 0);
+        dxgi_vk_swap_chain_push_present_id(chain, 0, 0, 0);
         pthread_join(chain->wait_thread.thread, NULL);
         pthread_mutex_destroy(&chain->wait_thread.lock);
         pthread_cond_destroy(&chain->wait_thread.cond);
@@ -2434,7 +2440,9 @@ static void dxgi_vk_swap_chain_signal_waitable_handle(struct dxgi_vk_swap_chain
 
     if (chain->present.present_id_valid)
     {
-        dxgi_vk_swap_chain_push_present_id(chain, chain->present.present_id, chain->request.begin_frame_time_ns);
+        dxgi_vk_swap_chain_push_present_id(chain, chain->present.present_id,
+                chain->request.begin_frame_time_ns,
+                chain->present.complete_count);
     }
     else
     {
@@ -2543,15 +2551,96 @@ static void dxgi_vk_swap_chain_present_callback(void *chain_)
 #endif
 }
 
+#define PRESENT_WAIT_DELAY_ESTIMATE_SHIFT 4
+#define PRESENT_WAIT_FIFO_BOUND_ESTIMATE_SHIFT 3
+#define PRESENT_WAIT_FIFO_BOUND_FIFO_BOUND_INCREMENT (1024 >> PRESENT_WAIT_FIFO_BOUND_ESTIMATE_SHIFT)
+#define PRESENT_WAIT_FIFO_BOUND_THRESHOLD 512
+
+static void dxgi_vk_swap_chain_wait_internal_latency(struct dxgi_vk_swap_chain *chain,
+        uint64_t present_id, uint64_t complete_timeline, uint64_t *gpu_complete_ns)
+{
+    const struct vkd3d_vk_device_procs *vk_procs = &chain->queue->device->vk_procs;
+    uint64_t present_wait_timeout;
+    bool is_fifo_bound;
+
+    if (!chain->wait_thread.skip_waits)
+    {
+        /* If GPU is already done rendering as soon as we're done waiting for previous frame,
+         * it means we're spending at least a full refresh cycle
+         * just waiting for FIFO queue to catch up. At this point, we have a strong signal
+         * that we're not GPU bound, but rather FIFO bound.
+         * If we observe this signal often enough, we'll just wait for the actual present wait to complete.
+         * That should be smoothly paced. */
+        is_fifo_bound = dxgi_vk_swap_chain_query_complete_semaphore(chain) >= complete_timeline;
+        if (!is_fifo_bound)
+            dxgi_vk_swap_chain_drain_complete_semaphore(chain, complete_timeline);
+
+        /* If GPU was already done rendering by the time we got here, this won't be an accurate estimate
+         * unless we add yet another thread that waits on GPU completion events.
+         * However, if we hit this condition often enough, we'll hit the FIFO-bound path anyway, so
+         * this is fine. */
+        *gpu_complete_ns = vkd3d_get_current_time_ns();
+
+        /* Basic exponential average. The filter will cap out at 1024. */
+        chain->wait_thread.present_wait_fifo_bound_estimate -=
+                chain->wait_thread.present_wait_fifo_bound_estimate >> PRESENT_WAIT_FIFO_BOUND_ESTIMATE_SHIFT;
+        if (is_fifo_bound)
+            chain->wait_thread.present_wait_fifo_bound_estimate += PRESENT_WAIT_FIFO_BOUND_FIFO_BOUND_INCREMENT;
+
+        if (chain->debug_latency)
+        {
+            INFO("Updating FIFO bound estimate to %u (threshold %u).\n",
+                    chain->wait_thread.present_wait_fifo_bound_estimate, PRESENT_WAIT_FIFO_BOUND_THRESHOLD);
+        }
+
+        is_fifo_bound = chain->wait_thread.present_wait_fifo_bound_estimate >= PRESENT_WAIT_FIFO_BOUND_THRESHOLD;
+
+        present_wait_timeout = is_fifo_bound ?
+                UINT64_MAX : chain->wait_thread.estimated_delay_from_gpu_complete_to_flip_ns;
+
+        /* We don't care if we timed out or not. */
+        VK_CALL(vkWaitForPresentKHR(chain->queue->device->vk_device, chain->present.vk_swapchain,
+                present_id, present_wait_timeout));
+    }
+
+    vkd3d_native_sync_handle_release(chain->frame_latency_event_internal, 1);
+}
+
+static void dxgi_vk_swap_chain_update_internal_latency(struct dxgi_vk_swap_chain *chain, uint64_t gpu_complete_ns)
+{
+    uint64_t present_complete_ns, queue_delay_ns;
+
+    chain->wait_thread.estimated_delay_from_gpu_complete_to_flip_ns -=
+            chain->wait_thread.estimated_delay_from_gpu_complete_to_flip_ns >> PRESENT_WAIT_DELAY_ESTIMATE_SHIFT;
+
+    if (gpu_complete_ns)
+    {
+        present_complete_ns = vkd3d_get_current_time_ns();
+        present_complete_ns = max(present_complete_ns, gpu_complete_ns);
+        queue_delay_ns = present_complete_ns - gpu_complete_ns;
+
+        chain->wait_thread.estimated_delay_from_gpu_complete_to_flip_ns +=
+                queue_delay_ns >> PRESENT_WAIT_DELAY_ESTIMATE_SHIFT;
+
+        if (chain->debug_latency)
+        {
+            INFO("Updated GPU complete to FLIP delay to %"PRIu64" us.\n",
+                    chain->wait_thread.estimated_delay_from_gpu_complete_to_flip_ns / 1000);
+        }
+    }
+}
+
 static void *dxgi_vk_swap_chain_wait_worker(void *chain_)
 {
     struct dxgi_vk_swap_chain *chain = chain_;
 
     struct vkd3d_queue_timeline_trace *timeline_trace = &chain->queue->device->queue_timeline_trace;
     const struct vkd3d_vk_device_procs *vk_procs = &chain->queue->device->vk_procs;
     struct vkd3d_queue_timeline_trace_cookie cookie;
+    uint64_t next_complete_timeline = 0;
     uint64_t begin_frame_time_ns = 0;
     uint64_t end_frame_time_ns = 0;
+    uint64_t gpu_complete_ns = 0;
     uint64_t next_wait_id = 0;
     int previous_semaphore;
 
@@ -2564,19 +2653,46 @@ static void *dxgi_vk_swap_chain_wait_worker(void *chain_)
             pthread_cond_wait(&chain->wait_thread.cond, &chain->wait_thread.lock);
         next_wait_id = chain->wait_thread.wait_queue[0].id;
         begin_frame_time_ns = chain->wait_thread.wait_queue[0].begin_frame_time_ns;
+        next_complete_timeline = chain->wait_thread.wait_queue[0].complete_timeline;
         pthread_mutex_unlock(&chain->wait_thread.lock);
 
         /* Sentinel for swapchain teardown. */
         if (!next_wait_id)
             break;
 
         cookie = vkd3d_queue_timeline_trace_register_present_wait(timeline_trace, next_wait_id);
+
+        /* For the internal latency handle, we should be more careful than just waiting for present naively.
+         * In a GPU bound scenario on FRR displays where FPS < refresh rate, WaitForPresentKHR will be very noisy
+         * as vblanks are missed in unpredictable ways.
+         * This stutter will propagate to the application in ::Present(), and blocking the CPU timeline
+         * for a longer time in application threads can lead to an effect where GPU goes idle, despite being GPU bound.
+         * To combat this, we reformulate the internal latency fence to be released between these two timepoints:
+         *  - blit timeline completes
+         *  - WaitForPresentKHR signals
+         * The delta between these events is unwanted presentation delay.
+         * For VRR at a rate below refresh rate, the expectation is that this presentation delay is ~0 ms,
+         * so this algorithm won't do much, but on FRR, we want to unblock the internal latency fence
+         * while optimizing for smooth frame pacing. To achieve this we compute a timeout estimate for PresentWait.
+         * To unblock the latency handle, we will wait for GPU to complete render, then wait up to N ms in a present wait.
+         * If present wait gets delayed for longer than expected, just unblock the latency fence to avoid stutter.
+         * In cases where we can observe that we are fully FIFO bound, rather than GPU bound,
+         * we will use the present wait as-is. */
+        if (vkd3d_native_sync_handle_is_valid(chain->frame_latency_event_internal))
+        {
+            dxgi_vk_swap_chain_wait_internal_latency(chain,
+                    next_wait_id, next_complete_timeline, &gpu_complete_ns);
+        }
+
         /* In skip wait mode we just need to make sure that we signal latency fences properly. */
         if (!chain->wait_thread.skip_waits)
         {
             /* We don't really care if we observed OUT_OF_DATE or something here. */
             VK_CALL(vkWaitForPresentKHR(chain->queue->device->vk_device, chain->present.vk_swapchain,
                     next_wait_id, UINT64_MAX));
+
+            if (vkd3d_native_sync_handle_is_valid(chain->frame_latency_event_internal))
+                dxgi_vk_swap_chain_update_internal_latency(chain, gpu_complete_ns);
         }
         vkd3d_queue_timeline_trace_complete_present_wait(timeline_trace, cookie);
 
@@ -2598,9 +2714,6 @@ static void *dxgi_vk_swap_chain_wait_worker(void *chain_)
                 WARN("Failed to increment swapchain semaphore. Did application forget to acquire?\n");
         }
 
-        if (vkd3d_native_sync_handle_is_valid(chain->frame_latency_event_internal))
-            vkd3d_native_sync_handle_release(chain->frame_latency_event_internal, 1);
-
         if (begin_frame_time_ns)
             INFO("vkWaitForPresentKHR frame latency: %.3f ms.\n", 1e-6 * (end_frame_time_ns - begin_frame_time_ns));