defrag: allow defrag to start during AOF loading

Addresses #1393 During AOF loading or long running script, this allows defrag to be initiated. Signed-off-by: Jim Brunner <[email protected]>
valkey-io · Dec 10, 2024 · 4688575 · 4688575
1 parent 7e56488
commit 4688575
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 5 deletions.
diff --git a/src/defrag.c b/src/defrag.c
@@ -84,7 +84,7 @@ struct DefragContext {
 
     long long timeproc_id;      // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
     monotime timeproc_end_time; // Ending time of previous timerproc execution
-    long timeproc_overage_us;   // A correction value if over/under target CPU percent
+    long timeproc_overage_us;   // A correction value if over target CPU percent
 };
 static struct DefragContext defrag;
 
@@ -1189,7 +1189,7 @@ static int computeDefragCycleUs(void) {
          *  the starvation of the timer. */
         dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent);
 
-        // Also adjust for any accumulated overage(underage).
+        // Also adjust for any accumulated overage.
         dutyCycleUs -= defrag.timeproc_overage_us;
         defrag.timeproc_overage_us = 0;
 
@@ -1208,8 +1208,11 @@ static int computeDefragCycleUs(void) {
  * computeDefragCycleUs computation. */
 static int computeDelayMs(monotime intendedEndtime) {
     defrag.timeproc_end_time = getMonotonicUs();
-    int overage = defrag.timeproc_end_time - intendedEndtime;
+    long overage = defrag.timeproc_end_time - intendedEndtime;
     defrag.timeproc_overage_us += overage; // track over/under desired CPU
+    /* Allow negative overage (underage) to count against existing overage, but don't allow
+     * underage (from short stages) to be accumulated.  */
+    if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0;
 
     int targetCpuPercent = server.active_defrag_cpu_percent;
     serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
@@ -1221,7 +1224,7 @@ static int computeDelayMs(monotime intendedEndtime) {
     long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent;
     long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us;
     // Only increase delay by the fraction of the overage that would be non-duty-cycle
-    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative
+    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100;
     if (delayUs < 0) delayUs = 0;
     long delayMs = delayUs / 1000; // round down
     return delayMs;
@@ -1286,6 +1289,9 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i
  * actions.  This interface allows defrag to continue running, avoiding a single long defrag step
  * after the long operation completes. */
 void defragWhileBlocked(void) {
+    // This is called infrequently, while timers are not active.  We might need to start defrag.
+    if (!defragIsRunning()) monitorActiveDefrag();
+
     if (!defragIsRunning()) return;
 
     // Save off the timeproc_id.  If we have a normal termination, it will be cleared.

diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
@@ -138,8 +138,12 @@ run_solo {defrag} {
                 # reset stats and load the AOF file
                 r config resetstat
                 r config set key-load-delay -25 ;# sleep on average 1/25 usec
+                # Note: This test is checking if defrag is working DURING AOF loading (while
+                #       timers are not active).  So we don't give any extra time, and we deactivate
+                #       defrag immediately after the AOF loading is complete.  During loading,
+                #       defrag will get invoked less often, causing starvation prevention.  We
+                #       should expect longer latency measurements.
                 r debug loadaof
-                after 1000 ;# give defrag a chance to work before turning it off
                 r config set activedefrag no
 
                 # measure hits and misses right after aof loading