From 67e0a07024c61cb45e2099d9b2dd56d5e6cb6e3a Mon Sep 17 00:00:00 2001
From: TB Schardl <neboat@mit.edu>
Date: Mon, 25 Jul 2022 19:53:07 +0000
Subject: [PATCH] Version 2.0.1 release

---
 include/cilk/cilk_api.h     |   5 +-
 include/cilk/cilk_stub.h    |   2 +
 runtime/init.c              |   9 +++
 runtime/local.h             |   1 +
 runtime/pedigree-internal.h |  16 ++--
 runtime/rts-config.h        |   8 +-
 runtime/sched_stats.c       |  40 +++++-----
 runtime/scheduler.c         |  32 ++++----
 runtime/worker_coord.h      |  44 ++++++-----
 runtime/worker_sleep.h      | 151 ++++++++++++++++++++++++++++--------
 10 files changed, 206 insertions(+), 102 deletions(-)

diff --git a/include/cilk/cilk_api.h b/include/cilk/cilk_api.h
index 112c4471..7320789d 100644
--- a/include/cilk/cilk_api.h
+++ b/include/cilk/cilk_api.h
@@ -4,7 +4,10 @@
 #include <stddef.h> /* size_t */
 
 #ifdef __cplusplus
+#define __CILKRTS_NOTHROW noexcept
 extern "C" {
+#else
+#define __CILKRTS_NOTHROW
 #endif
 
 extern int __cilkrts_is_initialized(void);
@@ -20,7 +23,7 @@ typedef struct __cilkrts_pedigree {
     struct __cilkrts_pedigree *parent;
 } __cilkrts_pedigree;
 extern __cilkrts_pedigree __cilkrts_get_pedigree(void);
-extern void __cilkrts_bump_worker_rank(void);
+extern void __cilkrts_bump_worker_rank(void) __CILKRTS_NOTHROW;
 extern void __cilkrts_dprand_set_seed(uint64_t seed);
 extern void __cilkrts_init_dprng(void);
 extern uint64_t __cilkrts_get_dprand(void);
diff --git a/include/cilk/cilk_stub.h b/include/cilk/cilk_stub.h
index 08a849db..983c9a15 100644
--- a/include/cilk/cilk_stub.h
+++ b/include/cilk/cilk_stub.h
@@ -2,3 +2,5 @@
 #define cilk_spawn /* empty */
 #define cilk_sync /* empty */
 #define cilk_scope /* empty */
+
+#define cilk_reducer(I,R) /* empty */
diff --git a/runtime/init.c b/runtime/init.c
index f76c5e7f..c8ca5bbb 100644
--- a/runtime/init.c
+++ b/runtime/init.c
@@ -50,6 +50,7 @@ static local_state *worker_local_init(local_state *l, global_state *g) {
     l->state = WORKER_IDLE;
     l->provably_good_steal = false;
     l->rand_next = 0; /* will be reset in scheduler loop */
+    l->wake_val = 0;
     cilk_sched_stats_init(&(l->stats));
 
     return l;
@@ -118,6 +119,7 @@ __cilkrts_worker *__cilkrts_init_tls_worker(worker_id i, global_state *g) {
     return w;
 }
 
+#if ENABLE_WORKER_PINNING
 #ifdef CPU_SETSIZE
 static void move_bit(int cpu, cpu_set_t *to, cpu_set_t *from) {
     if (CPU_ISSET(cpu, from)) {
@@ -126,11 +128,13 @@ static void move_bit(int cpu, cpu_set_t *to, cpu_set_t *from) {
     }
 }
 #endif
+#endif // ENABLE_WORKER_PINNING
 
 static void threads_init(global_state *g) {
     /* TODO: Mac OS has a better interface allowing the application
        to request that two threads run as far apart as possible by
        giving them distinct "affinity tags". */
+#if ENABLE_WORKER_PINNING
 #ifdef CPU_SETSIZE
     // Affinity setting, from cilkplus-rts
     cpu_set_t process_mask;
@@ -171,6 +175,7 @@ static void threads_init(global_state *g) {
         break;
     }
 #endif
+#endif // ENABLE_WORKER_PINNING
     int n_threads = g->nworkers;
     CILK_ASSERT_G(n_threads > 0);
 
@@ -178,6 +183,7 @@ static void threads_init(global_state *g) {
 
     cilkrts_alert(BOOT, NULL, "(threads_init) Setting up threads");
 
+#if ENABLE_WORKER_PINNING
 #ifdef CPU_SETSIZE
     /* Three cases: core count at least twice worker count, allocate
        groups of floor(worker count / core count) CPUs.
@@ -201,6 +207,7 @@ static void threads_init(global_state *g) {
         }
     }
 #endif
+#endif // ENABLE_WORKER_PINNING
     int worker_start =
 #if BOSS_THIEF
             1
@@ -216,6 +223,7 @@ static void threads_init(global_state *g) {
             cilkrts_bug(NULL, "Cilk: thread creation (%u) failed: %s", w,
                         strerror(status));
 
+#if ENABLE_WORKER_PINNING
 #ifdef CPU_SETSIZE
         if (available_cores > 0) {
             /* Skip to the next active CPU ID.  */
@@ -243,6 +251,7 @@ static void threads_init(global_state *g) {
             CILK_ASSERT_G(err == 0);
         }
 #endif
+#endif // ENABLE_WORKER_PINNING
     }
 }
 
diff --git a/runtime/local.h b/runtime/local.h
index aafd7947..85360aff 100644
--- a/runtime/local.h
+++ b/runtime/local.h
@@ -14,6 +14,7 @@ struct local_state {
     unsigned short state; /* __cilkrts_worker_state */
     bool provably_good_steal;
     unsigned int rand_next;
+    uint32_t wake_val;
 
     jmpbuf rts_ctx;
     struct cilk_fiber_pool fiber_pool;
diff --git a/runtime/pedigree-internal.h b/runtime/pedigree-internal.h
index bd6a794e..e9901174 100644
--- a/runtime/pedigree-internal.h
+++ b/runtime/pedigree-internal.h
@@ -17,22 +17,22 @@ typedef struct __pedigree_frame {
     int64_t dprng_depth;
 } __pedigree_frame;
 
-typedef struct __pedigree_frame_storage_t {
-    size_t next_pedigree_frame;
-    __pedigree_frame* frames;
-} __pedigree_frame_storage_t;
-
-
 ///////////////////////////////////////////////////////////////////////////
 // Helper methods
 
 static inline __attribute__((malloc)) __pedigree_frame *
 push_pedigree_frame(__cilkrts_worker *w) {
+#if ENABLE_EXTENSION
     return __cilkrts_push_ext_stack(w, sizeof(__pedigree_frame));
+#else
+    return NULL;
+#endif
 }
 
 static inline void pop_pedigree_frame(__cilkrts_worker *w) {
+#if ENABLE_EXTENSION
     __cilkrts_pop_ext_stack(w, sizeof(__pedigree_frame));
+#endif
 }
 
 static inline uint64_t __cilkrts_dprng_swap_halves(uint64_t x) {
@@ -63,11 +63,15 @@ static inline uint64_t __cilkrts_dprng_sum_mod_p(uint64_t a, uint64_t b) {
 // Helper method to advance the pedigree and dprng states.
 static inline __attribute__((always_inline)) __pedigree_frame *
 bump_worker_rank(void) {
+#if ENABLE_EXTENSION
     __pedigree_frame *frame = (__pedigree_frame *)(__cilkrts_get_extension());
     frame->rank++;
     frame->dprng_dotproduct = __cilkrts_dprng_sum_mod_p(
         frame->dprng_dotproduct, __pedigree_dprng_m_array[frame->dprng_depth]);
     return frame;
+#else
+    return NULL;
+#endif
 }
 
 #endif // _PEDIGREE_INTERNAL_H
diff --git a/runtime/rts-config.h b/runtime/rts-config.h
index f04daef5..33d4ca04 100644
--- a/runtime/rts-config.h
+++ b/runtime/rts-config.h
@@ -44,13 +44,7 @@
 
 #define ENABLE_EXTENSION 1
 
-#if defined __linux__
-#define CILK_PAGE_SIZE 0 /* page size not available at compile time */
-#elif defined __APPLE__
-#define CILK_PAGE_SIZE 4096 /* Apple implies x86 or ARM */
-#else
-#include <machine/param.h>
-#endif
+#define ENABLE_WORKER_PINNING 0
 
 #define MIN_NUM_PAGES_PER_STACK 4
 #define MAX_NUM_PAGES_PER_STACK 2000
diff --git a/runtime/sched_stats.c b/runtime/sched_stats.c
index bb8e60c9..1c55ef05 100644
--- a/runtime/sched_stats.c
+++ b/runtime/sched_stats.c
@@ -8,6 +8,7 @@
 #include "internal-malloc-impl.h"
 #include "local.h"
 #include "sched_stats.h"
+#include "types.h"
 
 #if SCHED_STATS
 static const char *enum_to_str(enum timing_type t) {
@@ -157,14 +158,15 @@ void cilk_exit_worker_timing(struct global_state *g) {
 
 static void sched_stats_reset_worker(__cilkrts_worker *w,
                                      void *data __attribute__((unused))) {
+    local_state *l = w->l;
     for (int t = 0; t < NUMBER_OF_STATS; t++) {
-        w->l->stats.time[t] = 0;
-        w->l->stats.count[t] = 0;
+        l->stats.time[t] = 0;
+        l->stats.count[t] = 0;
     }
-    w->l->stats.steals = 0;
-    w->l->stats.repos = 0;
-    w->l->stats.reeng_rqsts = 0;
-    w->l->stats.onesen_rqsts = 0;
+    l->stats.steals = 0;
+    l->stats.repos = 0;
+    l->stats.reeng_rqsts = 0;
+    l->stats.onesen_rqsts = 0;
 }
 
 #define COL_DESC "%15s"
@@ -177,22 +179,24 @@ static void sched_stats_reset_worker(__cilkrts_worker *w,
 static void sched_stats_print_worker(__cilkrts_worker *w, void *data) {
     FILE *fp = (FILE *)data;
     fprintf(fp, WORKER_HDR_DESC, "Worker", w->self);
+    global_state *g = w->g;
+    local_state *l = w->l;
     for (int t = 0; t < NUMBER_OF_STATS; t++) {
-        double tmp = nsec_to_sec(w->l->stats.time[t]);
-        w->g->stats.time[t] += (double)tmp;
-        uint64_t tmp_count = w->l->stats.count[t];
-        w->g->stats.count[t] += tmp_count;
+        double tmp = nsec_to_sec(l->stats.time[t]);
+        g->stats.time[t] += (double)tmp;
+        uint64_t tmp_count = l->stats.count[t];
+        g->stats.count[t] += tmp_count;
         fprintf(fp, FIELD_DESC, tmp, tmp_count);
     }
-    w->g->stats.steals += w->l->stats.steals;
-    w->g->stats.repos += w->l->stats.repos;
-    w->g->stats.reeng_rqsts += w->l->stats.reeng_rqsts;
-    w->g->stats.onesen_rqsts += w->l->stats.onesen_rqsts;
+    g->stats.steals += l->stats.steals;
+    g->stats.repos += l->stats.repos;
+    g->stats.reeng_rqsts += l->stats.reeng_rqsts;
+    g->stats.onesen_rqsts += l->stats.onesen_rqsts;
 
-    fprintf(stderr, COUNT_DESC, w->l->stats.steals);
-    fprintf(stderr, COUNT_DESC, w->l->stats.repos);
-    fprintf(stderr, COUNT_DESC, w->l->stats.reeng_rqsts);
-    fprintf(stderr, COUNT_DESC, w->l->stats.onesen_rqsts);
+    fprintf(stderr, COUNT_DESC, l->stats.steals);
+    fprintf(stderr, COUNT_DESC, l->stats.repos);
+    fprintf(stderr, COUNT_DESC, l->stats.reeng_rqsts);
+    fprintf(stderr, COUNT_DESC, l->stats.onesen_rqsts);
     fprintf(fp, "\n");
 }
 
diff --git a/runtime/scheduler.c b/runtime/scheduler.c
index cc24c4a1..5ee99048 100644
--- a/runtime/scheduler.c
+++ b/runtime/scheduler.c
@@ -172,6 +172,14 @@ static void setup_for_sync(__cilkrts_worker *w, Closure *t) {
     CILK_ASSERT_POINTER_EQUAL(w, w->current_stack_frame, t->frame);
 
     SP(t->frame) = (void *)t->orig_rsp;
+    if (USE_EXTENSION) {
+        // Set the worker's extension (analogous to updating the worker's stack
+        // pointer).
+        w->extension = t->frame->extension;
+        // Set the worker's extension stack to be the start of the saved
+        // extension fiber.
+        w->ext_stack = sysdep_get_stack_start(t->ext_fiber);
+    }
     t->orig_rsp = NULL; // unset once we have sync-ed
     atomic_store_explicit(&t->frame->worker, w, memory_order_relaxed);
 }
@@ -1513,10 +1521,9 @@ void worker_scheduler(__cilkrts_worker *w) {
     // Get the number of workers.  We don't currently support changing the
     // number of workers dynamically during execution of a Cilkified region.
     unsigned int nworkers = rts->nworkers;
-    // Initialize count of consecutive failed steal attempts.  Effectively,
-    // every worker is active upon entering this routine.
-    unsigned int fails = 0;
-    unsigned int request_threshold = SENTINEL_THRESHOLD;
+    // Initialize count of consecutive failed steal attempts.
+    unsigned int fails = init_fails(w->l->wake_val, rts);
+    unsigned int sample_threshold = SENTINEL_THRESHOLD;
     // Local history information of the state of the system, for sentinel
     // workers to use to determine when to disengage and how many workers to
     // reengage.
@@ -1566,19 +1573,14 @@ void worker_scheduler(__cilkrts_worker *w) {
                         index_to_worker[get_rand(rand_state) % stealable];
                 rand_state = update_rand_state(rand_state);
                 while (victim == self) {
-                    busy_loop_pause();
                     victim = index_to_worker[get_rand(rand_state) % stealable];
                     rand_state = update_rand_state(rand_state);
                 }
                 // Attempt to steal from that victim.
                 t = Closure_steal(workers, deques, w, victim);
                 if (!t) {
-                    // Pause inside this busy loop.  We perform many pause
-                    // instructions in order to limit how much memory bandwidth
-                    // the theif consumes.
-                    for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) {
-                        busy_loop_pause();
-                    }
+                    // Pause inside this busy loop.
+                    steal_short_pause();
                 }
             } while (!t && --attempt > 0);
 
@@ -1593,7 +1595,7 @@ void worker_scheduler(__cilkrts_worker *w) {
             }
 #endif
             fails = go_to_sleep_maybe(
-                rts, self, nworkers, w, t, fails, &request_threshold,
+                rts, self, nworkers, w, t, fails, &sample_threshold,
                 &inefficient_history, &efficient_history,
                 sentinel_count_history, &sentinel_count_history_tail,
                 &recent_sentinel_count);
@@ -1622,14 +1624,14 @@ void worker_scheduler(__cilkrts_worker *w) {
                 // Decrement the count of failed steal attempts based on the
                 // amount of work done.
                 fails = decrease_fails_by_work(rts, w, fails, elapsed,
-                                               &request_threshold);
+                                               &sample_threshold);
                 if (fails < SENTINEL_THRESHOLD) {
                     inefficient_history = 0;
                     efficient_history = 0;
                 }
             } else {
                 fails = 0;
-                request_threshold = SENTINEL_THRESHOLD;
+                sample_threshold = SENTINEL_THRESHOLD;
             }
 #endif // ENABLE_THIEF_SLEEP
             t = NULL;
@@ -1696,7 +1698,7 @@ void *scheduler_thread_proc(void *arg) {
 #endif
             if (thief_should_wait(rts)) {
                 disengage_worker(rts, nworkers, self);
-                thief_wait(rts);
+                w->l->wake_val = thief_wait(rts);
                 reengage_worker(rts, nworkers, self);
             }
 #if !BOSS_THIEF
diff --git a/runtime/worker_coord.h b/runtime/worker_coord.h
index ebde70e2..19b1e150 100644
--- a/runtime/worker_coord.h
+++ b/runtime/worker_coord.h
@@ -4,6 +4,7 @@
 // Routines for coordinating workers, specifically, putting workers to sleep and
 // waking workers when execution enters and leaves cilkified regions.
 
+#include <stdint.h>
 #include <limits.h>
 
 #ifdef __linux__
@@ -34,8 +35,8 @@
 
 // Convenience wrapper for futex syscall.
 static inline long futex(_Atomic uint32_t *uaddr, int futex_op, uint32_t val,
-                  const struct timespec *timeout, uint32_t *uaddr2,
-                  uint32_t val3) {
+                         const struct timespec *timeout, uint32_t *uaddr2,
+                         uint32_t val3) {
     return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3);
 }
 
@@ -74,7 +75,7 @@ static inline void fbroadcast(_Atomic uint32_t *futexp) {
 // Called by a worker thread.  Causes the worker thread to wait on the given
 // flag-futex pair.
 static inline void worker_wait(volatile atomic_bool *flag,
-                        _Atomic uint32_t *flag_futex) {
+                               _Atomic uint32_t *flag_futex) {
     while (!atomic_load_explicit(flag, memory_order_acquire)) {
         fwait(flag_futex);
     }
@@ -82,7 +83,7 @@ static inline void worker_wait(volatile atomic_bool *flag,
 
 // Start all workers waiting on the given flag-futex pair.
 static inline void worker_start_broadcast(volatile atomic_bool *flag,
-                                   _Atomic uint32_t *flag_futex) {
+                                          _Atomic uint32_t *flag_futex) {
     atomic_store_explicit(flag, 1, memory_order_release);
     fbroadcast(flag_futex);
 }
@@ -90,7 +91,7 @@ static inline void worker_start_broadcast(volatile atomic_bool *flag,
 // Reset the given flag-futex pair, so that workers will eventually resume
 // waiting on that flag-futex pair.
 static inline void worker_clear_start(volatile atomic_bool *flag,
-                               _Atomic uint32_t *flag_futex) {
+                                      _Atomic uint32_t *flag_futex) {
     atomic_store_explicit(flag, 0, memory_order_relaxed);
     atomic_store_explicit(flag_futex, 0, memory_order_relaxed);
 }
@@ -103,8 +104,8 @@ static inline void worker_clear_start(volatile atomic_bool *flag,
 // Called by a worker thread.  Causes the worker thread to wait on the given
 // flag and associated mutex and condition variable.
 static inline void worker_wait(volatile atomic_bool *flag,
-                        pthread_mutex_t *flag_lock,
-                        pthread_cond_t *flag_cond_var) {
+                               pthread_mutex_t *flag_lock,
+                               pthread_cond_t *flag_cond_var) {
     pthread_mutex_lock(flag_lock);
     while (!atomic_load_explicit(flag, memory_order_acquire)) {
         pthread_cond_wait(flag_cond_var, flag_lock);
@@ -115,8 +116,8 @@ static inline void worker_wait(volatile atomic_bool *flag,
 // Start all workers waiting on the given flag and associated mutex and
 // condition variable.
 static inline void worker_start_broadcast(volatile atomic_bool *flag,
-                                   pthread_mutex_t *flag_lock,
-                                   pthread_cond_t *flag_cond_var) {
+                                          pthread_mutex_t *flag_lock,
+                                          pthread_cond_t *flag_cond_var) {
     pthread_mutex_lock(flag_lock);
     atomic_store_explicit(flag, 1, memory_order_release);
     pthread_cond_broadcast(flag_cond_var);
@@ -352,7 +353,7 @@ static inline void request_more_thieves(global_state *g, uint32_t count) {
 }
 
 #if USE_FUTEX
-static inline void thief_disengage_futex(_Atomic uint32_t *futexp) {
+static inline uint32_t thief_disengage_futex(_Atomic uint32_t *futexp) {
     // This step synchronizes with calls to request_more_thieves.
     while (true) {
         // Decrement the futex when woken up.  The loop and compare-exchange are
@@ -363,7 +364,7 @@ static inline void thief_disengage_futex(_Atomic uint32_t *futexp) {
             if (atomic_compare_exchange_strong_explicit(futexp, &val, val - 1,
                                                         memory_order_release,
                                                         memory_order_acquire)) {
-                return;
+                return val;
             }
         }
 
@@ -374,9 +375,9 @@ static inline void thief_disengage_futex(_Atomic uint32_t *futexp) {
     }
 }
 #else
-static inline void thief_disengage_cond_var(_Atomic uint32_t *count,
-                                            pthread_mutex_t *lock,
-                                            pthread_cond_t *cond_var) {
+static inline uint32_t thief_disengage_cond_var(_Atomic uint32_t *count,
+                                                pthread_mutex_t *lock,
+                                                pthread_cond_t *cond_var) {
     // This step synchronizes with calls to request_more_thieves.
     pthread_mutex_lock(lock);
     while (true) {
@@ -384,18 +385,19 @@ static inline void thief_disengage_cond_var(_Atomic uint32_t *count,
         if (val > 0) {
             atomic_store_explicit(count, val - 1, memory_order_release);
             pthread_mutex_unlock(lock);
-            return;
+            return val;
         }
         pthread_cond_wait(cond_var, lock);
     }
 }
 #endif
-static inline void thief_disengage(global_state *g) {
+static inline uint32_t thief_disengage(global_state *g) {
 #if USE_FUTEX
-    thief_disengage_futex(&g->disengaged_thieves_futex);
+    return thief_disengage_futex(&g->disengaged_thieves_futex);
 #else
-    thief_disengage_cond_var(&g->disengaged_thieves_futex, &g->disengaged_lock,
-                             &g->disengaged_cond_var);
+    return thief_disengage_cond_var(&g->disengaged_thieves_futex,
+                                    &g->disengaged_lock,
+                                    &g->disengaged_cond_var);
 #endif
 }
 
@@ -425,8 +427,8 @@ static inline void sleep_thieves(global_state *g) {
 
 // Called by a thief thread.  Causes the thief thread to wait for a signal to
 // start work-stealing.
-static inline void thief_wait(global_state *g) {
-    thief_disengage(g);
+static inline uint32_t thief_wait(global_state *g) {
+    return thief_disengage(g);
 }
 
 // Called by a thief thread.  Check if the thief should start waiting for the
diff --git a/runtime/worker_sleep.h b/runtime/worker_sleep.h
index 236c2774..f3d074ca 100644
--- a/runtime/worker_sleep.h
+++ b/runtime/worker_sleep.h
@@ -1,7 +1,10 @@
 #ifndef _WORKER_SLEEP_H
 #define _WORKER_SLEEP_H
 
+#include <time.h>
+
 #include "cilk-internal.h"
+#include "sched_stats.h"
 #include "worker_coord.h"
 
 #if defined(__APPLE__) && defined(__aarch64__)
@@ -14,10 +17,11 @@
 
 // Nanoseconds that a sentinel worker should sleep if it reaches the disengage
 // threshold but does not disengage.
-/* #define SLEEP_NSEC 12500 */
+/* #define NAP_NSEC 12500 */
 #define NAP_NSEC 25000
-/* #define SLEEP_NSEC 50000 */
-#define SLEEP_NSEC 4 * NAP_NSEC
+/* #define NAP_NSEC 50000 */
+/* #define SLEEP_NSEC 4 * NAP_NSEC */
+#define SLEEP_NSEC NAP_NSEC
 
 // Ratio of active workers over sentinels that the system aims to maintain.
 #define AS_RATIO 2
@@ -48,15 +52,44 @@ typedef uint32_t history_t;
 // attempts don't take too much memory bandwidth away from the workers doing
 // work.
 #define STEAL_BUSY_PAUSE 16
+#define LONG_STEAL_BUSY_PAUSE 200
 
 static inline __attribute__((always_inline)) uint64_t gettime_fast(void) {
+    // __builtin_readcyclecounter triggers "illegal instruction" errors on ARM64
+    // chips, unless user-level access to the cycle counter has been enabled in
+    // the kernel.  Since we cannot rely on that, we use other means to measure
+    // the time.
 #ifdef APPLE_ARM64
-    // __builtin_readcyclecounter triggers "illegal instruction" runtime errors
-    // on Apple M1s.
     return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+#elif defined(__aarch64__)
+    struct timespec res;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &res);
+    return (res.tv_sec * 1e9) + (res.tv_nsec);
 #else
     return __builtin_readcyclecounter();
-#endif // #if APPLE_ARM64
+#endif
+}
+
+static inline __attribute__((always_inline)) void steal_short_pause(void) {
+    // We perform many pause instructions in order to limit how much memory
+    // bandwidth and other computing resources the thief consumes.
+#if defined(__aarch64__)
+    for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) {
+        busy_loop_pause();
+    }
+#else
+    uint64_t start = __builtin_readcyclecounter();
+    do {
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+        busy_loop_pause();
+    } while ((__builtin_readcyclecounter() - start) < 1600);
+#endif
 }
 
 typedef struct worker_counts {
@@ -193,7 +226,7 @@ is_efficient(worker_counts counts) {
 // Convert the elapsed time spent working into a fail count.
 __attribute__((const, always_inline)) static inline unsigned int
 get_scaled_elapsed(unsigned int elapsed) {
-#ifdef APPLE_ARM64
+#if defined(__aarch64__)
     return ((elapsed * (1 * SENTINEL_THRESHOLD) / (16 * 65536)) / ATTEMPTS) *
            ATTEMPTS;
 #else
@@ -208,7 +241,7 @@ __attribute__((always_inline)) static inline unsigned int
 maybe_reengage_workers(global_state *const rts, worker_id self,
                        unsigned int nworkers, __cilkrts_worker *const w,
                        unsigned int fails,
-                       unsigned int *const request_threshold,
+                       unsigned int *const sample_threshold,
                        history_t *const inefficient_history,
                        history_t *const efficient_history,
                        unsigned int *const sentinel_count_history,
@@ -230,7 +263,7 @@ maybe_reengage_workers(global_state *const rts, worker_id self,
         history_t my_efficient_history = *efficient_history;
         history_t my_inefficient_history = *inefficient_history;
         unsigned int my_sentinel_count = *recent_sentinel_count;
-        if (fails >= *request_threshold) {
+        if (fails >= *sample_threshold) {
             // Update the inefficient history.
             history_t curr_ineff = is_inefficient(counts);
             my_inefficient_history = (my_inefficient_history >> 1) |
@@ -286,11 +319,13 @@ maybe_reengage_workers(global_state *const rts, worker_id self,
         // Make sure at least 1 worker is requested if we're about to run
         // out of sentinels.
         if (request == 0 && counts.sentinels == 0 &&
-            counts.active < (int32_t)nworkers &&
-            !atomic_load_explicit(&rts->disengaged_thieves_futex,
-                                  memory_order_relaxed)) {
-            request = (counts.active + 3) / 4;
-            WHEN_SCHED_STATS(w->l->stats.onesen_rqsts += request);
+            counts.active < (int32_t)nworkers) {
+            int32_t current_request = atomic_load_explicit(
+                &rts->disengaged_thieves_futex, memory_order_relaxed);
+            if (current_request < ((counts.active + 3) / 4)) {
+                request = ((counts.active + 3) / 4) - current_request;
+                WHEN_SCHED_STATS(w->l->stats.onesen_rqsts += request);
+            }
         }
 
         if (request > 0) {
@@ -298,13 +333,14 @@ maybe_reengage_workers(global_state *const rts, worker_id self,
         }
 
         // Set a cap on the fail count.
-        if (fails > DISENGAGE_THRESHOLD)
-            fails = DISENGAGE_THRESHOLD;
+        if (fails > SENTINEL_THRESHOLD) {
+            fails = SENTINEL_THRESHOLD;
+        }
 
         // Update request threshold so that, in case this worker ends up
         // executing a small task, it still adds samples to its history that
         // are spread out in time.
-        *request_threshold = fails + (SENTINEL_THRESHOLD / 1);
+        *sample_threshold = fails + (SENTINEL_THRESHOLD / 1);
     }
 
     return fails;
@@ -350,7 +386,7 @@ __attribute__((always_inline)) static inline unsigned int
 handle_failed_steal_attempts(global_state *const rts, worker_id self,
                              unsigned int nworkers, __cilkrts_worker *const w,
                              unsigned int fails,
-                             unsigned int *const request_threshold,
+                             unsigned int *const sample_threshold,
                              history_t *const inefficient_history,
                              history_t *const efficient_history,
                              unsigned int *const sentinel_count_history,
@@ -382,8 +418,7 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self,
 #if BOSS_THIEF
             if (is_boss_thread) {
                 if (fails % NAP_THRESHOLD == 0) {
-                    // The boss thread should never disengage.  Sleep
-                    // instead.
+                    // The boss thread should never disengage.  Sleep instead.
                     const struct timespec sleeptime = {
                         .tv_sec = 0,
                         .tv_nsec =
@@ -394,6 +429,7 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self,
 #else
             {
 #endif
+#if ENABLE_THIEF_SLEEP
                 // Check if the current worker counts.
                 uint64_t disengaged_sentinel = atomic_load_explicit(
                     &rts->disengaged_sentinel, memory_order_acquire);
@@ -455,13 +491,16 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self,
                         }
 
                         // Update fail count
-                        if (scaled_elapsed < SENTINEL_THRESHOLD)
+                        if (scaled_elapsed < SENTINEL_THRESHOLD) {
                             fails -= scaled_elapsed;
-                        else {
+                        } else {
                             fails = DISENGAGE_THRESHOLD - SENTINEL_THRESHOLD;
                         }
-                        *request_threshold = SENTINEL_THRESHOLD;
+                        *sample_threshold = SENTINEL_THRESHOLD;
                     }
+#else
+                if (false) {
+#endif
                 } else if (fails % NAP_THRESHOLD == 0) {
                     // We have enough active workers to keep this worker out of
                     // disengage, but this worker was still unable to steal
@@ -470,24 +509,48 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self,
                     // approximately 50 us.
                     const struct timespec sleeptime = {
                         .tv_sec = 0,
-                        .tv_nsec = 
+                        .tv_nsec =
                             (fails > SLEEP_THRESHOLD) ? SLEEP_NSEC : NAP_NSEC};
                     nanosleep(&sleeptime, NULL);
                 } else {
                     // We perform many pause instructions to reduce the thief's
                     // load on the system in a lightweight manner.
-                    for (int i = 0; i < 8 * ATTEMPTS; ++i) {
+#if defined(__aarch64__)
+                    for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) {
                         busy_loop_pause();
                     }
+#else
+                    uint64_t start = __builtin_readcyclecounter();
+                    do {
+                        busy_loop_pause();
+                        busy_loop_pause();
+                        busy_loop_pause();
+                        busy_loop_pause();
+                    } while ((__builtin_readcyclecounter() - start) < 800);
+#endif
                 }
             }
         }
     } else {
         // We perform many pause instructions to reduce the thief's load on
         // the system in a lightweight manner.
-        for (int i = 0; i < 32 * ATTEMPTS; ++i) {
+#if defined(__aarch64__)
+        for (int i = 0; i < LONG_STEAL_BUSY_PAUSE; ++i) {
             busy_loop_pause();
         }
+#else
+        uint64_t start = __builtin_readcyclecounter();
+        do {
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+            busy_loop_pause();
+        } while ((__builtin_readcyclecounter() - start) < 3600);
+#endif
     }
     CILK_STOP_TIMING(w, INTERVAL_SLEEP);
     return fails;
@@ -498,7 +561,7 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self,
                                       unsigned int nworkers,
                                       __cilkrts_worker *const w,
                                       Closure *const t, unsigned int fails,
-                                      unsigned int *const request_threshold,
+                                      unsigned int *const sample_threshold,
                                       history_t *const inefficient_history,
                                       history_t *const efficient_history,
                                       unsigned int *const sentinel_count_history,
@@ -506,12 +569,12 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self,
                                       unsigned int *const recent_sentinel_count) {
     if (t) {
         return maybe_reengage_workers(
-            rts, self, nworkers, w, fails, request_threshold,
+            rts, self, nworkers, w, fails, sample_threshold,
             inefficient_history, efficient_history, sentinel_count_history,
             sentinel_count_history_tail, recent_sentinel_count);
     } else {
         return handle_failed_steal_attempts(
-            rts, self, nworkers, w, fails, request_threshold,
+            rts, self, nworkers, w, fails, sample_threshold,
             inefficient_history, efficient_history, sentinel_count_history,
             sentinel_count_history_tail, recent_sentinel_count);
     }
@@ -521,7 +584,7 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self,
 __attribute__((always_inline)) static unsigned int
 decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w,
                        unsigned int fails, uint64_t elapsed,
-                       unsigned int *const request_threshold) {
+                       unsigned int *const sample_threshold) {
     uint64_t scaled_elapsed = get_scaled_elapsed(elapsed);
 
     // Decrease the number of fails based on the work done.
@@ -535,10 +598,10 @@ decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w,
     // work.
     CILK_ASSERT(w, fails % ATTEMPTS == 0);
 
-    if (scaled_elapsed > (uint64_t)(*request_threshold) - SENTINEL_THRESHOLD)
-        *request_threshold = SENTINEL_THRESHOLD;
+    if (scaled_elapsed > (uint64_t)(*sample_threshold) - SENTINEL_THRESHOLD)
+        *sample_threshold = SENTINEL_THRESHOLD;
     else
-        *request_threshold -= scaled_elapsed;
+        *sample_threshold -= scaled_elapsed;
 
     // If this worker is still sentinel, update sentinel-worker count.
     if (fails >= SENTINEL_THRESHOLD)
@@ -548,6 +611,26 @@ decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w,
 }
 #endif // ENABLE_THIEF_SLEEP
 
+__attribute__((always_inline)) static unsigned int
+init_fails(uint32_t wake_val, global_state *rts) {
+    // It's possible that a disengaged worker is woken up by a call to
+    // request_more_thieves, in which case it should be a sentinel.  But there
+    // isn't a direct way to tell how whether the worker should be active or a
+    // sentinel when it's woken up.  Since the maximum value of the futex when
+    // sentinels are engaging and disengaging during Cilk execution is
+    // nworkers/2, we simply assume that if the value of the futex is less than
+    // that value, then it should be a sentinel.
+    //
+    // As a result, when workers are woken up to start executing any new Cilk
+    // function, half of them will be active, and half sentinels.
+    if (wake_val <= (rts->nworkers / 2)) {
+        atomic_fetch_add_explicit(&rts->disengaged_sentinel, 1,
+                                  memory_order_release);
+        return SENTINEL_THRESHOLD;
+    }
+    return 0;
+}
+
 __attribute__((always_inline)) static unsigned int
 reset_fails(global_state *rts, unsigned int fails) {
     if (fails >= SENTINEL_THRESHOLD) {
@@ -574,7 +657,7 @@ disengage_worker(global_state *g, unsigned int nworkers, worker_id self) {
     cilk_mutex_unlock(&g->index_lock);
 }
 
- __attribute__((always_inline)) static inline void
+__attribute__((always_inline)) static inline void
 reengage_worker(global_state *g, unsigned int nworkers, worker_id self) {
     cilk_mutex_lock(&g->index_lock);
     uint64_t disengaged_sentinel = atomic_fetch_sub_explicit(