From 67e0a07024c61cb45e2099d9b2dd56d5e6cb6e3a Mon Sep 17 00:00:00 2001 From: TB Schardl Date: Mon, 25 Jul 2022 19:53:07 +0000 Subject: [PATCH] Version 2.0.1 release --- include/cilk/cilk_api.h | 5 +- include/cilk/cilk_stub.h | 2 + runtime/init.c | 9 +++ runtime/local.h | 1 + runtime/pedigree-internal.h | 16 ++-- runtime/rts-config.h | 8 +- runtime/sched_stats.c | 40 +++++----- runtime/scheduler.c | 32 ++++---- runtime/worker_coord.h | 44 ++++++----- runtime/worker_sleep.h | 151 ++++++++++++++++++++++++++++-------- 10 files changed, 206 insertions(+), 102 deletions(-) diff --git a/include/cilk/cilk_api.h b/include/cilk/cilk_api.h index 112c4471..7320789d 100644 --- a/include/cilk/cilk_api.h +++ b/include/cilk/cilk_api.h @@ -4,7 +4,10 @@ #include /* size_t */ #ifdef __cplusplus +#define __CILKRTS_NOTHROW noexcept extern "C" { +#else +#define __CILKRTS_NOTHROW #endif extern int __cilkrts_is_initialized(void); @@ -20,7 +23,7 @@ typedef struct __cilkrts_pedigree { struct __cilkrts_pedigree *parent; } __cilkrts_pedigree; extern __cilkrts_pedigree __cilkrts_get_pedigree(void); -extern void __cilkrts_bump_worker_rank(void); +extern void __cilkrts_bump_worker_rank(void) __CILKRTS_NOTHROW; extern void __cilkrts_dprand_set_seed(uint64_t seed); extern void __cilkrts_init_dprng(void); extern uint64_t __cilkrts_get_dprand(void); diff --git a/include/cilk/cilk_stub.h b/include/cilk/cilk_stub.h index 08a849db..983c9a15 100644 --- a/include/cilk/cilk_stub.h +++ b/include/cilk/cilk_stub.h @@ -2,3 +2,5 @@ #define cilk_spawn /* empty */ #define cilk_sync /* empty */ #define cilk_scope /* empty */ + +#define cilk_reducer(I,R) /* empty */ diff --git a/runtime/init.c b/runtime/init.c index f76c5e7f..c8ca5bbb 100644 --- a/runtime/init.c +++ b/runtime/init.c @@ -50,6 +50,7 @@ static local_state *worker_local_init(local_state *l, global_state *g) { l->state = WORKER_IDLE; l->provably_good_steal = false; l->rand_next = 0; /* will be reset in scheduler loop */ + l->wake_val = 0; cilk_sched_stats_init(&(l->stats)); return l; @@ -118,6 +119,7 @@ __cilkrts_worker *__cilkrts_init_tls_worker(worker_id i, global_state *g) { return w; } +#if ENABLE_WORKER_PINNING #ifdef CPU_SETSIZE static void move_bit(int cpu, cpu_set_t *to, cpu_set_t *from) { if (CPU_ISSET(cpu, from)) { @@ -126,11 +128,13 @@ static void move_bit(int cpu, cpu_set_t *to, cpu_set_t *from) { } } #endif +#endif // ENABLE_WORKER_PINNING static void threads_init(global_state *g) { /* TODO: Mac OS has a better interface allowing the application to request that two threads run as far apart as possible by giving them distinct "affinity tags". */ +#if ENABLE_WORKER_PINNING #ifdef CPU_SETSIZE // Affinity setting, from cilkplus-rts cpu_set_t process_mask; @@ -171,6 +175,7 @@ static void threads_init(global_state *g) { break; } #endif +#endif // ENABLE_WORKER_PINNING int n_threads = g->nworkers; CILK_ASSERT_G(n_threads > 0); @@ -178,6 +183,7 @@ static void threads_init(global_state *g) { cilkrts_alert(BOOT, NULL, "(threads_init) Setting up threads"); +#if ENABLE_WORKER_PINNING #ifdef CPU_SETSIZE /* Three cases: core count at least twice worker count, allocate groups of floor(worker count / core count) CPUs. @@ -201,6 +207,7 @@ static void threads_init(global_state *g) { } } #endif +#endif // ENABLE_WORKER_PINNING int worker_start = #if BOSS_THIEF 1 @@ -216,6 +223,7 @@ static void threads_init(global_state *g) { cilkrts_bug(NULL, "Cilk: thread creation (%u) failed: %s", w, strerror(status)); +#if ENABLE_WORKER_PINNING #ifdef CPU_SETSIZE if (available_cores > 0) { /* Skip to the next active CPU ID. */ @@ -243,6 +251,7 @@ static void threads_init(global_state *g) { CILK_ASSERT_G(err == 0); } #endif +#endif // ENABLE_WORKER_PINNING } } diff --git a/runtime/local.h b/runtime/local.h index aafd7947..85360aff 100644 --- a/runtime/local.h +++ b/runtime/local.h @@ -14,6 +14,7 @@ struct local_state { unsigned short state; /* __cilkrts_worker_state */ bool provably_good_steal; unsigned int rand_next; + uint32_t wake_val; jmpbuf rts_ctx; struct cilk_fiber_pool fiber_pool; diff --git a/runtime/pedigree-internal.h b/runtime/pedigree-internal.h index bd6a794e..e9901174 100644 --- a/runtime/pedigree-internal.h +++ b/runtime/pedigree-internal.h @@ -17,22 +17,22 @@ typedef struct __pedigree_frame { int64_t dprng_depth; } __pedigree_frame; -typedef struct __pedigree_frame_storage_t { - size_t next_pedigree_frame; - __pedigree_frame* frames; -} __pedigree_frame_storage_t; - - /////////////////////////////////////////////////////////////////////////// // Helper methods static inline __attribute__((malloc)) __pedigree_frame * push_pedigree_frame(__cilkrts_worker *w) { +#if ENABLE_EXTENSION return __cilkrts_push_ext_stack(w, sizeof(__pedigree_frame)); +#else + return NULL; +#endif } static inline void pop_pedigree_frame(__cilkrts_worker *w) { +#if ENABLE_EXTENSION __cilkrts_pop_ext_stack(w, sizeof(__pedigree_frame)); +#endif } static inline uint64_t __cilkrts_dprng_swap_halves(uint64_t x) { @@ -63,11 +63,15 @@ static inline uint64_t __cilkrts_dprng_sum_mod_p(uint64_t a, uint64_t b) { // Helper method to advance the pedigree and dprng states. static inline __attribute__((always_inline)) __pedigree_frame * bump_worker_rank(void) { +#if ENABLE_EXTENSION __pedigree_frame *frame = (__pedigree_frame *)(__cilkrts_get_extension()); frame->rank++; frame->dprng_dotproduct = __cilkrts_dprng_sum_mod_p( frame->dprng_dotproduct, __pedigree_dprng_m_array[frame->dprng_depth]); return frame; +#else + return NULL; +#endif } #endif // _PEDIGREE_INTERNAL_H diff --git a/runtime/rts-config.h b/runtime/rts-config.h index f04daef5..33d4ca04 100644 --- a/runtime/rts-config.h +++ b/runtime/rts-config.h @@ -44,13 +44,7 @@ #define ENABLE_EXTENSION 1 -#if defined __linux__ -#define CILK_PAGE_SIZE 0 /* page size not available at compile time */ -#elif defined __APPLE__ -#define CILK_PAGE_SIZE 4096 /* Apple implies x86 or ARM */ -#else -#include -#endif +#define ENABLE_WORKER_PINNING 0 #define MIN_NUM_PAGES_PER_STACK 4 #define MAX_NUM_PAGES_PER_STACK 2000 diff --git a/runtime/sched_stats.c b/runtime/sched_stats.c index bb8e60c9..1c55ef05 100644 --- a/runtime/sched_stats.c +++ b/runtime/sched_stats.c @@ -8,6 +8,7 @@ #include "internal-malloc-impl.h" #include "local.h" #include "sched_stats.h" +#include "types.h" #if SCHED_STATS static const char *enum_to_str(enum timing_type t) { @@ -157,14 +158,15 @@ void cilk_exit_worker_timing(struct global_state *g) { static void sched_stats_reset_worker(__cilkrts_worker *w, void *data __attribute__((unused))) { + local_state *l = w->l; for (int t = 0; t < NUMBER_OF_STATS; t++) { - w->l->stats.time[t] = 0; - w->l->stats.count[t] = 0; + l->stats.time[t] = 0; + l->stats.count[t] = 0; } - w->l->stats.steals = 0; - w->l->stats.repos = 0; - w->l->stats.reeng_rqsts = 0; - w->l->stats.onesen_rqsts = 0; + l->stats.steals = 0; + l->stats.repos = 0; + l->stats.reeng_rqsts = 0; + l->stats.onesen_rqsts = 0; } #define COL_DESC "%15s" @@ -177,22 +179,24 @@ static void sched_stats_reset_worker(__cilkrts_worker *w, static void sched_stats_print_worker(__cilkrts_worker *w, void *data) { FILE *fp = (FILE *)data; fprintf(fp, WORKER_HDR_DESC, "Worker", w->self); + global_state *g = w->g; + local_state *l = w->l; for (int t = 0; t < NUMBER_OF_STATS; t++) { - double tmp = nsec_to_sec(w->l->stats.time[t]); - w->g->stats.time[t] += (double)tmp; - uint64_t tmp_count = w->l->stats.count[t]; - w->g->stats.count[t] += tmp_count; + double tmp = nsec_to_sec(l->stats.time[t]); + g->stats.time[t] += (double)tmp; + uint64_t tmp_count = l->stats.count[t]; + g->stats.count[t] += tmp_count; fprintf(fp, FIELD_DESC, tmp, tmp_count); } - w->g->stats.steals += w->l->stats.steals; - w->g->stats.repos += w->l->stats.repos; - w->g->stats.reeng_rqsts += w->l->stats.reeng_rqsts; - w->g->stats.onesen_rqsts += w->l->stats.onesen_rqsts; + g->stats.steals += l->stats.steals; + g->stats.repos += l->stats.repos; + g->stats.reeng_rqsts += l->stats.reeng_rqsts; + g->stats.onesen_rqsts += l->stats.onesen_rqsts; - fprintf(stderr, COUNT_DESC, w->l->stats.steals); - fprintf(stderr, COUNT_DESC, w->l->stats.repos); - fprintf(stderr, COUNT_DESC, w->l->stats.reeng_rqsts); - fprintf(stderr, COUNT_DESC, w->l->stats.onesen_rqsts); + fprintf(stderr, COUNT_DESC, l->stats.steals); + fprintf(stderr, COUNT_DESC, l->stats.repos); + fprintf(stderr, COUNT_DESC, l->stats.reeng_rqsts); + fprintf(stderr, COUNT_DESC, l->stats.onesen_rqsts); fprintf(fp, "\n"); } diff --git a/runtime/scheduler.c b/runtime/scheduler.c index cc24c4a1..5ee99048 100644 --- a/runtime/scheduler.c +++ b/runtime/scheduler.c @@ -172,6 +172,14 @@ static void setup_for_sync(__cilkrts_worker *w, Closure *t) { CILK_ASSERT_POINTER_EQUAL(w, w->current_stack_frame, t->frame); SP(t->frame) = (void *)t->orig_rsp; + if (USE_EXTENSION) { + // Set the worker's extension (analogous to updating the worker's stack + // pointer). + w->extension = t->frame->extension; + // Set the worker's extension stack to be the start of the saved + // extension fiber. + w->ext_stack = sysdep_get_stack_start(t->ext_fiber); + } t->orig_rsp = NULL; // unset once we have sync-ed atomic_store_explicit(&t->frame->worker, w, memory_order_relaxed); } @@ -1513,10 +1521,9 @@ void worker_scheduler(__cilkrts_worker *w) { // Get the number of workers. We don't currently support changing the // number of workers dynamically during execution of a Cilkified region. unsigned int nworkers = rts->nworkers; - // Initialize count of consecutive failed steal attempts. Effectively, - // every worker is active upon entering this routine. - unsigned int fails = 0; - unsigned int request_threshold = SENTINEL_THRESHOLD; + // Initialize count of consecutive failed steal attempts. + unsigned int fails = init_fails(w->l->wake_val, rts); + unsigned int sample_threshold = SENTINEL_THRESHOLD; // Local history information of the state of the system, for sentinel // workers to use to determine when to disengage and how many workers to // reengage. @@ -1566,19 +1573,14 @@ void worker_scheduler(__cilkrts_worker *w) { index_to_worker[get_rand(rand_state) % stealable]; rand_state = update_rand_state(rand_state); while (victim == self) { - busy_loop_pause(); victim = index_to_worker[get_rand(rand_state) % stealable]; rand_state = update_rand_state(rand_state); } // Attempt to steal from that victim. t = Closure_steal(workers, deques, w, victim); if (!t) { - // Pause inside this busy loop. We perform many pause - // instructions in order to limit how much memory bandwidth - // the theif consumes. - for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) { - busy_loop_pause(); - } + // Pause inside this busy loop. + steal_short_pause(); } } while (!t && --attempt > 0); @@ -1593,7 +1595,7 @@ void worker_scheduler(__cilkrts_worker *w) { } #endif fails = go_to_sleep_maybe( - rts, self, nworkers, w, t, fails, &request_threshold, + rts, self, nworkers, w, t, fails, &sample_threshold, &inefficient_history, &efficient_history, sentinel_count_history, &sentinel_count_history_tail, &recent_sentinel_count); @@ -1622,14 +1624,14 @@ void worker_scheduler(__cilkrts_worker *w) { // Decrement the count of failed steal attempts based on the // amount of work done. fails = decrease_fails_by_work(rts, w, fails, elapsed, - &request_threshold); + &sample_threshold); if (fails < SENTINEL_THRESHOLD) { inefficient_history = 0; efficient_history = 0; } } else { fails = 0; - request_threshold = SENTINEL_THRESHOLD; + sample_threshold = SENTINEL_THRESHOLD; } #endif // ENABLE_THIEF_SLEEP t = NULL; @@ -1696,7 +1698,7 @@ void *scheduler_thread_proc(void *arg) { #endif if (thief_should_wait(rts)) { disengage_worker(rts, nworkers, self); - thief_wait(rts); + w->l->wake_val = thief_wait(rts); reengage_worker(rts, nworkers, self); } #if !BOSS_THIEF diff --git a/runtime/worker_coord.h b/runtime/worker_coord.h index ebde70e2..19b1e150 100644 --- a/runtime/worker_coord.h +++ b/runtime/worker_coord.h @@ -4,6 +4,7 @@ // Routines for coordinating workers, specifically, putting workers to sleep and // waking workers when execution enters and leaves cilkified regions. +#include #include #ifdef __linux__ @@ -34,8 +35,8 @@ // Convenience wrapper for futex syscall. static inline long futex(_Atomic uint32_t *uaddr, int futex_op, uint32_t val, - const struct timespec *timeout, uint32_t *uaddr2, - uint32_t val3) { + const struct timespec *timeout, uint32_t *uaddr2, + uint32_t val3) { return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3); } @@ -74,7 +75,7 @@ static inline void fbroadcast(_Atomic uint32_t *futexp) { // Called by a worker thread. Causes the worker thread to wait on the given // flag-futex pair. static inline void worker_wait(volatile atomic_bool *flag, - _Atomic uint32_t *flag_futex) { + _Atomic uint32_t *flag_futex) { while (!atomic_load_explicit(flag, memory_order_acquire)) { fwait(flag_futex); } @@ -82,7 +83,7 @@ static inline void worker_wait(volatile atomic_bool *flag, // Start all workers waiting on the given flag-futex pair. static inline void worker_start_broadcast(volatile atomic_bool *flag, - _Atomic uint32_t *flag_futex) { + _Atomic uint32_t *flag_futex) { atomic_store_explicit(flag, 1, memory_order_release); fbroadcast(flag_futex); } @@ -90,7 +91,7 @@ static inline void worker_start_broadcast(volatile atomic_bool *flag, // Reset the given flag-futex pair, so that workers will eventually resume // waiting on that flag-futex pair. static inline void worker_clear_start(volatile atomic_bool *flag, - _Atomic uint32_t *flag_futex) { + _Atomic uint32_t *flag_futex) { atomic_store_explicit(flag, 0, memory_order_relaxed); atomic_store_explicit(flag_futex, 0, memory_order_relaxed); } @@ -103,8 +104,8 @@ static inline void worker_clear_start(volatile atomic_bool *flag, // Called by a worker thread. Causes the worker thread to wait on the given // flag and associated mutex and condition variable. static inline void worker_wait(volatile atomic_bool *flag, - pthread_mutex_t *flag_lock, - pthread_cond_t *flag_cond_var) { + pthread_mutex_t *flag_lock, + pthread_cond_t *flag_cond_var) { pthread_mutex_lock(flag_lock); while (!atomic_load_explicit(flag, memory_order_acquire)) { pthread_cond_wait(flag_cond_var, flag_lock); @@ -115,8 +116,8 @@ static inline void worker_wait(volatile atomic_bool *flag, // Start all workers waiting on the given flag and associated mutex and // condition variable. static inline void worker_start_broadcast(volatile atomic_bool *flag, - pthread_mutex_t *flag_lock, - pthread_cond_t *flag_cond_var) { + pthread_mutex_t *flag_lock, + pthread_cond_t *flag_cond_var) { pthread_mutex_lock(flag_lock); atomic_store_explicit(flag, 1, memory_order_release); pthread_cond_broadcast(flag_cond_var); @@ -352,7 +353,7 @@ static inline void request_more_thieves(global_state *g, uint32_t count) { } #if USE_FUTEX -static inline void thief_disengage_futex(_Atomic uint32_t *futexp) { +static inline uint32_t thief_disengage_futex(_Atomic uint32_t *futexp) { // This step synchronizes with calls to request_more_thieves. while (true) { // Decrement the futex when woken up. The loop and compare-exchange are @@ -363,7 +364,7 @@ static inline void thief_disengage_futex(_Atomic uint32_t *futexp) { if (atomic_compare_exchange_strong_explicit(futexp, &val, val - 1, memory_order_release, memory_order_acquire)) { - return; + return val; } } @@ -374,9 +375,9 @@ static inline void thief_disengage_futex(_Atomic uint32_t *futexp) { } } #else -static inline void thief_disengage_cond_var(_Atomic uint32_t *count, - pthread_mutex_t *lock, - pthread_cond_t *cond_var) { +static inline uint32_t thief_disengage_cond_var(_Atomic uint32_t *count, + pthread_mutex_t *lock, + pthread_cond_t *cond_var) { // This step synchronizes with calls to request_more_thieves. pthread_mutex_lock(lock); while (true) { @@ -384,18 +385,19 @@ static inline void thief_disengage_cond_var(_Atomic uint32_t *count, if (val > 0) { atomic_store_explicit(count, val - 1, memory_order_release); pthread_mutex_unlock(lock); - return; + return val; } pthread_cond_wait(cond_var, lock); } } #endif -static inline void thief_disengage(global_state *g) { +static inline uint32_t thief_disengage(global_state *g) { #if USE_FUTEX - thief_disengage_futex(&g->disengaged_thieves_futex); + return thief_disengage_futex(&g->disengaged_thieves_futex); #else - thief_disengage_cond_var(&g->disengaged_thieves_futex, &g->disengaged_lock, - &g->disengaged_cond_var); + return thief_disengage_cond_var(&g->disengaged_thieves_futex, + &g->disengaged_lock, + &g->disengaged_cond_var); #endif } @@ -425,8 +427,8 @@ static inline void sleep_thieves(global_state *g) { // Called by a thief thread. Causes the thief thread to wait for a signal to // start work-stealing. -static inline void thief_wait(global_state *g) { - thief_disengage(g); +static inline uint32_t thief_wait(global_state *g) { + return thief_disengage(g); } // Called by a thief thread. Check if the thief should start waiting for the diff --git a/runtime/worker_sleep.h b/runtime/worker_sleep.h index 236c2774..f3d074ca 100644 --- a/runtime/worker_sleep.h +++ b/runtime/worker_sleep.h @@ -1,7 +1,10 @@ #ifndef _WORKER_SLEEP_H #define _WORKER_SLEEP_H +#include + #include "cilk-internal.h" +#include "sched_stats.h" #include "worker_coord.h" #if defined(__APPLE__) && defined(__aarch64__) @@ -14,10 +17,11 @@ // Nanoseconds that a sentinel worker should sleep if it reaches the disengage // threshold but does not disengage. -/* #define SLEEP_NSEC 12500 */ +/* #define NAP_NSEC 12500 */ #define NAP_NSEC 25000 -/* #define SLEEP_NSEC 50000 */ -#define SLEEP_NSEC 4 * NAP_NSEC +/* #define NAP_NSEC 50000 */ +/* #define SLEEP_NSEC 4 * NAP_NSEC */ +#define SLEEP_NSEC NAP_NSEC // Ratio of active workers over sentinels that the system aims to maintain. #define AS_RATIO 2 @@ -48,15 +52,44 @@ typedef uint32_t history_t; // attempts don't take too much memory bandwidth away from the workers doing // work. #define STEAL_BUSY_PAUSE 16 +#define LONG_STEAL_BUSY_PAUSE 200 static inline __attribute__((always_inline)) uint64_t gettime_fast(void) { + // __builtin_readcyclecounter triggers "illegal instruction" errors on ARM64 + // chips, unless user-level access to the cycle counter has been enabled in + // the kernel. Since we cannot rely on that, we use other means to measure + // the time. #ifdef APPLE_ARM64 - // __builtin_readcyclecounter triggers "illegal instruction" runtime errors - // on Apple M1s. return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); +#elif defined(__aarch64__) + struct timespec res; + clock_gettime(CLOCK_MONOTONIC_RAW, &res); + return (res.tv_sec * 1e9) + (res.tv_nsec); #else return __builtin_readcyclecounter(); -#endif // #if APPLE_ARM64 +#endif +} + +static inline __attribute__((always_inline)) void steal_short_pause(void) { + // We perform many pause instructions in order to limit how much memory + // bandwidth and other computing resources the thief consumes. +#if defined(__aarch64__) + for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) { + busy_loop_pause(); + } +#else + uint64_t start = __builtin_readcyclecounter(); + do { + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + } while ((__builtin_readcyclecounter() - start) < 1600); +#endif } typedef struct worker_counts { @@ -193,7 +226,7 @@ is_efficient(worker_counts counts) { // Convert the elapsed time spent working into a fail count. __attribute__((const, always_inline)) static inline unsigned int get_scaled_elapsed(unsigned int elapsed) { -#ifdef APPLE_ARM64 +#if defined(__aarch64__) return ((elapsed * (1 * SENTINEL_THRESHOLD) / (16 * 65536)) / ATTEMPTS) * ATTEMPTS; #else @@ -208,7 +241,7 @@ __attribute__((always_inline)) static inline unsigned int maybe_reengage_workers(global_state *const rts, worker_id self, unsigned int nworkers, __cilkrts_worker *const w, unsigned int fails, - unsigned int *const request_threshold, + unsigned int *const sample_threshold, history_t *const inefficient_history, history_t *const efficient_history, unsigned int *const sentinel_count_history, @@ -230,7 +263,7 @@ maybe_reengage_workers(global_state *const rts, worker_id self, history_t my_efficient_history = *efficient_history; history_t my_inefficient_history = *inefficient_history; unsigned int my_sentinel_count = *recent_sentinel_count; - if (fails >= *request_threshold) { + if (fails >= *sample_threshold) { // Update the inefficient history. history_t curr_ineff = is_inefficient(counts); my_inefficient_history = (my_inefficient_history >> 1) | @@ -286,11 +319,13 @@ maybe_reengage_workers(global_state *const rts, worker_id self, // Make sure at least 1 worker is requested if we're about to run // out of sentinels. if (request == 0 && counts.sentinels == 0 && - counts.active < (int32_t)nworkers && - !atomic_load_explicit(&rts->disengaged_thieves_futex, - memory_order_relaxed)) { - request = (counts.active + 3) / 4; - WHEN_SCHED_STATS(w->l->stats.onesen_rqsts += request); + counts.active < (int32_t)nworkers) { + int32_t current_request = atomic_load_explicit( + &rts->disengaged_thieves_futex, memory_order_relaxed); + if (current_request < ((counts.active + 3) / 4)) { + request = ((counts.active + 3) / 4) - current_request; + WHEN_SCHED_STATS(w->l->stats.onesen_rqsts += request); + } } if (request > 0) { @@ -298,13 +333,14 @@ maybe_reengage_workers(global_state *const rts, worker_id self, } // Set a cap on the fail count. - if (fails > DISENGAGE_THRESHOLD) - fails = DISENGAGE_THRESHOLD; + if (fails > SENTINEL_THRESHOLD) { + fails = SENTINEL_THRESHOLD; + } // Update request threshold so that, in case this worker ends up // executing a small task, it still adds samples to its history that // are spread out in time. - *request_threshold = fails + (SENTINEL_THRESHOLD / 1); + *sample_threshold = fails + (SENTINEL_THRESHOLD / 1); } return fails; @@ -350,7 +386,7 @@ __attribute__((always_inline)) static inline unsigned int handle_failed_steal_attempts(global_state *const rts, worker_id self, unsigned int nworkers, __cilkrts_worker *const w, unsigned int fails, - unsigned int *const request_threshold, + unsigned int *const sample_threshold, history_t *const inefficient_history, history_t *const efficient_history, unsigned int *const sentinel_count_history, @@ -382,8 +418,7 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self, #if BOSS_THIEF if (is_boss_thread) { if (fails % NAP_THRESHOLD == 0) { - // The boss thread should never disengage. Sleep - // instead. + // The boss thread should never disengage. Sleep instead. const struct timespec sleeptime = { .tv_sec = 0, .tv_nsec = @@ -394,6 +429,7 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self, #else { #endif +#if ENABLE_THIEF_SLEEP // Check if the current worker counts. uint64_t disengaged_sentinel = atomic_load_explicit( &rts->disengaged_sentinel, memory_order_acquire); @@ -455,13 +491,16 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self, } // Update fail count - if (scaled_elapsed < SENTINEL_THRESHOLD) + if (scaled_elapsed < SENTINEL_THRESHOLD) { fails -= scaled_elapsed; - else { + } else { fails = DISENGAGE_THRESHOLD - SENTINEL_THRESHOLD; } - *request_threshold = SENTINEL_THRESHOLD; + *sample_threshold = SENTINEL_THRESHOLD; } +#else + if (false) { +#endif } else if (fails % NAP_THRESHOLD == 0) { // We have enough active workers to keep this worker out of // disengage, but this worker was still unable to steal @@ -470,24 +509,48 @@ handle_failed_steal_attempts(global_state *const rts, worker_id self, // approximately 50 us. const struct timespec sleeptime = { .tv_sec = 0, - .tv_nsec = + .tv_nsec = (fails > SLEEP_THRESHOLD) ? SLEEP_NSEC : NAP_NSEC}; nanosleep(&sleeptime, NULL); } else { // We perform many pause instructions to reduce the thief's // load on the system in a lightweight manner. - for (int i = 0; i < 8 * ATTEMPTS; ++i) { +#if defined(__aarch64__) + for (int i = 0; i < STEAL_BUSY_PAUSE; ++i) { busy_loop_pause(); } +#else + uint64_t start = __builtin_readcyclecounter(); + do { + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + } while ((__builtin_readcyclecounter() - start) < 800); +#endif } } } } else { // We perform many pause instructions to reduce the thief's load on // the system in a lightweight manner. - for (int i = 0; i < 32 * ATTEMPTS; ++i) { +#if defined(__aarch64__) + for (int i = 0; i < LONG_STEAL_BUSY_PAUSE; ++i) { busy_loop_pause(); } +#else + uint64_t start = __builtin_readcyclecounter(); + do { + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + busy_loop_pause(); + } while ((__builtin_readcyclecounter() - start) < 3600); +#endif } CILK_STOP_TIMING(w, INTERVAL_SLEEP); return fails; @@ -498,7 +561,7 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self, unsigned int nworkers, __cilkrts_worker *const w, Closure *const t, unsigned int fails, - unsigned int *const request_threshold, + unsigned int *const sample_threshold, history_t *const inefficient_history, history_t *const efficient_history, unsigned int *const sentinel_count_history, @@ -506,12 +569,12 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self, unsigned int *const recent_sentinel_count) { if (t) { return maybe_reengage_workers( - rts, self, nworkers, w, fails, request_threshold, + rts, self, nworkers, w, fails, sample_threshold, inefficient_history, efficient_history, sentinel_count_history, sentinel_count_history_tail, recent_sentinel_count); } else { return handle_failed_steal_attempts( - rts, self, nworkers, w, fails, request_threshold, + rts, self, nworkers, w, fails, sample_threshold, inefficient_history, efficient_history, sentinel_count_history, sentinel_count_history_tail, recent_sentinel_count); } @@ -521,7 +584,7 @@ static unsigned int go_to_sleep_maybe(global_state *const rts, worker_id self, __attribute__((always_inline)) static unsigned int decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w, unsigned int fails, uint64_t elapsed, - unsigned int *const request_threshold) { + unsigned int *const sample_threshold) { uint64_t scaled_elapsed = get_scaled_elapsed(elapsed); // Decrease the number of fails based on the work done. @@ -535,10 +598,10 @@ decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w, // work. CILK_ASSERT(w, fails % ATTEMPTS == 0); - if (scaled_elapsed > (uint64_t)(*request_threshold) - SENTINEL_THRESHOLD) - *request_threshold = SENTINEL_THRESHOLD; + if (scaled_elapsed > (uint64_t)(*sample_threshold) - SENTINEL_THRESHOLD) + *sample_threshold = SENTINEL_THRESHOLD; else - *request_threshold -= scaled_elapsed; + *sample_threshold -= scaled_elapsed; // If this worker is still sentinel, update sentinel-worker count. if (fails >= SENTINEL_THRESHOLD) @@ -548,6 +611,26 @@ decrease_fails_by_work(global_state *const rts, __cilkrts_worker *const w, } #endif // ENABLE_THIEF_SLEEP +__attribute__((always_inline)) static unsigned int +init_fails(uint32_t wake_val, global_state *rts) { + // It's possible that a disengaged worker is woken up by a call to + // request_more_thieves, in which case it should be a sentinel. But there + // isn't a direct way to tell how whether the worker should be active or a + // sentinel when it's woken up. Since the maximum value of the futex when + // sentinels are engaging and disengaging during Cilk execution is + // nworkers/2, we simply assume that if the value of the futex is less than + // that value, then it should be a sentinel. + // + // As a result, when workers are woken up to start executing any new Cilk + // function, half of them will be active, and half sentinels. + if (wake_val <= (rts->nworkers / 2)) { + atomic_fetch_add_explicit(&rts->disengaged_sentinel, 1, + memory_order_release); + return SENTINEL_THRESHOLD; + } + return 0; +} + __attribute__((always_inline)) static unsigned int reset_fails(global_state *rts, unsigned int fails) { if (fails >= SENTINEL_THRESHOLD) { @@ -574,7 +657,7 @@ disengage_worker(global_state *g, unsigned int nworkers, worker_id self) { cilk_mutex_unlock(&g->index_lock); } - __attribute__((always_inline)) static inline void +__attribute__((always_inline)) static inline void reengage_worker(global_state *g, unsigned int nworkers, worker_id self) { cilk_mutex_lock(&g->index_lock); uint64_t disengaged_sentinel = atomic_fetch_sub_explicit(