From 82a5c4b9ef5eb5e3fa98a56b12e93063aeea66e8 Mon Sep 17 00:00:00 2001 From: taylor-bsg Date: Fri, 3 Sep 2021 15:04:00 -0700 Subject: [PATCH 1/3] Merge pull request #570 from bespoke-silicon-group/mcs_mutex MCS mutex --- software/bsg_manycore_lib/bsg_mcs_mutex.S | 130 +++++++++++++++++ software/bsg_manycore_lib/bsg_mcs_mutex.h | 60 ++++++++ software/bsg_manycore_lib/bsg_mcs_mutex.hpp | 140 +++++++++++++++++++ software/spmd/bsg_mcs_mutex_test/Makefile | 19 +++ software/spmd/bsg_mcs_mutex_test/main.cpp | 43 ++++++ software/spmd/bsg_simple_mutex_test/Makefile | 19 +++ software/spmd/bsg_simple_mutex_test/main.cpp | 52 +++++++ 7 files changed, 463 insertions(+) create mode 100644 software/bsg_manycore_lib/bsg_mcs_mutex.S create mode 100644 software/bsg_manycore_lib/bsg_mcs_mutex.h create mode 100644 software/bsg_manycore_lib/bsg_mcs_mutex.hpp create mode 100644 software/spmd/bsg_mcs_mutex_test/Makefile create mode 100644 software/spmd/bsg_mcs_mutex_test/main.cpp create mode 100644 software/spmd/bsg_simple_mutex_test/Makefile create mode 100644 software/spmd/bsg_simple_mutex_test/main.cpp diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.S b/software/bsg_manycore_lib/bsg_mcs_mutex.S new file mode 100644 index 000000000..6918805e6 --- /dev/null +++ b/software/bsg_manycore_lib/bsg_mcs_mutex.S @@ -0,0 +1,130 @@ +// MCS mutex +// Author: Max +// +// This is an implementation of the MCS mutex inspired in part by Mellor-Crummey and Scott in their 1991 paper +// “Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors” +// +// This is a spinlock mutex, but unlike a simple spinlock in which all threads update and spin on +// a single memory location, the MCS lock builds a linked-list of memory locations local to each core. +// +// Cores atomically append their local memory region to the global list using an unconditional +// amoswap operation. They then spin on their local memories for a predecessor in the queue +// to notify them that they now hold the lock. +// +// Once a core has completed its critical region, it checks for a successor and updates releases the lock to them. +// +// The advantages of this mutex over a simple spin lock on the manycore are two fold: +// +// (1) It greatly reduces the number of memory requests on the network and it mitigates the extent to which +// a single memory bank becomes a hot-spot. The number of requests issued to a memory bank containing the +// lock object is linear with the number of times an acquire operation is executed. +// +// (2) The lock approximates a FIFO-ish structure, which improves fairness. A simple spinlock on the manycore +// will favor threads topologically closer to the memory bank in which the lock resides and can lead to +// starvation of the other cores. +// +// This lock is by no means perfect. For locks with low contention, a simple spinlock may result in better performance. + + .text + .globl bsg_mcs_mutex_acquire + // Refer to bsg_mcs_mutex.h for detailed description of usage. + // a0 = mtx : bsg_mcs_mutex_t*, points to DRAM + // a1 = lcl : bsg_mcs_mutex_node_t*, local pointer that points to DMEM + // a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl' +bsg_mcs_mutex_acquire: + sw x0, 4(a1) // lcl->next = 0 + sw x0, 0(a1) // lcl->unlocked = 0 + amoswap.w.aq t0, a2, 0(a0) // predecessor = swap (&mtx, lcl_as_glbl) + beqz t0, bsg_mcs_mutex_acquire_ret // return if predecessor = 0 + sw a2, 4(t0) // predecessor->next = lcl_as_glbl +bsg_mcs_mutex_acquire_loop: + // Here we use the load-on-broken-reservation semantics to avoid + // busy waiting. This reduces the dynamic energy of the core + // and removes contention on our local memory from updates by + // other cores, including an update from our predecessor + // for when they release the lock to us. + // + // The expected wait time for this load is arbitrarily long as it depends + // on (1) the time it takes client code to complete the critical region + // and (2) the contention on this lock. + // We expect the wait time to be on the order of 20-100 cycles in the + // case where there is low contention on the lock. + lr.w t0, (a1) // unlocked = lcl->unlocked + bnez t0, bsg_mcs_mutex_acquire_ret // return if unlocked + lr.w.aq t0, (a1) // unlocked = lcl->unlocked + // MBT: backwards predict not taken branch variant would be helpful here + // + // MBT: if we supported context switching the reservation register, then we would + // not even need this branch (i.e. a blind synchronization); but currently if there were + // a context switch, then the reservation would be cleared and we would need this + // to go back to the lr.w to reprime the reservation. + // + // MBT: if lr.w.aq re-primed the reservation AND for some reason we did need to examine + // the sentinel value AND we supported context switching the reservation register, then we could + // just jump back to the lr.w.aq instruction + beqz t0, bsg_mcs_mutex_acquire_loop // while !unlocked +bsg_mcs_mutex_acquire_ret: + ret + + .globl bsg_mcs_mutex_release + // Refer to bsg_mcs_mutex.h for detailed description of usage. + // a0 = mtx : bsg_mcs_mutex_t*, points to DRAM + // a1 = lcl : bsg_mcs_mutex_node_t*, local pointer that points to DMEM + // a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl' +bsg_mcs_mutex_release: + lw t0, 4(a1) // next = lcl->next + li t1, 1 // t1 = 1 + beqz t0, bsg_mcs_mutex_release_no_successor // branch if no successor + // this is the case where there is a successor + // we need only unlock the successor and return + fence // fence to implement release semantics + sw t1, 0(t0) // successor->unlocked = 1 + ret +bsg_mcs_mutex_release_no_successor: + // this is the case where there is no known successor + // attempt to swap out the tail pointer with 0 + // + // Max: the more common version of this mutex assumes a compare-and-swap (CAS) + // instruction is supported by the architecture. The semantics of CAS are as follows: + // + // CAS success, test_value, write_value, address + // atomically reads mem[address] and, only if it is equal to test_value, writes write_value + // to mem[address]. success is set to one if the swap occurred, and zero to indicate failure + // + // Here CAS can be used as follows: CAS pointed_to_me, lcl_as_glbl, nullptr, &mtx + // This would prevent us from accidentally removing victims from the queue + // and would allow us to just check the swap condition, if it failed set our successor's unlocked + // bit to one, and return. + // + // We don't support a CAS instruction now for a couple reasons. First, it's not + // part of the RISCV ISA, which instead specifies the lr-sc should be used instead. + // We don't believe lr-sc maps well to a manycore architecture. + // Second, a CAS instruction would require a big refactor of our network links + // because we would need to send an extra data word (the test value). + amoswap.w.rl t2, x0, 0(a0) // victim_tail = swap(&mtx, 0) + bne t2, a2, bsg_mcs_mutex_release_exists_victim // victim_tail == lcl_as_glbl? + ret // there really is no successor -- return +bsg_mcs_mutex_release_exists_victim: + // someone added themselves to the queue and we have removed them + // we need to put them back + amoswap.w t2, t2, 0(a0) // usurper = swap(&mtx, victim_tail) +bsg_mcs_mutex_release_wait_on_successor: + // Here we do not use the load-on-broken-reservation instructions + // because if we are executing this code then there is a successor + // that has executed the `amoswap.w.aq` instruction found in the acquire + // function, and is in the process of updating the 'next' pointer + // that we are polling. + // We expect the wait time here to be on the order of 10s of cycles at worst. + // Additionally, this is a corner case that we don't expect to execute often, + // and the use of the LBR semantics increases the instruction footprint by + // three ops. + lw t0, 4(a1) // next = lcl->next + beqz t0, bsg_mcs_mutex_release_wait_on_successor // while (lcl->next == 0) + bnez t2, bsg_mcs_mutex_release_exists_usurper // was there an usurper? + // no usurper exists -- unlock our successor + sw t1, 0(t0) // next->unlocked = 1 + ret +bsg_mcs_mutex_release_exists_usurper: + // usurper exists, set victims as successor + sw t0, 4(t2) // usurper->next = next + ret diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.h b/software/bsg_manycore_lib/bsg_mcs_mutex.h new file mode 100644 index 000000000..3b4ecf105 --- /dev/null +++ b/software/bsg_manycore_lib/bsg_mcs_mutex.h @@ -0,0 +1,60 @@ +#pragma once +#ifdef __cplusplus +extern "C" { +#endif + + // Must live in tile's local memory (DMEM) + // Do not reorder the members in this struct + // The assembly code in bsg_mcs_mutex.S depends on this ordering. + typedef struct bsg_mcs_mutex_node { + int unlocked; + struct bsg_mcs_mutex_node *next; + } bsg_mcs_mutex_node_t; + + // Must live in dram + typedef bsg_mcs_mutex_node_t* bsg_mcs_mutex_t; + + /** + * Acquire the mutex, returns when the lock has been acquired. + * @param mtx A pointer to a MCS mutex (must be in DRAM) + * @param lcl A local pointer to a node allocated in tile's local memory + * @param lcl_as_glbl A global pointer to the same location as lcl + * + * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores + * with whom the mutex is to be shared. + * + * The most common use case would be a mutex for sharing within a tile group, in which case a + * tile group shared pointer should be used (see bsg_tile_group_remote_ptr). + * + * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr). + * + * Pointer casting macros can be found in bsg_manycore_arch.h + */ + void bsg_mcs_mutex_acquire(bsg_mcs_mutex_t *mtx //!< A pointer to an MCS mutex in DRAM + , bsg_mcs_mutex_node_t *lcl //!< A local pointer to a node allocated in tile's local memory + , bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory + ); + + /** + * Release the mutex, returns when the lock has been released and the calling core no longer holds the lock. + * @param mtx A pointer to a MCS mutex (must be in DRAM) + * @param lcl A local pointer to a node allocated in tile's local memory + * @param lcl_as_glbl A global pointer to the same location as lcl + * + * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores + * with whom the mutex is to be shared. + * + * The most common use case would be a mutex for sharing within a tile group, in which case a + * tile group shared pointer should be used (see bsg_tile_group_remote_ptr). + * + * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr). + * + * Pointer casting macros can be found in bsg_manycore_arch.h + */ + void bsg_mcs_mutex_release(bsg_mcs_mutex_t *mtx //!< A pointer to an MCS mutex in DRAM + , bsg_mcs_mutex_node_t *lcl //!< A local pointer to a node allocated in tile's local memory + , bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory + ); +#ifdef __cplusplus +} +#endif diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.hpp b/software/bsg_manycore_lib/bsg_mcs_mutex.hpp new file mode 100644 index 000000000..434ec179c --- /dev/null +++ b/software/bsg_manycore_lib/bsg_mcs_mutex.hpp @@ -0,0 +1,140 @@ +// MCS mutex +// Author: Max +// +// This is an implementation of the MCS mutex inspired in part by Mellor-Crummey and Scott in their 1991 paper +// “Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors” +// +// This is a spinlock mutex, but unlike a simple spinlock in which all threads update and spin on +// a single memory location, the MCS lock builds a linked-list of memory locations local to each core. +// +// Cores atomically append their local memory region to the global list using an unconditional +// amoswap operation. They then spin on their local memories for a predecessor in the queue +// to notify them that they now hold the lock. +// +// Once a core has completed its critical region, it checks for a successor and updates releases the lock to them. +// +// The advantages of this mutex over a simple spin lock on the manycore are two fold: +// +// (1) It greatly reduces the number of memory requests on the network and it mitigates the extent to which +// a single memory bank becomes a hot-spot. The number of requests issued to a memory bank containing the +// lock object is linear with the number of times an acquire operation is executed. +// +// (2) The lock approximates a FIFO-ish structure, which improves fairness. A simple spinlock on the manycore +// will favor threads topologically closer to the memory bank in which the lock resides and can lead to +// starvation of the other cores. +// +// This lock is by no means perfect. For locks with low contention, a simple spinlock may result in better performance. + +#pragma once +#include +#include "bsg_manycore.h" +#include "bsg_tile_config_vars.h" +#include "bsg_tile_group_barrier.h" + +template +static T atomic_load(volatile T *ptr) { + return *ptr; +} + +// Must live in tile's local memory (DMEM) +// Do not reorder the members in this struct +// The assembly code in bsg_mcs_mutex.S depends on this ordering. +typedef struct bsg_mcs_mutex_node { + int unlocked; + struct bsg_mcs_mutex_node* next; +} bsg_mcs_mutex_node_t; + +/** + * This object must live in global memory (DRAM). + */ +typedef std::atomic bsg_mcs_mutex_t; + +/** + * Acquire the mutex, returns when the lock has been acquired. + * @param mtx A pointer to a MCS mutex (must be in DRAM) + * @param lcl A local pointer to a node allocated in tile's local memory + * @param lcl_as_glbl A global pointer to the same location as lcl + * + * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores + * with whom the mutex is to be shared. + * + * The most common use case would be a mutex for sharing within a tile group, in which case a + * tile group shared pointer should be used (see bsg_tile_group_remote_ptr). + * + * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr). + * + * Pointer casting macros can be found in bsg_manycore_arch.h + */ +static void bsg_mcs_mutex_acquire(bsg_mcs_mutex_t *mtx + , bsg_mcs_mutex_node_t *lcl + , bsg_mcs_mutex_node_t *lcl_as_glbl) +{ + bsg_mcs_mutex_node_t *pred; // who's before us + + lcl->next = nullptr; + lcl->unlocked = 0; + + pred = mtx->exchange(lcl_as_glbl, std::memory_order_acquire); + // was there someone before us in line? + if (pred != nullptr) { + // tell our predecessor to notify us when done + pred->next = lcl_as_glbl; + // wait on our locked variable + bsg_wait_local_int_asm(&lcl->unlocked, 1); + } +} + +/** + * Release the mutex, returns when the lock has been released and the calling core no longer holds the lock. + * @param mtx A pointer to a MCS mutex (must be in DRAM) + * @param lcl A local pointer to a node allocated in tile's local memory + * @param lcl_as_glbl A global pointer to the same location as lcl + * + * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores + * with whom the mutex is to be shared. + * + * The most common use case would be a mutex for sharing within a tile group, in which case a + * tile group shared pointer should be used (see bsg_tile_group_remote_ptr). + * + * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr). + * + * Pointer casting macros can be found in bsg_manycore_arch.h + */ +static void bsg_mcs_mutex_release(bsg_mcs_mutex_t *mtx + , bsg_mcs_mutex_node_t *lcl + , bsg_mcs_mutex_node_t *lcl_as_glbl) +{ + // successor exists, unlock and return + if (lcl->next != nullptr) { + // fence and release + bsg_fence(); + lcl->next->unlocked = 1; + return; + } + + // no successor, head still points to us + // attempt to swap out head with null + bsg_mcs_mutex_node_t *vic_tail; + vic_tail = mtx->exchange(nullptr, std::memory_order_release); + if (vic_tail == lcl_as_glbl) { + // there's still no successor + return; + } + + // a successor added itself to the queue + // we have to put it back + bsg_mcs_mutex_node_t *usurper; + usurper = mtx->exchange(vic_tail, std::memory_order_release); + + // wait for next pointer to point to some head of our victims + while (atomic_load(&lcl->next) == nullptr); + + // did someone else get in line in the mean time? + if (usurper == nullptr) { + lcl->next->unlocked = 1; + return; + } + + // add victims behind usurper + usurper->next = lcl->next; +} diff --git a/software/spmd/bsg_mcs_mutex_test/Makefile b/software/spmd/bsg_mcs_mutex_test/Makefile new file mode 100644 index 000000000..e3c2be048 --- /dev/null +++ b/software/spmd/bsg_mcs_mutex_test/Makefile @@ -0,0 +1,19 @@ +export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel) + +# Running tests on full manycore array. Uncomment and modify for a smaller array +# bsg_tiles_X = 2 +# bsg_tiles_Y = 2 + +RISCV_GXX_EXTRA_OPTS += -DITERS=16 + +include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master +include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules + +OBJECT_FILES=main.o + +all: main.run + +main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o + $(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS) + +main.o: Makefile diff --git a/software/spmd/bsg_mcs_mutex_test/main.cpp b/software/spmd/bsg_mcs_mutex_test/main.cpp new file mode 100644 index 000000000..067f66e1c --- /dev/null +++ b/software/spmd/bsg_mcs_mutex_test/main.cpp @@ -0,0 +1,43 @@ +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_manycore_atomic.h" + +#ifndef ITERS +#error "define ITERS" +#endif + +#define BSG_TILE_GROUP_X_DIM bsg_tiles_X +#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y +#include "bsg_tile_group_barrier.h" +#include "bsg_mcs_mutex.hpp" + +INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1); + +volatile int data __attribute__((section(".dram"))) = 0; + +bsg_mcs_mutex_t mtx __attribute__((section(".dram"))); + +int main() +{ + + bsg_set_tile_x_y(); + + bsg_mcs_mutex_node_t lcl, *lcl_as_glbl = (bsg_mcs_mutex_node_t*)bsg_tile_group_remote_ptr(int, bsg_x, bsg_y, &lcl); + + for (int i = 0; i < ITERS; i++) { + bsg_mcs_mutex_acquire(&mtx, &lcl, lcl_as_glbl); + data += 1; + bsg_mcs_mutex_release(&mtx, &lcl, lcl_as_glbl); + } + + bsg_tile_group_barrier(&r_barrier, &c_barrier); + if (bsg_x == 0 && bsg_y == 0) { + bsg_print_int(data); + if (data != ITERS * bsg_tiles_X * bsg_tiles_Y) + bsg_fail(); + else + bsg_finish(); + } + + bsg_wait_while(1); +} diff --git a/software/spmd/bsg_simple_mutex_test/Makefile b/software/spmd/bsg_simple_mutex_test/Makefile new file mode 100644 index 000000000..b0253390d --- /dev/null +++ b/software/spmd/bsg_simple_mutex_test/Makefile @@ -0,0 +1,19 @@ +export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel) + +# Running tests on full manycore array. Uncomment and modify for a smaller array +# bsg_tiles_X = 2 +# bsg_tiles_Y = 2 + +RISCV_GXX_EXTRA_OPTS = -DITERS=16 + +include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master +include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules + +OBJECT_FILES=main.o + +all: main.run + +main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o + $(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS) + +main.o: Makefile diff --git a/software/spmd/bsg_simple_mutex_test/main.cpp b/software/spmd/bsg_simple_mutex_test/main.cpp new file mode 100644 index 000000000..9f3a5e100 --- /dev/null +++ b/software/spmd/bsg_simple_mutex_test/main.cpp @@ -0,0 +1,52 @@ +#include "bsg_manycore.h" +#include "bsg_set_tile_x_y.h" +#include "bsg_manycore_atomic.h" + +#ifndef ITERS +#error "define ITERS" +#endif + +#define BSG_TILE_GROUP_X_DIM bsg_tiles_X +#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y +#include "bsg_tile_group_barrier.h" +INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1); + +volatile int data __attribute__((section(".dram"))) = 0; + +int mtx __attribute__((section(".dram"))) = 0; + +static void acquire() +{ + int v = 1; + do { + v = bsg_amoswap_aq(&mtx, 1); + } while (v != 0); +} + +static void release() +{ + bsg_amoswap_rl(&mtx, 0); +} + +int main() +{ + + bsg_set_tile_x_y(); + + for (int i = 0; i < ITERS; i++) { + acquire(); + data += 1; + release(); + } + + bsg_tile_group_barrier(&r_barrier, &c_barrier); + if (bsg_x == 0 && bsg_y == 0) { + bsg_print_int(data); + if (data != ITERS*bsg_tiles_X*bsg_tiles_Y) + bsg_fail(); + else + bsg_finish(); + } + + bsg_wait_while(1); +} From 1b4fb2627b9ea0de24e6d25a0d09b1a331d6b6e2 Mon Sep 17 00:00:00 2001 From: Max Date: Thu, 22 Sep 2022 14:06:29 -0700 Subject: [PATCH 2/3] AMOADD Barrier --- .../bsg_manycore_lib/bsg_barrier_amoadd.S | 73 +++++++++++++++++++ software/spmd/bsg_mcs_mutex_test/Makefile | 3 +- 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 software/bsg_manycore_lib/bsg_barrier_amoadd.S diff --git a/software/bsg_manycore_lib/bsg_barrier_amoadd.S b/software/bsg_manycore_lib/bsg_barrier_amoadd.S new file mode 100644 index 000000000..dacd189da --- /dev/null +++ b/software/bsg_manycore_lib/bsg_barrier_amoadd.S @@ -0,0 +1,73 @@ +// AMOADD barrier + +// a0 = amo lock addr (in DRAM). Initialized to 0 +// a1 = sense word addr (in DMEM). Initialized to 1 + +// void bsg_barrier_amoadd(int*, int*); + +.text +.globl bsg_barrier_amoadd +bsg_barrier_amoadd: + // t0 - sense val + // t1 - amo result + // t2 - check val + // t3 - wakeup val + // t4 - y index + // t5 - x index + + // send amoadd + lw t0, 0(a1) + amoadd.w t1, t0, 0(a0) + + + // is sense -1 or +1? + // set wakeup val + sub t3, x0, t0 + + // set check val + blt x0, t0, bsg_barrier_amoadd_plus1 + + // -1 case + li t2, 1 + j bsg_barrier_amoadd_check + +bsg_barrier_amoadd_plus1: + // +1 case + li t2, (bsg_tiles_X*bsg_tiles_Y)-1 + +bsg_barrier_amoadd_check: + bne t2, t1, bsg_barrier_amoadd_sleep + +bsg_barrier_amoadd_wakeup: + li t4, bsg_tiles_Y-1 + +bsg_barrier_amoadd_wakeup_loop_y: + li t5, bsg_tiles_X-1 + +bsg_barrier_amoadd_wakeup_loop_x: + // calculate the tile-group addr for the sense val + li t6, 0x20000000 + slli a2, t4, 24 + add t6, t6, a2 + slli a2, t5, 18 + add t6, t6, a2 + add t6, t6, a1 + sw t3, 0(t6) + addi t5, t5, -1 + bge t5, x0, bsg_barrier_amoadd_wakeup_loop_x + addi t4, t4, -1 + bge t4, x0, bsg_barrier_amoadd_wakeup_loop_y + j bsg_barrier_amoadd_end + + +bsg_barrier_amoadd_sleep: + lr.w t0, 0(a1) + beq t3, t0, bsg_barrier_amoadd_end + // we need to check this, in order to support the reservation + // being cleared by a context switch + lr.w.aq t0, 0(a1) + beq t3, t0, bsg_barrier_amoadd_sleep + + +bsg_barrier_amoadd_end: + ret diff --git a/software/spmd/bsg_mcs_mutex_test/Makefile b/software/spmd/bsg_mcs_mutex_test/Makefile index e3c2be048..d54671616 100644 --- a/software/spmd/bsg_mcs_mutex_test/Makefile +++ b/software/spmd/bsg_mcs_mutex_test/Makefile @@ -6,12 +6,13 @@ export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel) RISCV_GXX_EXTRA_OPTS += -DITERS=16 +all: main.run + include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules OBJECT_FILES=main.o -all: main.run main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o $(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS) From 792e0ec052770217e376c301b2c36aab2fe6285d Mon Sep 17 00:00:00 2001 From: Max Date: Thu, 22 Sep 2022 14:15:38 -0700 Subject: [PATCH 3/3] CUDA HW Barrier API (defaults to AMOADD) --- .../bsg_manycore_lib/bsg_barrier_amoadd.h | 12 +++++ .../bsg_manycore_lib/bsg_cuda_lite_barrier.c | 4 ++ .../bsg_manycore_lib/bsg_cuda_lite_barrier.h | 49 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 software/bsg_manycore_lib/bsg_barrier_amoadd.h create mode 100644 software/bsg_manycore_lib/bsg_cuda_lite_barrier.c create mode 100644 software/bsg_manycore_lib/bsg_cuda_lite_barrier.h diff --git a/software/bsg_manycore_lib/bsg_barrier_amoadd.h b/software/bsg_manycore_lib/bsg_barrier_amoadd.h new file mode 100644 index 000000000..03a729dd9 --- /dev/null +++ b/software/bsg_manycore_lib/bsg_barrier_amoadd.h @@ -0,0 +1,12 @@ +#ifndef BSG_BARRIER_AMOADD_H +#define BSG_BARRIER_AMOADD_H +#ifdef __cplusplus +extern "C" { +#endif + +extern void bsg_barrier_amoadd(int*, int*); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c new file mode 100644 index 000000000..b178c0316 --- /dev/null +++ b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c @@ -0,0 +1,4 @@ +int *__cuda_barrier_cfg; +#ifndef BSG_ARCH_HW_BARRIER +int __cuda_barrier_sense = 1; +#endif diff --git a/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h new file mode 100644 index 000000000..1c0f926a7 --- /dev/null +++ b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h @@ -0,0 +1,49 @@ +#ifndef BSG_CUDA_LITE_BARRIER_H +#define BSG_CUDA_LITE_BARRIER_H +#include "bsg_barrier_amoadd.h" +#ifdef BSG_ARCH_HW_BARRIER +#include "bsg_hw_barrier.h" +#endif +#include "bsg_tile_config_vars.h" +#ifdef __cplusplus +extern "C" { +#endif +extern int *__cuda_barrier_cfg; +#ifndef BSG_ARCH_HW_BARRIER +extern int __cuda_barrier_sense; +#endif + +/** + * Initialize the tile-group barrier. + * This function should only be called once for the lifetime of the tile-group. + */ +static inline void bsg_barrier_hw_tile_group_init() +{ +#ifdef BSG_ARCH_HW_BARRIER + int sense = 1; + // initalize csr + int cfg = __cuda_barrier_cfg[1+__bsg_id]; + asm volatile ("csrrw x0, 0xfc1, %0" : : "r" (cfg)); + // reset Pi + asm volatile ("csrrwi x0, 0xfc2, 0"); + // sync with amoadd barrier + bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &sense); +#endif +} + +/** + * Invoke the tile-group barrier. + */ +static inline void bsg_barrier_hw_tile_group_sync() +{ +#ifdef BSG_ARCH_HW_BARRIER + bsg_barsend(); + bsg_barrecv(); +#else + bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &__cuda_barrier_sense); +#endif +} +#ifdef __cplusplus +} +#endif +#endif