Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bigblade pod fpga sw updates #664

Open
wants to merge 3 commits into
base: bigblade-pod-fpga
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions software/bsg_manycore_lib/bsg_barrier_amoadd.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// AMOADD barrier

// a0 = amo lock addr (in DRAM). Initialized to 0
// a1 = sense word addr (in DMEM). Initialized to 1

// void bsg_barrier_amoadd(int*, int*);

.text
.globl bsg_barrier_amoadd
bsg_barrier_amoadd:
// t0 - sense val
// t1 - amo result
// t2 - check val
// t3 - wakeup val
// t4 - y index
// t5 - x index

// send amoadd
lw t0, 0(a1)
amoadd.w t1, t0, 0(a0)


// is sense -1 or +1?
// set wakeup val
sub t3, x0, t0

// set check val
blt x0, t0, bsg_barrier_amoadd_plus1

// -1 case
li t2, 1
j bsg_barrier_amoadd_check

bsg_barrier_amoadd_plus1:
// +1 case
li t2, (bsg_tiles_X*bsg_tiles_Y)-1

bsg_barrier_amoadd_check:
bne t2, t1, bsg_barrier_amoadd_sleep

bsg_barrier_amoadd_wakeup:
li t4, bsg_tiles_Y-1

bsg_barrier_amoadd_wakeup_loop_y:
li t5, bsg_tiles_X-1

bsg_barrier_amoadd_wakeup_loop_x:
// calculate the tile-group addr for the sense val
li t6, 0x20000000
slli a2, t4, 24
add t6, t6, a2
slli a2, t5, 18
add t6, t6, a2
add t6, t6, a1
sw t3, 0(t6)
addi t5, t5, -1
bge t5, x0, bsg_barrier_amoadd_wakeup_loop_x
addi t4, t4, -1
bge t4, x0, bsg_barrier_amoadd_wakeup_loop_y
j bsg_barrier_amoadd_end


bsg_barrier_amoadd_sleep:
lr.w t0, 0(a1)
beq t3, t0, bsg_barrier_amoadd_end
// we need to check this, in order to support the reservation
// being cleared by a context switch
lr.w.aq t0, 0(a1)
beq t3, t0, bsg_barrier_amoadd_sleep


bsg_barrier_amoadd_end:
ret
12 changes: 12 additions & 0 deletions software/bsg_manycore_lib/bsg_barrier_amoadd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef BSG_BARRIER_AMOADD_H
#define BSG_BARRIER_AMOADD_H
#ifdef __cplusplus
extern "C" {
#endif

extern void bsg_barrier_amoadd(int*, int*);

#ifdef __cplusplus
}
#endif
#endif
4 changes: 4 additions & 0 deletions software/bsg_manycore_lib/bsg_cuda_lite_barrier.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
int *__cuda_barrier_cfg;
#ifndef BSG_ARCH_HW_BARRIER
int __cuda_barrier_sense = 1;
#endif
49 changes: 49 additions & 0 deletions software/bsg_manycore_lib/bsg_cuda_lite_barrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef BSG_CUDA_LITE_BARRIER_H
#define BSG_CUDA_LITE_BARRIER_H
#include "bsg_barrier_amoadd.h"
#ifdef BSG_ARCH_HW_BARRIER
#include "bsg_hw_barrier.h"
#endif
#include "bsg_tile_config_vars.h"
#ifdef __cplusplus
extern "C" {
#endif
extern int *__cuda_barrier_cfg;
#ifndef BSG_ARCH_HW_BARRIER
extern int __cuda_barrier_sense;
#endif

/**
* Initialize the tile-group barrier.
* This function should only be called once for the lifetime of the tile-group.
*/
static inline void bsg_barrier_hw_tile_group_init()
{
#ifdef BSG_ARCH_HW_BARRIER
int sense = 1;
// initalize csr
int cfg = __cuda_barrier_cfg[1+__bsg_id];
asm volatile ("csrrw x0, 0xfc1, %0" : : "r" (cfg));
// reset Pi
asm volatile ("csrrwi x0, 0xfc2, 0");
// sync with amoadd barrier
bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &sense);
#endif
}

/**
* Invoke the tile-group barrier.
*/
static inline void bsg_barrier_hw_tile_group_sync()
{
#ifdef BSG_ARCH_HW_BARRIER
bsg_barsend();
bsg_barrecv();
#else
bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &__cuda_barrier_sense);
#endif
}
#ifdef __cplusplus
}
#endif
#endif
130 changes: 130 additions & 0 deletions software/bsg_manycore_lib/bsg_mcs_mutex.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// MCS mutex
// Author: Max
//
// This is an implementation of the MCS mutex inspired in part by Mellor-Crummey and Scott in their 1991 paper
// “Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors”
//
// This is a spinlock mutex, but unlike a simple spinlock in which all threads update and spin on
// a single memory location, the MCS lock builds a linked-list of memory locations local to each core.
//
// Cores atomically append their local memory region to the global list using an unconditional
// amoswap operation. They then spin on their local memories for a predecessor in the queue
// to notify them that they now hold the lock.
//
// Once a core has completed its critical region, it checks for a successor and updates releases the lock to them.
//
// The advantages of this mutex over a simple spin lock on the manycore are two fold:
//
// (1) It greatly reduces the number of memory requests on the network and it mitigates the extent to which
// a single memory bank becomes a hot-spot. The number of requests issued to a memory bank containing the
// lock object is linear with the number of times an acquire operation is executed.
//
// (2) The lock approximates a FIFO-ish structure, which improves fairness. A simple spinlock on the manycore
// will favor threads topologically closer to the memory bank in which the lock resides and can lead to
// starvation of the other cores.
//
// This lock is by no means perfect. For locks with low contention, a simple spinlock may result in better performance.

.text
.globl bsg_mcs_mutex_acquire
// Refer to bsg_mcs_mutex.h for detailed description of usage.
// a0 = mtx : bsg_mcs_mutex_t*, points to DRAM
// a1 = lcl : bsg_mcs_mutex_node_t*, local pointer that points to DMEM
// a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl'
bsg_mcs_mutex_acquire:
sw x0, 4(a1) // lcl->next = 0
sw x0, 0(a1) // lcl->unlocked = 0
amoswap.w.aq t0, a2, 0(a0) // predecessor = swap (&mtx, lcl_as_glbl)
beqz t0, bsg_mcs_mutex_acquire_ret // return if predecessor = 0
sw a2, 4(t0) // predecessor->next = lcl_as_glbl
bsg_mcs_mutex_acquire_loop:
// Here we use the load-on-broken-reservation semantics to avoid
// busy waiting. This reduces the dynamic energy of the core
// and removes contention on our local memory from updates by
// other cores, including an update from our predecessor
// for when they release the lock to us.
//
// The expected wait time for this load is arbitrarily long as it depends
// on (1) the time it takes client code to complete the critical region
// and (2) the contention on this lock.
// We expect the wait time to be on the order of 20-100 cycles in the
// case where there is low contention on the lock.
lr.w t0, (a1) // unlocked = lcl->unlocked
bnez t0, bsg_mcs_mutex_acquire_ret // return if unlocked
lr.w.aq t0, (a1) // unlocked = lcl->unlocked
// MBT: backwards predict not taken branch variant would be helpful here
//
// MBT: if we supported context switching the reservation register, then we would
// not even need this branch (i.e. a blind synchronization); but currently if there were
// a context switch, then the reservation would be cleared and we would need this
// to go back to the lr.w to reprime the reservation.
//
// MBT: if lr.w.aq re-primed the reservation AND for some reason we did need to examine
// the sentinel value AND we supported context switching the reservation register, then we could
// just jump back to the lr.w.aq instruction
beqz t0, bsg_mcs_mutex_acquire_loop // while !unlocked
bsg_mcs_mutex_acquire_ret:
ret

.globl bsg_mcs_mutex_release
// Refer to bsg_mcs_mutex.h for detailed description of usage.
// a0 = mtx : bsg_mcs_mutex_t*, points to DRAM
// a1 = lcl : bsg_mcs_mutex_node_t*, local pointer that points to DMEM
// a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl'
bsg_mcs_mutex_release:
lw t0, 4(a1) // next = lcl->next
li t1, 1 // t1 = 1
beqz t0, bsg_mcs_mutex_release_no_successor // branch if no successor
// this is the case where there is a successor
// we need only unlock the successor and return
fence // fence to implement release semantics
sw t1, 0(t0) // successor->unlocked = 1
ret
bsg_mcs_mutex_release_no_successor:
// this is the case where there is no known successor
// attempt to swap out the tail pointer with 0
//
// Max: the more common version of this mutex assumes a compare-and-swap (CAS)
// instruction is supported by the architecture. The semantics of CAS are as follows:
//
// CAS success, test_value, write_value, address
// atomically reads mem[address] and, only if it is equal to test_value, writes write_value
// to mem[address]. success is set to one if the swap occurred, and zero to indicate failure
//
// Here CAS can be used as follows: CAS pointed_to_me, lcl_as_glbl, nullptr, &mtx
// This would prevent us from accidentally removing victims from the queue
// and would allow us to just check the swap condition, if it failed set our successor's unlocked
// bit to one, and return.
//
// We don't support a CAS instruction now for a couple reasons. First, it's not
// part of the RISCV ISA, which instead specifies the lr-sc should be used instead.
// We don't believe lr-sc maps well to a manycore architecture.
// Second, a CAS instruction would require a big refactor of our network links
// because we would need to send an extra data word (the test value).
amoswap.w.rl t2, x0, 0(a0) // victim_tail = swap(&mtx, 0)
bne t2, a2, bsg_mcs_mutex_release_exists_victim // victim_tail == lcl_as_glbl?
ret // there really is no successor -- return
bsg_mcs_mutex_release_exists_victim:
// someone added themselves to the queue and we have removed them
// we need to put them back
amoswap.w t2, t2, 0(a0) // usurper = swap(&mtx, victim_tail)
bsg_mcs_mutex_release_wait_on_successor:
// Here we do not use the load-on-broken-reservation instructions
// because if we are executing this code then there is a successor
// that has executed the `amoswap.w.aq` instruction found in the acquire
// function, and is in the process of updating the 'next' pointer
// that we are polling.
// We expect the wait time here to be on the order of 10s of cycles at worst.
// Additionally, this is a corner case that we don't expect to execute often,
// and the use of the LBR semantics increases the instruction footprint by
// three ops.
lw t0, 4(a1) // next = lcl->next
beqz t0, bsg_mcs_mutex_release_wait_on_successor // while (lcl->next == 0)
bnez t2, bsg_mcs_mutex_release_exists_usurper // was there an usurper?
// no usurper exists -- unlock our successor
sw t1, 0(t0) // next->unlocked = 1
ret
bsg_mcs_mutex_release_exists_usurper:
// usurper exists, set victims as successor
sw t0, 4(t2) // usurper->next = next
ret
60 changes: 60 additions & 0 deletions software/bsg_manycore_lib/bsg_mcs_mutex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif

// Must live in tile's local memory (DMEM)
// Do not reorder the members in this struct
// The assembly code in bsg_mcs_mutex.S depends on this ordering.
typedef struct bsg_mcs_mutex_node {
int unlocked;
struct bsg_mcs_mutex_node *next;
} bsg_mcs_mutex_node_t;

// Must live in dram
typedef bsg_mcs_mutex_node_t* bsg_mcs_mutex_t;

/**
* Acquire the mutex, returns when the lock has been acquired.
* @param mtx A pointer to a MCS mutex (must be in DRAM)
* @param lcl A local pointer to a node allocated in tile's local memory
* @param lcl_as_glbl A global pointer to the same location as lcl
*
* lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
* with whom the mutex is to be shared.
*
* The most common use case would be a mutex for sharing within a tile group, in which case a
* tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
*
* However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
*
* Pointer casting macros can be found in bsg_manycore_arch.h
*/
void bsg_mcs_mutex_acquire(bsg_mcs_mutex_t *mtx //!< A pointer to an MCS mutex in DRAM
, bsg_mcs_mutex_node_t *lcl //!< A local pointer to a node allocated in tile's local memory
, bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory
);

/**
* Release the mutex, returns when the lock has been released and the calling core no longer holds the lock.
* @param mtx A pointer to a MCS mutex (must be in DRAM)
* @param lcl A local pointer to a node allocated in tile's local memory
* @param lcl_as_glbl A global pointer to the same location as lcl
*
* lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
* with whom the mutex is to be shared.
*
* The most common use case would be a mutex for sharing within a tile group, in which case a
* tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
*
* However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
*
* Pointer casting macros can be found in bsg_manycore_arch.h
*/
void bsg_mcs_mutex_release(bsg_mcs_mutex_t *mtx //!< A pointer to an MCS mutex in DRAM
, bsg_mcs_mutex_node_t *lcl //!< A local pointer to a node allocated in tile's local memory
, bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory
);
#ifdef __cplusplus
}
#endif
Loading