diff --git a/software/bsg_manycore_lib/bsg_barrier_amoadd.S b/software/bsg_manycore_lib/bsg_barrier_amoadd.S
new file mode 100644
index 000000000..dacd189da
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_barrier_amoadd.S
@@ -0,0 +1,73 @@
+//  AMOADD barrier
+
+//  a0 = amo lock addr (in DRAM). Initialized to 0
+//  a1 = sense word addr (in DMEM).  Initialized to 1
+
+//  void bsg_barrier_amoadd(int*, int*);
+
+.text
+.globl bsg_barrier_amoadd
+bsg_barrier_amoadd:
+  // t0 - sense val
+  // t1 - amo result
+  // t2 - check val
+  // t3 - wakeup val
+  // t4 - y index
+  // t5 - x index
+
+  // send amoadd
+  lw t0, 0(a1)
+  amoadd.w t1, t0, 0(a0)
+
+  
+  // is sense -1 or +1?
+  // set wakeup val
+  sub t3, x0, t0
+ 
+  // set check val 
+  blt x0, t0, bsg_barrier_amoadd_plus1
+
+  // -1 case
+  li t2, 1
+  j bsg_barrier_amoadd_check
+
+bsg_barrier_amoadd_plus1:
+  // +1 case
+  li t2, (bsg_tiles_X*bsg_tiles_Y)-1
+
+bsg_barrier_amoadd_check:
+  bne t2, t1, bsg_barrier_amoadd_sleep
+
+bsg_barrier_amoadd_wakeup:
+  li t4, bsg_tiles_Y-1
+
+bsg_barrier_amoadd_wakeup_loop_y:
+  li t5, bsg_tiles_X-1
+
+bsg_barrier_amoadd_wakeup_loop_x:
+  // calculate the tile-group addr for the sense val
+  li t6, 0x20000000
+  slli a2, t4, 24
+  add t6, t6, a2
+  slli a2, t5, 18
+  add t6, t6, a2
+  add t6, t6, a1
+  sw t3, 0(t6)
+  addi t5, t5, -1
+  bge t5, x0, bsg_barrier_amoadd_wakeup_loop_x
+  addi t4, t4, -1
+  bge t4, x0, bsg_barrier_amoadd_wakeup_loop_y
+  j bsg_barrier_amoadd_end
+
+
+bsg_barrier_amoadd_sleep:
+  lr.w t0, 0(a1)
+  beq t3, t0, bsg_barrier_amoadd_end
+  // we need to check this, in order to support the reservation
+  // being cleared by a context switch
+  lr.w.aq t0, 0(a1)
+  beq t3, t0, bsg_barrier_amoadd_sleep
+
+
+bsg_barrier_amoadd_end:
+  ret
diff --git a/software/bsg_manycore_lib/bsg_barrier_amoadd.h b/software/bsg_manycore_lib/bsg_barrier_amoadd.h
new file mode 100644
index 000000000..03a729dd9
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_barrier_amoadd.h
@@ -0,0 +1,12 @@
+#ifndef BSG_BARRIER_AMOADD_H
+#define BSG_BARRIER_AMOADD_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void bsg_barrier_amoadd(int*, int*);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c
new file mode 100644
index 000000000..b178c0316
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.c
@@ -0,0 +1,4 @@
+int *__cuda_barrier_cfg;
+#ifndef BSG_ARCH_HW_BARRIER
+int  __cuda_barrier_sense = 1;
+#endif
diff --git a/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h
new file mode 100644
index 000000000..1c0f926a7
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_cuda_lite_barrier.h
@@ -0,0 +1,49 @@
+#ifndef BSG_CUDA_LITE_BARRIER_H
+#define BSG_CUDA_LITE_BARRIER_H
+#include "bsg_barrier_amoadd.h"
+#ifdef BSG_ARCH_HW_BARRIER
+#include "bsg_hw_barrier.h"
+#endif
+#include "bsg_tile_config_vars.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int *__cuda_barrier_cfg;
+#ifndef BSG_ARCH_HW_BARRIER
+extern int __cuda_barrier_sense;
+#endif
+
+/**
+ * Initialize the tile-group barrier.
+ * This function should only be called once for the lifetime of the tile-group.
+ */
+static inline void bsg_barrier_hw_tile_group_init()
+{
+#ifdef BSG_ARCH_HW_BARRIER
+    int sense = 1;
+    // initalize csr
+    int cfg = __cuda_barrier_cfg[1+__bsg_id];
+    asm volatile ("csrrw x0, 0xfc1, %0" : : "r" (cfg));
+    // reset Pi
+    asm volatile ("csrrwi x0, 0xfc2, 0");
+    // sync with amoadd barrier
+    bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &sense);
+#endif
+}
+
+/**
+ * Invoke the tile-group barrier.
+ */
+static inline void bsg_barrier_hw_tile_group_sync()
+{
+#ifdef BSG_ARCH_HW_BARRIER
+    bsg_barsend();
+    bsg_barrecv();
+#else
+    bsg_barrier_amoadd(&__cuda_barrier_cfg[0], &__cuda_barrier_sense);
+#endif
+}
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.S b/software/bsg_manycore_lib/bsg_mcs_mutex.S
new file mode 100644
index 000000000..6918805e6
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_mcs_mutex.S
@@ -0,0 +1,130 @@
+// MCS mutex
+// Author: Max
+//
+// This is an implementation of the MCS mutex inspired in part by Mellor-Crummey and Scott in their 1991 paper
+// “Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors”
+//
+// This is a spinlock mutex, but unlike a simple spinlock in which all threads update and spin on
+// a single memory location, the MCS lock builds a linked-list of memory locations local to each core.
+//
+// Cores atomically append their local memory region to the global list using an unconditional
+// amoswap operation. They then spin on their local memories for a predecessor in the queue
+// to notify them that they now hold the lock.
+//
+// Once a core has completed its critical region, it checks for a successor and updates releases the lock to them.
+//
+// The advantages of this mutex over a simple spin lock on the manycore are two fold:
+//
+// (1) It greatly reduces the number of memory requests on the network and it mitigates the extent to which
+// a single memory bank becomes a hot-spot. The number of requests issued to a memory bank containing the
+// lock object is linear with the number of times an acquire operation is executed.
+//
+// (2) The lock approximates a FIFO-ish structure, which improves fairness. A simple spinlock on the manycore
+// will favor threads topologically closer to the memory bank in which the lock resides and can lead to
+// starvation of the other cores.
+//
+// This lock is by no means perfect. For locks with low contention, a simple spinlock may result in better performance.
+
+        .text
+        .globl bsg_mcs_mutex_acquire
+        // Refer to bsg_mcs_mutex.h for detailed description of usage.
+        // a0 = mtx         : bsg_mcs_mutex_t*, points to DRAM
+        // a1 = lcl         : bsg_mcs_mutex_node_t*, local pointer that points to DMEM
+        // a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl'
+bsg_mcs_mutex_acquire:
+        sw      x0, 4(a1)                      // lcl->next = 0
+        sw      x0, 0(a1)                      // lcl->unlocked = 0
+        amoswap.w.aq t0, a2, 0(a0)             // predecessor = swap (&mtx, lcl_as_glbl)
+        beqz    t0, bsg_mcs_mutex_acquire_ret  // return if predecessor = 0
+        sw      a2, 4(t0)                      // predecessor->next = lcl_as_glbl
+bsg_mcs_mutex_acquire_loop:
+        // Here we use the load-on-broken-reservation semantics to avoid
+        // busy waiting. This reduces the dynamic energy of the core
+        // and removes contention on our local memory from updates by
+        // other cores, including an update from our predecessor
+        // for when they release the lock to us.
+        //
+        // The expected wait time for this load is arbitrarily long as it depends
+        // on (1) the time it takes client code to complete the critical region
+        // and (2) the contention on this lock.
+        // We expect the wait time to be on the order of 20-100 cycles in the
+        // case where there is low contention on the lock.
+        lr.w    t0, (a1)                       // unlocked = lcl->unlocked
+        bnez    t0, bsg_mcs_mutex_acquire_ret  // return if unlocked
+        lr.w.aq t0, (a1)                       // unlocked = lcl->unlocked
+        // MBT: backwards predict not taken branch variant would be helpful here
+        //
+        // MBT: if we supported context switching the reservation register, then we would
+        // not even need this branch (i.e. a blind synchronization); but currently if there were
+        // a context switch, then the reservation would be cleared and we would need this
+        // to go back to the lr.w to reprime the reservation.
+        //
+        // MBT: if lr.w.aq re-primed the reservation AND for some reason we did need to examine
+        // the sentinel value AND we supported context switching the reservation register, then we could
+        // just jump back to the lr.w.aq instruction
+        beqz    t0, bsg_mcs_mutex_acquire_loop // while !unlocked
+bsg_mcs_mutex_acquire_ret:
+        ret
+
+        .globl bsg_mcs_mutex_release
+        // Refer to bsg_mcs_mutex.h for detailed description of usage.
+        // a0 = mtx         : bsg_mcs_mutex_t*, points to DRAM
+        // a1 = lcl         : bsg_mcs_mutex_node_t*, local pointer that points to DMEM
+        // a2 = lcl_as_glbl : bsg_mcs_mutex_node_t*, global pointer to same location as 'lcl'        
+bsg_mcs_mutex_release:
+        lw      t0, 4(a1)                                   // next = lcl->next
+        li      t1, 1                                       // t1 = 1
+        beqz    t0, bsg_mcs_mutex_release_no_successor      // branch if no successor
+        // this is the case where there is a successor
+        // we need only unlock the successor and return
+        fence                                               // fence to implement release semantics
+        sw      t1, 0(t0)                                   // successor->unlocked = 1
+        ret
+bsg_mcs_mutex_release_no_successor:
+        // this is the case where there is no known successor
+        // attempt to swap out the tail pointer with 0
+        //
+        // Max: the more common version of this mutex assumes a compare-and-swap (CAS)
+        // instruction is supported by the architecture. The semantics of CAS are as follows:
+        //
+        // CAS success, test_value, write_value, address
+        // atomically reads mem[address] and, only if it is equal to test_value, writes write_value
+        // to mem[address]. success is set to one if the swap occurred, and zero to indicate failure
+        //
+        // Here CAS can be used as follows: CAS pointed_to_me, lcl_as_glbl, nullptr, &mtx
+        // This would prevent us from accidentally removing victims from the queue
+        // and would allow us to just check the swap condition, if it failed set our successor's unlocked
+        // bit to one, and return.
+        //
+        // We don't support a CAS instruction now for a couple reasons. First, it's not
+        // part of the RISCV ISA, which instead specifies the lr-sc should be used instead.
+        // We don't believe lr-sc maps well to a manycore architecture.
+        // Second, a CAS instruction would require a big refactor of our network links
+        // because we would need to send an extra data word (the test value).
+        amoswap.w.rl t2, x0, 0(a0)                          // victim_tail = swap(&mtx, 0)
+        bne     t2, a2, bsg_mcs_mutex_release_exists_victim // victim_tail == lcl_as_glbl?
+        ret                                                 // there really is no successor -- return
+bsg_mcs_mutex_release_exists_victim:
+        // someone added themselves to the queue and we have removed them
+        // we need to put them back
+        amoswap.w t2, t2, 0(a0)                             // usurper = swap(&mtx, victim_tail)
+bsg_mcs_mutex_release_wait_on_successor:
+        // Here we do not use the load-on-broken-reservation instructions
+        // because if we are executing this code then there is a successor
+        // that has executed the `amoswap.w.aq` instruction found in the acquire
+        // function, and is in the process of updating the 'next' pointer
+        // that we are polling.
+        // We expect the wait time here to be on the order of 10s of cycles at worst.
+        // Additionally, this is a corner case that we don't expect to execute often,
+        // and the use of the LBR semantics increases the instruction footprint by
+        // three ops.
+        lw      t0, 4(a1)                                   // next = lcl->next
+        beqz    t0, bsg_mcs_mutex_release_wait_on_successor // while (lcl->next == 0)
+        bnez    t2, bsg_mcs_mutex_release_exists_usurper    // was there an usurper?
+        // no usurper exists -- unlock our successor
+        sw      t1, 0(t0)                                   // next->unlocked = 1
+        ret
+bsg_mcs_mutex_release_exists_usurper:       
+        // usurper exists, set victims as successor
+        sw      t0, 4(t2)                                   // usurper->next = next
+        ret
diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.h b/software/bsg_manycore_lib/bsg_mcs_mutex.h
new file mode 100644
index 000000000..3b4ecf105
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_mcs_mutex.h
@@ -0,0 +1,60 @@
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif    
+
+    // Must live in tile's local memory (DMEM)
+    // Do not reorder the members in this struct
+    // The assembly code in bsg_mcs_mutex.S depends on this ordering.
+    typedef struct bsg_mcs_mutex_node {
+        int unlocked;
+        struct bsg_mcs_mutex_node *next;
+    } bsg_mcs_mutex_node_t;    
+
+    // Must live in dram
+    typedef bsg_mcs_mutex_node_t* bsg_mcs_mutex_t;
+
+    /**
+     * Acquire the mutex, returns when the lock has been acquired.
+     * @param mtx         A pointer to a MCS mutex (must be in DRAM)
+     * @param lcl         A local pointer to a node allocated in tile's local memory
+     * @param lcl_as_glbl A global pointer to the same location as lcl
+     *
+     * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
+     * with whom the mutex is to be shared.
+     *
+     * The most common use case would be a mutex for sharing within a tile group, in which case a
+     * tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
+     *
+     * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
+     *
+     * Pointer casting macros can be found in bsg_manycore_arch.h
+     */
+    void bsg_mcs_mutex_acquire(bsg_mcs_mutex_t *mtx                //!< A pointer to an MCS mutex in DRAM
+                               , bsg_mcs_mutex_node_t *lcl         //!< A local pointer to a node allocated in tile's local memory
+                               , bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory
+        );
+
+    /**
+     * Release the mutex, returns when the lock has been released and the calling core no longer holds the lock.
+     * @param mtx         A pointer to a MCS mutex (must be in DRAM)
+     * @param lcl         A local pointer to a node allocated in tile's local memory
+     * @param lcl_as_glbl A global pointer to the same location as lcl
+     *
+     * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
+     * with whom the mutex is to be shared.
+     *
+     * The most common use case would be a mutex for sharing within a tile group, in which case a
+     * tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
+     *
+     * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
+     *
+     * Pointer casting macros can be found in bsg_manycore_arch.h
+     */
+    void bsg_mcs_mutex_release(bsg_mcs_mutex_t *mtx                //!< A pointer to an MCS mutex in DRAM
+                               , bsg_mcs_mutex_node_t *lcl         //!< A local pointer to a node allocated in tile's local memory
+                               , bsg_mcs_mutex_node_t *lcl_as_glbl //!< A global pointer to a node allocated in tile's local memory
+        );
+#ifdef __cplusplus
+}
+#endif
diff --git a/software/bsg_manycore_lib/bsg_mcs_mutex.hpp b/software/bsg_manycore_lib/bsg_mcs_mutex.hpp
new file mode 100644
index 000000000..434ec179c
--- /dev/null
+++ b/software/bsg_manycore_lib/bsg_mcs_mutex.hpp
@@ -0,0 +1,140 @@
+// MCS mutex
+// Author: Max
+//
+// This is an implementation of the MCS mutex inspired in part by Mellor-Crummey and Scott in their 1991 paper
+// “Algorithms for Scalable Synchronization on Shared-Memory Multiprocessors”
+//
+// This is a spinlock mutex, but unlike a simple spinlock in which all threads update and spin on
+// a single memory location, the MCS lock builds a linked-list of memory locations local to each core.
+//
+// Cores atomically append their local memory region to the global list using an unconditional
+// amoswap operation. They then spin on their local memories for a predecessor in the queue
+// to notify them that they now hold the lock.
+//
+// Once a core has completed its critical region, it checks for a successor and updates releases the lock to them.
+//
+// The advantages of this mutex over a simple spin lock on the manycore are two fold:
+//
+// (1) It greatly reduces the number of memory requests on the network and it mitigates the extent to which
+// a single memory bank becomes a hot-spot. The number of requests issued to a memory bank containing the
+// lock object is linear with the number of times an acquire operation is executed.
+//
+// (2) The lock approximates a FIFO-ish structure, which improves fairness. A simple spinlock on the manycore
+// will favor threads topologically closer to the memory bank in which the lock resides and can lead to
+// starvation of the other cores.
+//
+// This lock is by no means perfect. For locks with low contention, a simple spinlock may result in better performance.
+
+#pragma once
+#include <atomic>
+#include "bsg_manycore.h"
+#include "bsg_tile_config_vars.h"
+#include "bsg_tile_group_barrier.h"
+
+template <typename T>
+static T atomic_load(volatile T *ptr) {
+    return *ptr;
+}
+
+// Must live in tile's local memory (DMEM)
+// Do not reorder the members in this struct
+// The assembly code in bsg_mcs_mutex.S depends on this ordering.
+typedef struct bsg_mcs_mutex_node {
+    int                  unlocked;
+    struct bsg_mcs_mutex_node* next;
+} bsg_mcs_mutex_node_t;
+
+/**
+ * This object must live in global memory (DRAM).
+ */
+typedef std::atomic<bsg_mcs_mutex_node*> bsg_mcs_mutex_t;
+
+/**
+ * Acquire the mutex, returns when the lock has been acquired.
+ * @param mtx         A pointer to a MCS mutex (must be in DRAM)
+ * @param lcl         A local pointer to a node allocated in tile's local memory
+ * @param lcl_as_glbl A global pointer to the same location as lcl
+ *
+ * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
+ * with whom the mutex is to be shared.
+ *
+ * The most common use case would be a mutex for sharing within a tile group, in which case a
+ * tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
+ *
+ * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
+ *
+ * Pointer casting macros can be found in bsg_manycore_arch.h
+ */
+static void bsg_mcs_mutex_acquire(bsg_mcs_mutex_t *mtx
+                                  , bsg_mcs_mutex_node_t *lcl
+                                  , bsg_mcs_mutex_node_t *lcl_as_glbl)
+{
+    bsg_mcs_mutex_node_t *pred; // who's before us
+
+    lcl->next = nullptr;
+    lcl->unlocked = 0;
+
+    pred = mtx->exchange(lcl_as_glbl, std::memory_order_acquire);
+    // was there someone before us in line?
+    if (pred != nullptr) {
+        // tell our predecessor to notify us when done
+        pred->next = lcl_as_glbl;
+        // wait on our locked variable
+        bsg_wait_local_int_asm(&lcl->unlocked, 1);
+    }
+}
+
+/**
+ * Release the mutex, returns when the lock has been released and the calling core no longer holds the lock.
+ * @param mtx         A pointer to a MCS mutex (must be in DRAM)
+ * @param lcl         A local pointer to a node allocated in tile's local memory
+ * @param lcl_as_glbl A global pointer to the same location as lcl
+ *
+ * lcl_as_glbl must point to the same memory as lcl and it must be addressable by other cores
+ * with whom the mutex is to be shared.
+ *
+ * The most common use case would be a mutex for sharing within a tile group, in which case a
+ * tile group shared pointer should be used (see bsg_tile_group_remote_ptr).
+ *
+ * However, lcl_as_glbl can also be a global pointer to support sharing across tile groups (see bsg_global_pod_ptr).
+ *
+ * Pointer casting macros can be found in bsg_manycore_arch.h
+ */
+static void bsg_mcs_mutex_release(bsg_mcs_mutex_t *mtx
+                                  , bsg_mcs_mutex_node_t *lcl
+                                  , bsg_mcs_mutex_node_t *lcl_as_glbl)
+{
+    // successor exists, unlock and return
+    if (lcl->next != nullptr) {
+        // fence and release
+        bsg_fence();
+        lcl->next->unlocked = 1;
+        return;
+    }
+
+    // no successor, head still points to us
+    // attempt to swap out head with null
+    bsg_mcs_mutex_node_t *vic_tail;
+    vic_tail = mtx->exchange(nullptr, std::memory_order_release);
+    if (vic_tail == lcl_as_glbl) {
+        // there's still no successor
+        return;
+    }
+
+    // a successor added itself to the queue
+    // we have to put it back
+    bsg_mcs_mutex_node_t *usurper;
+    usurper = mtx->exchange(vic_tail, std::memory_order_release);
+
+    // wait for next pointer to point to some head of our victims
+    while (atomic_load(&lcl->next) == nullptr);
+
+    // did someone else get in line in the mean time?
+    if (usurper == nullptr) {
+        lcl->next->unlocked = 1;
+        return;
+    }
+
+    // add victims behind usurper
+    usurper->next = lcl->next;
+}
diff --git a/software/spmd/bsg_mcs_mutex_test/Makefile b/software/spmd/bsg_mcs_mutex_test/Makefile
new file mode 100644
index 000000000..d54671616
--- /dev/null
+++ b/software/spmd/bsg_mcs_mutex_test/Makefile
@@ -0,0 +1,20 @@
+export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel)
+
+# Running tests on full manycore array. Uncomment and modify for a smaller array
+# bsg_tiles_X = 2
+# bsg_tiles_Y = 2
+
+RISCV_GXX_EXTRA_OPTS += -DITERS=16
+
+all: main.run
+
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules
+
+OBJECT_FILES=main.o
+
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+main.o: Makefile
diff --git a/software/spmd/bsg_mcs_mutex_test/main.cpp b/software/spmd/bsg_mcs_mutex_test/main.cpp
new file mode 100644
index 000000000..067f66e1c
--- /dev/null
+++ b/software/spmd/bsg_mcs_mutex_test/main.cpp
@@ -0,0 +1,43 @@
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_manycore_atomic.h"
+
+#ifndef ITERS
+#error "define ITERS"
+#endif
+
+#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
+#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
+#include "bsg_tile_group_barrier.h"
+#include "bsg_mcs_mutex.hpp"
+
+INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1);
+
+volatile int data __attribute__((section(".dram"))) = 0;
+
+bsg_mcs_mutex_t mtx __attribute__((section(".dram")));
+
+int main()
+{
+
+  bsg_set_tile_x_y();
+
+  bsg_mcs_mutex_node_t lcl, *lcl_as_glbl = (bsg_mcs_mutex_node_t*)bsg_tile_group_remote_ptr(int, bsg_x, bsg_y, &lcl);
+
+  for (int i = 0; i < ITERS; i++) {
+      bsg_mcs_mutex_acquire(&mtx, &lcl, lcl_as_glbl);
+      data += 1;
+      bsg_mcs_mutex_release(&mtx, &lcl, lcl_as_glbl);
+  }
+
+  bsg_tile_group_barrier(&r_barrier, &c_barrier);
+  if (bsg_x == 0 && bsg_y == 0) {
+      bsg_print_int(data);
+      if (data != ITERS * bsg_tiles_X * bsg_tiles_Y)
+          bsg_fail();
+      else
+          bsg_finish();
+  }
+
+  bsg_wait_while(1);
+}
diff --git a/software/spmd/bsg_simple_mutex_test/Makefile b/software/spmd/bsg_simple_mutex_test/Makefile
new file mode 100644
index 000000000..b0253390d
--- /dev/null
+++ b/software/spmd/bsg_simple_mutex_test/Makefile
@@ -0,0 +1,19 @@
+export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel)
+
+# Running tests on full manycore array. Uncomment and modify for a smaller array
+# bsg_tiles_X = 2
+# bsg_tiles_Y = 2
+
+RISCV_GXX_EXTRA_OPTS = -DITERS=16
+
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules
+
+OBJECT_FILES=main.o
+
+all: main.run
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+main.o: Makefile
diff --git a/software/spmd/bsg_simple_mutex_test/main.cpp b/software/spmd/bsg_simple_mutex_test/main.cpp
new file mode 100644
index 000000000..9f3a5e100
--- /dev/null
+++ b/software/spmd/bsg_simple_mutex_test/main.cpp
@@ -0,0 +1,52 @@
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_manycore_atomic.h"
+
+#ifndef ITERS
+#error "define ITERS"
+#endif
+
+#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
+#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
+#include "bsg_tile_group_barrier.h"
+INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1);
+
+volatile int data __attribute__((section(".dram"))) = 0;
+
+int mtx __attribute__((section(".dram"))) = 0;
+
+static void acquire()
+{
+    int v = 1;
+    do {
+        v = bsg_amoswap_aq(&mtx, 1);
+    } while (v != 0);
+}
+
+static void release()
+{
+    bsg_amoswap_rl(&mtx, 0);
+}
+
+int main()
+{
+
+  bsg_set_tile_x_y();
+
+  for (int i = 0; i < ITERS; i++) {
+      acquire();
+      data += 1;
+      release();
+  }
+
+  bsg_tile_group_barrier(&r_barrier, &c_barrier);
+  if (bsg_x == 0 && bsg_y == 0) {
+      bsg_print_int(data);
+      if (data != ITERS*bsg_tiles_X*bsg_tiles_Y)
+          bsg_fail();
+      else
+          bsg_finish();
+  }
+
+  bsg_wait_while(1);
+}