diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f39441d7c..f3035bc7b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,7 +5,9 @@ variables:
 stages:
   - checkout
   - build
-  - test
+  - test_1x1
+  - test_1x1_hbm2
+  - test_4x4
 
 build-repos:
   stage: checkout
@@ -19,6 +21,7 @@ build-repos:
     - git clone --recursive -b $CI_COMMIT_REF_NAME https://github.com/bespoke-silicon-group/bsg_manycore.git
     - echo "pwd"
     - echo $CI_PROJECT_DIR
+    - echo $CI_COMMIT_REF_NAME
     - >
       if [ -e cache/ ]; then
         echo "Pulling toolchain installation from cache...";
@@ -100,7 +103,7 @@ build-toolchain:
   retry: 2
 
 test-spmd:
-  stage: test
+  stage: test_1x1
   tags:
     - bsg
     - vcs
@@ -125,7 +128,7 @@ test-spmd:
 
 
 test-beebs:
-  stage: test
+  stage: test_1x1
   tags:
     - bsg
     - vcs
@@ -147,3 +150,138 @@ test-beebs:
       - /^ci_.*$/
       - master
   retry: 2
+
+
+test-interrupt:
+  stage: test_1x1
+  tags:
+    - bsg
+    - vcs
+  script:
+    - echo "Running interrupt regression..."
+    - cd bsg_manycore
+    - pwd
+    - ./ci/interrupt.sh
+  cache:
+    key: $CI_COMMIT_REF_NAME
+    paths:
+      - $CI_PROJECT_DIR/bsg_cadenv
+      - $CI_PROJECT_DIR/basejump_stl
+      - $CI_PROJECT_DIR/bsg_manycore
+      - $CI_PROJECT_DIR/bsg_bladerunner
+    policy: pull
+  only:
+    refs:
+      - /^ci_.*$/
+      - master
+  retry: 2
+
+
+
+test-spmd-hbm2:
+  stage: test_1x1_hbm2
+  tags:
+    - bsg
+    - vcs
+  script:
+    - export BSG_MACHINE_PATH=$CI_PROJECT_DIR/bsg_manycore/machines/pod_1x1_hbm2
+    - echo "Running Manycore regression..."
+    - cd bsg_manycore
+    - pwd
+    - ./ci/spmd.sh
+  cache:
+    key: $CI_COMMIT_REF_NAME
+    paths:
+      - $CI_PROJECT_DIR/bsg_cadenv
+      - $CI_PROJECT_DIR/basejump_stl
+      - $CI_PROJECT_DIR/bsg_manycore
+      - $CI_PROJECT_DIR/bsg_bladerunner
+    policy: pull
+  only:
+    refs:
+      - /^ci_.*$/
+      - master
+  retry: 2
+
+
+test-interrupt-hbm2:
+  stage: test_1x1_hbm2
+  tags:
+    - bsg
+    - vcs
+  script:
+    - export BSG_MACHINE_PATH=$CI_PROJECT_DIR/bsg_manycore/machines/pod_1x1_hbm2
+    - echo "Running interrupt regression..."
+    - cd bsg_manycore
+    - pwd
+    - ./ci/interrupt.sh
+  cache:
+    key: $CI_COMMIT_REF_NAME
+    paths:
+      - $CI_PROJECT_DIR/bsg_cadenv
+      - $CI_PROJECT_DIR/basejump_stl
+      - $CI_PROJECT_DIR/bsg_manycore
+      - $CI_PROJECT_DIR/bsg_bladerunner
+    policy: pull
+  only:
+    refs:
+      - /^ci_.*$/
+      - master
+  retry: 2
+
+
+
+
+
+
+
+test-spmd-4x4:
+  stage: test_4x4
+  tags:
+    - bsg
+    - vcs
+  script:
+    - export BSG_MACHINE_PATH=$CI_PROJECT_DIR/bsg_manycore/machines/pod_4x4
+    - echo "Running Manycore regression..."
+    - cd bsg_manycore
+    - pwd
+    - ./ci/spmd.sh
+  cache:
+    key: $CI_COMMIT_REF_NAME
+    paths:
+      - $CI_PROJECT_DIR/bsg_cadenv
+      - $CI_PROJECT_DIR/basejump_stl
+      - $CI_PROJECT_DIR/bsg_manycore
+      - $CI_PROJECT_DIR/bsg_bladerunner
+    policy: pull
+  only:
+    refs:
+      - /^ci_.*$/
+      - master
+  retry: 2
+
+
+test-interrupt-4x4:
+  stage: test_4x4
+  tags:
+    - bsg
+    - vcs
+  script:
+    - export BSG_MACHINE_PATH=$CI_PROJECT_DIR/bsg_manycore/machines/pod_4x4
+    - echo "Running interrupt regression..."
+    - cd bsg_manycore
+    - pwd
+    - ./ci/interrupt.sh
+  cache:
+    key: $CI_COMMIT_REF_NAME
+    paths:
+      - $CI_PROJECT_DIR/bsg_cadenv
+      - $CI_PROJECT_DIR/basejump_stl
+      - $CI_PROJECT_DIR/bsg_manycore
+      - $CI_PROJECT_DIR/bsg_bladerunner
+    policy: pull
+  only:
+    refs:
+      - /^ci_.*$/
+      - master
+  retry: 2
diff --git a/Makefile b/Makefile
index 5a4c3599b..3e40caa15 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 .DEFAULT_GOAL = nothing
+.PHONY: machines
 
 nothing:
 
@@ -8,8 +9,12 @@ checkout_submodules:
 	git submodule update --init --recursive
 
 machines:
-	make -C machines/
+	make -j 3 -C machines/
 
 tools:
 	make -C software/riscv-tools checkout-all
 	make -C software/riscv-tools build-all
+
+# helpful grep rule that allows you to skip large compiled riscv-tools and imports directories
+%.grep:
+	grep -r "$*" --exclude-dir=imports --exclude-dir=riscv-tools
diff --git a/README.md b/README.md
index 0db56a2e9..bb296c3b4 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,11 @@ This repo contains the **bsg\_manycore** source code with contributions from the
 
 The tile based architecture is designed for computing efficiency, scalability and generality. The two main components are:
 
-* **Computing Node:** Purpose-designed RISC-V 32IM compatible core runs at 1.4GHz@16nm, but nodes also can be any other accelerators.
+* **Computing Node:** Purpose-designed RISC-V 32IMF compatible core runs at 1.4GHz@16nm, but nodes also can be any other accelerators.
 * **Mesh Network  :** Dimension ordered, single flit network with inter-nodes synchronization primitives (mutex, barrier etc.)
 
-Without any customized circuit, a 16nm prototype chip that holds 16x31 tiles on a 4.5x3.4 mm^2 die space achieves **812,350**
-aggregated [CoreMark](https://www.eembc.org/coremark/) score.
+Without any custom circuits, a 16nm prototype chip with 16x31 tiles on a 4.5x3.4 mm^2 die space achieves **812,350**
+aggregated [CoreMark](https://www.eembc.org/coremark/) score, a world record. Many improvements have been made since this previous version.
 
 # Documentation 
 
@@ -24,12 +24,18 @@ aggregated [CoreMark](https://www.eembc.org/coremark/) score.
 
 # Initial Setup for running programs
 
+Above this directory:
+
+- Checkout `basejump_stl`; cd into imports directory and type `make DRAMSim3`
+- Checkout `bsg_cadenv`
+
 In this directory:
 
 - `make checkout_submodules`: To update all submodules in `imports/`.
-- `make tools`: To install software toolchain required running programs on BSG Manycore.
+- `make tools`: To install software toolchain required running programs on BSG Manycore. (This build uses 12-16 threads by default.)
 - `make machines`: Compile simulation executables in `machines/`.
 - Edit `BSG_MACHINE_PATH` in `software/mk/Makefile.paths` to choose the machine to run somd programs on.
+- go into `software/spmd/bsg_barrier` and type `make` to run a test!
 
 # Contributions
 
diff --git a/ci/beebs.sh b/ci/beebs.sh
index ede46b8d9..bee7fa076 100755
--- a/ci/beebs.sh
+++ b/ci/beebs.sh
@@ -3,5 +3,5 @@
 cd software/spmd/beebs
 
 make clean
-make -j 6 > /dev/null 2>&1
+make -j 8 > /dev/null 2>&1
 make check_finish
diff --git a/ci/interrupt.sh b/ci/interrupt.sh
new file mode 100755
index 000000000..e499df95f
--- /dev/null
+++ b/ci/interrupt.sh
@@ -0,0 +1,7 @@
+#~/bin/bash
+
+cd software/spmd/interrupt_tests
+
+make clean
+make regress > /dev/null 2>&1
+make summary
diff --git a/machines/.gitignore b/machines/.gitignore
index 77c8bd599..9b1ce2ed6 100644
--- a/machines/.gitignore
+++ b/machines/.gitignore
@@ -1,8 +1,15 @@
 simv-debug
 simv-debug.daidir
+simv-profile
+simv-profile.daidir
+simv-debug
 vc_hdrs.h
 stack.info.*
+*/build-profile
 */build-debug
 */build
 */csrc
 */csrc-debug
+*/csrc-profile
+*/*.tr
+*/bsg_tag_boot_rom.v
diff --git a/machines/16x8_crossbar/Makefile.machine.include b/machines/16x8_crossbar/Makefile.machine.include
deleted file mode 100644
index 7b2ef8af0..000000000
--- a/machines/16x8_crossbar/Makefile.machine.include
+++ /dev/null
@@ -1,23 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 16
-BSG_MACHINE_GLOBAL_Y                  = 9
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-# e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								=	e_network_crossbar
-BSG_MACHINE_RUCHE_FACTOR_X						= 0
diff --git a/machines/16x8_mesh/Makefile.machine.include b/machines/16x8_mesh/Makefile.machine.include
deleted file mode 100644
index 95c2f6e46..000000000
--- a/machines/16x8_mesh/Makefile.machine.include
+++ /dev/null
@@ -1,23 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 16
-BSG_MACHINE_GLOBAL_Y                  = 9
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-# e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								=	e_network_mesh
-BSG_MACHINE_RUCHE_FACTOR_X						= 0
diff --git a/machines/16x8_ruche/Makefile.machine.include b/machines/16x8_ruche/Makefile.machine.include
deleted file mode 100644
index d87bf508e..000000000
--- a/machines/16x8_ruche/Makefile.machine.include
+++ /dev/null
@@ -1,23 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 16
-BSG_MACHINE_GLOBAL_Y                  = 9
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-# e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								=	e_network_half_ruche_x
-BSG_MACHINE_RUCHE_FACTOR_X						= 3
diff --git a/machines/32x16_ruche/Makefile.machine.include b/machines/32x16_ruche/Makefile.machine.include
deleted file mode 100644
index 9f3bbad1c..000000000
--- a/machines/32x16_ruche/Makefile.machine.include
+++ /dev/null
@@ -1,23 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 32
-BSG_MACHINE_GLOBAL_Y                  = 17
-
-BSG_MACHINE_VCACHE_SET                = 64
-BSG_MACHINE_VCACHE_WAY                = 4
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 8388608
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-# e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_dramsim3_hbm2
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								=	e_network_half_ruche_x
-BSG_MACHINE_RUCHE_FACTOR_X						= 3
diff --git a/machines/4x4_fast_fake/Makefile.machine.include b/machines/4x4_fast_fake/Makefile.machine.include
deleted file mode 100644
index 95def9af0..000000000
--- a/machines/4x4_fast_fake/Makefile.machine.include
+++ /dev/null
@@ -1,24 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 4
-BSG_MACHINE_GLOBAL_Y                  = 5
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 67108864
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-BSG_MACHINE_MEM_CFG                   = e_infinite_mem
-
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								= e_network_mesh
-BSG_MACHINE_RUCHE_FACTOR_X						= 0
diff --git a/machines/4x4_gs/Makefile.machine.include b/machines/4x4_gs/Makefile.machine.include
deleted file mode 100644
index 75ba788c5..000000000
--- a/machines/4x4_gs/Makefile.machine.include
+++ /dev/null
@@ -1,32 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 4
-BSG_MACHINE_GLOBAL_Y                  = 5
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 67108864
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-# e_infinite_mem
-# e_vcache_blocking_axi4_nonsynth_mem
-# e_vcache_non_blocking_axi4_nonsynth_mem
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_axi4_nonsynth_mem
-
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-
-# it has a gather-scatterer at the bottom-right corner.
-BSG_MACHINE_HETERO_TYPE_VEC           = 0,0,0,0,\
-                                        0,0,0,0,\
-                                        0,0,0,0,\
-                                        0,0,0,1
-
-
-BSG_MACHINE_NETWORK_CFG = e_network_mesh
-BSG_MACHINE_RUCHE_FACTOR_X						= 0
diff --git a/machines/4x4_no_dram/Makefile.machine.include b/machines/4x4_no_dram/Makefile.machine.include
deleted file mode 100644
index f48baf319..000000000
--- a/machines/4x4_no_dram/Makefile.machine.include
+++ /dev/null
@@ -1,25 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 4
-BSG_MACHINE_GLOBAL_Y                  = 5
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 67108864
-BSG_MACHINE_DRAM_INCLUDED             = 0
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-# supported memory cfg
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_axi4_nonsynth_mem
-#BSG_MACHINE_MEM_CFG                  = e_vcache_non_blocking_axi4_nonsynth_mem
-
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								= e_network_mesh
-BSG_MACHINE_RUCHE_FACTOR_X						= 0
diff --git a/machines/8x4/Makefile.machine.include b/machines/8x4/Makefile.machine.include
deleted file mode 100644
index 76764b608..000000000
--- a/machines/8x4/Makefile.machine.include
+++ /dev/null
@@ -1,23 +0,0 @@
-BSG_MACHINE_GLOBAL_X                  = 8
-BSG_MACHINE_GLOBAL_Y                  = 5
-
-BSG_MACHINE_VCACHE_SET                = 128
-BSG_MACHINE_VCACHE_WAY                = 8
-BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
-BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
-BSG_MACHINE_VCACHE_MISS_FIFO_ELS      = 32
-
-BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912  # 2^29 words (2GB)
-BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 33554432
-BSG_MACHINE_DRAM_INCLUDED             = 1
-
-BSG_MACHINE_MAX_EPA_WIDTH             = 28
-
-BSG_MACHINE_MEM_CFG                   = e_vcache_blocking_axi4_nonsynth_mem
-
-BSG_MACHINE_BRANCH_TRACE_EN           = 0
-
-BSG_MACHINE_HETERO_TYPE_VEC           = default:0
-
-BSG_MACHINE_NETWORK_CFG								=	e_network_half_ruche_x
-BSG_MACHINE_RUCHE_FACTOR_X						= 3
diff --git a/machines/Makefile b/machines/Makefile
index 4421c66da..cababb723 100644
--- a/machines/Makefile
+++ b/machines/Makefile
@@ -15,12 +15,15 @@ VCS_INCLUDES += $(foreach inc,$(VINCLUDES),+incdir+"$(inc)")
 VCS_DEFINES  += $(foreach def,$(VDEFINES),+define+"$(def)")
 VCS_SOURCES  += $(VHEADERS) $(VSOURCES) 
 VCS_FLAGS    += +v2k -sverilog -full64 -timescale=1ps/1ps \
-	 	+lint=all,noSVA-UA,noSVA-NSVU,noVCDE,noNS -top spmd_testbench -licqueue \
-    -reportstats
+    +lint=all,noSVA-UA,noSVA-NSVU,noVCDE,noNS -top spmd_testbench
+VCS_FLAGS    += -licqueue
+VCS_FLAGS    += -reportstats
+VCS_FLAGS    += -assert svaext  # needed for "assert final"
 #VCS_FLAGS    += +vcs+loopreport
 
 VCS_CFLAGS   =  -cpp g++
 VCS_CFLAGS   += -CFLAGS "-std=c++11 -g -Wall"
+VCS_CFLAGS   += -CFLAGS "-O2"
 VCS_CFLAGS   += -CFLAGS "-fPIC"
 VCS_CFLAGS 	 += -CFLAGS "-I$(BASEJUMP_STL_DIR)/imports/DRAMSim3/src"
 VCS_CFLAGS   += -CFLAGS "-I$(BASEJUMP_STL_DIR)/imports/DRAMSim3/ext/headers"
@@ -29,13 +32,15 @@ VCS_CFLAGS   += -CFLAGS "-I$(BASEJUMP_STL_DIR)/bsg_test"
 VCS_CFLAGS   += -CFLAGS "-DFMT_HEADER_ONLY=1"
 VCS_CFLAGS   += -CFLAGS "-DBASEJUMP_STL_DIR=$(BASEJUMP_STL_DIR)"
 
-DEFAULT_MACHINES = 16x8_ruche 16x8_mesh 16x8_crossbar 4x4_gs 4x4_no_dram 4x4_fast_fake 8x4
+DEFAULT_MACHINES = pod_1x1 pod_1x1_hbm2 pod_4x4
 
 .PHONY: all clean
 
 DEFAULT_TARGETS = $(foreach machine, $(DEFAULT_MACHINES),$(machine)/simv)
 DEFAULT_DEBUG_TARGETS = $(foreach machine, $(DEFAULT_MACHINES),$(machine)/simv-debug)
-all: $(DEFAULT_TARGETS) $(DEFAULT_DEBUG_TARGETS)
+DEFAULT_PROFILE_TARGETS = $(foreach machine, $(DEFAULT_MACHINES),$(machine)/simv-profile)
+
+all: $(DEFAULT_TARGETS) $(DEFAULT_DEBUG_TARGETS) $(DEFAULT_PROFILE_TARGETS)
 
 # set_machine_variables includes the Makefile.machine.include file and sets the
 # VCS_DEFINES that define the architecture for a machine. This should be called
@@ -43,8 +48,15 @@ all: $(DEFAULT_TARGETS) $(DEFAULT_DEBUG_TARGETS)
 # machine so that the *correct* Makfile.machine.include can be included
 define set_machine_variables
 	$(eval include $1)
-	$(eval VCS_DEFINES := +define+BSG_MACHINE_GLOBAL_X=${BSG_MACHINE_GLOBAL_X})
+	$(eval VCS_DEFINES := +define+BSG_MACHINE_PODS_X=${BSG_MACHINE_PODS_X})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_PODS_Y=${BSG_MACHINE_PODS_Y})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_ORIGIN_X_CORD=${BSG_MACHINE_ORIGIN_X_CORD})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_ORIGIN_Y_CORD=${BSG_MACHINE_ORIGIN_Y_CORD})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_GLOBAL_X=${BSG_MACHINE_GLOBAL_X})
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_GLOBAL_Y=${BSG_MACHINE_GLOBAL_Y})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_HOST_X_CORD=${BSG_MACHINE_HOST_X_CORD})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_HOST_Y_CORD=${BSG_MACHINE_HOST_Y_CORD})
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_ORIGIN_Y_CORD=${BSG_MACHINE_ORIGIN_Y_CORD})
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_VCACHE_SET=${BSG_MACHINE_VCACHE_SET})
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_VCACHE_WAY=${BSG_MACHINE_VCACHE_WAY})
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS=${BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS})
@@ -58,6 +70,14 @@ define set_machine_variables
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_HETERO_TYPE_VEC="${BSG_MACHINE_HETERO_TYPE_VEC}")
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_NETWORK_CFG="${BSG_MACHINE_NETWORK_CFG}")
 	$(eval VCS_DEFINES += +define+BSG_MACHINE_RUCHE_FACTOR_X="${BSG_MACHINE_RUCHE_FACTOR_X}")
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_SUBARRAY_X="${BSG_MACHINE_SUBARRAY_X}")
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_SUBARRAY_Y="${BSG_MACHINE_SUBARRAY_Y}")
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_NUM_VCACHE_ROWS="${BSG_MACHINE_NUM_VCACHE_ROWS}")
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_NUM_VCACHES_PER_CHANNEL="${BSG_MACHINE_NUM_VCACHES_PER_CHANNEL}")
+	$(eval VCS_DEFINES += +define+BSG_MACHINE_DRAMSIM3_PKG="${BSG_MACHINE_DRAMSIM3_PKG}")
+	# specify where the host module is instantiated for profiler trigger (print_stat).
+	# relative to $root
+	$(eval VCS_DEFINES += +define+HOST_MODULE_PATH=spmd_testbench) 
 	# These define are required by mobile_ddr.v.
 	# density     	= 2048 Mbit
 	# speed grade 	= 5
@@ -66,17 +86,46 @@ define set_machine_variables
 	$(eval VCS_DEFINES += +define+den2048Mb+sg5+x16+FULL_MEM)
 endef
 
+# boot tag rom gen
+POD_TRACE_GEN_PY = $(BSG_MANYCORE_DIR)/testbenches/py/pod_trace_gen.py
+ASCII_TO_ROM_PY = $(BASEJUMP_STL_DIR)/bsg_mem/bsg_ascii_to_rom.py 
+# build simv
+# without debug option for faster simulation.
 %/simv : %/Makefile.machine.include $(VSOURCES) $(CSOURCES) $(VINCLUDES) $(VHEADERS) 
 	$(call set_machine_variables,$<)
-	$(VCS) $(VCS_FLAGS) $(VCS_CFLAGS) -o $@ $(VCS_INCLUDES) $(VCS_DEFINES) $(CSOURCES) $(VCS_SOURCES) -l $*/build.log -Mdir=$*/csrc
+	python $(POD_TRACE_GEN_PY) $(BSG_MACHINE_PODS_X) $(BSG_MACHINE_PODS_Y) > $*/pod_trace.tr
+	python $(ASCII_TO_ROM_PY) $*/pod_trace.tr bsg_tag_boot_rom > $*/bsg_tag_boot_rom.v
+	$(VCS) $(VCS_FLAGS) $(VCS_CFLAGS) -o $@ \
+		$(VCS_INCLUDES) $(VCS_DEFINES) $(CSOURCES) $(VCS_SOURCES) $*/bsg_tag_boot_rom.v \
+		-l $*/build.log -Mdir=$*/csrc
 
+# build simv-debug
+# with debug option for waveform generation.
 %/simv-debug : %/Makefile.machine.include $(VSOURCES) $(CSOURCES) $(VINCLUDES) $(VHEADERS) 
 	$(call set_machine_variables,$<)
+	python $(POD_TRACE_GEN_PY) $(BSG_MACHINE_PODS_X) $(BSG_MACHINE_PODS_Y) > $*/pod_trace.tr
+	python $(ASCII_TO_ROM_PY) $*/pod_trace.tr bsg_tag_boot_rom > $*/bsg_tag_boot_rom.v
 	$(eval VCS_FLAGS += -debug_pp +vcs+vcdpluson) # Debug adds these two variables to generate waveforms
-	$(VCS) $(VCS_FLAGS) $(VCS_CFLAGS) -o $@ $(VCS_INCLUDES) $(VCS_DEFINES) $(CSOURCES) $(VCS_SOURCES) -l $*/build-debug.log -Mdir=$*/csrc-debug
+	$(VCS) $(VCS_FLAGS) $(VCS_CFLAGS) -o $@ \
+		$(VCS_INCLUDES) $(VCS_DEFINES) $(CSOURCES) $(VCS_SOURCES) $*/bsg_tag_boot_rom.v \
+		-l $*/build-debug.log -Mdir=$*/csrc-debug
+
+# building simv-profile
+# with trace-based profiler for bloodgraphs etc
+%/simv-profile : %/Makefile.machine.include $(VSOURCES) $(CSOURCES) $(VINCLUDES) $(VHEADERS) 
+	$(call set_machine_variables,$<)
+	python $(POD_TRACE_GEN_PY) $(BSG_MACHINE_PODS_X) $(BSG_MACHINE_PODS_Y) > $*/pod_trace.tr
+	python $(ASCII_TO_ROM_PY) $*/pod_trace.tr bsg_tag_boot_rom > $*/bsg_tag_boot_rom.v
+	$(eval VCS_FLAGS += +define+BSG_ENABLE_PROFILING) # Debug adds these two variables to generate waveforms
+	$(eval VCS_CFLAGS   += -CFLAGS "-DBLOOD_GRAPH")
+	$(VCS) $(VCS_FLAGS) $(VCS_CFLAGS) -o $@ \
+		$(VCS_INCLUDES) $(VCS_DEFINES) $(CSOURCES) $(VCS_SOURCES) $*/bsg_tag_boot_rom.v \
+		-l $*/build-profile.log -Mdir=$*/csrc-profile
+
 
 clean:
-	rm -rf csrc
-	rm -rf */*.log */*.daidir */simv */simv-debug
+	rm -rf */csrc*
+	rm -rf */*.log */*.daidir */simv */simv-debug */simv-profile
 	rm -rf stack.info.*
-	rm -f vc_hdrs.h
+	rm -f */vc_hdrs.h
+	rm -f */*.tr */bsg_tag_boot_rom.v
diff --git a/machines/README.md b/machines/README.md
index c990ad39b..235d0f15a 100644
--- a/machines/README.md
+++ b/machines/README.md
@@ -3,10 +3,17 @@ Machine.machine.include parameters
 
 - BSG_MACHINE_GLOBAL_X                  =   x-dimension of manycore array.
 - BSG_MACHINE_GLOBAL_Y                  =   y-dimension of manycore array (including the io router row).
+
+- BSG_MACHINE_NUM_VCACHE_ROWS           =   number of vcache rows on each side of pod (north and south).
+                                            (allowed val = 1,2,4)
 - BSG_MACHINE_VCACHE_SET                =   number of sets in each vcache
 - BSG_MACHINE_VCACHE_WAY                =   number of ways in each vcache
 - BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   =   number of words in each vcache block.
 - BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     =   vcache dma interface data width.
+                                            (constraint: 32 <= DMA_DATA_WIDTH <= BLOCK_SIZE*32)
+- BSG_MACHINE_NUM_VCACHES_PER_CHANNEL   =   number of vcaches allocated per one HBM2 channel (only for e_vcache_hbm2)
+                                            (constraint for single pod: NUM_VCACHES_PER_CHANNEL <= 2*NUM_VCACHE_ROWS*BSG_MACHINE_GLOBAL_X)
+
 - BSG_MACHINE_VCACHE_MISS_FIFO_ELS      =   number of entries in miss fifo (non-blocking vcache only).
 - BSG_MACHINE_DRAM_SIZE_WORDS           =   the total size of main memory. 2GB max, but it can be set to lower.
 - BSG_MACHINE_DRAM_BANK_SIZE_WORDS      =   the size of address space spanned by each bank.
@@ -16,6 +23,7 @@ Machine.machine.include parameters
                                             meaning that the vcache is only used as block memory.
 - BSG_MACHINE_MAX_EPA_WIDTH             =   Width of word address on the mesh network.
 - BSG_MACHINE_MEM_CFG                   =   e_vcache_non_blocking_axi4_nonsynth_mem
+- BSG_MACHINE_DRAMSIM3_PKG              =   Specify the dramsim3 setting. (only applicable if BSG_MACHINE_MEM_CFG is hbm2)
 - BSG_MACHINE_BRANCH_TRACE_EN           =   Enable branch trace.
 - BSG_MACHINE_HETERO_TYPE_VEC           =   Hetero type vector. Default configuration is 'default:0'.
                                             Use this to instantiate custom accelerator instead of vanilla core.
diff --git a/machines/arch_filelist.mk b/machines/arch_filelist.mk
index c3017001a..9b47cfee5 100644
--- a/machines/arch_filelist.mk
+++ b/machines/arch_filelist.mk
@@ -5,6 +5,7 @@
 VINCLUDES += $(BASEJUMP_STL_DIR)/bsg_misc
 VINCLUDES += $(BASEJUMP_STL_DIR)/bsg_cache
 VINCLUDES += $(BASEJUMP_STL_DIR)/bsg_noc
+VINCLUDES += $(BASEJUMP_STL_DIR)/bsg_tag
 VINCLUDES += $(BSG_MANYCORE_DIR)/v
 VINCLUDES += $(BSG_MANYCORE_DIR)/v/vanilla_bean
 VINCLUDES += $(BSG_MANYCORE_DIR)/imports/HardFloat/source
@@ -13,7 +14,9 @@ VINCLUDES += $(BSG_MANYCORE_DIR)/imports/HardFloat/source/RISCV
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_defines.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_noc_pkg.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_mesh_router_pkg.v
+VHEADERS += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_router_pkg.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_noc_links.vh
+VHEADERS += $(BASEJUMP_STL_DIR)/bsg_tag/bsg_tag_pkg.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_pkg.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_pkg.v
 VHEADERS += $(BASEJUMP_STL_DIR)/bsg_fpu/bsg_fpu_defines.vh
@@ -37,6 +40,7 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_mux_bitwise.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_chain.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_en_bypass.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_reset_en_bypass.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_en.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_reset.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_dff_reset_en.v
@@ -47,6 +51,7 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_decode_with_v.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_decode.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_counter_clear_up.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_counter_up_down.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_counter_set_down.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_round_robin_arb.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_arb_round_robin.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_circular_ptr.v
@@ -68,6 +73,10 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_id_pool.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_concentrate_static.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_array_concentrate_static.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_unconcentrate_static.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_misc/bsg_mux2_gatestack.v
+
+
+VHEADERS += $(BASEJUMP_STL_DIR)/bsg_tag/bsg_tag_client.v
 
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dataflow/bsg_fifo_1r1w_large.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dataflow/bsg_fifo_1rw_large.v
@@ -101,6 +110,13 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_mesh_stitch.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_mesh_router.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_mesh_router_decoder_dor.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_mesh_router_buffered.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_router.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_router_decoder_dor.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_router_input_control.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_router_output_control.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_concentrator.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_concentrator_in.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_noc/bsg_wormhole_concentrator_out.v
 
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_async/bsg_launch_sync_sync.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_async/bsg_sync_sync.v
@@ -116,17 +132,6 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_sbuf_queue.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_link_to_cache.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_vcache_blocking.v
 
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_decode.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_miss_fifo.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_data_mem.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_stat_mem.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_tag_mem.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_dma.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_mhu.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_non_blocking_tl_stage.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_link_to_cache_non_blocking.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_vcache_non_blocking.v
 
 VSOURCES += $(BSG_MANYCORE_DIR)/imports/HardFloat/source/fNToRecFN.v
 VSOURCES += $(BSG_MANYCORE_DIR)/imports/HardFloat/source/compareRecFN.v
@@ -154,6 +159,7 @@ VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fpu_float_aux.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fpu_int.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fpu_int_fclass.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fcsr.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/mcsr.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fpu_fdiv_fsqrt.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/fpu_fmin_fmax.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/icache.v
@@ -164,19 +170,25 @@ VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/regfile.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/regfile_synth.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/regfile_hard.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/scoreboard.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/hash_function.v
 
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_top_ruche.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_top_mesh.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_ruche.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_mesh.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_pod_ruche_array.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_pod_ruche.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_pod_ruche_row.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_compute_array_ruche.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_compute_ruche.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_vcache_array.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_tile_vcache.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_cache_dma_to_wormhole.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_hetero_socket.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_mesh_node.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_endpoint.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_endpoint_standard.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_lock_ctrl.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_reg_id_decode.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_1hold.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_eva_to_npa.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_dram_hash_function.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_link_sif_tieoff.v
 VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_ruche_x_link_sif_tieoff.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_manycore_gather_scatter.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_ruche_buffer.v
+VSOURCES += $(BSG_MANYCORE_DIR)/v/bsg_ruche_anti_buffer.v
diff --git a/machines/pod_1x1/Makefile.machine.include b/machines/pod_1x1/Makefile.machine.include
new file mode 100644
index 000000000..eb774b4e8
--- /dev/null
+++ b/machines/pod_1x1/Makefile.machine.include
@@ -0,0 +1,38 @@
+BSG_MACHINE_PODS_X									  = 1
+BSG_MACHINE_PODS_Y									  = 1
+
+BSG_MACHINE_GLOBAL_X                  = 16
+BSG_MACHINE_GLOBAL_Y                  = 8
+
+BSG_MACHINE_X_CORD_WIDTH              = 7
+BSG_MACHINE_Y_CORD_WIDTH              = 7
+
+BSG_MACHINE_RUCHE_FACTOR_X            = 3
+
+BSG_MACHINE_NUM_VCACHE_ROWS           = 1
+BSG_MACHINE_VCACHE_SET                = 64
+BSG_MACHINE_VCACHE_WAY                = 4
+BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
+BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
+BSG_MACHINE_NUM_VCACHES_PER_CHANNEL   = 16
+
+
+BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912
+BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
+BSG_MACHINE_DRAM_INCLUDED             = 1
+
+BSG_MACHINE_MAX_EPA_WIDTH             = 28
+BSG_MACHINE_BRANCH_TRACE_EN           = 0
+BSG_MACHINE_HETERO_TYPE_VEC           = default:0
+
+BSG_MACHINE_ORIGIN_Y_CORD             = 8
+BSG_MACHINE_ORIGIN_X_CORD             = 16
+
+BSG_MACHINE_HOST_Y_CORD               = 0
+BSG_MACHINE_HOST_X_CORD               = 16
+
+BSG_MACHINE_MEM_CFG                   = e_vcache_test_mem
+BSG_MACHINE_DRAMSIM3_PKG              = bsg_dramsim3_hbm2_8gb_x128_pkg
+
+BSG_MACHINE_SUBARRAY_X                = 1
+BSG_MACHINE_SUBARRAY_Y                = 1
diff --git a/machines/pod_1x1_hbm2/Makefile.machine.include b/machines/pod_1x1_hbm2/Makefile.machine.include
new file mode 100644
index 000000000..f49fefda7
--- /dev/null
+++ b/machines/pod_1x1_hbm2/Makefile.machine.include
@@ -0,0 +1,37 @@
+BSG_MACHINE_PODS_X									  = 1
+BSG_MACHINE_PODS_Y									  = 1
+
+BSG_MACHINE_GLOBAL_X                  = 16
+BSG_MACHINE_GLOBAL_Y                  = 8
+
+BSG_MACHINE_X_CORD_WIDTH              = 7
+BSG_MACHINE_Y_CORD_WIDTH              = 7
+
+BSG_MACHINE_RUCHE_FACTOR_X            = 3
+
+BSG_MACHINE_NUM_VCACHE_ROWS           = 2
+BSG_MACHINE_VCACHE_WAY                = 4
+BSG_MACHINE_VCACHE_SET                = 64
+BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 16
+BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 64
+BSG_MACHINE_NUM_VCACHES_PER_CHANNEL   = 32
+
+BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912
+BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
+BSG_MACHINE_DRAM_INCLUDED             = 1
+
+BSG_MACHINE_MAX_EPA_WIDTH             = 28
+BSG_MACHINE_BRANCH_TRACE_EN           = 0
+BSG_MACHINE_HETERO_TYPE_VEC           = default:0
+
+BSG_MACHINE_ORIGIN_Y_CORD             = 8
+BSG_MACHINE_ORIGIN_X_CORD             = 16
+
+BSG_MACHINE_HOST_Y_CORD               = 0
+BSG_MACHINE_HOST_X_CORD               = 16
+
+BSG_MACHINE_MEM_CFG                   = e_vcache_hbm2
+BSG_MACHINE_DRAMSIM3_PKG              = bsg_dramsim3_hbm2_8gb_x128_pkg
+
+BSG_MACHINE_SUBARRAY_X                = 1
+BSG_MACHINE_SUBARRAY_Y                = 1
diff --git a/machines/pod_4x4/Makefile.machine.include b/machines/pod_4x4/Makefile.machine.include
new file mode 100644
index 000000000..8703f531f
--- /dev/null
+++ b/machines/pod_4x4/Makefile.machine.include
@@ -0,0 +1,40 @@
+BSG_MACHINE_PODS_X									  = 4
+BSG_MACHINE_PODS_Y									  = 4
+
+BSG_MACHINE_GLOBAL_X                  = 16
+BSG_MACHINE_GLOBAL_Y                  = 8
+
+BSG_MACHINE_X_CORD_WIDTH              = 7
+BSG_MACHINE_Y_CORD_WIDTH              = 7
+
+BSG_MACHINE_RUCHE_FACTOR_X            = 3
+
+BSG_MACHINE_NUM_VCACHE_ROWS           = 1
+BSG_MACHINE_VCACHE_SET                = 64
+BSG_MACHINE_VCACHE_WAY                = 4
+BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 8
+BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 32
+BSG_MACHINE_NUM_VCACHES_PER_CHANNEL   = 16
+
+
+BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912
+BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
+BSG_MACHINE_DRAM_INCLUDED             = 1
+
+BSG_MACHINE_MAX_EPA_WIDTH             = 28
+BSG_MACHINE_BRANCH_TRACE_EN           = 0
+BSG_MACHINE_HETERO_TYPE_VEC           = default:0
+
+# possible origin Y = 8, 24,40,56
+# possible origin X = 16,32,48,64
+BSG_MACHINE_ORIGIN_Y_CORD             = 8
+BSG_MACHINE_ORIGIN_X_CORD             = 16
+
+BSG_MACHINE_HOST_Y_CORD               = 0
+BSG_MACHINE_HOST_X_CORD               = 16
+
+BSG_MACHINE_MEM_CFG                   = e_vcache_test_mem
+BSG_MACHINE_DRAMSIM3_PKG              = bsg_dramsim3_hbm2_8gb_x128_pkg
+
+BSG_MACHINE_SUBARRAY_X                = 1
+BSG_MACHINE_SUBARRAY_Y                = 1
diff --git a/machines/pod_4x4_hbm2/Makefile.machine.include b/machines/pod_4x4_hbm2/Makefile.machine.include
new file mode 100644
index 000000000..89fe0af29
--- /dev/null
+++ b/machines/pod_4x4_hbm2/Makefile.machine.include
@@ -0,0 +1,39 @@
+BSG_MACHINE_PODS_X									  = 4
+BSG_MACHINE_PODS_Y									  = 4
+
+BSG_MACHINE_GLOBAL_X                  = 16
+BSG_MACHINE_GLOBAL_Y                  = 8
+
+BSG_MACHINE_X_CORD_WIDTH              = 7
+BSG_MACHINE_Y_CORD_WIDTH              = 7
+
+BSG_MACHINE_RUCHE_FACTOR_X            = 3
+
+BSG_MACHINE_NUM_VCACHE_ROWS           = 2
+BSG_MACHINE_VCACHE_WAY                = 4
+BSG_MACHINE_VCACHE_SET                = 64
+BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS   = 16
+BSG_MACHINE_VCACHE_DMA_DATA_WIDTH     = 64
+BSG_MACHINE_NUM_VCACHES_PER_CHANNEL   = 32
+
+BSG_MACHINE_DRAM_SIZE_WORDS           = 536870912
+BSG_MACHINE_DRAM_BANK_SIZE_WORDS      = 16777216
+BSG_MACHINE_DRAM_INCLUDED             = 1
+
+BSG_MACHINE_MAX_EPA_WIDTH             = 28
+BSG_MACHINE_BRANCH_TRACE_EN           = 0
+BSG_MACHINE_HETERO_TYPE_VEC           = default:0
+
+# possible origin Y = 8, 24,40,56
+# possible origin X = 16,32,48,64
+BSG_MACHINE_ORIGIN_Y_CORD             = 8
+BSG_MACHINE_ORIGIN_X_CORD             = 16
+
+BSG_MACHINE_HOST_Y_CORD               = 0
+BSG_MACHINE_HOST_X_CORD               = 16
+
+BSG_MACHINE_MEM_CFG                   = e_vcache_hbm2
+BSG_MACHINE_DRAMSIM3_PKG              = bsg_dramsim3_hbm2_8gb_x128_pkg
+
+BSG_MACHINE_SUBARRAY_X                = 1
+BSG_MACHINE_SUBARRAY_Y                = 1
diff --git a/machines/sim_filelist.mk b/machines/sim_filelist.mk
index 88db60fa5..e39e00219 100644
--- a/machines/sim_filelist.mk
+++ b/machines/sim_filelist.mk
@@ -1,8 +1,6 @@
 # This file contains a list of non-synthesizable files used in manycore
 # simulation. These augment the sythesizable files in core.include.
 
-VINCLUDES += $(BASEJUMP_STL_DIR)/testing/bsg_dmc/lpddr_verilog_model
-
 VHEADERS += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_manycore_mem_cfg_pkg.v
 VHEADERS += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_manycore_network_cfg_pkg.v
 VHEADERS += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_manycore_profile_pkg.v
@@ -19,22 +17,11 @@ VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dataflow/bsg_serial_in_parallel_out_full.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dataflow/bsg_round_robin_1_to_n.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dataflow/bsg_one_fifo.v
 
-VSOURCES += $(BASEJUMP_STL_DIR)/testing/bsg_dmc/lpddr_verilog_model/mobile_ddr.v
-
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_tag/bsg_tag_pkg.v
 
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dmc/bsg_dmc_pkg.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dmc/bsg_dmc.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dmc/bsg_dmc_controller.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_dmc/bsg_dmc_phy.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_dram_ctrl.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_dram_ctrl_rx.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_dram_ctrl_tx.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_fsb/bsg_fsb_node_trace_replay.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_tag/bsg_tag_trace_replay.v
+VSOURCES += $(BASEJUMP_STL_DIR)/bsg_tag/bsg_tag_master.v
 
-VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/hash_function_reverse.v
-VSOURCES += $(BSG_MANYCORE_DIR)/v/vanilla_bean/bsg_cache_to_axi_hashed.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_axi_rx.v
-VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_axi_tx.v
 
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_mem/bsg_nonsynth_mem_1r1w_sync_mask_write_byte_dma.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_mem/bsg_nonsynth_mem_1r1w_sync_dma.v
@@ -57,6 +44,7 @@ CSOURCES += $(BASEJUMP_STL_DIR)/imports/DRAMSim3/src/memory_system.cc
 CSOURCES += $(BASEJUMP_STL_DIR)/imports/DRAMSim3/src/refresh.cc
 CSOURCES += $(BASEJUMP_STL_DIR)/imports/DRAMSim3/src/simple_stats.cc
 CSOURCES += $(BASEJUMP_STL_DIR)/imports/DRAMSim3/src/timing.cc
+CSOURCES += $(BASEJUMP_STL_DIR)/imports/DRAMSim3/src/blood_graph.cc
 
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_test_dram.v
 VSOURCES += $(BASEJUMP_STL_DIR)/bsg_cache/bsg_cache_to_test_dram_tx.v
@@ -71,9 +59,14 @@ VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/vanilla_core_profiler.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/vcache_profiler.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/vcache_non_blocking_profiler.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/infinite_mem_profiler.v
+VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_manycore_tag_master.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_manycore_io_complex.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_manycore_spmd_loader.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_manycore_monitor.v
+VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_wormhole_test_mem.v
+VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_manycore_vcache_wh_to_cache_dma.v
+VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/bsg_nonsynth_manycore_testbench.v
+VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/vcache_dma_to_dram_channel_map.v
 VSOURCES += $(BSG_MANYCORE_DIR)/testbenches/common/v/spmd_testbench.v
 
 
diff --git a/software/bsg_manycore_lib/bsg_barrier.h b/software/bsg_manycore_lib/bsg_barrier.h
index 15b0f9b80..ca6f17c9a 100644
--- a/software/bsg_manycore_lib/bsg_barrier.h
+++ b/software/bsg_manycore_lib/bsg_barrier.h
@@ -49,7 +49,7 @@ static inline void bsg_barrier_wait(  bsg_barrier *  p_local_barrier, int barrie
 //------------------------------------------------------------------
 
 static inline void bsg_barrier_wait(  bsg_barrier *  p_local_barrier, int barrier_x_cord, int barrier_y_cord ){
-
+#warning bsg_barrier is a very slow barrier; use the tilegroup barrier instead
     bsg_barrier * p_remote_barrier = (bsg_barrier *) bsg_remote_ptr( barrier_x_cord,    \
                                                                      barrier_y_cord,    \
                                                                      p_local_barrier);
diff --git a/software/bsg_manycore_lib/bsg_manycore.h b/software/bsg_manycore_lib/bsg_manycore.h
index 0202ba431..95f885672 100644
--- a/software/bsg_manycore_lib/bsg_manycore.h
+++ b/software/bsg_manycore_lib/bsg_manycore.h
@@ -13,6 +13,10 @@ int bsg_printf(const char *fmt, ...);
 }
 #endif
 
+
+
+
+// remote pointer types
 typedef volatile int   *bsg_remote_int_ptr;
 typedef volatile float   *bsg_remote_float_ptr;
 typedef volatile unsigned char  *bsg_remote_uint8_ptr;
@@ -20,8 +24,6 @@ typedef volatile unsigned short  *bsg_remote_uint16_ptr;
 typedef volatile unsigned *bsg_remote_uint32_ptr;
 typedef volatile void *bsg_remote_void_ptr;
 
-
-
 #define bsg_remote_flt_store(x,y,local_addr,val) do { *(bsg_remote_flt_ptr((x),(y),(local_addr))) = (float) (val); } while (0)
 #define bsg_remote_flt_load(x,y,local_addr,val)  do { val = *(bsg_remote_flt_ptr((x),(y),(local_addr))) ; } while (0)
 
@@ -34,6 +36,9 @@ typedef volatile void *bsg_remote_void_ptr;
 #define bsg_global_float_store(x,y,local_addr,val) do { *(bsg_global_float_ptr((x),(y),(local_addr))) = (float) (val); } while (0)
 #define bsg_global_float_load(x,y,local_addr,val)  do { val = *(bsg_global_float_ptr((x),(y),(local_addr))) ; } while (0)
 
+#define bsg_global_pod_store(px,py,x,y,local_addr,val) do { *(bsg_global_pod_ptr(px,py,(x),(y),(local_addr))) = (int) (val); } while (0)
+#define bsg_global_pod_load(px,py,x,y,local_addr,val)  do { val = *(bsg_global_pod_ptr(px,py,(x),(y),(local_addr))) ; } while (0)
+
 #define bsg_dram_store(dram_addr,val) do { *(bsg_dram_ptr((dram_addr))) = (int) (val); } while (0)
 #define bsg_dram_load(dram_addr,val)  do { val = *(bsg_dram_ptr((dram_addr))) ; } while (0)
 
@@ -118,6 +123,12 @@ inline int bsg_lr_aq(int *p) { int tmp; __asm__ __volatile__("lr.w.aq %0,%1\n" :
 #elif defined(__GNUC__) || defined(__GNUG__)
 inline int bsg_lr(int *p)    { int tmp; __asm__ __volatile__("lr.w    %0,%1\n" : "=r" (tmp) : "A" (*p)); return tmp; }
 inline int bsg_lr_aq(int *p) { int tmp; __asm__ __volatile__("lr.w.aq %0,%1\n" : "=r" (tmp) : "A" (*p)); return tmp; }
+
+inline int bsg_li(int constant_val) { int result; asm("li %0, %1" : "=r"(result) : "i"(constant_val)); return result; }
+inline int bsg_div(int a, int b)  { int result; __asm__ __volatile__("divu %0,%1,%2" : "=r"(result) : "r" (a), "r" (b)); return result; }
+inline int bsg_mulu(int a, int b) { int result; __asm__ __volatile__("mul %0,%1,%2" : "=r"(result) : "r" (a), "r" (b)); return result; }
+
+
 #else
 #error Unsupported Compiler!
 #endif
@@ -217,6 +228,11 @@ inline void bsg_fence()      { __asm__ __volatile__("fence" :::); }
 #define BSG_CUDA_PRINT_STAT_X_MASK          ((1 << BSG_CUDA_PRINT_STAT_X_WIDTH) - 1)      // 0x3F
 #define BSG_CUDA_PRINT_STAT_Y_MASK          ((1 << BSG_CUDA_PRINT_STAT_Y_WIDTH) - 1)      // 0x3F
 
+//Macros for triggering saif generation
+#define bsg_saif_start() asm volatile ("addi zero,zero,1")
+
+#define bsg_saif_end() asm volatile ("addi zero,zero,2")
+
 
 #define bsg_print_stat(tag) do { bsg_remote_int_ptr ptr = bsg_remote_ptr_io(IO_X_INDEX,0xd0c); *ptr = tag; } while (0)
 
diff --git a/software/bsg_manycore_lib/bsg_manycore.hpp b/software/bsg_manycore_lib/bsg_manycore.hpp
index 4d8ceda2c..f2141685a 100644
--- a/software/bsg_manycore_lib/bsg_manycore.hpp
+++ b/software/bsg_manycore_lib/bsg_manycore.hpp
@@ -16,9 +16,9 @@
  */ 
 template<typename T>
 T *bsg_tile_group_remote_pointer(unsigned char x, unsigned char y, T* local_addr) {
-        uintptr_t remote_prefix = (REMOTE_EPA_PREFIX << REMOTE_EPA_MASK_SHIFTS);
-        uintptr_t y_bits = ((y) << Y_CORD_SHIFTS);
-        uintptr_t x_bits = ((x) << X_CORD_SHIFTS);
+        uintptr_t remote_prefix = (1 << REMOTE_PREFIX_SHIFT);
+        uintptr_t y_bits = ((y) << REMOTE_Y_CORD_SHIFT);
+        uintptr_t x_bits = ((x) << REMOTE_X_CORD_SHIFT);
         uintptr_t local_bits = reinterpret_cast<uintptr_t>(local_addr);
         return reinterpret_cast<T *>(remote_prefix | y_bits | x_bits | local_bits);
 }
diff --git a/software/bsg_manycore_lib/bsg_manycore_arch.h b/software/bsg_manycore_lib/bsg_manycore_arch.h
index b90bb186c..acdd89f17 100644
--- a/software/bsg_manycore_lib/bsg_manycore_arch.h
+++ b/software/bsg_manycore_lib/bsg_manycore_arch.h
@@ -2,21 +2,7 @@
 #define _BSG_MANYCORE_ARCH_H
 
 //------------------------------------------------------
-// 0. basic SoC definitaion
-//------------------------------------------------------
-#define IO_X_INDEX     (0) 
-#define IO_Y_INDEX     (1) 
-//in words.
-#define EPA_ADDR_BITS                   18
-
-// The CSR Addr configurations
-// bsg_manycore/v/parameters.vh  for definition in RTL
-#define CSR_BASE_ADDR   (1<< (EPA_ADDR_BITS-1))
-#define CSR_FREEZE      0x0
-#define CSR_TGO_X       0x4
-#define CSR_TGO_Y       0x8
-//------------------------------------------------------
-// 1. X/Y dimention setting/Checking.
+// X/Y dimention setting/Checking.
 //------------------------------------------------------
 #ifndef bsg_global_X
 #error bsg_global_X must be defined
@@ -34,82 +20,100 @@
 #error bsg_tiles_Y must be defined
 #endif
 
-#if ( bsg_tiles_Y + 1 ) > (bsg_global_Y )
-#error bsg_tiles_Y must 1 less than bsg_global_Y
+#if ( bsg_tiles_Y ) > (bsg_global_Y )
+#error bsg_tiles_Y is greater than bsg_global_Y
 #endif
 
+
 //------------------------------------------------------
-// 2.Tile Address Mapping Configuation
+// Basic EVA Format Definition
 //------------------------------------------------------
-#define MAX_X_CORD_BITS                 6
-#define MAX_Y_CORD_BITS                 6
+// Global EPA = {01YY_YYYY_YXXX_XXXX_PPPP_PPPP_PPPP_PPPP}
+// Remote EPA = {001Y_YYYY_XXXX_XXPP_PPPP_PPPP_PPPP_PPPP}
+// DRAM Addr  = {1PPP_PPPP_PPPP_PPPP_PPPP_PPPP_PPPP_PPPP}
+//------------------------------------------------------
+#define GLOBAL_EPA_WIDTH      16
+#define GLOBAL_X_CORD_WIDTH   7
+#define GLOBAL_Y_CORD_WIDTH   7
+#define GLOBAL_X_CORD_SHIFT   (GLOBAL_EPA_WIDTH)
+#define GLOBAL_Y_CORD_SHIFT   (GLOBAL_X_CORD_SHIFT+GLOBAL_X_CORD_WIDTH)
+#define GLOBAL_PREFIX_SHIFT   (GLOBAL_Y_CORD_SHIFT+GLOBAL_Y_CORD_WIDTH)
 
-#define X_CORD_SHIFTS                   (EPA_ADDR_BITS)
-#define Y_CORD_SHIFTS                   (X_CORD_SHIFTS + MAX_X_CORD_BITS)
+#define REMOTE_EPA_WIDTH      18
+#define REMOTE_X_CORD_WIDTH   6
+#define REMOTE_Y_CORD_WIDTH   5
+#define REMOTE_X_CORD_SHIFT   (REMOTE_EPA_WIDTH)
+#define REMOTE_Y_CORD_SHIFT   (REMOTE_X_CORD_SHIFT+REMOTE_X_CORD_WIDTH)
+#define REMOTE_PREFIX_SHIFT   (REMOTE_Y_CORD_SHIFT+REMOTE_Y_CORD_WIDTH)
 
-#define REMOTE_EPA_PREFIX               0x1
-#define GLOBAL_EPA_PREFIX               0x1
-#define REMOTE_EPA_MASK_BITS            (32 - EPA_ADDR_BITS - MAX_X_CORD_BITS - MAX_Y_CORD_BITS) 
-#define REMOTE_EPA_MASK                 ((1<<REMOTE_EPA_MASK_BITS)-1)
-//TODO -- MAX_Y_CORD_BITS is reduced 1 bits. 
-#define REMOTE_EPA_MASK_SHIFTS          (Y_CORD_SHIFTS + MAX_Y_CORD_BITS -1)
-#define GLOBAL_EPA_MASK_SHIFTS          (Y_CORD_SHIFTS + MAX_Y_CORD_BITS   )
+#define DRAM_PREFIX_SHIFT     31
 
-//------------------------------------------------------
-// 3. Basic Remote Pointers Definition
-//------------------------------------------------------
-// Global EPA = {01, y_cord, x_cord, addr }
-// Remote EPA = {001,y_cord, x_cord, addr }
-// DRAM Addr  = {1 addr                   }
-//------------------------------------------------------
 
-#define bsg_remote_addr_bits            EPA_ADDR_BITS 
-// Used for in tile group access
-#define bsg_remote_ptr(x,y,local_addr) ((bsg_remote_int_ptr) (   (REMOTE_EPA_PREFIX << REMOTE_EPA_MASK_SHIFTS) \
-                                                               | ((y) << Y_CORD_SHIFTS )                     \
-                                                               | ((x) << X_CORD_SHIFTS )                     \
-                                                               | ((int) (local_addr)   )                     \
-                                                             )                                               \
-                                        )
-
-#define bsg_remote_flt_ptr(x,y,local_addr) ((bsg_remote_float_ptr) (   (REMOTE_EPA_PREFIX << REMOTE_EPA_MASK_SHIFTS) \
-                                                               | ((y) << Y_CORD_SHIFTS )                     \
-                                                               | ((x) << X_CORD_SHIFTS )                     \
-                                                               | ((int) (local_addr)   )                     \
-                                                             )                                               \
-                                        )
+// The CSR Addr configurations
+#define CSR_BASE_ADDR   (1<<(REMOTE_EPA_WIDTH-1))
+#define CSR_FREEZE      0x0
+#define CSR_TGO_X       0x4
+#define CSR_TGO_Y       0x8
+
+
+// tile-group address pointer 
+#define bsg_remote_ptr(x,y,local_addr) \
+  ((bsg_remote_int_ptr) ( (1 << REMOTE_PREFIX_SHIFT) \
+                        | (y << REMOTE_Y_CORD_SHIFT) \
+                        | (x << REMOTE_X_CORD_SHIFT) \
+                        | ((int) local_addr)))
+
+#define bsg_remote_flt_ptr(x,y,local_addr) \
+  ((bsg_remote_float_ptr) ( (1 << REMOTE_PREFIX_SHIFT) \
+                          | (y << REMOTE_Y_CORD_SHIFT) \
+                          | (x << REMOTE_X_CORD_SHIFT) \
+                          | ((int) local_addr)))
 
 #define CREATE_POINTER_BY_TYPE(type) bsg_remote_ ## type ## _ptr
 
-#define bsg_tile_group_remote_ptr(type,x,y,local_addr) ( (CREATE_POINTER_BY_TYPE(type)) (   (REMOTE_EPA_PREFIX << REMOTE_EPA_MASK_SHIFTS)  \
-                                                                                          | ((y) << Y_CORD_SHIFTS )                        \
-                                                                                          | ((x) << X_CORD_SHIFTS )                        \
-                                                                                          | ((int) (local_addr)   )                        \
-                                                                                        )                                                  \
-                                                       )
+#define bsg_tile_group_remote_ptr(type,x,y,local_addr) \
+  ((CREATE_POINTER_BY_TYPE(type)) ( (1 << REMOTE_PREFIX_SHIFT) \
+                                  | (y << REMOTE_Y_CORD_SHIFT) \
+                                  | (x << REMOTE_X_CORD_SHIFT) \
+                                  | ((int) local_addr)))
+
+
+// global address pointer
+#define bsg_global_ptr(x,y,local_addr) \
+  ((bsg_remote_int_ptr) ( (1 << GLOBAL_PREFIX_SHIFT)    \
+                        | (y << GLOBAL_Y_CORD_SHIFT)  \
+                        | (x << GLOBAL_X_CORD_SHIFT)  \
+                        | ((int) local_addr)))
+
 
+#define bsg_global_float_ptr(x,y,local_addr) \
+  ((bsg_remote_float_ptr) ( (1 << GLOBAL_PREFIX_SHIFT)    \
+                          | (y << GLOBAL_Y_CORD_SHIFT)  \
+                          | (x << GLOBAL_X_CORD_SHIFT)  \
+                          | ((int) local_addr)))
 
 
-//Used for global network access
-#define bsg_global_ptr(x,y,local_addr) ((bsg_remote_int_ptr) (   (GLOBAL_EPA_PREFIX << GLOBAL_EPA_MASK_SHIFTS) \
-                                                               | ((y) << Y_CORD_SHIFTS )                     \
-                                                               | ((x) << X_CORD_SHIFTS )                     \
-                                                               | ((int) (local_addr)   )                     \
-                                                             )                                               \
-                                        )
-#define bsg_global_float_ptr(x,y,local_addr) ((bsg_remote_float_ptr) (   (GLOBAL_EPA_PREFIX << GLOBAL_EPA_MASK_SHIFTS) \
-                                                               | ((y) << Y_CORD_SHIFTS )                     \
-                                                               | ((x) << X_CORD_SHIFTS )                     \
-                                                               | ((int) (local_addr)   )                     \
-                                                             )                                               \
-                                        )
-#define bsg_dram_ptr(local_addr) (  (bsg_remote_int_ptr)  ((1<< 31) | ((int) (local_addr))  ) )
+// pod remote pointer
+// px = pod id x
+// py = pod id y
+// x = subcord x
+// y = subcord y
+#define bsg_global_pod_ptr(px,py,x,y,local_addr) \
+  ((bsg_remote_int_ptr) ( (1 << GLOBAL_PREFIX_SHIFT)    \
+                        | ((((1+(py*2))*bsg_global_Y)+y) << GLOBAL_Y_CORD_SHIFT)  \
+                        | ((((px+1)*bsg_global_X)+x) << GLOBAL_X_CORD_SHIFT)  \
+                        | ((int) local_addr)))
+
+
+// DRAM address pointer
+#define bsg_dram_ptr(local_addr) ((bsg_remote_int_ptr) ((1<<DRAM_PREFIX_SHIFT) | ((int) local_addr)))
+#define bsg_host_dram_ptr(addr)  ((bsg_remote_int_ptr) ((3<<30) | ((int) (addr))))
+
+
+
+#define bsg_local_ptr(remote_addr) ((int) (remote_addr) & ((1<<REMOTE_EPA_WIDTH)-1))
 
-#define bsg_host_dram_ptr(addr) ( (bsg_remote_int_ptr) ((3<<30) | ((int) (addr))))
 
-#define bsg_local_ptr( remote_addr)  (    (int) (remote_addr)                           \
-                                        & (   (1 << bsg_remote_addr_bits) - 1 )         \
-                                     )
 
 #define bsg_tile_group_shared_ptr(type,lc_sh,index) ( bsg_tile_group_remote_ptr  ( type,                                                   \
                                                                                    ((index)%bsg_tiles_X),                                  \
@@ -118,4 +122,6 @@
  
                                                     
 #define bsg_io_mutex_ptr(local_addr)  bsg_global_ptr( IO_X_INDEX, IO_Y_INDEX, (local_addr))  
+
+
 #endif
diff --git a/software/bsg_manycore_lib/bsg_manycore_asm.h b/software/bsg_manycore_lib/bsg_manycore_asm.h
index fa3f74e7b..587dd163a 100644
--- a/software/bsg_manycore_lib/bsg_manycore_asm.h
+++ b/software/bsg_manycore_lib/bsg_manycore_asm.h
@@ -9,15 +9,15 @@
 // Loads and stores
 
 #define bsg_asm_remote_ptr(x,y,local_addr)         \
-    ((REMOTE_EPA_PREFIX << REMOTE_EPA_MASK_SHIFTS) \
-      | ((y) << Y_CORD_SHIFTS )                    \
-      | ((x) << X_CORD_SHIFTS )                    \
+    ((1 << REMOTE_PREFIX_SHIFT) \
+      | ((y) << REMOTE_Y_CORD_SHIFT )                    \
+      | ((x) << REMOTE_X_CORD_SHIFT )                    \
       | ( (local_addr)   )                         \
     )
 
-#define bsg_asm_global_ptr(x,y,local_addr)  ( (GLOBAL_EPA_PREFIX << GLOBAL_EPA_MASK_SHIFTS) \
-                                                               | ((y) << Y_CORD_SHIFTS )                     \
-                                                               | ((x) << X_CORD_SHIFTS )                     \
+#define bsg_asm_global_ptr(x,y,local_addr)  ( (1 << GLOBAL_PREFIX_SHIFT) \
+                                                               | ((y) << GLOBAL_Y_CORD_SHIFT )                     \
+                                                               | ((x) << GLOBAL_X_CORD_SHIFT )                     \
                                                                | ( local_addr          )                     \
                                             )                                               \
 
@@ -56,6 +56,11 @@
     li t0, local_addr;                     \
     lw reg, 0x0(t0);
 
+// Remote Interrupt address (global EVA)
+#define bsg_global_remote_interrupt_ptr(x,y) bsg_asm_global_ptr(x,y,0xfffc)
+// Remote Interrupt address (tile-group EVA)
+#define bsg_tile_group_remote_interrupt_ptr(x,y) bsg_asm_remote_ptr(x,y,0xfffc)
+
 
 // IO macros
 
@@ -74,6 +79,11 @@
 #define bsg_asm_finish(x,val) \
     bsg_asm_global_store(x, IO_Y_INDEX,0xEAD0,val)
 
+// print out time
+#define bsg_asm_print_time(x,val) \
+    bsg_asm_global_store(x, IO_Y_INDEX,0xEAD4,val)
+
+
 // finish with value in a reg in IO #x
 #define bsg_asm_finish_reg(x,reg) \
     bsg_asm_global_store_reg(x,IO_Y_INDEX,0xEAD0,reg)
@@ -94,4 +104,71 @@
     li t0, val;             \
     op reg,t0,dest;
 
+// start code
+#define bsg_asm_init_regfile  \
+    li x1, 0;                 \
+    li x2, 4096-4;          \
+    li x3, 0;                 \
+    li x4, 0;                 \
+    li x5, 0;                 \
+    li x6, 0;                 \
+    li x7, 0;                 \
+    li x8, 0;                 \
+    li x9, 0;                 \
+    li x10,0;                 \
+    li x11,0;                 \
+    li x12,0;                 \
+    li x13,0;                 \
+    li x14,0;                 \
+    li x15,0;                 \
+    li x16,0;                 \
+    li x17,0;                 \
+    li x18,0;                 \
+    li x19,0;                 \
+    li x20,0;                 \
+    li x21,0;                 \
+    li x22,0;                 \
+    li x23,0;                 \
+    li x24,0;                 \
+    li x25,0;                 \
+    li x26,0;                 \
+    li x27,0;                 \
+    li x28,0;                 \
+    li x29,0;                 \
+    li x30,0;                 \
+    li x31,0;                 \
+                              \
+    fcvt.s.w f0, x0;          \
+    fcvt.s.w f1, x0;          \
+    fcvt.s.w f2, x0;          \
+    fcvt.s.w f3, x0;          \
+    fcvt.s.w f4, x0;          \
+    fcvt.s.w f5, x0;          \
+    fcvt.s.w f6, x0;          \
+    fcvt.s.w f7, x0;          \
+    fcvt.s.w f8, x0;          \
+    fcvt.s.w f9, x0;          \
+    fcvt.s.w f10,x0;          \
+    fcvt.s.w f11,x0;          \
+    fcvt.s.w f12,x0;          \
+    fcvt.s.w f13,x0;          \
+    fcvt.s.w f14,x0;          \
+    fcvt.s.w f15,x0;          \
+    fcvt.s.w f16,x0;          \
+    fcvt.s.w f17,x0;          \
+    fcvt.s.w f18,x0;          \
+    fcvt.s.w f19,x0;          \
+    fcvt.s.w f20,x0;          \
+    fcvt.s.w f21,x0;          \
+    fcvt.s.w f22,x0;          \
+    fcvt.s.w f23,x0;          \
+    fcvt.s.w f24,x0;          \
+    fcvt.s.w f25,x0;          \
+    fcvt.s.w f26,x0;          \
+    fcvt.s.w f27,x0;          \
+    fcvt.s.w f28,x0;          \
+    fcvt.s.w f29,x0;          \
+    fcvt.s.w f30,x0;          \
+    fcvt.s.w f31,x0;       
+    
 #endif
diff --git a/software/bsg_manycore_lib/bsg_manycore_atomic.h b/software/bsg_manycore_lib/bsg_manycore_atomic.h
index 4d40a0cbb..6aa277541 100644
--- a/software/bsg_manycore_lib/bsg_manycore_atomic.h
+++ b/software/bsg_manycore_lib/bsg_manycore_atomic.h
@@ -74,5 +74,40 @@ inline int bsg_amoor_aqrl (int* p, int val)
   return result;
 }
 
+inline int bsg_amoadd (int* p, int val)
+{
+  int result;
+  asm volatile ("amoadd.w %[result], %[val], 0(%[p])" \
+                : [result] "=r" (result) \
+                : [p] "r" (p), [val] "r" (val));
+  return result;
+}
+
+inline int bsg_amoadd_aq (int* p, int val)
+{
+  int result;
+  asm volatile ("amoadd.w.aq %[result], %[val], 0(%[p])" \
+                : [result] "=r" (result) \
+                : [p] "r" (p), [val] "r" (val));
+  return result;
+}
+
+inline int bsg_amoadd_rl (int* p, int val)
+{
+  int result;
+  asm volatile ("amoadd.w.rl %[result], %[val], 0(%[p])" \
+                : [result] "=r" (result) \
+                : [p] "r" (p), [val] "r" (val));
+  return result;
+}
+
+inline int bsg_amoadd_aqrl (int* p, int val)
+{
+  int result;
+  asm volatile ("amoadd.w.aqrl %[result], %[val], 0(%[p])" \
+                : [result] "=r" (result) \
+                : [p] "r" (p), [val] "r" (val));
+  return result;
+}
 
 #endif
diff --git a/software/bsg_manycore_lib/bsg_tile_group_barrier.h b/software/bsg_manycore_lib/bsg_tile_group_barrier.h
index c3beadfcf..52be88940 100644
--- a/software/bsg_manycore_lib/bsg_tile_group_barrier.h
+++ b/software/bsg_manycore_lib/bsg_tile_group_barrier.h
@@ -82,7 +82,9 @@ void inline alert_col ( bsg_col_barrier * p_col_b);
 
 //------------------------------------------------------------------
 //d. wait a address to be writen by others with specific value 
+
 inline int bsg_wait_local_int(int * ptr,  int cond );
+inline int bsg_wait_local_int_asm(int * ptr,  int cond );
 
 //------------------------------------------------------------------
 // 1. send the sync signal to the center tile of the row
@@ -245,4 +247,74 @@ inline int bsg_wait_local_int(int * ptr,  int cond ) {
         }
     }
 }
+
+// assembly equivalent of the above; note stalls after lr's, and at least one branch
+// mispredict on exit path
+
+inline int bsg_wait_local_int_asm (int *ptr, int cond)
+{ int tmp; __asm__ __volatile__("2: lr.w %0, %1\n\t"
+                                "beq %0, %2, 1f\n\t"
+                                "lr.w.aq %0, %1\n\t"
+                                "bne %0, %2, 2b\n\t"
+                                "1:\n\t" : "=&r" (tmp) : "A" (*ptr), "r" (cond)); 
+  return tmp; 
+}
+
+// checks to see if a word is set; waits if it is not
+// but assumes that any incoming word is the correct word
+
+inline int bsg_wait_local_int_asm_blind (int *ptr, int cond)
+{ int tmp; __asm__ __volatile__("lr.w %0, %1\n\t"
+                                "beq %0, %2, 1f\n\t"
+                                "lr.w.aq %0, %1\n\t"
+                                "1:\n\t" : "=&r" (tmp) : "A" (*ptr), "r" (cond)); 
+  return tmp; 
+}
+
+// this checks to see if 16 consecutive bytes have been set to all 0xFF or 0x00
+// and when it has, it writes the same value to ptr_out.
+//
+// the code is written to minimize load/use stalls and also to minimize branch
+// mispredicts on the case when everything has arrived. hence we have the off-path
+// branches all jump forward
+
+// this code is polling so should only be used when you know the word is coming in
+// quickly; e.g. a barrier
+
+inline int bsg_join4_relay  (volatile int *ptr_in, int cond, char *ptr_out)
+{ int tmp; int tmp2; __asm__ __volatile__(
+				"4:\n\t"
+				"lw %0, 0+%4\n\t"
+                                "lw %1, 4+%4\n\t"
+                                "bne %0, %2, 3f\n\t"        
+                                "bne %1, %2, 3f\n\t"
+				"2: lw %0, 8+%4\n\t"
+                                "lw %1, 12+%4\n\t"
+                                "bne %0, %2, 1f\n\t"        
+                                "bne %1, %2, 1f\n\t"
+				"sb %2, %3\n\t"
+                                "j 0f \n\t"
+				"3:j 4b\n\t"
+				"1:j 2b\n\t"
+                                "0:\n\t"
+					  : "=&r" (tmp), "=&r" (tmp2) : "r" (cond), "A" (*ptr_out), "A" (*ptr_in)); 
+  return tmp; 
+}
+
+inline int bsg_join2  (volatile int *ptr_in, int cond)
+{ int tmp; int tmp2; __asm__ __volatile__(
+				"4:\n\t"
+				"lw %0, 0+%3\n\t"
+                                "lw %1, 4+%3\n\t"
+                                "bne %0, %2, 3f\n\t"        
+                                "bne %1, %2, 3f\n\t"
+                                "j 0f \n\t"
+				"3:j 4b\n\t"
+                                "0:\n\t"
+					  : "=&r" (tmp), "=&r" (tmp2) : "r" (cond), "A" (*ptr_in)); 
+  return tmp; 
+}
+
+
+
 #endif
diff --git a/software/mk/Makefile.builddefs b/software/mk/Makefile.builddefs
index e34e81fa9..1bfe6933b 100644
--- a/software/mk/Makefile.builddefs
+++ b/software/mk/Makefile.builddefs
@@ -39,7 +39,7 @@ BSG_ELF_VCACHE_SIZE ?= $(shell echo \
 # EVA of stack pointer
 BSG_ELF_DRAM_EVA_OFFSET = 0x80000000
 ifeq ($(BSG_ELF_DEFAULT_DATA_LOC), LOCAL)
-BSG_ELF_STACK_PTR ?= 0x00001ffc
+BSG_ELF_STACK_PTR ?= 0x00000ffc
 else
   ifeq ($(BSG_ELF_OFF_CHIP_MEM), 1)
   BSG_ELF_STACK_PTR ?= $(shell echo $$(( $(BSG_ELF_DRAM_EVA_OFFSET) + $(BSG_ELF_DRAM_SIZE) - 4 )) )
@@ -88,6 +88,15 @@ RISCV_GCC_OPTS +=$(RISCV_GCC_EXTRA_OPTS) -I$(BSG_MANYCORE_DIR)/software/spmd/com
 RISCV_GCC_OPTS +=-Dbsg_tiles_X=$(bsg_tiles_X) -Dbsg_tiles_Y=$(bsg_tiles_Y)
 RISCV_GCC_OPTS +=-Dbsg_global_X=$(bsg_global_X) -Dbsg_global_Y=$(bsg_global_Y)
 RISCV_GCC_OPTS +=-Dbsg_group_size=$(bsg_group_size)
+RISCV_GCC_OPTS +=-Dbsg_pods_X=$(bsg_pods_X) -Dbsg_pods_Y=$(bsg_pods_Y)
+RISCV_GCC_OPTS +=-DVCACHE_BLOCK_SIZE_WORDS=${BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS}
+RISCV_GCC_OPTS +=-DVCACHE_CAPACITY_WORDS=$(shell expr $(BSG_MACHINE_GLOBAL_X) \* $(BSG_MACHINE_VCACHE_SET) \* $(BSG_MACHINE_VCACHE_WAY) \* $(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS) \* 2 \* $(BSG_MACHINE_NUM_VCACHE_ROWS))
+
+# IO_X/Y_INDEX indicates global physical coordinate where the host interface is attached.
+# This macros are required by the program to know where to send the finish/fail packets to the host.
+RISCV_GCC_OPTS +=-DIO_X_INDEX=$(BSG_MACHINE_HOST_X_CORD)
+RISCV_GCC_OPTS +=-DIO_Y_INDEX=$(BSG_MACHINE_HOST_Y_CORD)
+
 ifeq ($(BSG_NEWLIB), 1)
   ARGC ?= 1
   ARGV ?= main
@@ -290,7 +299,11 @@ $(BSG_MANYCORE_LIB): $(BSG_MANYCORE_LIB_OBJS)
 		$(BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS) \
 		$(BSG_MACHINE_DRAM_SIZE_WORDS) $(BSG_MACHINE_MAX_EPA_WIDTH) \
 		$(bsg_tiles_org_X) $(bsg_tiles_org_Y) \
-		$(bsg_tiles_X) $(bsg_tiles_Y) $(BSG_ELF_OFF_CHIP_MEM) > $*.nbf
+		$(bsg_tiles_X) $(bsg_tiles_Y) $(BSG_ELF_OFF_CHIP_MEM) \
+		$(BSG_MACHINE_ORIGIN_X_CORD) $(BSG_MACHINE_ORIGIN_Y_CORD) \
+		$(BSG_MACHINE_PODS_X) $(BSG_MACHINE_PODS_Y) \
+    $(bsg_pods_X) $(bsg_pods_Y) \
+    $(BSG_MACHINE_NUM_VCACHE_ROWS) > $*.nbf
 
 
 %.bin:  %.hex
diff --git a/software/mk/Makefile.dimensions b/software/mk/Makefile.dimensions
index 58c5c2ce7..bf4825a67 100644
--- a/software/mk/Makefile.dimensions
+++ b/software/mk/Makefile.dimensions
@@ -1,19 +1,28 @@
+# number of pods to deploy the SPMD programs
+bsg_pods_X ?= 1
+bsg_pods_Y ?= 1
+bsg_pods_size ?= $(call bsg-times-fn,$(bsg_pods_X),$(bsg_pods_Y))
 
-#$(warning testing $(call bsg-max-fn,2,3))
-#$(warning testing $(call bsg-minus-fn,3,2))
-#$(warning testing $(call bsg-min-fn,3,2))
-#$(warning testing $(call bsg-times-fn,3,2))
-#$(warning testing $(call bsg-plus-fn,3,2))
+# By default, one tile-group running each pod sends exactly one finish packet to the host.
+# The host will wait for the total number of finish packets equal to the number of pods that has spmd programs launched.
+NUM_FINISH ?= $(bsg_pods_size)
 
+# tile-group origin
+# relative to the top-left tile in a pod
 bsg_tiles_org_X ?=0
-bsg_tiles_org_Y ?=2
+bsg_tiles_org_Y ?=0
 
 ifdef BSG_MACHINE_GLOBAL_X
 
-bsg_tiles_X ?= $(call bsg-min-fn,2,$(BSG_MACHINE_GLOBAL_X))
-bsg_tiles_Y ?= $(call bsg-min-fn,2,$(call bsg-minus-fn,$(BSG_MACHINE_GLOBAL_Y),1))
+
+# tile group size
+# By default, the tile groups size is the full array.
+bsg_tiles_X ?= $(BSG_MACHINE_GLOBAL_X)
+bsg_tiles_Y ?= $(BSG_MACHINE_GLOBAL_Y)
 bsg_group_size ?= $(call bsg-times-fn,$(bsg_tiles_X),$(bsg_tiles_Y))
 
+
+# size of a pod
 bsg_global_X ?= $(BSG_MACHINE_GLOBAL_X)
 bsg_global_Y ?= $(BSG_MACHINE_GLOBAL_Y)
 
diff --git a/software/mk/Makefile.llvminstall b/software/mk/Makefile.llvminstall
index ef1109bf5..a28cc3b71 100644
--- a/software/mk/Makefile.llvminstall
+++ b/software/mk/Makefile.llvminstall
@@ -3,7 +3,14 @@ ifndef LLVM_DIR
 endif
 
 # devtoolset-8
-HOST_TOOLCHAIN ?= /opt/rh/devtoolset-8/root/usr/bin
+GCCVERSION = $(shell gcc -dumpversion)
+ifeq ($(shell expr $(GCCVERSION) \> 7), 1)
+HOST_TOOLCHAIN ?=
+$(info Using default GCCVERSION $(GCCVERSION))
+else
+HOST_TOOLCHAIN ?= /opt/rh/devtoolset-8/root/usr/bin/
+$(info Default GCCVERSION $(GCCVERSION) is too old; using $(HOST_TOOLCHAIN))
+endif
 
 # We need cmake3. On older RHEL systems, cmake is version 2 and cmake3
 # is version 3. On newer systems cmake is version3. Default to cmake3
@@ -36,8 +43,8 @@ llvm-install:
 	    && $(CMAKE) -G $(GENERATOR) -DCMAKE_BUILD_TYPE="Debug" \
       -DLLVM_ENABLE_PROJECTS="clang" \
 	    -DCMAKE_INSTALL_PREFIX="$(LLVM_DIR)/llvm-install" \
-	    -DCMAKE_C_COMPILER=$(HOST_TOOLCHAIN)/gcc \
-	    -DCMAKE_CXX_COMPILER=$(HOST_TOOLCHAIN)/g++ \
+	    -DCMAKE_C_COMPILER=$(HOST_TOOLCHAIN)gcc \
+	    -DCMAKE_CXX_COMPILER=$(HOST_TOOLCHAIN)g++ \
 	    -DLLVM_TARGETS_TO_BUILD="X86;RISCV" \
 	    -DBUILD_SHARED_LIBS=True \
 	    -DLLVM_USE_SPLIT_DWARF=True \
diff --git a/software/mk/Makefile.paths b/software/mk/Makefile.paths
index 3e53e0228..111d6102f 100644
--- a/software/mk/Makefile.paths
+++ b/software/mk/Makefile.paths
@@ -1,5 +1,4 @@
 BSG_IP_CORES_DIR ?= $(BSG_MANYCORE_DIR)/../basejump_stl
 RISCV_BIN_DIR ?= $(BSG_MANYCORE_DIR)/software/riscv-tools/riscv-install/bin
 CAD_DIR ?= $(BSG_MANYCORE_DIR)/../bsg_cadenv
-BSG_MACHINE_PATH ?= $(BSG_MANYCORE_DIR)/machines/8x4
-
+BSG_MACHINE_PATH ?= $(BSG_MANYCORE_DIR)/machines/pod_1x1
diff --git a/software/mk/Makefile.tail_rules b/software/mk/Makefile.tail_rules
index 89a345c55..affcb2ea3 100644
--- a/software/mk/Makefile.tail_rules
+++ b/software/mk/Makefile.tail_rules
@@ -32,11 +32,17 @@ spike_call_log: pp_spike_trace
 	$(BSG_MANYCORE_DIR)/software/py/func_call_log.py spike_pc.log $(PROG_NAME).riscv > $@
 
 
+# DRAMsim3 blood graph
+dramsim3.%: blood_graph_%.log
+	python $(BSG_MANYCORE_DIR)/software/py/dramsim3_blood_graph.py $< dramsim3_$*.png
+
+
+
 clean:
 	-rm -rf *.o *.a *.jou *.log *.pelog *.pb bsg_manycore_io_complex_rom.v *.riscv *.wdb *.bin *.hex *.ld
 	-rm -rf xsim.dir *.mem stack.info.*
 	-rm -rf ./simv csrc simv.daidir ucli.key DVEfiles *.vpd *.vdb coverage* constfile.txt
 	-rm -rf build/ *.bc* *.ll*
-	-rm -rf lfs.c *.nbf *.csv
-	-rm -rf dramsim3epoch.json
+	-rm -rf lfs.c *.nbf *.csv *.png
+	-rm -rf dramsim3epoch.json pc_stats dramsim3.json dramsim3.txt dramsim3.tag.json
 	-rm -rf dis
diff --git a/software/mk/Makefile.verilog b/software/mk/Makefile.verilog
index ac0001691..77198cad9 100644
--- a/software/mk/Makefile.verilog
+++ b/software/mk/Makefile.verilog
@@ -36,7 +36,11 @@ endif
 # Simulator setup
 
 ifeq ($(WAVE),0)
+ifeq ($(TRACE),0)
   BSG_SIM_EXE = $(BSG_MACHINE_PATH)/simv
+else
+  BSG_SIM_EXE = $(BSG_MACHINE_PATH)/simv-profile
+endif
 else
   BSG_SIM_EXE = $(BSG_MACHINE_PATH)/simv-debug
 endif
@@ -46,6 +50,7 @@ NBF_FILE = $(PROG_NAME).nbf
 
 BSG_SIM_OPTS = +nbf_file=$(NBF_FILE) \
                +max_cycle=$(MAX_CYCLES) \
+               +num_finish=$(NUM_FINISH) \
                +vanilla_trace_en=$(TRACE) \
                -licqueue -reportstats \
                -l run.log
@@ -63,7 +68,9 @@ ifeq ($(abspath $(BSG_MACHINE_PATH)/..),$(abspath $(BSG_MANYCORE_DIR)/machines))
 	$(MAKE) -C $(BSG_MACHINE_PATH)/.. $(BSG_SIM_EXE)
 endif
 	$(BSG_SIM_EXE) $(BSG_SIM_OPTS)
-
+ifeq ($(TRACE),1)
+	PYTHONPATH=$(BSG_MANYCORE_DIR)/software/py/vanilla_parser/.. python3 -m vanilla_parser --generate-key
+endif
 
 ###################################################
 # Need to move below to machines directory
diff --git a/software/py/bsg_manycore_link_gen.py b/software/py/bsg_manycore_link_gen.py
index e7cc4b2db..42de3b6f6 100755
--- a/software/py/bsg_manycore_link_gen.py
+++ b/software/py/bsg_manycore_link_gen.py
@@ -135,7 +135,7 @@ def script(self):
     # LMA (Load Memory Address)    => NPA used by loader
     # VMA (Virtual Memory Address) => Logical address used by linker for
     #                                 symbol resolutions
-    _DMEM_VMA_START   = 0x1000
+    _DMEM_VMA_START   = 0x0000
     _DMEM_VMA_SIZE    = 0x1000
     _DRAM_T_LMA_START = 0x80000000
     _DRAM_T_LMA_SIZE  = self._imem_size
@@ -154,7 +154,8 @@ def script(self):
       # Format:
       # <output section>: [<input sections>]
       ['.text.dram'        , ['.crtbegin','.text','.text.startup','.text.*']],
-      ['.dmem'             , ['.dmem','.dmem.*']],
+      # bsg-tommy: 8 bytes are allocated in.dmem.interrupt for interrupt handler to spill registers.
+      ['.dmem'             , ['.dmem.interrupt', '.dmem','.dmem.*']],
       ['.data'             , ['.data','.data*']],
       ['.sdata'            , ['.sdata','.sdata.*','.sdata*','.sdata*.*'
                               '.gnu.linkonce.s.*']],
@@ -174,7 +175,7 @@ def script(self):
     # DMEM sections
     for i, m in enumerate(section_map):
       sec = m[0]
-      laddr = '0x1000'
+      laddr = "0x" + "{:0X}".format(_DMEM_VMA_START)
       in_sections = m[1]
 
       # Place objects into dmem if default data loc is dmem
diff --git a/software/py/dramsim3_blood_graph.py b/software/py/dramsim3_blood_graph.py
new file mode 100644
index 000000000..76a013c07
--- /dev/null
+++ b/software/py/dramsim3_blood_graph.py
@@ -0,0 +1,73 @@
+#
+#   dramsim3_blood_graph.py
+#
+#   how to use
+#   python dramsim3_blood_graph.py {input.log}  {output.png}
+#
+#   Author: Tommy
+
+
+
+import sys
+import csv
+from PIL import Image, ImageDraw
+
+class BloodGraph:
+
+  palette = {
+    "act"       : (0xff,0xff,0x00),   ## yellow       = activate
+    "pre"       : (0xff,0xa5,0x00),   ## orange       = precharge
+    "rd"        : (0x00,0xff,0x00),   ## green        = read
+    "wr"        : (0x00,0x88,0x00),   ## dark green   = write
+    "nop"       : (0xff,0xaa,0xff),   ## pink         = no request in this bank
+    "conf"      : (0xff,0x00,0x00),   ## red          = there is a row hit, but can't access due to various timing constraints (tWTR, tCCD_S, etc)
+    "closed"    : (0x80,0x00,0x80),   ## purple       = there is a request in this bank, but the row is closed.
+    "ref"       : (0x60,0x60,0x60),   ## gray         = refresh
+    "arb"       : (0x00,0xff,0xff),   ## cyan         = there is a row hit, but other bank is accessing (arbitrated)
+    "row_miss"  : (0xff,0x00,0xff)    ## fuchsia      = there is a request but row miss.
+  }
+
+  def generate(self, input_file, output_file):
+    traces = []
+    with open(input_file) as f:
+      csv_reader = csv.DictReader(f, delimiter=",")
+      for row in csv_reader:
+        trace = {}
+        trace["time"] = int(row["time"])
+        trace["bank"] = int(row["bank"])
+        trace["state"] = row["state"]
+        traces.append(trace)
+
+      self.__get_stats(traces)
+      self.__init_image()
+      for trace in traces:
+        self.__mark_trace(trace)
+      self.img.save(output_file)
+      return
+
+  def __get_stats(self, traces):
+    banks = list(map(lambda t: t["bank"], traces))
+    times = list(map(lambda t: t["time"], traces))
+    self.num_banks = 1+max(banks)
+    self.end_time = max(times)
+    return
+
+  def __init_image(self):
+    self.img_width = 3900//2
+    self.img_height = ((self.end_time+self.img_width)//self.img_width)*(2+self.num_banks)
+    self.img = Image.new("RGB", (self.img_width, self.img_height), "black")
+    self.pixel = self.img.load()
+    return
+
+  def __mark_trace(self, trace):
+    col = trace["time"] % self.img_width
+    floor = trace["time"] // self.img_width
+    row = floor*(2+self.num_banks) + trace["bank"]
+    self.pixel[col,row] = self.palette[trace["state"]]
+
+
+if __name__ == "__main__":
+  input_file = sys.argv[1]
+  output_file = sys.argv[2]
+  blood_graph = BloodGraph()
+  blood_graph.generate(input_file, output_file)
diff --git a/software/py/nbf.py b/software/py/nbf.py
index fffef6080..a382bfbba 100644
--- a/software/py/nbf.py
+++ b/software/py/nbf.py
@@ -3,21 +3,21 @@
 #
 #   ELF (.riscv) to Network Boot Format (.nbf)
 #
-
+#   When there is a EVA to NPA mapping change in bsg_manycore_eva_to_npa.v,
+#   this file should also be updated accordingly.
+#
+#   https://github.com/bespoke-silicon-group/bsg_manycore/blob/master/v/bsg_manycore_eva_to_npa.v
+#
+#
 
 import sys
 import math
 import os
 import subprocess
 
-#
-#   NBF
-#
 
 ################################
 # EPA Constants
-DMEM_BASE_EPA = 0x400
-
 ICACHE_BASE_EPA = 0x400000
 
 CSR_BASE = 0x8000
@@ -51,7 +51,16 @@ def __init__(self, config):
     self.cache_block_size = config["cache_block_size"]
     self.dram_size = config["dram_size"]
     self.addr_width = config["addr_width"]
-  
+    self.origin_x_cord = config["origin_x_cord"]
+    self.origin_y_cord = config["origin_y_cord"]
+    # physical number of pods
+    self.machine_pods_x = config["machine_pods_x"]
+    self.machine_pods_y = config["machine_pods_y"]
+    # number of pods to launch program.
+    self.num_pods_x = config["num_pods_x"]  
+    self.num_pods_y = config["num_pods_y"]
+    self.num_vcache_rows = config["num_vcache_rows"]
+
     # software setting
     self.tgo_x = config["tgo_x"]
     self.tgo_y = config["tgo_y"]
@@ -59,12 +68,14 @@ def __init__(self, config):
     self.tg_dim_y = config["tg_dim_y"]
     self.enable_dram = config["enable_dram"]
 
+
     # derived params
     self.cache_size = self.cache_way * self.cache_set * self.cache_block_size # in words
     self.x_cord_width = self.safe_clog2(self.num_tiles_x)
 
     # process riscv
     self.get_data_end_addr()
+    self.get_start_addr()
     self.read_dmem()
     self.read_dram()
    
@@ -162,6 +173,22 @@ def get_data_end_addr(self):
       if words[2] == "_bsg_data_end_addr":
         self.bsg_data_end_addr = (int(words[0]) >> 2) # make it word address
 
+
+  # grab address for _start symbol.
+  # code earlier than that contains interrupt handler.
+  # we want to set the tile pc_init val to this address.
+  # if _start is not found, which might be possible for some spmd assembly tests, just return 0
+  def get_start_addr(self):
+    proc = subprocess.Popen(["nm", "--radix=d", self.riscv_file], stdout=subprocess.PIPE)
+    lines = proc.stdout.readlines()
+    self.start_addr = 0
+    for line in lines:
+      stripped = line.strip()
+      words = stripped.split()
+      if words[2] == "_start":
+        self.start_addr = (int(words[0]) >> 2) # make it word address
+    
+
   def select_bits(self, num, start, end):
     retval = 0
 
@@ -177,21 +204,55 @@ def select_bits(self, num, start, end):
 
 
   # set TGO x,y
-  def config_tile_group(self):
+  def config_tile_group(self, pod_origin_x, pod_origin_y):
     for x in range(self.tg_dim_x):
       for y in range(self.tg_dim_y):
-        x_eff = self.tgo_x + x
-        y_eff = self.tgo_y + y
+        x_eff = self.tgo_x + x + pod_origin_x
+        y_eff = self.tgo_y + y + pod_origin_y
         self.print_nbf(x_eff, y_eff, CSR_TGO_X, self.tgo_x)
         self.print_nbf(x_eff, y_eff, CSR_TGO_Y, self.tgo_y)
 
- 
+
+  # initialize vcache wh dest cord
+  def init_vcache_wh_dest(self, pod_origin_x, pod_origin_y, px):
+    # top two MSBs are 1.
+    epa = 0b11 << (self.addr_width-2)
+
+    # if there is only one pod in x-direction, split the pod in half.
+    if self.machine_pods_x == 1:
+      for r in range(self.num_vcache_rows):
+        for x in range(self.num_tiles_x):
+          east_not_west = 0 if x < (self.num_tiles_x/2) else 1
+          # north vcache
+          x_eff = pod_origin_x + x
+          y_eff = pod_origin_y - 1 - r
+          self.print_nbf(x_eff, y_eff, epa, east_not_west)
+          # south vcache
+          x_eff = pod_origin_x + x
+          y_eff = pod_origin_y + self.num_tiles_y + r
+          self.print_nbf(x_eff, y_eff, epa, east_not_west)
+    # if there are more than one pod, then the left half of pods goes to west, and the right half to east.
+    else:
+      east_not_west = 0 if (px < (self.machine_pods_x/2)) else 1
+      for r in range(self.num_vcache_rows):
+        for x in range(self.num_tiles_x):
+          # north vcache
+          x_eff = pod_origin_x + x
+          y_eff = pod_origin_y - 1 - r
+          self.print_nbf(x_eff, y_eff, epa, east_not_west)
+          # south vcache
+          x_eff = pod_origin_x + x
+          y_eff = pod_origin_y + self.num_tiles_y + r
+          self.print_nbf(x_eff, y_eff, epa, east_not_west)
+
+
+
   # initialize icache
-  def init_icache(self):
+  def init_icache(self, pod_origin_x, pod_origin_y):
     for x in range(self.tg_dim_x):
       for y in range(self.tg_dim_y):
-        x_eff = self.tgo_x + x
-        y_eff = self.tgo_y + y
+        x_eff = self.tgo_x + x + pod_origin_x
+        y_eff = self.tgo_y + y + pod_origin_y
         for k in sorted(self.dram_data.keys()):
           addr = k - 0x20000000
           if addr < self.icache_size:
@@ -200,7 +261,7 @@ def init_icache(self):
         
  
   # initialize dmem
-  def init_dmem(self):
+  def init_dmem(self, pod_origin_x, pod_origin_y):
     # if there is nothing in dmem, just return.
     if len(self.dmem_data.keys()) == 0:
       return
@@ -208,10 +269,10 @@ def init_dmem(self):
     for x in range(self.tg_dim_x):
       for y in range(self.tg_dim_y):
 
-        x_eff = self.tgo_x + x
-        y_eff = self.tgo_y + y
+        x_eff = self.tgo_x + x + pod_origin_x
+        y_eff = self.tgo_y + y + pod_origin_y
           
-        for k in range(1024, self.bsg_data_end_addr):
+        for k in range(self.bsg_data_end_addr):
           if k in self.dmem_data.keys():
             self.print_nbf(x_eff, y_eff, k, self.dmem_data[k])
           else:
@@ -219,16 +280,16 @@ def init_dmem(self):
 
  
   # disable dram mode
-  def disable_dram(self):
+  def disable_dram(self, pod_origin_x, pod_origin_y):
     for x in range(self.tg_dim_x):
       for y in range(self.tg_dim_y):
-        x_eff = self.tgo_x + x
-        y_eff = self.tgo_y + y
+        x_eff = self.tgo_x + x + pod_origin_x
+        y_eff = self.tgo_y + y + pod_origin_y
         self.print_nbf(x_eff, y_eff, CSR_ENABLE_DRAM, 0)
    
  
   # initialize vcache in no DRAM mode
-  def init_vcache(self):
+  def init_vcache(self, pod_origin_x, pod_origin_y):
 
     t_shift = self.safe_clog2(self.cache_block_size)
 
@@ -236,33 +297,36 @@ def init_vcache(self):
       for t in range(self.cache_way * self.cache_set):
         epa = (t << t_shift) | (1 << (self.addr_width-1))
         data = (1 << (self.data_width-1)) | (t / self.cache_set)
-        self.print_nbf(x, 0, epa, data)
-        self.print_nbf(x, self.num_tiles_y+1, epa, data)
+        # top vcache
+        self.print_nbf(x+pod_origin_x, pod_origin_y-1, epa, data)
+        # bot vcache
+        self.print_nbf(x+pod_origin_x, pod_origin_y+self.num_tiles_y, epa, data)
          
  
   # init DRAM
-  def init_dram(self, enable_dram): 
+  def init_dram(self, pod_origin_x, pod_origin_y): 
     cache_size = self.cache_size
     lg_x = self.safe_clog2(self.num_tiles_x)
     lg_block_size = self.safe_clog2(self.cache_block_size)
     lg_set = self.safe_clog2(self.cache_set)
-    index_width = self.addr_width-1-lg_block_size-1
+    lg_y = self.safe_clog2(2*self.num_vcache_rows)
+    index_width = 32-1-2-lg_block_size-lg_x-lg_y
 
-    if enable_dram == 1:
+    if self.enable_dram == 1:
       # dram enabled:
       # EVA space is striped across top and bottom vcaches.
       if self.num_tiles_x & (self.num_tiles_x-1) == 0:
         # hashing for power of 2 banks
         for k in sorted(self.dram_data.keys()):
           addr = k - 0x20000000
-          x = self.select_bits(addr, lg_block_size, lg_block_size + lg_x - 1)
-          y = self.select_bits(addr, lg_block_size + lg_x, lg_block_size + lg_x)
-          index = self.select_bits(addr, lg_block_size+lg_x+1, lg_block_size+lg_x+1+index_width-1)
+          x = self.select_bits(addr, lg_block_size, lg_block_size + lg_x - 1) + pod_origin_x
+          y = self.select_bits(addr, lg_block_size + lg_x, lg_block_size + lg_x + lg_y-1)
+          index = self.select_bits(addr, lg_block_size+lg_x+lg_y, lg_block_size+lg_x+lg_y+index_width-1)
           epa = self.select_bits(addr, 0, lg_block_size-1) | (index << lg_block_size)
-          if y == 0:
-            self.print_nbf(x, 0, epa, self.dram_data[k]) #top
+          if y % 2 == 0:
+            self.print_nbf(x, pod_origin_y-1-(y/2), epa, self.dram_data[k]) #top
           else:
-            self.print_nbf(x, self.num_tiles_y+1, epa, self.dram_data[k]) #bot
+            self.print_nbf(x, pod_origin_y+self.num_tiles_y+(y/2), epa, self.dram_data[k]) #bot
       else:
         print("hash function not supported for x={0}.")
         sys.exit()
@@ -271,21 +335,25 @@ def init_dram(self, enable_dram):
       # using vcache as block mem
       for k in sorted(self.dram_data.keys()):
         addr = k - 0x20000000
-        x = addr / cache_size
+        x = (addr / cache_size)
         epa = addr % cache_size
         if (x < self.num_tiles_x):
-          self.print_nbf(x, 0, epa, self.dram_data[k])
+          x_eff = x + pod_origin_x
+          y_eff = pod_origin_y -1
+          self.print_nbf(x_eff, y_eff, epa, self.dram_data[k])
         elif (x < self.num_tiles_x*2):
-          self.print_nbf(x, self.num_tiles_y+1, epa, self.dram_data[k])
+          x_eff = (x % self.num_tiles_x) + pod_origin_x
+          y_eff = pod_origin_y + self.num_tiles_y
+          self.print_nbf(x_eff, y_eff, epa, self.dram_data[k])
         else:
           print("## WARNING: NO DRAM MODE, DRAM DATA OUT OF RANGE!!!")
 
       
 
   # unfreeze tiles
-  def unfreeze_tiles(self):
-    tgo_x = self.tgo_x
-    tgo_y = self.tgo_y
+  def unfreeze_tiles(self, pod_origin_x, pod_origin_y):
+    tgo_x = self.tgo_x + pod_origin_x
+    tgo_y = self.tgo_y + pod_origin_y
 
     for y in range(self.tg_dim_y):
       for x in range(self.tg_dim_x):
@@ -293,6 +361,22 @@ def unfreeze_tiles(self):
         y_eff = tgo_y + y
         self.print_nbf(x_eff, y_eff, CSR_FREEZE, 0)
 
+
+  # set pc_init_val.
+  # if _start is not 0, then set the pc_init_val.
+  def set_pc_init_val(self, pod_origin_x, pod_origin_y):
+    if self.start_addr == 0:
+      return
+    tgo_x = self.tgo_x + pod_origin_x
+    tgo_y = self.tgo_y + pod_origin_y
+
+    for y in range(self.tg_dim_y):
+      for x in range(self.tg_dim_x):
+        x_eff = tgo_x + x
+        y_eff = tgo_y + y
+        self.print_nbf(x_eff, y_eff, CSR_PC_INIT, self.start_addr)
+
+
   # print finish
   # when spmd loader sees, this it stops sending packets.
   def print_finish(self):
@@ -304,22 +388,42 @@ def fence(self):
     self.print_nbf(0xff, 0xff, 0x0, 0x0)
 
 
+
+
   ##### LOADER ROUTINES END  #####  
 
+  # public main function
   # users only have to call this function.
   def dump(self):
-    self.config_tile_group()
-    self.init_icache()
-    self.init_dmem()
+    # initialize all pods
+    for px in range(self.num_pods_x):
+      for py in range(self.num_pods_y):
+        pod_origin_x = self.origin_x_cord + (px*self.num_tiles_x)
+        pod_origin_y = self.origin_y_cord + (py*2*self.num_tiles_y)
+        self.config_tile_group(pod_origin_x, pod_origin_y)
+        self.init_icache(pod_origin_x, pod_origin_y)
+        self.init_dmem(pod_origin_x, pod_origin_y)
+        self.set_pc_init_val(pod_origin_x, pod_origin_y)
+        self.init_vcache_wh_dest(pod_origin_x, pod_origin_y, px)
+
+        if self.enable_dram != 1:
+          self.disable_dram(pod_origin_x, pod_origin_y)
+          self.init_vcache(pod_origin_x, pod_origin_y)
 
-    if self.enable_dram != 1:
-      self.disable_dram()    
-      self.init_vcache()
+        self.init_dram(pod_origin_x, pod_origin_y)
 
-    self.init_dram(self.enable_dram)
+
+    # wait for all store credits to return.
     self.fence()
-    self.unfreeze_tiles()
 
+    # unfreeze all pods
+    for px in range(self.num_pods_x):
+      for py in range(self.num_pods_y):
+        pod_origin_x = self.origin_x_cord + (px*self.num_tiles_x)
+        pod_origin_y = self.origin_y_cord + (py*2*self.num_tiles_y)
+        self.unfreeze_tiles(pod_origin_x, pod_origin_y)
+
+    # print finish nbf.
     self.print_finish()
 
 
@@ -348,7 +452,7 @@ def dump(self):
 #
 if __name__ == "__main__":
 
-  if len(sys.argv) == 14:
+  if len(sys.argv) == 21:
     # config setting
     config = {
       "riscv_file" : sys.argv[1],
@@ -365,6 +469,13 @@ def dump(self):
       "tg_dim_x" : int(sys.argv[11]),
       "tg_dim_y" : int(sys.argv[12]),
       "enable_dram" : int(sys.argv[13]),
+      "origin_x_cord" : int(sys.argv[14]),
+      "origin_y_cord" : int(sys.argv[15]),
+      "machine_pods_x" : int(sys.argv[16]),
+      "machine_pods_y" : int(sys.argv[17]),
+      "num_pods_x" : int(sys.argv[18]),
+      "num_pods_y" : int(sys.argv[19]),
+      "num_vcache_rows" : int(sys.argv[20])
     }
 
     converter = NBF(config)
@@ -373,7 +484,11 @@ def dump(self):
     print("USAGE:")
     command = "python nbf.py {program.riscv} "
     command += "{num_tiles_x} {num_tiles_y} "
-    command += "{cache_way} {cache_set} {cache_block_size} {dram_size} {max_epa_width}"
-    command += "{tgo_x} {tgo_y} {tg_dim_x} {tg_dim_y} {enable_dram}"
+    command += "{cache_way} {cache_set} {cache_block_size} {dram_size} {max_epa_width} "
+    command += "{tgo_x} {tgo_y} {tg_dim_x} {tg_dim_y} {enable_dram} "
+    command += "{origin_x_cord} {origin_y_cord}"
+    command += "{machine_pods_x} {machine_pods_y}"
+    command += "{num_pods_x} {num_pods_y}"
+    command += "{num_vcache_rows}"
     print(command)
 
diff --git a/software/py/vanilla_parser/stats_parser.py b/software/py/vanilla_parser/stats_parser.py
index 279e6ec30..04e78bb7e 100644
--- a/software/py/vanilla_parser/stats_parser.py
+++ b/software/py/vanilla_parser/stats_parser.py
@@ -23,14 +23,26 @@
 
 import sys
 import argparse
+import functools
 import os
 import re
 import csv
 import numpy as np
+
+# Pandas must be at least version 1.0.0 and tabulate must be installed
+# This is not uncommon, but do this check to provide a better error message
+import pandas as pd
+import tabulate
+try:
+    pd.DataFrame.to_markdown
+except:
+    raise RuntimeError("Pandas version is not sufficient. Upgrade pandas to > 1.0.0")
+
 from enum import Enum
 from collections import Counter
 from . import common
 
+
 # CudaStatTag class 
 # Is instantiated by a packet tag value that is recieved from a 
 # bsg_cuda_print_stat(tag) insruction
@@ -93,9 +105,22 @@ def tag(self):
         """ Get the tag associated with this object """
         return ((self.__s >> self._TAG_INDEX) & self._TAG_MASK)
 
+    @property
+    def getTag(self):
+        """ Get the tag associated with this object """
+        if(self.__type == self.StatType.KERNEL_START or
+           self.__type == self.StatType.KERNEL_END):
+            return "Kernel"
+        return ((self.__s >> self._TAG_INDEX) & self._TAG_MASK)
+
     @property 
     def tg_id(self):
-        """ Get the Tile-Group IP associated with this object """
+        """ Get the Tile-Group ID associated with this object """
+        return ((self.__s >> self._TG_ID_INDEX) & self._TG_ID_MASK)
+
+    @property 
+    def getTileGroupID(self):
+        """ Get the Tile-Group ID associated with this object """
         return ((self.__s >> self._TG_ID_INDEX) & self._TG_ID_MASK)
 
     @property 
@@ -108,6 +133,11 @@ def y(self):
         """ Get the Y Coordinate associated with this object """
         return ((self.__s >> self._Y_INDEX) & self._Y_MASK)
 
+    @property 
+    def getAction(self):
+        """ Get the Action that this object defines"""
+        return "Start" if self.__type in {self.StatType.KERNEL_START, self.StatType.START} else "End"
+
     @property 
     def statType(self):
         """ Get the StatType that this object defines"""
@@ -137,7 +167,761 @@ def isKernelEnd(self):
         bsg_cuda_print_stat_kernel_end """
         return (self.__type == self.StatType.KERNEL_END)
 
- 
+# Create the ManycoreCoordinate class, a surprisingly useful wrapper
+# for a tuple. Access the y and x fields using var.y and var.x
+from collections import namedtuple
+ManycoreCoordinate = namedtuple('ManycoreCoordinate', ['y', 'x'])
+
+# The challenge in the victim cache parser is order of
+# iterations. Simply described:
+# 
+# Each time a tile executes a start/end call with a
+# particular, it is an iteration of that tag.
+#
+# 0. Tiles iterate and can call start/end multiple times
+# 1. Cals to start/end define an iteration order for that tile
+# 2. Packets from a single tile arrive at the host in tile iteration order
+# 3. Packets from multiple tiles arrive interleaved
+# 4. The tile iteration number at the host is not necessarily monotonic
+#    (That is - the tiles are not necessarily executing the same iteration)
+# 
+# An iteration-consistent order must be reconstructed
+# so that start/end calls do not count operations
+# outside of the window defined by their tag.
+# 
+# The following lines enumerate an iteration order for
+# start/end calls from each tile.
+class CacheStatsParser:
+
+    # The field_is_* stall methods return true if a field from the CSV
+    # is of the type requested. Use these for filtering operations
+    @classmethod
+    def field_is_stall(cls, op):
+        return op.startswith("stall_")
+
+    @classmethod
+    def field_is_dma(cls, op):
+        return op.startswith("dma_")
+
+    @classmethod
+    def field_is_miss(cls, op):
+        return op.startswith("miss_")
+
+    @classmethod
+    def field_is_mgmt(cls, op):
+        return (op.startswith("instr_tag")
+                or (op.startswith("instr_a")
+                    and not cls.field_is_amo(op)))
+
+    @classmethod
+    def field_is_load(cls, op):
+        return op.startswith("instr_ld")
+
+    @classmethod
+    def field_is_store(cls, op):
+        return op.startswith("instr_s")
+
+    @classmethod
+    def field_is_amo(cls, op):
+        return op.startswith("instr_amo")
+
+    @classmethod
+    def field_is_event_counter(cls, op):
+        return (cls.field_is_dma(op) or op == "total_dma"
+                or cls.field_is_miss(op) or op == "total_miss")
+
+    @classmethod
+    def field_is_cycle_counter(cls, op):
+        return (cls.field_is_stall(op) or op == "total_stalls"
+                or cls.field_is_load(op) or op == "total_loads"
+                or cls.field_is_store(op) or op == "total_stores"
+                or cls.field_is_amo(op) or op == "total_atomics"
+                or cls.field_is_mgmt(op) or op == "total_mgmt"
+                or op == "global_ctr")
+
+    # Parse the raw tag column into Tag, Action, and Tile Coordinate columns
+    @classmethod
+    def parse_raw_tag(cls, df):
+        # Parse raw_tag data using CudaStatTag
+        cst = df.raw_tag.map(CudaStatTag)
+
+        p = pd.DataFrame()
+        # Update the table with information parsed from CudaStatTag
+        p["Tile Group ID"] = cst.map(lambda e: e.getTileGroupID)
+        p["Tag"] = cst.map(lambda e: e.getTag)
+        p["Action"] = cst.map(lambda e: e.getAction)
+        p["Tile Coordinate (Y,X)"] = cst.map(lambda e: ManycoreCoordinate(e.y, e.x))
+
+        return p
+
+    # Get group-level characteristics: dim, origin
+    @classmethod
+    def parse_group_characteristics(cls, df):
+        dim = ManycoreCoordinate(
+            df["Tile Coordinate (Y,X)"].map(lambda l: l.y).max() + 1,
+            df["Tile Coordinate (Y,X)"].map(lambda l: l.x).max() + 1)
+
+        origin = ManycoreCoordinate(
+            df["Tile Coordinate (Y,X)"].map(lambda l: l.y).min(),
+            df["Tile Coordinate (Y,X)"].map(lambda l: l.x).min())
+
+        return (dim, origin)
+
+    # Use the vcache column (which contains indexes, embedded in
+    # strings) into ManycoreCoordinate objects, and return them as a
+    # new column
+    @classmethod
+    def parse_cache_coordinates(cls, df):
+        cache_names = df.vcache.unique()
+        ncaches = len(cache_names)
+
+        if(ncaches % 2):
+            raise RuntimeError("Number of caches must be a multiple of two "
+                               f"Got {ncaches}.")
+
+        # The CSV contains a string representing the cache's
+        # path in the hierarchy, not the (Y,X) location, so we
+        # map the string to a (Top/Bottom,X) coordinate and create a
+        # new column in the table.
+        cache_ys = ["Top"] * (ncaches//2) + ["Bottom"] * (ncaches//2)
+        cache_xs = [*range(ncaches//2), *range(ncaches//2)]
+        cache_coords = zip(cache_ys, cache_xs)
+        cache_coord_map = {c:i for c, i in zip(cache_names,cache_coords)}
+        return df.vcache.map(cache_coord_map)
+
+    # Infer labels each line in the dataframe with the iteration
+    # number for that tile and tag.
+    #
+    # The way this is done is by hierarchical grouping in
+    # Pandas. We can think of each tile's iterations as a
+    # group of lines in the csv file, and we need to label
+    # each line with it's iteration. 
+    @classmethod
+    def parse_label_iterations(cls, df):
+        # We create a hierarchy with the following
+        # levels (top to bottom):
+        hierarchy = ["Action", "Tag", "Tile Coordinate (Y,X)", "Cache Coordinate (Y,X)"]
+
+        # At the bottom, the "Cache Coordinate (Y,X)" group
+        # will have n entries, where n is the number of times
+        # that action was called, with that tag, by the
+        # particular tile.
+        #
+        # Enumerating the n rows, produces a Tile-Tag
+        # Iteration number for each row.
+                
+        # We group the data hierarchically, as described above
+        groups = df.groupby(hierarchy)
+
+        # Then we enumerate the iterations using cumcount().
+        iterations = groups.cumcount()
+
+        return iterations
+        
+    def __init__(self, vcache_input_file):
+        d = pd.read_csv(vcache_input_file)
+
+        # Fail if the metadata is not in the header.
+        meta = ["vcache", "tag"]
+        if(not all(f in d.columns.values for f in meta)):
+            raise RuntimeError("Metadata fields not in header of CSV")
+
+        # Rename from tag, to raw_tag to avoid confusion. "tag" in
+        # this context is the unparsed data from the packet.
+        d = d.rename(columns={"tag": "raw_tag"})
+
+        # Rename columns with totals to avoid confusion
+        d = d.rename(columns={"instr_ld": "total_loads",
+                              "instr_st": "total_stores",
+                              "instr_atomic": "total_atomics"
+                          })
+
+        # Compute Stall, Mgmt, DMA, and Miss totals. These are not
+        # computed in Verilog, but Stores, Atomics, and load
+        # operations are.
+
+        # Parse out the operations we care about
+        header = d.columns.values
+        self._mgmt = [f for f in header if self.field_is_mgmt(f)]
+
+        self._stalls = [f for f in header if self.field_is_stall(f)]
+
+        self._misses = [f for f in header if self.field_is_miss(f)]
+        self._dmas = [f for f in header if self.field_is_dma(f)]
+
+        d['total_stalls'] = d[self._stalls].sum(axis="columns")
+        d['total_mgmt'] = d[self._mgmt].sum(axis="columns")
+        d['total_miss'] = d[self._misses].sum(axis="columns")
+        d['total_dma'] = d[self._dmas].sum(axis="columns")
+
+
+        # Parse raw tag data into Action, Tag, and Tile Coordinate (Y,X), 
+        # and Tile Group Columns
+        d = pd.concat([d, self.parse_raw_tag(d)], axis='columns')
+
+        # Use the vcache column (which contains indexes, embedded in
+        # strings) into ManycoreCoordinate objects, and put them in a
+        # new column
+        d["Cache Coordinate (Y,X)"] = self.parse_cache_coordinates(d)
+
+        # Create a column with the Tile-Tag iterations (see comment)
+        # All of the magic happens here.
+        d["Tile-Tag Iteration"] = self.parse_label_iterations(d)
+
+        # Drop the columns that no longer contain useful data.
+        d = d.drop(["raw_tag", "vcache", "time"], axis="columns")
+
+        # Parse the aggregate stats (for the device)
+        self.agg = AggregateCacheStats(d)
+        # Parse the aggregate stats (for each group)
+        self.group = GroupCacheStats(d)
+
+        # Finally, save d and parse the Tag, Bank, and Group data
+        self.d = d.copy();
+
+# Cache Stats is the parent class for CacheTagStats, CacheBankStats,
+# It contains reusable functionality, but doesn't actually do any
+# parsing or computation.
+class CacheStats:
+    def __init__(self, name, df):
+        self._name = name
+        self._df = df.copy()
+    
+        header = df.columns.values
+
+        # Classify operations in the header
+        self._loads = [f for f in header if CacheStatsParser.field_is_load(f)]
+        self._stores = [f for f in header if CacheStatsParser.field_is_store(f)]
+        self._atomics = [f for f in header if CacheStatsParser.field_is_amo(f)]
+        self._mgmt = [f for f in header if CacheStatsParser.field_is_mgmt(f)]
+
+        self._stalls = [f for f in header if CacheStatsParser.field_is_stall(f)]
+
+        self._misses = [f for f in header if CacheStatsParser.field_is_miss(f)]
+        self._dmas = [f for f in header if CacheStatsParser.field_is_dma(f)]
+
+        self._ops = [*self._mgmt, *self._atomics, *self._stores, *self._loads]
+
+        # Create a dictionary mapping operation to operation type
+        # global_ctr is just a cycle counter
+        self._op_type_map = dict({*[(l,"Load") for l in [*self._loads, "total_loads"]],
+                                  *[(s,"Store") for s in [*self._stores, "total_stores"]],
+                                  *[(t,"Management") for t in [*self._mgmt, "total_mgmt"]],
+                                  *[(a,"Atomic") for a in [*self._atomics, "total_atomics"]],
+                                  *[(s,"Stall") for s in [*self._stalls, "total_stalls"]],
+                                  *[(m,"Miss") for m in [*self._misses, "total_miss"]],
+                                  *[(d,"DMA") for d in [*self._dmas, "total_dma"]],
+                                  ("global_ctr", "Cycles")
+                              })
+
+    # Find mismatched calls to start/end
+    #
+    # Returns mismatches, a MultiIndex containing the list of
+    # mismatches.
+    def find_mismatches(self, s, e):
+        # Subtracting the start and end dataframes will match
+        # groups. If a group in the start or end dataframe is
+        # missing a row/index then it will insert a row of
+        # NaNs at the cooresponding index in the output that
+        # we can use to print an error.
+        diff = e.sort_index() - s.sort_index()
+        
+        # Find rows with NaNs
+        mismatches = diff[diff.isnull().any(axis="columns")].index
+                
+        # If mismatches is not empty, then there is a row of
+        # NaNs, described above.
+        return list(mismatches)
+
+    # Sort the tags so that "Kernel" is last, followed by the
+    # tags in sorted order.
+    @classmethod
+    def _sort_tags(cls, tags):
+        tags = list(tags)
+
+        if "Kernel" in tags:
+            tags.remove("Kernel")
+            tags.sort()
+            tags = tags + ["Kernel"]
+        else:
+            tags.sort()
+
+        return tags
+
+    def __str__(self):
+        return self._name
+
+    # Create a string of length l with n (name) centered in the
+    # middle, and padded by c (characters)
+    @classmethod
+    def _fill(cls, n, l, c):
+        if(len(c) != 1):
+            raise ValueError("Argument 'c' must be a character")
+        l -= len(n)
+        lpre = l // 2
+        lpost = (l + 1) // 2
+        s = (c * lpre) + n + (c * lpost)
+        return s
+    
+    # Create section separator, of length l with name n
+    @classmethod
+    def _make_sec_sep(cls, n, l):
+        return cls._fill(" " + n + " ", l, "#")
+
+    # Create tag separator, of length l with name tag
+    @classmethod
+    def _make_tag_sep(cls, tag, l):
+        t = f" Tag: {tag} "
+        return cls._fill(t, l, "=")
+        
+    # Create sub-separator, of length l with name n
+    @classmethod
+    def _make_sub_sep(cls, n, l):
+        return cls._fill(n, l, "-")
+
+
+
+class CacheTagStats(CacheStats):
+
+    def __init__(self, name, df):
+        super().__init__(name, df)
+        # Tag statistics are aggregated across banks. We use Tile-Tag
+        # Iteration at the bottom of the hierarchy so that lines that
+        # were printed by the same packet, i.e. different cache banks,
+        # are grouped together and can be aggregated.
+        #
+        # Then, for per-tag statistics we take the sum of the lowest
+        # group, to get aggregate the counters across all banks.
+        hierarchy = ["Action", "Tag", "Tile Coordinate (Y,X)", "Tile-Tag Iteration"]
+        banksums = df.groupby(hierarchy).sum()
+
+        # Split into Start/End 
+        starts = banksums.loc["Start"]
+        ends = banksums.loc["End"]
+        
+        # Find mismatched start and end pairs
+        mismatches = self.find_mismatches(starts, ends)
+        if(list(mismatches)):
+            raise RuntimeError("Unpaired calls to Start/End detected."
+                               f" Check the following: {tuple(mismatches)}:"
+                               f"{list(mismatches)}")
+
+        # For all tag iterations, find the minium arrival time
+        # for the start packet for that iteration
+
+        # We will use groups again to do this. Group together
+        # matching iterations at the bottom of the hierarchy,
+        # and find the earliest arrived packet (for Starts),
+        # and the latest arrived packet (for Ends).
+
+        # The getmin/getmax functions below are general. They
+        # will do what is described above. BUT, they are slow,
+        # and unnecessary
+
+        # getmin = lambda df: df.loc[df["global_ctr"].idxmin()]
+        # tag_starts = tag_starts.groupby(["tag", "Tile-Tag Iteration"]).apply(getmin)
+        # bank_starts = bank_starts.groupby(["tag", "Cache Coordinate (Y,X)", "Tile-Tag Iteration"]).apply(getmin)
+
+        # Instead: Groupby maintains the relative order of rows within
+        # a group, and the rows in the table were already in order
+        # because they were already printed in order!
+
+        # We use first() to get the first row, which
+        # is also the earliest. No need to search for min/max
+        # if an O(1) operation exists!
+        starts = starts.groupby(["Tag", "Tile-Tag Iteration"]).first()
+
+        # Same as above. Slow, unnecessary:
+        # getmax = lambda df: df.loc[df["global_ctr"].idxmax()]
+        # tag_ends = tag_ends.groupby(["tag", "Tile-Tag Iteration"]).apply(getmax)
+        # bank_ends = bank_ends.groupby(["tag", "Cache Coordinate (Y,X)", "Tile-Tag Iteration"]).apply(getmax)
+
+        # As above, groupby maintains order within groups so
+        # we can just use last().
+        ends = ends.groupby(["Tag", "Tile-Tag Iteration"]).last()
+
+        # Finally, subtract all ends from starts and sum.
+        results = (ends - starts).groupby("Tag").sum()
+        
+        # Save the result
+        self.df = results
+
+
+    # Parse the results into a pretty table
+    def __prettify(self, df):
+        doc = ""
+        # Transpose so that columns are tags. Then we can easily see sums
+        pretty = df.T
+
+        # Sort into Events and Cycles
+        counter_map = dict({*[(e,"Event") for e in pretty.index
+                              if CacheStatsParser.field_is_event_counter(e)],
+                            *[(c, "Cycle") for c in pretty.index
+                              if CacheStatsParser.field_is_cycle_counter(c)]})
+        pretty["Counter Type"] = pretty.index.map(counter_map)
+
+        # Classify operations by type
+        pretty['Operation Type'] = pretty.index.map(self._op_type_map)
+
+        # Rename the rows that contain totals to "Total"
+        istotal = lambda op: op.startswith("total") or op == "global_ctr"
+        totals_map = {op:"Total" for op in pretty.index.values if istotal(op)}
+        pretty = pretty.rename(mapper=totals_map)
+
+        # Re-index the table. This creates a hierarchical table where
+        # operations are grouped by Counter type (Event, or Cycle) and
+        # Operation type (e.g. Atomic).
+        pretty['Name'] = pretty.index
+        pretty = pretty.set_index(["Counter Type", "Operation Type", "Name"])
+        pretty = pretty.sort_index(level=[0, 1, 2], ascending=[True, True, False])
+
+        # Sort the columns so that "Kernel" is last.
+        srtd = self._sort_tags(pretty.columns)
+
+        pretty = pretty.reindex(srtd, axis=1)
+
+        doc += "Table Rows:\n"
+        doc += "\tLoad Operations:\n"
+        doc += "\t\t-instr_ld_l[wu,w,hu,h,du,d,bu,b]: Load [w]ord/[h]alf/[b]yte/[d]ouble [u]nsigned/[]signed\n"
+        doc += "\tStore Operations:\n"
+        doc += "\t\t-instr_sm_s[w,h,d,b]: Store [w]ord/[h]alf/[b]yte/[d]ouble\n"
+        doc += "\tCache Management Operations:\n"
+        doc += "\t\t-instr_tagst: Tag Store (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_tagfl: Tag Flush (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_taglv: Tag Load Valid (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_tagla: Tag Load Address (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_afl: Address Flush (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_aflinv: Address Flush Invalidate (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_ainv: Address Invalidate (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_alock: Address Lock (Not caused by Vanilla Core)\n"
+        doc += "\t\t-instr_aunlock: Address Unlock (Not caused by Vanilla Core)\n"
+        doc += "\t RISC-V Atomic Operations:\n"
+        doc += "\t\t-instr_amoswap: Atomic Swap\n"
+        doc += "\t\t-instr_amoor: Atomic OR\n"
+        doc += "\t Cache Stall Operations:\n"
+        doc += "\t\t-stall_miss: Miss Operation (Stall)\n"
+        doc += "\t\t-stall_idle: Idle Operation (Stall)\n"
+        doc += "\t\t-stall_rsp: Response Network Congestion Stall\n"
+        doc += "\n"
+        doc += " *** All operations take one cycle. *** \n"
+
+        return (pretty, doc)
+
+
+    # Compute the breakdowns for an operation type. 
+    # Compute both intra group percentage, and total
+    @classmethod
+    def __cycle_breakdown(cls, tot_cyc, ds):
+        # Construct a new dataframe (the input is a series)
+        df = pd.DataFrame()
+        df["Count"] = ds
+        # Compute breakdowns. For anything that is NaN, just report 0
+        df["% of Type Cycles"] = (100 * ds / ds.loc[:,"Total"]).fillna(0)
+        df["% of Total Cycles"]   = 100 * ds / tot_cyc
+        return df
+
+    # Formatting method for table index.
+    @classmethod
+    def __index_tostr(cls, i):
+        if i[0] == "Cycles":
+            s =f"{i[1]} Cycles"
+            return s
+        if i[1] == "Total":
+            s =f"{i[0]} Operation {i[1]}"
+            return s + "\n" + "-" * len(s)
+        else:
+            return f"--{i[1]}"
+
+    # Format the cycles table (operations)
+    @classmethod
+    def __cycle_tostr(cls, df):
+        # Create columns for Type, and Group percentages
+        tot_cyc = df.loc[("Cycles", "Total")]
+        f = functools.partial(cls.__cycle_breakdown, tot_cyc)
+        df = df.groupby(level=[0]).apply(f)
+        
+        # Format the final table...
+
+        # Reorder the index
+        order = ["Load", "Store", "Atomic", "Management", "Stall", "Cycles"]
+        df = df.loc[(order),:]
+
+        # Then prettify the by applying index_tostr
+        i = list(df.index.map(cls.__index_tostr))
+
+        # Specify the float precision
+        fmt = [".0f", ".0f", ".2f", ".2f"]
+
+        # Finally, format the table with the pretty index
+        s = df.to_markdown(tablefmt="simple", floatfmt=fmt, index=i, numalign="right")
+
+        return s
+
+    # Format the events table (misses)
+    @classmethod
+    def __event_tostr(cls, ds, ld, st, atom):
+        # Construct a pretty dataframe to print
+        df = pd.DataFrame()
+
+        # We only care about misses, so throw away DMA operations
+        ds = ds.loc["Miss"]
+
+        # Create a column for miss counts
+        df["Misses"] = ds
+
+        # Set up a "Type" column, to use as a new index, replacing the
+        # one from the CSV
+        df["Type"] = ds.index.map({"miss_st": "Stores",
+                                   "miss_ld": "Loads",
+                                   "miss_amo": "Atomics",
+                                   "Total": "Total"})
+
+        # Set up a column for access counts
+        df["Accesses"] = pd.Series(index = ds.index.values,
+                                   data =  [st, ld, atom, atom + ld + st])
+
+        # Compute the miss rate by dividing the misses by the accesses
+        # Nans are expected -- 0/0. Just turn them into 0's
+        df["Miss Rate (%)"] = 100 * (df["Misses"] / df["Accesses"]).fillna(0)
+
+        # Set index to the type
+        df = df.set_index(["Type"])
+
+        # Set the format for floats
+        fmt = [".0f"] * 3 + [".2f"]
+        s = df.to_markdown(tablefmt="simple", floatfmt=fmt, numalign="right")
+        return s
+
+    # Get a pretty formatted table representation for a tag
+    @classmethod
+    def __tag_tostr(cls, df):
+        # Get load and store totals for miss statistics
+        ld_total = df.loc[("Cycle", ["Load"], "Total")][0]
+        st_total = df.loc[("Cycle", ["Store"], "Total")][0]
+        at_total = df.loc[("Cycle", ["Atomic"], "Total")][0]
+
+        counts = cls.__cycle_tostr(df.loc["Cycle"]) + "\n"
+        l = len(counts.splitlines()[0])
+
+        events = cls.__event_tostr(df.loc["Event"], ld_total, st_total, at_total)
+
+        s = ""
+        s += ("Operation Cycle Counts" + " " * l)[:l] + "\n"
+        s += cls._make_sub_sep("", l) + "\n"
+        s += counts
+        s += cls._make_sub_sep("", l) + "\n"
+
+        # TODO: Bandwidth Utilization would go here:
+        s += "\n"
+        s += cls._make_sub_sep("", l) + "\n"
+        s += ("Miss Statistics" + " " * l)[:l] + "\n"
+        s += cls._make_sub_sep("", l) + "\n"
+        s += events
+        s += "\n"
+        s += cls._make_sub_sep("", l) + "\n"
+
+        return s
+
+    # Get a pretty formatted table representation for all tags
+    @classmethod
+    def __tostr(cls, df):
+        s = ""
+        for tag in df.columns:
+            tab= cls.__tag_tostr(df[tag])
+            l = tab.splitlines()[0]
+
+            s += cls._make_tag_sep(tag, len(l))
+            s += "\n"
+            s += tab
+            s += "\n"
+            s += "\n"
+        return s
+
+    # Define a string representation for Bank Statistics.
+    # Returns a pretty table and the doc header
+    def __str__(self):
+        # Get name, and add spaces
+        n = super().__str__()
+        n = " " + n + " "
+
+        # Get pretty dataframe, and documentation
+        df, doc = self.__prettify(self.df)
+
+        # Get string-formatted table for all tags
+        tab = self.__tostr(df)
+
+        # Get horizontal width of table 
+        w = len(tab.splitlines()[0])
+
+        # Build separators
+        sep = self._make_sec_sep(n, w) + "\n"
+        end = self._make_sec_sep("End " + n, w) + "\n"
+        return sep + doc + tab
+
+class CacheBankStats(CacheStats):
+    # This class is highly similar to CacheTagStats. Detailed comments
+    # are in that class.
+    def __init__(self, name, df):
+        super().__init__(name, df)
+        
+        # Create a table where we'll compute the per-bank tagsums. Do
+        # not take the sum, because we are not aggregating here.
+        hierarchy = ["Action", "Tag", "Cache Coordinate (Y,X)", "Tile-Tag Iteration"]
+        banks = df.set_index(hierarchy)
+
+        # Split into Start/End 
+        starts = banks.loc["Start"]
+        ends = banks.loc["End"]
+
+        # Find mismatched start and end pairs
+        mismatches = self.find_mismatches(starts, ends)
+        if(list(mismatches)):
+            raise RuntimeError("Unpaired calls to Start/End detected."
+                               f" Check the following: {tuple(mismatches.names)}:"
+                               f"{list(mismatches)}")
+
+        # For all tag iterations, find the minium arrival time
+        # for the start packet for that iteration
+        starts = starts.groupby(["Tag", "Cache Coordinate (Y,X)", "Tile-Tag Iteration"]).first()
+
+        # As above, groupby maintains order within groups so
+        # we can just use last().
+        ends = ends.groupby(["Tag", "Cache Coordinate (Y,X)", "Tile-Tag Iteration"]).last()
+
+        # Same for tags, except keep the cache coordinates
+        results = (ends - starts).groupby(["Tag", "Cache Coordinate (Y,X)"]).sum()
+
+        # Save the result
+        self.df = results
+
+
+    # Parse the results into a pretty table
+    def __prettify(self, df):
+
+        pretty = pd.DataFrame()
+        doc = ""
+        ops = self.df[self._ops].sum(axis="columns")
+
+        # Compute pretty table
+
+        # Fill nans as 0's where this is expected behaviour (i.e. 0/0) but leave infs.
+        doc += "Table Fields: \n"
+
+        doc += "\t- Cache Coordinate (Y,X): Cache Coordinate within HammerBlade Pod\n"
+
+        doc += "\t- Total Cycles: Total Cache Execution Cycles\n"
+        pretty["Total Cycles"] = self.df["global_ctr"]
+
+        doc += "\t- # Misses: Total Number of Cache Misses\n"
+        pretty["# Misses"] = self.df["total_miss"]
+
+        doc += "\t- Operations: Total Number of Cache Operations (Loads + Stores + Atomics + Management)\n"
+        pretty["# Operations"] = ops
+
+        doc += "\t- Miss Rate: 100 * (Number of Misses / Number of Ops)\n"
+        pretty["Miss Rate (%)"] = 100 * (self.df["total_miss"] / ops)
+
+        doc += "\t- Memory Access Latency: Average Memory Access Latency for Misses (Total Miss Cycles / Number of Misses)\n"
+        pretty["Mem. Latency"] = (self.df["stall_miss"] / self.df["total_miss"]).fillna(0)
+
+        doc += "\t- Percent Miss Cycles: 100 * (Total Miss Cycles / Total Cycles)\n"
+        pretty["Percent Miss Cycles"] = 100 *(self.df["stall_miss"] / self.df["global_ctr"])
+
+        doc += "\t- Percent Idle Cycles: 100 * (Total Idle Cycles / Total Cycles)\n"
+        pretty["Percent Idle Cycles"] = 100 *(self.df["stall_idle"] / self.df["global_ctr"])
+
+        doc += "\t- Percent Response Stall Cycles: 100 * (Total Response Stall Cycles / Total Cycles)\n"
+        pretty["Percent Stall Cycles"] = 100 *(self.df["stall_rsp"] / self.df["global_ctr"])
+
+        doc += "\t- Percent Operations Cycles: 100 * (Total Operation Cycles / Total Cycles)\n"
+        pretty["Percent Ops."] = 100 * (ops / self.df["global_ctr"])
+
+        doc += "\n"
+        doc += "Note: inf (Infinite) occurs when a tag window captures miss stall cycles that bleed into its window, but has no misses"
+        
+        doc += "\n"
+        
+        return (pretty, doc)
+
+    # Get a pretty formatted table representation for a tag
+    @classmethod
+    def __tag_tostr(cls, df):
+        # Dictate the format of floats to two decimal
+        # points. Everything else should be an integer. This isn't
+        # clean, but effective, and the only way
+        fmt = [".0f"] * 4 + [".2f"] * (len(df.columns) -3)
+        s = df.to_markdown(tablefmt="simple", floatfmt=fmt, numalign="right")
+        return s
+
+    # Get a pretty formatted table representation for all tags
+    @classmethod
+    def __tostr(cls, df):
+        s = ""
+        for tag, sub in df.groupby(level=[0]):
+            tab = cls.__tag_tostr(sub.loc[tag])
+            l = tab.splitlines()[0]
+
+            s += cls._make_tag_sep(tag, len(l))
+            s += "\n"
+            s += tab
+            s += "\n"
+            s += "\n"
+        return s
+
+    # Define a string representation for Bank Statistics.
+    # Returns a pretty table and the doc header
+    def __str__(self):
+        # Get name
+        n = super().__str__()
+
+        # Get pretty dataframe, and documentation
+        df, doc = self.__prettify(self.df)
+
+        # Get string-formatted table
+        tab = self.__tostr(df)
+
+        # Get horizontal width of table 
+        w = len(tab.splitlines()[0])
+
+        # Build separators
+        sep = self._make_sec_sep(n, w) + "\n"
+        end = self._make_sec_sep("End " + n, w) + "\n"
+        return sep + doc + tab + end
+        
+
+# Aggregate cache statistics for a particular dataframe. Can be reused
+# for the device, or for a particular tile group (via GroupCacheStats)
+class AggregateCacheStats():
+    def __init__(self, df):
+        # Create tables with data specific to the parser that will use it
+        # Per-Tag Cache Parsing doesn't care about Tile Group ID
+        tagdata = df.drop(["Tile Group ID"], axis="columns")
+
+        # Per-Bank Cache Parsing doesn't care about Tile Group ID, or Tile Coordinate
+        bankdata = df.drop(["Tile Group ID", "Tile Coordinate (Y,X)"], axis="columns")
+
+        self.tag = CacheTagStats("Per-Tag Victim Cache Stats", tagdata)
+        self.bank = CacheBankStats("Per-Bank Victim Cache Stats", bankdata)
+        
+    def __str__(self):
+        s = str(self.tag)
+        s += str(self.bank)
+        return s
+
+# Aggregate cache statistics for each tile group within a dataframe
+class GroupCacheStats():
+    def __init__(self, df):
+        self._agg = dict()
+
+        # Group the dataframe by Tile Group ID and then parse that
+        # group
+        for i, grp in df.groupby(["Tile Group ID"]):
+            self._agg[i] = AggregateCacheStats(grp)
+
+    def __getitem__(self, i):
+        return self._agg[i]
+
+
 class VanillaStatsParser:
     # formatting parameters for aligned printing
     type_fmt = {"name"      : "{:<35}",
@@ -158,9 +942,6 @@ class VanillaStatsParser:
                     "tile_timing_data": type_fmt["cord"]       + type_fmt["int"]  + type_fmt["int"]     + type_fmt["float"]   + type_fmt["float"]   + type_fmt["percent"] + type_fmt["percent"] + "\n",
                     "timing_data"     : type_fmt["name"]       + type_fmt["int"]  + type_fmt["int"]     + type_fmt["float"]   + type_fmt["percent"] + type_fmt["percent"] + "\n",
 
-                    "vcache_timing_header": type_fmt["name"]   + type_fmt["type"] + type_fmt["type"]    + type_fmt["type"]    + type_fmt["type"]    + type_fmt["type"]   + "\n",
-                    "vcache_timing_data"  : type_fmt["name"]   + type_fmt["int"]  + type_fmt["int"]     + type_fmt["int"]     + type_fmt["int"]     + type_fmt["float"]   + "\n",
-
                     "instr_header"    : type_fmt["name"]       + type_fmt["int"]  + type_fmt["type"]    + "\n",
                     "instr_data"      : type_fmt["name"]       + type_fmt["int"]  + type_fmt["percent"] + "\n",
                     "instr_data_indt" : type_fmt["name_indt"]  + type_fmt["int"]  + type_fmt["percent"] + "\n",
@@ -181,11 +962,10 @@ class VanillaStatsParser:
 
 
     # default constructor
-    def __init__(self, per_tile_stat, per_tile_group_stat, per_vcache_stat, vanilla_input_file, vcache_input_file):
+    def __init__(self, per_tile_stat, per_tile_group_stat, vanilla_input_file, vcache_input_file):
 
         self.per_tile_stat = per_tile_stat
         self.per_tile_group_stat = per_tile_group_stat
-        self.per_vcache_stat = per_vcache_stat
         self.vcache = True if vcache_input_file else False
 
         self.traces = []
@@ -262,7 +1042,9 @@ def __init__(self, per_tile_stat, per_tile_group_stat, per_vcache_stat, vanilla_
 
         # Raise exception and exit if there are no traces 
         if not self.traces:
-            raise IOError("No Vanilla Stats Found: Use bsg_cuda_print_stat_kernel_start/end to generate runtime statistics")
+            print("## Warning: No Vanilla Stats Found: you can use bsg_cuda_print_stat_kernel_start/end to generate runtime statistics");
+            sys.exit(0)
+            #raise IOError("No Vanilla Stats Found: Use bsg_cuda_print_stat_kernel_start/end to generate runtime statistics")
 
 
         # Save the active tiles in a list
@@ -275,56 +1057,11 @@ def __init__(self, per_tile_stat, per_tile_group_stat, per_vcache_stat, vanilla_
         # Calculate total aggregate stats for manycore by summing up per_tile stat counts
         self.manycore_stat = self.__generate_manycore_stats_all(self.tile_stat, self.manycore_cycle_parallel_cnt)
 
-
-
-
         # Generate VCache Stats
         # If vcache stats file is given as input, also generate vcache stats 
         if (self.vcache):
-            # If the victim cache stats file is found 
-            if os.path.exists(vcache_input_file):
-                # Parse vcache input file's header to generate a list of all types of operations
-                self.vcache_stats, self.vcache_instrs, self.vcache_flops, self.vcache_misses, self.vcache_stalls, self.vcache_bubbles = self.parse_header(vcache_input_file)
-        
-                # Create a list of all types of opertaions for iteration
-                self.vcache_all_ops = self.vcache_stats + self.vcache_instrs + self.vcache_misses + self.vcache_stalls + self.vcache_bubbles
-        
-                # Use sets to determine the active vcache banks (without duplicates)
-                active_vcaches = set()
-        
-                # Parse vcache stats file line by line, and append the trace line to traces list. 
-                with open(vcache_input_file) as f:
-                    csv_reader = csv.DictReader (f, delimiter=",")
-                    for row in csv_reader:
-                        # Vcache bank name is a string that contains the vcache bank number 
-                        # The vcache bank number is extracted separately from other stats 
-                        # and manually added 
-                        trace = {op:int(row[op]) for op in self.vcache_all_ops if op != 'vcache'}
-                        vcache_name = row['vcache']
-                        vcache_bank = int (vcache_name[vcache_name.find("[")+1: vcache_name.find("]")])
-                        trace['vcache'] = vcache_bank
-                        active_vcaches.add((vcache_bank))
-                        self.vcache_traces.append(trace)
-        
-                self.active_vcaches = list(active_vcaches)
-                self.active_vcaches.sort()
-        
-                # generate timing stats for each vcache bank 
-                self.vcache_tile_group_stat, self.vcache_stat = self.__generate_vcache_stats(self.vcache_traces, self.active_vcaches)
-        
-                # Calculate total aggregate vcache stats for manycore by summing up per vcache bank stat counts
-                self.manycore_vcache_stat = self.__generate_manycore_vcache_stats_all(self.vcache_stat)
-
-                # Calculate total aggregate vcache stats for each tile group in manycore by summing up per vcache bank stat counts
-                self.manycore_vcache_tile_group_stat = self.__generate_tile_group_vcache_stats_all(self.vcache_tile_group_stat)
-
-            # Victim cache stats is optional, if it's not found we throw a warning and skip
-            # vcache stats generation, but do not hault the vanilla stats generation
-            else:
-                self.vcache = False
-                print("Warning: vcache stats file not found, skipping victim cache stats generation.")
-
-
+            self.vparser = CacheStatsParser(vcache_input_file)
+            
         return
 
 
@@ -334,7 +1071,6 @@ def __print_stat(self, stat_file, stat_type, *argv):
         return
 
 
-
     # print instruction count, stall count, execution cycles for the entire manycore for each tag
     def __print_manycore_stats_tag(self, stat_file):
         stat_file.write("Per-Tag Stats\n")
@@ -538,66 +1274,6 @@ def __print_manycore_stats_tile_timing(self, stat_file, header, tiles, manycore_
         return   
 
 
-
-
-    # print execution timing for the entire manycoree per vcache bank for a certain tag
-    def __print_manycore_tag_stats_vcache_timing(self, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-
-        for vcache in self.active_vcaches:
-
-            hit_cnt = self.vcache_stat[tag][vcache]["instr_total"]
-            miss_cnt = self.vcache_stat[tag][vcache]["miss_total"]
-            stall_cnt = self.vcache_stat[tag][vcache]["stall_total"]
-            cycle_cnt = self.vcache_stat[tag][vcache]["global_ctr"]
-            utilization = np.float64(hit_cnt + miss_cnt) / cycle_cnt
-
-            self.__print_stat(stat_file, "vcache_timing_data"
-                                         ,vcache
-                                         ,hit_cnt
-                                         ,miss_cnt
-                                         ,stall_cnt
-                                         ,cycle_cnt
-                                         ,utilization)
-
-
-        hit_cnt = self.manycore_vcache_stat[tag]["instr_total"]
-        miss_cnt = self.manycore_vcache_stat[tag]["miss_total"]
-        stall_cnt = self.manycore_vcache_stat[tag]["stall_total"]
-        cycle_cnt = self.manycore_vcache_stat[tag]["global_ctr"]
-        utilization = np.float64(hit_cnt + miss_cnt) / cycle_cnt
-
-        self.__print_stat(stat_file, "vcache_timing_data"
-                                     ,"total"
-                                     ,hit_cnt
-                                     ,miss_cnt
-                                     ,stall_cnt
-                                     ,cycle_cnt
-                                     ,utilization)
-        return
-
-
-    # Prints manycore timing stats per vcache bank for all tags 
-    def __print_manycore_stats_vcache_timing(self, stat_file):
-        stat_file.write("Per-Vcache-Bank Timing Stats\n")
-        self.__print_stat(stat_file, "vcache_timing_header"
-                                     ,"Vcache Bank No."
-                                     ,"Hit Requests"
-                                     ,"Miss Requests"
-                                     ,"Stall Cycles"
-                                     ,"Total Cycles"
-                                     ,"Utilization")
-        self.__print_stat(stat_file, "start_lbreak")
-
-        for tag in self.manycore_vcache_stat.keys():
-            if(self.manycore_vcache_stat[tag]["global_ctr"]):
-                self.__print_manycore_tag_stats_vcache_timing(stat_file, tag)
-        self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
     # print timing stats for each tile group in a separate file 
     # tg_id is tile group id 
     def __print_per_tile_group_tag_stats_timing(self, stat_file, tg_id, tag, manycore_stat, tile_group_stat):
@@ -1130,154 +1806,9 @@ def __print_stats_miss(self, stat_file, header, item, stat, misses):
             if(stat[tag][item]["global_ctr"]):
                 self.__print_tag_stats_miss(stat_file, item, tag, stat, misses)
         self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
-
-    # print victim cache instruction stats for the entire manycore
-    def __print_manycore_vcache_tag_stats_instr(self, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-   
-        # Print instruction stats for manycore
-        for instr in self.vcache_instrs:
-            self.__print_stat(stat_file, "instr_data", instr,
-                                         self.manycore_vcache_stat[tag][instr]
-                                         ,(100 * self.manycore_vcache_stat[tag][instr] / self.manycore_vcache_stat[tag]["instr_total"]))
-        return
-
-
-    # Prints victim cahe manycore instruction stats for all tags  
-    def __print_manycore_vcache_stats_instr(self, stat_file):
-        stat_file.write("Vcache Per-Tag Instruction Stats\n")
-        self.__print_stat(stat_file, "instr_header", "Instruction", "Count", "% of Instructions")
-        self.__print_stat(stat_file, "start_lbreak")
-        for tag in self.manycore_vcache_stat.keys():
-            if(self.manycore_vcache_stat[tag]["global_ctr"]):
-                self.__print_manycore_vcache_tag_stats_instr(stat_file, tag)
-        self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
-    # print stall stats for the entire vcache
-    def __print_manycore_vcache_tag_stats_stall(self, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-
-        # Print stall stats for manycore vcache
-        for stall in self.vcache_stalls:
-            stall_format = "stall_data"
-            self.__print_stat(stat_file, stall_format, stall,
-                                         self.manycore_vcache_stat[tag][stall],
-                                         (100 * np.float64(self.manycore_vcache_stat[tag][stall]) / self.manycore_vcache_stat[tag]["stall_total"])
-                                         ,(100 * np.float64(self.manycore_vcache_stat[tag][stall]) / self.manycore_vcache_stat[tag]["global_ctr"]))
-
-        return
-
-
-    # Prints manycore stall stats per vcache for all tags 
-    def __print_manycore_vcache_stats_stall(self, stat_file):
-        stat_file.write("Vcache Per-Tag Stall Stats\n")
-        self.__print_stat(stat_file, "stall_header", "Stall Type", "Cycles", " % Stall Cycles", " % Total Cycles")
-        self.__print_stat(stat_file, "start_lbreak")
-        for tag in self.manycore_vcache_stat.keys():
-            if(self.manycore_vcache_stat[tag]["global_ctr"]):
-                self.__print_manycore_vcache_tag_stats_stall(stat_file, tag)
-        self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
-    # print instruction stats for each vcache in a separate file 
-    def __print_per_vcache_tag_stats_instr(self, vcache, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-
-        # Print instruction stats for vache
-        for instr in self.vcache_instrs:
-            self.__print_stat(stat_file, "instr_data", instr,
-                                         self.vcache_stat[tag][vcache][instr]
-                                         ,(100 * np.float64(self.vcache_stat[tag][vcache][instr]) / self.vcache_stat[tag][vcache]["instr_total"]))
         return
 
 
-    # print instr stats for each vcache in a separate file for all tags 
-    def __print_per_vcache_stats_instr(self, vcache, stat_file):
-        stat_file.write("Vcache Instruction Stats\n")
-        self.__print_stat(stat_file, "instr_header", "Instruction", "Count", "% of Instructions")
-        self.__print_stat(stat_file, "start_lbreak")
-        for tag in self.vcache_stat.keys():
-            if(self.vcache_stat[tag][vcache]["global_ctr"]):
-                self.__print_per_vcache_tag_stats_instr(vcache, stat_file, tag)
-        self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
-    # print stall stats for each vcache in a separate file
-    def __print_per_vcache_tag_stats_stall(self, vcache, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-
-        # Print stall stats for manycore
-        for stall in self.vcache_stalls:
-            stall_format = "stall_data"
-            self.__print_stat(stat_file, stall_format, stall,
-                                         self.vcache_stat[tag][vcache][stall],
-                                         (100 * np.float64(self.vcache_stat[tag][vcache][stall]) / self.vcache_stat[tag][vcache]["stall_total"])
-                                         ,(100 * np.float64(self.vcache_stat[tag][vcache][stall]) / self.vcache_stat[tag][vcache]["global_ctr"]))
-        return
-
-
-    # print stall stats for each vcache in a separate file for all tags 
-    def __print_per_vcache_stats_stall(self, vcache, stat_file):
-        stat_file.write("Per-Tile Stall Stats\n")
-        self.__print_stat(stat_file, "stall_header", "Stall Type", "Cycles", "% of Stall Cycles", "% of Total Cycles")
-        self.__print_stat(stat_file, "start_lbreak")
-        for tag in self.vcache_stat.keys():
-            if(self.vcache_stat[tag][vcache]["global_ctr"]):
-                self.__print_per_vcache_tag_stats_stall(vcache, stat_file, tag)
-        self.__print_stat(stat_file, "start_lbreak")
-        return   
-
-
-
-
-    # print miss stats for each vcache in a separate file
-    # vcache is the victim cache bank number
-    def __print_per_vcache_tag_stats_miss(self, vcache, stat_file, tag):
-        self.__print_stat(stat_file, "tag_separator", tag)
-
-        for miss in self.vcache_misses:
-            # Find total number of operations for that miss
-            operation = miss.replace("miss_", "instr_")
-            operation_cnt = self.vcache_stat[tag][vcache][operation]
-            miss_cnt = self.vcache_stat[tag][vcache][miss]
-            hit_rate = 1 if operation_cnt == 0 else (1 - miss_cnt/operation_cnt)
-         
-            self.__print_stat(stat_file, "miss_data", miss, miss_cnt, operation_cnt, hit_rate )
-
-        return
-
-
-    # print miss for each vcache in a separate file for all tags 
-    def __print_per_vcache_stats_miss(self, vcache, stat_file):
-        stat_file.write("Per-Vcache Miss Stats\n")
-        self.__print_stat(stat_file, "miss_header", "Miss Type", "miss", "total", "hit rate")
-        self.__print_stat(stat_file, "start_lbreak")
-        for tag in self.vcache_stat.keys():
-            if(self.vcache_stat[tag][vcache]["global_ctr"]):
-                self.__print_per_vcache_tag_stats_miss(vcache, stat_file, tag)
-        self.__print_stat(stat_file, "end_lbreak")
-        return   
-
-
-
-
-
-
     # prints all four types of stats, timing, instruction,
     # miss and stall for the entire manycore 
     def print_manycore_stats_all(self):
@@ -1295,10 +1826,8 @@ def print_manycore_stats_all(self):
 
         # If vcache stats is given as input, also print vcache stats
         if (self.vcache):
-            self.__print_manycore_stats_vcache_timing(manycore_stats_file)
-            self.__print_manycore_stats_miss(manycore_stats_file, "VCache Per-Tag Miss Stats", self.manycore_vcache_stat, self.vcache_misses)
-            self.__print_manycore_stats_stall(manycore_stats_file, "VCache Per-Tag Stall Stats", self.manycore_vcache_stat, self.vcache_stalls)
-            self.__print_manycore_stats_instr(manycore_stats_file, "VCache Per-Tag Instruction Stats", self.manycore_vcache_stat, self.vcache_instrs)
+            s = str(self.vparser.agg)
+            manycore_stats_file.write(s)
         manycore_stats_file.close()
         return
 
@@ -1320,9 +1849,8 @@ def print_per_tile_group_stats_all(self):
 
             # If vcache stats is given as input 
             if (self.vcache):
-                self.__print_per_tile_group_stats_miss(stat_file, "VCache Per-Tile-Group Miss Stats", tg_id, self.manycore_vcache_tile_group_stat, self.vcache_misses)
-                self.__print_per_tile_group_stats_stall(stat_file, "VCache Per-Tile-Group Stall Stats", tg_id, self.manycore_vcache_tile_group_stat, self.vcache_stalls)
-                self.__print_per_tile_group_stats_instr(stat_file, "VCache Per-Tile-Group Instruction Stats", tg_id, self.manycore_vcache_tile_group_stat, self.vcache_instrs)
+                s = str(self.vparser.group[tg_id])
+                stat_file.write(s)
 
             stat_file.close()
         return
@@ -1346,26 +1874,6 @@ def print_per_tile_stats_all(self):
             stat_file.close()
 
 
-
-    # prints all four types of stats, timing, instruction,
-    # miss and stall for each vcache in a separate file  
-    def print_per_vcache_stats_all(self):
-        # if Vcache stats is given as input
-        if (self.vcache):
-            stats_path = os.getcwd() + "/stats/vcache/"
-            if not os.path.exists(stats_path):
-                os.mkdir(stats_path)
-            for vcache in self.active_vcaches:
-                stat_file = open( (stats_path + "vcache_bank_" + str(vcache) + "_stats.log"), "w")
-                self.__print_stats_miss(stat_file, "Per-VCache Miss Stats", vcache, self.vcache_stat, self.vcache_misses)
-                self.__print_stats_stall(stat_file, "Per-VCache Stall Stats", vcache, self.vcache_stat, self.vcache_stalls)
-                self.__print_stats_instr(stat_file, "Per-VCache Instr Stats", vcache, self.vcache_stat, self.vcache_instrs)
-                stat_file.close()
-        return
-
-
-
-
     # go though the input traces and extract start and end stats  
     # for each tile, and each tile group 
     # return number of tile groups, tile group timing stats, tile stats, and cycle parallel cnt
@@ -1399,12 +1907,12 @@ def __generate_tile_stats(self, traces, tiles):
         # For calculating manycore stats, all tiles are considerd to be involved
         # For calculating tile group stats, only tiles inside the tile group are considered
         # For manycore (all tiles that participate in tag are included)
-        manycore_cycle_parallel_earliest_start = {tag: traces[0]["global_ctr"] for tag in tags}
+        manycore_cycle_parallel_earliest_start = {tag: traces[-1]["global_ctr"] for tag in tags}
         manycore_cycle_parallel_latest_end     = {tag: traces[0]["global_ctr"] for tag in tags}
         manycore_cycle_parallel_cnt       = {tag: 0 for tag in tags}
 
         # For each tile group (only tiles in a tile group that participate in a tag are included)
-        tile_group_cycle_parallel_earliest_start = {tag: [traces[0]["global_ctr"] for tg_id in range(self.max_tile_groups)] for tag in tags}
+        tile_group_cycle_parallel_earliest_start = {tag: [traces[-1]["global_ctr"] for tg_id in range(self.max_tile_groups)] for tag in tags}
         tile_group_cycle_parallel_latest_end     = {tag: [traces[0]["global_ctr"] for tg_id in range(self.max_tile_groups)] for tag in tags}
         tile_group_cycle_parallel_cnt            = {tag: [traces[0]["global_ctr"] for tg_id in range(self.max_tile_groups)] for tag in tags}
 
@@ -1564,179 +2072,6 @@ def __generate_tile_stats(self, traces, tiles):
 
 
 
-
-
-
-    # go though the input traces and extract start and end stats for each vcache bank 
-    # return vcache stats 
-    # this function only counts the portion between two print_stat_start and end messages
-    # in practice, this excludes the time in between executions,
-    # i.e. when tiles are waiting to be loaded by the host.
-    # contrary to vanilla stats, vcache stats can be printed multiple times, every time
-    # a tile invokes print_stat, the stat for all vcache banks is printed 
-    # Therefore, if multiple stats of the same vcache bank and the same tag are seen,
-    # or multiple stats of the same vcache bank, same tag and same tile group ID are seen,
-    # the earliest (latest) stat is chosen if it is a start (end) stat.
-    def __generate_vcache_stats(self, traces, vcaches):
-        tags = list(range(self.max_tags)) + ["kernel"]
-        num_tile_groups = {tag:0 for tag in tags}
-
-        # Dictionary to contain operation count for every tag and vcache bank
-        # For aggregate vcache stats of the manycore, the vcache dimension of 
-        # this dictionary is sum reduced in the generate_manycore_vcache_stats_all function
-        vcache_stat_start = {tag: {vcache:Counter() for vcache in vcaches} for tag in tags}
-        vcache_stat_end   = {tag: {vcache:Counter() for vcache in vcaches} for tag in tags}
-        vcache_stat       = {tag: {vcache:Counter() for vcache in vcaches} for tag in tags}
-
-        # Dictionary to contain operation count for every tag, vcache bank
-        # and tile group ID.
-        # For aggregate vcache stats of a tile group, the tile group id dimension of 
-        # this dictionary is sum reduced in the generate_tile_group_vcache_stats_all function
-        vcache_tile_group_stat_start = {tag: {tg_id: {vcache: Counter() for vcache in vcaches} for tg_id in range(max(self.num_tile_groups.values()))} for tag in tags}
-        vcache_tile_group_stat_end   = {tag: {tg_id: {vcache: Counter() for vcache in vcaches} for tg_id in range(max(self.num_tile_groups.values()))} for tag in tags}
-        vcache_tile_group_stat       = {tag: {tg_id: {vcache: Counter() for vcache in vcaches} for tg_id in range(max(self.num_tile_groups.values()))} for tag in tags}
-
-
-        tag_seen = {tag: {vcache:False for vcache in vcaches} for tag in tags}
-
-
-        for trace in traces:
-            cur_vcache = (trace['vcache'])
-
-            # instantiate a CudaStatTag object with the tag value
-            cst = CudaStatTag(trace["tag"])
-
-            # Separate depending on stat type (start or end)
-            if(cst.isStart):
-                # If start stat for this tag is not already seen, or if it is an earlier start stat
-                if (not vcache_stat_start[cst.tag][cur_vcache] or vcache_stat_start[cst.tag][cur_vcache]['global_ctr'] > trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_stat_start[cst.tag][cur_vcache][op] = trace[op]
-
-                # If start stat for this tag and this tile group is not already seen,
-                # or if it is an earlier start stat
-                if (not vcache_tile_group_stat_start[cst.tag][cst.tg_id][cur_vcache] or vcache_tile_group_stat_start[cst.tag][cst.tg_id][cur_vcache]['global_ctr'] > trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_tile_group_stat_start[cst.tag][cst.tg_id][cur_vcache][op] = trace[op]
-
-                tag_seen[cst.tag][cur_vcache] = True
-
-
-
-            elif (cst.isEnd):
-                # If end stat for this tag is not already seen, or if it is a later end stat
-                if (not vcache_stat_end[cst.tag][cur_vcache] or vcache_stat_end[cst.tag][cur_vcache]['global_ctr'] < trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_stat_end[cst.tag][cur_vcache][op] = trace[op]
-                # If end stat for this tag and this tile group is not already seen,
-                # or if it is a later end stat
-                if (not vcache_tile_group_stat_end[cst.tag][cst.tg_id][cur_vcache] or vcache_tile_group_stat_end[cst.tag][cst.tg_id][cur_vcache]['global_ctr'] < trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_tile_group_stat_end[cst.tag][cst.tg_id][cur_vcache][op] = trace[op]
-                tag_seen[cst.tag][cur_vcache] = False;
-
-
-
-            # And depending on kernel start/end
-            if(cst.isKernelStart):
-                # If start stat for this tag is not already seen, or if it is an earlier start stat
-                if (not vcache_stat_start["kernel"][cur_vcache] or vcache_stat_start["kernel"][cur_vcache]['global_ctr'] > trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_stat_start["kernel"][cur_vcache][op] = trace[op]
-
-                # If start stat for this tag and this tile group is not already seen,
-                # or if it is an earlier start stat
-                if (not vcache_tile_group_stat_start["kernel"][cst.tg_id][cur_vcache] or vcache_tile_group_stat_start["kernel"][cst.tg_id][cur_vcache]['global_ctr'] > trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_tile_group_stat_start["kernel"][cst.tg_id][cur_vcache][op] = trace[op]
-                tag_seen["kernel"][cur_vcache] = True
-
-
-
-            elif (cst.isKernelEnd):
-                # If end stat for this tag is not already seen, or if it is a later end stat
-                if (not vcache_stat_end["kernel"][cur_vcache] or vcache_stat_end["kernel"][cur_vcache]['global_ctr'] < trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_stat_end["kernel"][cur_vcache][op] = trace[op]
-                # If end stat for this tag and this tile group is not already seen,
-                # or if it is a later end stat
-                if (not vcache_tile_group_stat_end["kernel"][cst.tg_id][cur_vcache] or vcache_tile_group_stat_end["kernel"][cst.tg_id][cur_vcache]['global_ctr'] < trace['global_ctr']):
-                    for op in self.vcache_all_ops:
-                        vcache_tile_group_stat_end["kernel"][cst.tg_id][cur_vcache][op] = trace[op]
-                tag_seen["kernel"][cur_vcache] = False;
-
-
-
-        # Generate vcache stats for every tag and every 
-        # vcache bank by subtracting the start stat from
-        # the end stat of that vcache bank and that tag
-        for tag in tags:
-            for vcache in vcaches:
-                vcache_stat[tag][vcache] = vcache_stat_end[tag][vcache] - vcache_stat_start[tag][vcache]
-
-
-        # Generate all tile group vcache stats by
-        #  subtracting start time from end time
-        for tag in tags:
-            for tg_id in range(self.num_tile_groups[tag]): 
-                for vcache in vcaches:
-                    vcache_tile_group_stat[tag][tg_id][vcache] = vcache_tile_group_stat_end[tag][tg_id][vcache] - vcache_tile_group_stat_start[tag][tg_id][vcache]
-
-
-        
-        # Generate total stats for entire vcache by summing all stats for all vcache banks
-        for tag in tags:
-            for vcache in vcaches:
-                for instr in self.vcache_instrs:
-                    # different types of load/store/atomic instructions are already counted
-                    # under the umbrella of instr_ld/st/atomic, so they are not summed to 
-                    # to avoid double counting
-                    if (not instr.startswith('instr_ld_') and not instr.startswith('instr_sm_') and not instr.startswith('instr_amo')):
-                        vcache_stat[tag][vcache]["instr_total"] += vcache_stat[tag][vcache][instr]
-                for stall in self.vcache_stalls:
-                    vcache_stat[tag][vcache]["stall_total"] += vcache_stat[tag][vcache][stall]
-                for bubble in self.vcache_bubbles:
-                    vcache_stat[tag][vcache]["bubble_total"] += vcache_stat[tag][vcache][bubble]
-                for miss in self.vcache_misses:
-                    vcache_stat[tag][vcache]["miss_total"] += vcache_stat[tag][vcache][miss]
-                    hit = miss.replace("miss_", "instr_")
-                    vcache_stat[tag][vcache]["hit_total"] += vcache_stat[tag][vcache][hit]
-
-
-
-
-        # Generate total stats for each tile group by summing all vcache stats 
-        for tag in tags:
-            for tg_id in range(self.num_tile_groups[tag]):
-                for vcache in vcaches:
-                    for instr in self.vcache_instrs:
-                        vcache_tile_group_stat[tag][tg_id][cur_vcache]["instr_total"] += vcache_tile_group_stat[tag][tg_id][cur_vcache][instr]
-                    for stall in self.vcache_stalls:
-                        vcache_tile_group_stat[tag][tg_id][cur_vcache]["stall_total"] += vcache_tile_group_stat[tag][tg_id][cur_vcache][stall]
-                    for bubble in self.vcache_bubbles:
-                        vcache_tile_group_stat[tag][tg_id][cur_vcache]["bubble_total"] += vcache_tile_group_stat[tag][tg_id][cur_vcache][bubble]
-                    for miss in self.vcache_misses:
-                        vcache_tile_group_stat[tag][tg_id][cur_vcache]["miss_total"] += vcache_tile_group_stat[tag][tg_id][cur_vcache][miss]
-                        hit = miss.replace("miss_", "instr_")
-                        vcache_tile_group_stat[tag][tg_id][cur_vcache]["hit_total"] += vcache_tile_group_stat[tag][tg_id][cur_vcache][hit]
-
-
-
-
-        self.vcache_instrs  += ["instr_total"]
-        self.vcache_stalls  += ["stall_total"]
-        self.vcache_bubbles += ["bubble_total"]
-        self.vcache_misses  += ["miss_total"]
-        self.vcache_all_ops += ["instr_total", "stall_total", "bubble_total", "miss_total", "hit_total"]
-
-        return vcache_tile_group_stat, vcache_stat
-
-
-
-
-
-
-
     # Calculate aggregate manycore stats dictionary by summing 
     # all per tile stats dictionaries
     def __generate_manycore_stats_all(self, tile_stat, manycore_cycle_parallel_cnt):
@@ -1756,42 +2091,6 @@ def __generate_manycore_stats_all(self, tile_stat, manycore_cycle_parallel_cnt):
         return manycore_stat
 
 
-    # Calculate aggregate vcache stats dictionary by summing 
-    # all per vcache bank dictionaries
-    def __generate_manycore_vcache_stats_all(self, vcache_stat):
-        # Create a dictionary and initialize elements to zero
-        tags = list(range(self.max_tags)) + ["kernel"]
-        manycore_vcache_stat = {tag: Counter() for tag in tags}
-
-        for tag in tags:
-            for vcache in self.active_vcaches:
-                for op in self.vcache_all_ops:
-                    manycore_vcache_stat[tag][op] += vcache_stat[tag][vcache][op]
-
-        return manycore_vcache_stat
-
-
-
-
-    # Calculate aggregate vcache stats dictionary for every  
-    # tile group by summing all per vcache bank dictionaries
-    # for all tiles belonding to a certain tile group
-    def __generate_tile_group_vcache_stats_all(self,  vcache_tile_group_stat):
-        # Create a dictionary and initialize elements to zero
-        tags = list(range(self.max_tags)) + ["kernel"]
-        manycore_vcache_tile_group_stat = {tag: [Counter() for tg_id in range(max(self.num_tile_groups.values()))] for tag in tags}
-
-        for tag in tags:
-            for tg_id in range(self.num_tile_groups[tag]):
-                for vcache in self.active_vcaches:
-                    for op in self.vcache_all_ops:
-                        manycore_vcache_tile_group_stat[tag][tg_id][op] += vcache_tile_group_stat[tag][tg_id][vcache][op]
-
-        return manycore_vcache_tile_group_stat
-
-
- 
-
     # Parses stat file's header to generate list of all 
     # operations based on type (stat, instruction, miss, stall)
     def parse_header(self, f):
@@ -1826,18 +2125,15 @@ def parse_header(self, f):
 
 # parses input arguments
 def add_args(parser):
-    parser.add_argument("--per-vcache", default=False, action='store_true',
-                        help="Also generate separate stats files for each victim cache bank.")
+    pass
 
 def main(args): 
-    st = VanillaStatsParser(args.tile, args.tile_group, args.per_vcache, args.stats, args.vcache_stats)
+    st = VanillaStatsParser(args.tile, args.tile_group, args.stats, args.vcache_stats)
     st.print_manycore_stats_all()
     if(st.per_tile_stat):
         st.print_per_tile_stats_all()
     if(st.per_tile_group_stat):
         st.print_per_tile_group_stats_all()
-    if(st.per_vcache_stat):
-        st.print_per_vcache_stats_all()
 
 # main()
 if __name__ == "__main__":
diff --git a/software/spmd/.gitignore b/software/spmd/.gitignore
index d12d9db0f..3242bdcfd 100644
--- a/software/spmd/.gitignore
+++ b/software/spmd/.gitignore
@@ -11,3 +11,6 @@ dramsim3epoch.json
 *.ld
 dis
 *.png
+dramsim3.json
+dramsim3.tag.json
+dramsim3.txt
diff --git a/software/spmd/Makefile b/software/spmd/Makefile
index cc93112dd..a9c976e4f 100644
--- a/software/spmd/Makefile
+++ b/software/spmd/Makefile
@@ -14,7 +14,11 @@ NO-RECURSE = \
 	c++ fft memtest16 fhello_malloc striped_vector_sum striped_hello \
 	coremark-top \
 	deprecated \
-	beebs
+	gather_scatter \
+	gather_scatter_multi \
+  striped_struct_vector	\
+	beebs saif \
+	interrupt_tests
 
 # Define this variable on cmd line to run coverage analysis. Currently
 # supports VCS coverage: run "make COVERAGE=VCS"
diff --git a/software/spmd/amoadd_test/Makefile b/software/spmd/amoadd_test/Makefile
new file mode 100644
index 000000000..b37d0941d
--- /dev/null
+++ b/software/spmd/amoadd_test/Makefile
@@ -0,0 +1,17 @@
+export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel)
+
+# Running tests on full manycore array. Uncomment and modify for a smaller array
+# bsg_tiles_X = 4
+# bsg_tiles_Y = 4
+
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules
+
+OBJECT_FILES=main.o
+
+all: main.run
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+main.o: Makefile
\ No newline at end of file
diff --git a/software/spmd/amoadd_test/main.c b/software/spmd/amoadd_test/main.c
new file mode 100644
index 000000000..e2283326e
--- /dev/null
+++ b/software/spmd/amoadd_test/main.c
@@ -0,0 +1,73 @@
+/*
+  Description:
+  Test to check if atomic adds work
+  Every tile atomically updates the 2 counter variables in DRAM using 2 methods and tile 0 checks if the value is the same using both techniques and is equal to the sum of bsg_x_id * bsg_y_id
+*/
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_manycore_atomic.h"
+
+#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
+#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
+#include "bsg_tile_group_barrier.h"
+INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1);
+
+int data[2] __attribute__ ((section (".dram"))) = {0};
+int lock __attribute__ ((section (".dram"))) = {0};
+
+void atomic_add()
+{
+  // Perform an atomic add using amoadd.w
+  int value0 = __bsg_x * __bsg_y;
+  int result = bsg_amoadd_aq(&data[0], value0);
+
+  // Perform an atomic add using amoswaps 
+  // Acquires a lock and then updates the memory location in the critical region
+  int lock_val = 1;
+
+  // acquire
+  do {
+    lock_val = bsg_amoswap_aq(&lock, 1);
+  } while (lock_val != 0);
+
+  // Critical region
+  int value1 = data[1] + __bsg_x *__bsg_y;
+  data[1] = value1;
+
+  // release
+  bsg_amoswap_rl(&lock, 0);
+
+  // Wait for all cores to finish
+  bsg_fence();
+  bsg_tile_group_barrier(&r_barrier, &c_barrier);
+
+  if (__bsg_id == 0)
+  {
+    bsg_printf("%d\n", data[0]);
+    bsg_printf("%d\n", data[1]);
+
+    int expected = 0;
+    int sum = 0;
+    for (int i = 0; i < bsg_tiles_X; i++)
+      expected += i;
+    for (int i = 0; i < bsg_tiles_Y; i++)
+      sum += i;
+    expected *= sum;
+
+    if ((data[0] == data[1]) && (data[0] == expected))
+      bsg_finish();
+    else
+      bsg_fail();
+  }
+}
+
+int main()
+{
+
+  bsg_set_tile_x_y();
+
+  atomic_add();
+
+  bsg_wait_while(1);
+}
\ No newline at end of file
diff --git a/software/spmd/asm_dmem_test/main.S b/software/spmd/asm_dmem_test/main.S
index fdb0c28fa..178b680c6 100644
--- a/software/spmd/asm_dmem_test/main.S
+++ b/software/spmd/asm_dmem_test/main.S
@@ -5,10 +5,6 @@
 #include "bsg_manycore_asm.h"
 #define N 1024
 
-// allocate 4KB in DMEM
-.data
-  dmem_arr: .space (N*4)
-
 // initialize
 .text
 li x1, 0
@@ -76,7 +72,8 @@ fcvt.s.w f29,x0
 fcvt.s.w f30,x0
 fcvt.s.w f31,x0
 
-la t0, dmem_arr
+// write to DMEM
+li t0, 0
 li t1, 0
 li t2, N
 write_loop:
@@ -85,7 +82,8 @@ write_loop:
   addi t0, t0, 4
   bne t1, t2, write_loop
 
-la t0, dmem_arr
+// read from DMEM
+li t0, 0
 li t1, 0
 li t2, N
 read_loop:
diff --git a/software/spmd/asm_flwadd_dram/Makefile b/software/spmd/asm_flwadd_dram/Makefile
new file mode 100644
index 000000000..0bc89ea7f
--- /dev/null
+++ b/software/spmd/asm_flwadd_dram/Makefile
@@ -0,0 +1,15 @@
+bsg_tiles_X= 1
+bsg_tiles_Y= 1 
+
+all: main.run
+
+include ../Makefile.include
+
+RISCV_LINK_OPTS = -march=rv32imaf -nostdlib -nostartfiles 
+
+main.riscv: $(LINK_SCRIPT)  main.o 
+	$(RISCV_LINK)  main.o -o $@ $(RISCV_LINK_OPTS)
+
+
+include ../../mk/Makefile.tail_rules
+
diff --git a/software/spmd/asm_flwadd_dram/main.S b/software/spmd/asm_flwadd_dram/main.S
new file mode 100644
index 000000000..033c7f860
--- /dev/null
+++ b/software/spmd/asm_flwadd_dram/main.S
@@ -0,0 +1,110 @@
+// testing flwadd
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+#define N 1024
+
+// allocate 4KB in DRAM
+.section .dram, "aw"
+  dram_arr: .space (N*4)
+
+// initialize
+.text
+li x1, 0
+li x2, 0
+li x3, 0
+li x4, 0
+li x5, 0
+li x6, 0
+li x7, 0
+li x8, 0
+li x9, 0
+li x10,0
+li x11,0
+li x12,0
+li x13,0
+li x14,0
+li x15,0
+li x16,0
+li x17,0
+li x18,0
+li x19,0
+li x20,0
+li x21,0
+li x22,0
+li x23,0
+li x24,0
+li x25,0
+li x26,0
+li x27,0
+li x28,0
+li x29,0
+li x30,0
+li x31,0
+
+fcvt.s.w f0, x0
+fcvt.s.w f1, x0
+fcvt.s.w f2, x0
+fcvt.s.w f3, x0
+fcvt.s.w f4, x0
+fcvt.s.w f5, x0
+fcvt.s.w f6, x0
+fcvt.s.w f7, x0
+fcvt.s.w f8, x0
+fcvt.s.w f9, x0
+fcvt.s.w f10,x0
+fcvt.s.w f11,x0
+fcvt.s.w f12,x0
+fcvt.s.w f13,x0
+fcvt.s.w f14,x0
+fcvt.s.w f15,x0
+fcvt.s.w f16,x0
+fcvt.s.w f17,x0
+fcvt.s.w f18,x0
+fcvt.s.w f19,x0
+fcvt.s.w f20,x0
+fcvt.s.w f21,x0
+fcvt.s.w f22,x0
+fcvt.s.w f23,x0
+fcvt.s.w f24,x0
+fcvt.s.w f25,x0
+fcvt.s.w f26,x0
+fcvt.s.w f27,x0
+fcvt.s.w f28,x0
+fcvt.s.w f29,x0
+fcvt.s.w f30,x0
+fcvt.s.w f31,x0
+ 
+
+
+test1:
+  la x31, dram_arr
+  li x1, 0x3f800000 // x1 = 1.0
+  sw x1, 0(x31)
+  li x1, 0x40000000 // x1 = 2.0
+  sw x1, 4(x31)
+
+  la x1, dram_arr    // base
+  li x2, 4           // stride
+
+  .word 0x0020f004 // flwadd f0, x2, 0(x1)
+  .word 0x0020f084 // flwadd f1, x2, 0(x1)
+
+  fadd.s f2, f1, f0
+  fcvt.w.s x3, f2
+  li x4, 3
+  bne x4, x3, fail
+
+  la x4, dram_arr
+  addi x4, x4, 8
+  bne x4, x1, fail
+
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/asm_flwadd_test/Makefile b/software/spmd/asm_flwadd_test/Makefile
new file mode 100644
index 000000000..0bc89ea7f
--- /dev/null
+++ b/software/spmd/asm_flwadd_test/Makefile
@@ -0,0 +1,15 @@
+bsg_tiles_X= 1
+bsg_tiles_Y= 1 
+
+all: main.run
+
+include ../Makefile.include
+
+RISCV_LINK_OPTS = -march=rv32imaf -nostdlib -nostartfiles 
+
+main.riscv: $(LINK_SCRIPT)  main.o 
+	$(RISCV_LINK)  main.o -o $@ $(RISCV_LINK_OPTS)
+
+
+include ../../mk/Makefile.tail_rules
+
diff --git a/software/spmd/asm_flwadd_test/main.S b/software/spmd/asm_flwadd_test/main.S
new file mode 100644
index 000000000..f68da52c6
--- /dev/null
+++ b/software/spmd/asm_flwadd_test/main.S
@@ -0,0 +1,108 @@
+// testing flwadd
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+#define N 1024
+
+// allocate 4KB in DRAM
+.section .dram, "aw"
+  dram_arr: .space (N*4)
+
+// initialize
+.text
+li x1, 0
+li x2, 0
+li x3, 0
+li x4, 0
+li x5, 0
+li x6, 0
+li x7, 0
+li x8, 0
+li x9, 0
+li x10,0
+li x11,0
+li x12,0
+li x13,0
+li x14,0
+li x15,0
+li x16,0
+li x17,0
+li x18,0
+li x19,0
+li x20,0
+li x21,0
+li x22,0
+li x23,0
+li x24,0
+li x25,0
+li x26,0
+li x27,0
+li x28,0
+li x29,0
+li x30,0
+li x31,0
+
+fcvt.s.w f0, x0
+fcvt.s.w f1, x0
+fcvt.s.w f2, x0
+fcvt.s.w f3, x0
+fcvt.s.w f4, x0
+fcvt.s.w f5, x0
+fcvt.s.w f6, x0
+fcvt.s.w f7, x0
+fcvt.s.w f8, x0
+fcvt.s.w f9, x0
+fcvt.s.w f10,x0
+fcvt.s.w f11,x0
+fcvt.s.w f12,x0
+fcvt.s.w f13,x0
+fcvt.s.w f14,x0
+fcvt.s.w f15,x0
+fcvt.s.w f16,x0
+fcvt.s.w f17,x0
+fcvt.s.w f18,x0
+fcvt.s.w f19,x0
+fcvt.s.w f20,x0
+fcvt.s.w f21,x0
+fcvt.s.w f22,x0
+fcvt.s.w f23,x0
+fcvt.s.w f24,x0
+fcvt.s.w f25,x0
+fcvt.s.w f26,x0
+fcvt.s.w f27,x0
+fcvt.s.w f28,x0
+fcvt.s.w f29,x0
+fcvt.s.w f30,x0
+fcvt.s.w f31,x0
+ 
+
+
+test1:
+  li x1, 0x3f800000 // x1 = 1.0
+  sw x1, 0(x0)
+  li x1, 0x40000000 // x1 = 2.0
+  sw x1, 4(x0)
+
+  li x1, 0    // base
+  li x2, 4    // stride
+
+  .word 0x0020f004 // flwadd f0, x2, 0(x1)
+  .word 0x0020f084 // flwadd f1, x2, 0(x1)
+
+  fadd.s f2, f1, f0
+  fcvt.w.s x3, f2
+  li x4, 3
+  bne x4, x3, fail
+
+  li x4, 8
+  bne x4, x1, fail
+
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/asm_memcpy_dram/main.S b/software/spmd/asm_memcpy_dram/main.S
index 89fcd8624..3798b67c3 100644
--- a/software/spmd/asm_memcpy_dram/main.S
+++ b/software/spmd/asm_memcpy_dram/main.S
@@ -10,10 +10,6 @@
 #include "bsg_manycore_asm.h"
 #define N 1024
 
-// allocate 4KB in DMEM
-.data
-  dmem_arr: .space (N*4)
-
 // allocate 4KB in DRAM
 .section .dram, "aw"
   dram_arr: .space (N*4)
@@ -86,7 +82,7 @@ fcvt.s.w f30,x0
 fcvt.s.w f31,x0
 
 // initialize DMEM with 0.0~1023.0
-la t0, dmem_arr
+li t0, 0
 li t1, 0
 li t2, N
 init_loop:
@@ -96,9 +92,9 @@ init_loop:
   addi t0, t0, 4
   bne t1, t2, init_loop
 
-
+bsg_asm_print_time(IO_X_INDEX, 0)
 // copy from DMEM to DRAM
-la t0, dmem_arr
+li t0, 0
 la t1, dram_arr
 li t2, 0
 li t3, (N/32)
@@ -171,9 +167,9 @@ dmem_to_dram_loop:
   addi t1, t1, 128
   addi t2, t2, 1
   bne t2, t3, dmem_to_dram_loop
-
+  bsg_asm_print_time(IO_X_INDEX, 1)
 // clear DMEM
-la t0, dmem_arr
+li t0, 0
 li t1, 0
 li t2, N
 fcvt.s.w f0, x0
@@ -183,8 +179,9 @@ clear_dmem_loop:
   addi t1, t1, 1
   bne t1, t2, clear_dmem_loop
 
+bsg_asm_print_time(IO_X_INDEX, 2)
 // copy from DRAM to DMEM
-la t0, dmem_arr
+li t0, 0
 la t1, dram_arr
 li t2, 0
 li t3, (N/32)
@@ -257,9 +254,9 @@ dram_to_dmem_loop:
   addi t1, t1, 128
   addi t2, t2, 1
   bne t2, t3, dram_to_dmem_loop
-
+  bsg_asm_print_time(IO_X_INDEX, 3)
 // validate
-la t0, dmem_arr
+li t0, 0
 li t1, 0
 li t2, N
 validate_loop:
@@ -272,15 +269,12 @@ validate_loop:
 
  
 pass:
-  li t0, 0x4100ead0
-  sw x0, 0(t0)
-
+  bsg_asm_finish(IO_X_INDEX, 0)
 pass_loop:
   beq x0, x0, pass_loop
 
 fail:
-  li t0, 0x4100ead8
-  sw x0, 0(t0)
+  bsg_asm_fail(IO_X_INDEX, 0)
 
 fail_loop:
   beq x0, x0, fail_loop
diff --git a/software/spmd/branch_rv32/main.S b/software/spmd/branch_rv32/main.S
index b88cdd1e1..2094af727 100644
--- a/software/spmd/branch_rv32/main.S
+++ b/software/spmd/branch_rv32/main.S
@@ -11,19 +11,21 @@ test_1:
     beqz zero, test_2;
 fail_1:
     bsg_asm_fail(IO_X_INDEX, 0x1);
-
+    j end_loop
 // (MAX-4) branch forward
 test_3:
     beqz zero, test_4;
     j finish;
 fail_3:
     bsg_asm_fail(IO_X_INDEX, 0x3);
+    j end_loop
 
 // branch backward
 test_2:
     beqz zero, test_3;
 fail_2:
     bsg_asm_fail(IO_X_INDEX, 0x2);
+    j end_loop
 _code_section_1_end = .
 
 .org test_3+MAX_BRANCH-4
@@ -35,7 +37,10 @@ test_4:
     beqz zero, test_3+4;
 fail_4:
     bsg_asm_fail(IO_X_INDEX, 0x4);
+    j end_loop
 
 finish:
     bsg_asm_finish(IO_X_INDEX, 0);
+end_loop:
+  j end_loop
 _code_section_2_end = .
diff --git a/software/spmd/bsg_barrier/Makefile b/software/spmd/bsg_barrier/Makefile
index 39a24ed4d..573b7a48b 100644
--- a/software/spmd/bsg_barrier/Makefile
+++ b/software/spmd/bsg_barrier/Makefile
@@ -1,6 +1,6 @@
 
-bsg_tiles_X = 3 
-bsg_tiles_Y = 3
+bsg_tiles_X = 16 
+bsg_tiles_Y = 8
 
 RISCV_GCC_EXTRA_OPTS ?= -O2 -funroll-loops
 OBJECT_FILES=main.o
diff --git a/software/spmd/bsg_barrier_time/Makefile b/software/spmd/bsg_barrier_time/Makefile
new file mode 100644
index 000000000..3c085dcc8
--- /dev/null
+++ b/software/spmd/bsg_barrier_time/Makefile
@@ -0,0 +1,20 @@
+
+bsg_tiles_X = 16
+bsg_tiles_Y = 8
+
+RISCV_GCC_EXTRA_OPTS ?= -O2 -funroll-loops
+OBJECT_FILES=main.o
+include ../Makefile.include
+
+all: main.run
+
+main.riscv: $(LINK_SCRIPT)  $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) ../common/crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+clean:
+	-rm -rf $(OBJECT_FILES) *.jou *.log *.pb bsg_rom_hello.v main.riscv *.wdb main.bin main.hex
+	-rm -rf xsim.dir  csrc simv  simv.daidir  ucli.key  vcdplus.vpd bsg_manycore_io_complex_rom.v
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/bsg_barrier_time/main.c b/software/spmd/bsg_barrier_time/main.c
new file mode 100644
index 000000000..26ac59923
--- /dev/null
+++ b/software/spmd/bsg_barrier_time/main.c
@@ -0,0 +1,45 @@
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_mutex.h"
+#include "bsg_barrier.h"
+//---------------------------------------------------------
+
+#define N 10
+// MBT: This test performs N barriers in a row.
+//      It demonstrates that bsg_barrier_wait, while never very efficient
+//      is horribly inefficient for large numbers of tiles. This is because the internal
+//      implementation of bsg barrier essentially uses spinlocks
+//      which do not scale.
+
+#define BARRIER_X_END (bsg_tiles_X - 1)
+#define BARRIER_Y_END (bsg_tiles_Y - 1)
+#define BARRIER_TILES ( (BARRIER_X_END +1) * ( BARRIER_Y_END+1) )
+
+bsg_barrier     tile0_barrier = BSG_BARRIER_INIT(0, BARRIER_X_END, 0, BARRIER_Y_END);
+
+#define array_size(a)               \
+    (sizeof(a)/(sizeof((a)[0])))
+
+volatile int data[bsg_tiles_X][bsg_tiles_Y] __attribute__((section (".dram")));
+
+////////////////////////////////////////////////////////////////////
+int main() {
+        int i, j, id;
+
+        bsg_set_tile_x_y();
+
+        id = bsg_x_y_to_id(bsg_x, bsg_y);
+
+	for (int i = 0; i < N; i++)
+	  {
+	    if (bsg_x+bsg_y == 0)
+	      bsg_print_time();
+	    bsg_barrier_wait( &tile0_barrier, 0, 0);
+	  }
+
+	if (id == 0)
+	  bsg_finish();
+	else
+	  bsg_wait_while(1);
+}
+
diff --git a/software/spmd/bsg_cuda_lite_runtime/dma/Makefile b/software/spmd/bsg_cuda_lite_runtime/dma/Makefile
new file mode 100644
index 000000000..dbbd6227c
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/dma/Makefile
@@ -0,0 +1,34 @@
+#########################################################
+# Network Configutaion
+# If not configured, Will use default Values
+	bsg_global_X ?= $(bsg_tiles_X)
+	bsg_global_Y ?= $(bsg_tiles_Y)+1
+
+#########################################################
+#Tile group configuration
+# If not configured, Will use default Values
+	bsg_tiles_org_X ?= 0
+	bsg_tiles_org_Y ?= 1
+
+# If not configured, Will use default Values
+	bsg_tiles_X ?= 2
+	bsg_tiles_Y ?= 2
+
+
+all: main.run
+
+
+KERNEL_NAME ?=kernel_dma
+
+OBJECT_FILES=main.o kernel_dma.o
+
+include ../../Makefile.include
+
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) ../../common/crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../../mk/Makefile.tail_rules
diff --git a/software/spmd/bsg_cuda_lite_runtime/dma/kernel_dma.cpp b/software/spmd/bsg_cuda_lite_runtime/dma/kernel_dma.cpp
new file mode 100644
index 000000000..92014af96
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/dma/kernel_dma.cpp
@@ -0,0 +1,15 @@
+//This is an empty kernel
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+extern "C" __attribute__ ((noinline))
+int kernel_dma(int *A, int *B, int n) {
+
+    if (__bsg_id == 0) {
+        for (int i = 0; i < n; i++)
+            B[i] = A[i];
+    }
+
+    return 0;
+}
diff --git a/software/spmd/bsg_cuda_lite_runtime/dma/main.c b/software/spmd/bsg_cuda_lite_runtime/dma/main.c
new file mode 120000
index 000000000..24daac669
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/dma/main.c
@@ -0,0 +1 @@
+../main/main.c
\ No newline at end of file
diff --git a/software/spmd/bsg_cuda_lite_runtime/high_mem/Makefile b/software/spmd/bsg_cuda_lite_runtime/high_mem/Makefile
new file mode 100644
index 000000000..d56fb27b9
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/high_mem/Makefile
@@ -0,0 +1,34 @@
+#########################################################
+# Network Configutaion
+# If not configured, Will use default Values
+	bsg_global_X ?= $(bsg_tiles_X)
+	bsg_global_Y ?= $(bsg_tiles_Y)+1
+
+#########################################################
+#Tile group configuration
+# If not configured, Will use default Values
+	bsg_tiles_org_X ?= 0
+	bsg_tiles_org_Y ?= 1
+
+# If not configured, Will use default Values
+	bsg_tiles_X ?= 2
+	bsg_tiles_Y ?= 2
+
+
+all: main.run
+
+
+KERNEL_NAME ?=kernel_high_mem
+
+OBJECT_FILES=main.o kernel_high_mem.o
+
+include ../../Makefile.include
+
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) ../../common/crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../../mk/Makefile.tail_rules
diff --git a/software/spmd/bsg_cuda_lite_runtime/high_mem/kernel_high_mem.cpp b/software/spmd/bsg_cuda_lite_runtime/high_mem/kernel_high_mem.cpp
new file mode 100644
index 000000000..7940ad59a
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/high_mem/kernel_high_mem.cpp
@@ -0,0 +1,10 @@
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+extern "C" __attribute__ ((noinline))
+int kernel_high_mem(unsigned *A, unsigned *B, int N) {
+    for (int i = 0; i < N; ++i) {
+        B[i] = A[i];
+    }
+    return 0;
+}
diff --git a/software/spmd/bsg_cuda_lite_runtime/high_mem/main.c b/software/spmd/bsg_cuda_lite_runtime/high_mem/main.c
new file mode 120000
index 000000000..24daac669
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/high_mem/main.c
@@ -0,0 +1 @@
+../main/main.c
\ No newline at end of file
diff --git a/software/spmd/bsg_tile_group_barrier/Makefile b/software/spmd/bsg_tile_group_barrier/Makefile
index 0aeea6a97..89522baa5 100644
--- a/software/spmd/bsg_tile_group_barrier/Makefile
+++ b/software/spmd/bsg_tile_group_barrier/Makefile
@@ -1,6 +1,7 @@
 
-bsg_tiles_X = 4 
-bsg_tiles_Y = 4
+
+bsg_tiles_X = 16 
+bsg_tiles_Y = 8
 
 RISCV_GCC_EXTRA_OPTS ?= -O2 -funroll-loops
 OBJECT_FILES=main.o
diff --git a/software/spmd/bsg_transpose/main.c b/software/spmd/bsg_transpose/main.c
index 8d4a827a8..d3ba7fdee 100644
--- a/software/spmd/bsg_transpose/main.c
+++ b/software/spmd/bsg_transpose/main.c
@@ -75,7 +75,7 @@ void print_result( dest_array * p_dest ){
             int col_id = i % SUB_ROW_NUM ;
 
             unsigned int data = * ( bsg_remote_ptr( x_cord, y_cord, &( p_dest[row_id][col_id] ) ) );
-            bsg_remote_ptr_io_store( 0, 0, data);
+            bsg_remote_ptr_io_store(IO_X_INDEX, 0, data);
         }
     }
 }
@@ -83,7 +83,7 @@ void print_result1( dest_array * p_dest ){
     for( int i=0; i< SUB_COL_NUM  ;  i++){
         for( int j=0; j< SUB_ROW_NUM; j++){
             unsigned int data = * ( bsg_remote_ptr( 1, 0, &( p_dest[ i ][ j ] ) ) );
-            bsg_remote_ptr_io_store( 0, 0, data);
+            bsg_remote_ptr_io_store(IO_X_INDEX, 0, data);
         }
     }
 }
@@ -102,7 +102,7 @@ int main() {
    bsg_barrier_wait( &tile0_barrier, 0, 0);
 
    //start to transpose
-   if( id == 0) bsg_remote_ptr_io_store(0x0, 0x0, 0x0000cab0);
+   if( id == 0) bsg_remote_ptr_io_store(IO_X_INDEX, 0x0, 0x0000cab0);
 
    transpose( local_source, local_dest);
 
@@ -110,7 +110,7 @@ int main() {
    bsg_barrier_wait( &tile0_trans_barrier, 0, 0);
 
    if( id == 0) {
-       bsg_remote_ptr_io_store(0x0, 0x0, 0x0000cab1);
+       bsg_remote_ptr_io_store(IO_X_INDEX, 0x0, 0x0000cab1);
        print_result( local_dest );
        bsg_finish();
    }
diff --git a/software/spmd/common/crt.S b/software/spmd/common/crt.S
index b0d346604..062b27ad2 100644
--- a/software/spmd/common/crt.S
+++ b/software/spmd/common/crt.S
@@ -1,6 +1,12 @@
 # See LICENSE for license details.
 # MBT modified for bsg_manycore
 
+# bsg-tommy: we reserve the first two words of DMEM for interrupt handlers.
+.section .dmem.interrupt,"a"
+  .globl _interrupt_arr
+_interrupt_arr:
+  .space(8)
+
 .section .crtbegin,"a"
   .globl _start
 _start:
diff --git a/software/spmd/coremark/ee_printf.c b/software/spmd/coremark/ee_printf.c
index 3a3a6f0e4..b7f208942 100644
--- a/software/spmd/coremark/ee_printf.c
+++ b/software/spmd/coremark/ee_printf.c
@@ -15,10 +15,14 @@ limitations under the License.
 */
 
 #include "bsg_manycore.h"
+#include "bsg_mutex.h"
 
 #include <coremark.h>
 #include <stdarg.h>
 
+
+static bsg_remote_int_ptr io_mutex_ptr= bsg_io_mutex_ptr( 0 );
+
 #define ZEROPAD  	(1<<0)	/* Pad with zero */
 #define SIGN    	(1<<1)	/* Unsigned/signed long */
 #define PLUS    	(1<<2)	/* Show plus */
@@ -587,15 +591,21 @@ int ee_printf(const char *fmt, ...)
   va_list args;
   int n=0;
 
+
+
   va_start(args, fmt);
   ee_vsprintf(buf, fmt, args);
   va_end(args);
   p=buf;
+
+
+  bsg_mutex_lock( io_mutex_ptr );
   while (*p) {
 	uart_send_char(*p);
 	n++;
 	p++;
   }
+  bsg_mutex_unlock( io_mutex_ptr );
 
   return n;
 }
diff --git a/software/spmd/crc32/main.c b/software/spmd/crc32/main.c
index 96ceb0f6d..f712879b3 100644
--- a/software/spmd/crc32/main.c
+++ b/software/spmd/crc32/main.c
@@ -219,12 +219,12 @@ int main()
     if (crc == ANSWER)
     {
       bsg_printf("crc: %X [PASSED]\n", crc);
-      bsg_finish_x(0);
+      bsg_finish();
     }
     else
     {
       bsg_printf("crc: %X [FAILED]\n", crc);
-      bsg_fail_x(0);
+      bsg_fail();
     }
   }
 
diff --git a/software/spmd/fma_fdiv_waw_check/Makefile b/software/spmd/fma_fdiv_waw_check/Makefile
new file mode 100644
index 000000000..0bc89ea7f
--- /dev/null
+++ b/software/spmd/fma_fdiv_waw_check/Makefile
@@ -0,0 +1,15 @@
+bsg_tiles_X= 1
+bsg_tiles_Y= 1 
+
+all: main.run
+
+include ../Makefile.include
+
+RISCV_LINK_OPTS = -march=rv32imaf -nostdlib -nostartfiles 
+
+main.riscv: $(LINK_SCRIPT)  main.o 
+	$(RISCV_LINK)  main.o -o $@ $(RISCV_LINK_OPTS)
+
+
+include ../../mk/Makefile.tail_rules
+
diff --git a/software/spmd/fma_fdiv_waw_check/main.S b/software/spmd/fma_fdiv_waw_check/main.S
new file mode 100644
index 000000000..8ec6dece4
--- /dev/null
+++ b/software/spmd/fma_fdiv_waw_check/main.S
@@ -0,0 +1,115 @@
+//  This tests that fma followed by special case fdiv that only takes one cycle (e.g. divide infinity by zero) 
+// does not violate WAW depency.
+
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.text
+li x0, 0
+li x1, 0
+li x2, 0
+li x3, 0
+li x4, 0
+li x5, 0
+li x6, 0
+li x7, 0
+li x8, 0
+li x9, 0
+li x10,0
+li x11,0
+li x12,0
+li x13,0
+li x14,0
+li x15,0
+li x16,0
+li x17,0
+li x18,0
+li x19,0
+li x20,0
+li x21,0
+li x22,0
+li x23,0
+li x24,0
+li x25,0
+li x26,0
+li x27,0
+li x28,0
+li x29,0
+li x30,0
+li x31,0
+fcvt.s.w f0, x0
+fcvt.s.w f1, x0
+fcvt.s.w f2, x0
+fcvt.s.w f3, x0
+fcvt.s.w f4, x0
+fcvt.s.w f5, x0
+fcvt.s.w f6, x0
+fcvt.s.w f7, x0
+fcvt.s.w f8, x0
+fcvt.s.w f9, x0
+fcvt.s.w f10,x0
+fcvt.s.w f11,x0
+fcvt.s.w f12,x0
+fcvt.s.w f13,x0
+fcvt.s.w f14,x0
+fcvt.s.w f15,x0
+fcvt.s.w f16,x0
+fcvt.s.w f17,x0
+fcvt.s.w f18,x0
+fcvt.s.w f19,x0
+fcvt.s.w f20,x0
+fcvt.s.w f21,x0
+fcvt.s.w f22,x0
+fcvt.s.w f23,x0
+fcvt.s.w f24,x0
+fcvt.s.w f25,x0
+fcvt.s.w f26,x0
+fcvt.s.w f27,x0
+fcvt.s.w f28,x0
+fcvt.s.w f29,x0
+fcvt.s.w f30,x0
+fcvt.s.w f31,x0
+
+
+
+
+li x1, 3
+li x2, 4
+li x3, 0x7f800000
+li x4, 7
+fmv.s.x f1, x1
+fmv.s.x f2, x2
+fmv.s.x f3, x3
+
+nop
+nop
+nop
+nop
+nop
+
+//  if fdiv is divide-by-infinite or some special situation, it could take much less 
+//  cycles. Make sure that fadd.s does not write after fdiv.s
+fadd.s f31, f1, f2
+fdiv.s f31, f1, f3
+
+nop
+nop
+nop
+nop
+nop
+
+fmv.x.s x5, f31
+beq x4, x5, fail
+
+
+
+
+pass:
+  bsg_asm_finish(IO_X_INDEX,0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/hello/main.c b/software/spmd/hello/main.c
index a67954177..239612a46 100644
--- a/software/spmd/hello/main.c
+++ b/software/spmd/hello/main.c
@@ -6,6 +6,8 @@
  Declear an array in DRAM. 
 *************************************************************************/
 int data[4] __attribute__ ((section (".dram"))) = { -1, 1, 0xF, 0x80000000};
+extern int* _interrupt_arr;
+
 
 int main()
 {
@@ -51,6 +53,16 @@ int main()
        i++;
      }
 
+     // interrupt array addr
+     bsg_printf("Interrupt array at 0x%d\n", _interrupt_arr);
+      
+     _interrupt_arr[0] = 3;
+     _interrupt_arr[1] = 4;
+
+     int sum = _interrupt_arr[0] + _interrupt_arr[1];
+     if (sum != 7) bsg_fail();
+
+
   /************************************************************************
     Terminates the Simulation
   *************************************************************************/
diff --git a/software/spmd/interrupt_tests/.gitignore b/software/spmd/interrupt_tests/.gitignore
new file mode 100644
index 000000000..1fb4da228
--- /dev/null
+++ b/software/spmd/interrupt_tests/.gitignore
@@ -0,0 +1,3 @@
+*run
+coverage/
+urgReport/
diff --git a/software/spmd/interrupt_tests/Makefile b/software/spmd/interrupt_tests/Makefile
new file mode 100644
index 000000000..36f71cae1
--- /dev/null
+++ b/software/spmd/interrupt_tests/Makefile
@@ -0,0 +1,47 @@
+export BSG_MANYCORE_DIR := $(shell git rev-parse --show-toplevel)
+
+bsg_tiles_X= 1
+bsg_tiles_Y= 1
+
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.master
+include $(BSG_MANYCORE_DIR)/software/mk/Makefile.tail_rules
+include $(BSG_MANYCORE_DIR)/software/spmd/interrupt_tests/Makefile.testlist
+
+# Name useful for coverage
+CM_NAME ?= csr_test
+
+RISCV_LINK_OPTS = -march=rv32imaf -nostdlib -nostartfiles
+
+cov_setup:
+ifneq ($(COVERAGE), 0)
+	$(shell mkdir -p $(BSG_MANYCORE_DIR)/software/spmd/interrupt_tests/coverage)
+endif
+
+%.riscv: $(LINK_SCRIPT)  %.o 
+	$(RISCV_LINK)  $*.o -o $@ $(RISCV_LINK_OPTS)
+
+%_run: cov_setup
+	mkdir -p $@
+	cp $*.S $@/main.S
+	$(MAKE) -C $@ -f ../Makefile main.run CM_NAME=$* 2>&1 | /usr/bin/tee $@/$@.log
+
+%_wave:
+	$(DVE) -full64 -vpd $*/vcdplus.vpd &
+
+cov:
+	$(DVE) -full64 -cov -covdir coverage/simv.vdb &
+
+regress: clean
+	$(MAKE) $(foreach test, $(TESTS), $(test)_run)
+
+summary:
+	@$(foreach test, $(TESTS), grep -H --color -e "BSG_FINISH" -e "BSG_FATAL" -e "Error" -e \
+	"BSG_ERROR" $(test)_run/$(test)_run.log;)
+
+clean.run:
+	rm -rf *run/
+
+clean.build:
+	$(MAKE) -C $(BSG_MACHINE_PATH)/.. -f Makefile clean
+
+clean: clean.run
diff --git a/software/spmd/interrupt_tests/Makefile.testlist b/software/spmd/interrupt_tests/Makefile.testlist
new file mode 100644
index 000000000..bbf7a5137
--- /dev/null
+++ b/software/spmd/interrupt_tests/Makefile.testlist
@@ -0,0 +1,29 @@
+TESTS = csr_test \
+        dual_handler_test \
+        dual_source_test  \
+        dual_npc_mret_test \
+        dual_remote_no_trace_test \
+        dual_threading_test \
+        remote_test \
+        remote_fdiv_test \
+        remote_float_test \
+        remote_handler1_test \
+        remote_handler2_test \
+        remote_icache_miss_test \
+        remote_idiv_test \
+        remote_imul_test \
+        remote_multiple_test \
+        remote_remote_load_loop_test \
+        trace_test \
+        trace_fdiv_test \
+        trace_idiv_test \
+        trace_imul_test \
+        trace_float_test \
+        trace_icache_miss_test \
+        trace_countdown_test \
+        trace_branch_mispredict_loop_test \
+        trace_jump_loop_test \
+        trace_jump_loop_icache_test \
+        trace_remote_load_loop_test \
+        trace_handler1_test \
+        trace_handler2_test
diff --git a/software/spmd/interrupt_tests/README.md b/software/spmd/interrupt_tests/README.md
new file mode 100644
index 000000000..177a038c2
--- /dev/null
+++ b/software/spmd/interrupt_tests/README.md
@@ -0,0 +1,19 @@
+# Interrupt Test Regression Suite
+
+This suite checks interaction of the vanilla core logic with trace and remote interrupts. 
+
+## Summary of Files
+1. `*.S`: Files containing RISC-V assembly code testing a specific function.
+2. `Makefile.testlist`: A makefile defining a variable `TESTS` that lists all the tests for the regression.
+3. `Makefile`: Top-level makefile with targets to run specific tests, the regression suite and observe waveform dumps (in DVE) and coverage reports (in DVE). This integrates with the already existing Makefile setup to run tests/regressions.
+
+## Commands
+1. `make <test_name>_run [COVERAGE=1 | WAVE=1]`: Runs a test with name `<test_name>` and dumps the results and log files in a `<test_name>_run` folder. This target can optionally be run with `COVERAGE=1` to turn on coverage for the vanilla core program counter or `WAVE=1` to dump the simulation waveform.<br/>
+    Coverage statistics can be found in `coverage/simv.vdb`.<br/>
+    Waveform dump can be found in the `<test_name>_run` folder.<br/>
+    Example: `make csr_test_run COVERAGE=1`.
+2. `make <test_name>_run_wave`: Opens DVE to view the specified waveform.
+3. `make cov`: Opens DVE to view coverage statistics. In case of a regression, this displays the merged coverage reports across all tests.
+4. `make regress [COVERAGE=1 | WAVE=1]`: Runs a regression and creates per-test dumps in `<test_name>_run` folders. The test suite used for the regression is given in `Makefile.testlist`.
+5. `make summary`: Use this command after a regression to view a summary of outputs. This target `greps` the generated log files for critical keywords indicating pass/fail.
+6. `make clean`: Cleans the current directory and all build directories.
diff --git a/software/spmd/interrupt_tests/csr_test.S b/software/spmd/interrupt_tests/csr_test.S
new file mode 100644
index 000000000..8d6649e87
--- /dev/null
+++ b/software/spmd/interrupt_tests/csr_test.S
@@ -0,0 +1,64 @@
+// Author: Tommy
+// testing accessing mcsr using csr instructions.
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.text
+.globl _start
+_start:
+  bsg_asm_init_regfile
+
+
+// test mstatus
+li x1, 0x88
+csrrw x0, mstatus, x1
+csrrw x2, mstatus, x0
+bne x1, x2, fail
+
+csrrc x0, mstatus, x1
+csrrc x2, mstatus, x0
+bne x2, x0, fail
+
+li x1, 0x80
+csrrs x0, mstatus, x1
+csrrs x2, mstatus, x0
+bne x1, x2, fail
+
+li x1, 0xff
+csrrc x0, mstatus, x1
+csrrc x2, mstatus, x0
+bne x2, x0, fail
+
+
+# test mepc
+li x1, 0xbeef0
+csrrw x0, mepc, x1
+csrrw x2, mepc, x0
+bne x2, x1, fail
+
+
+# test mip
+li x1, 0x30000
+csrrw x0, mip, x1
+csrrw x2, mip, x0
+bne x1, x2, fail
+
+csrrc x0, mip, x1
+csrrc x2, mip, x0
+bne x2, x0, fail
+
+
+
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/dual_handler_test.S b/software/spmd/interrupt_tests/dual_handler_test.S
new file mode 100644
index 000000000..ff69eab99
--- /dev/null
+++ b/software/spmd/interrupt_tests/dual_handler_test.S
@@ -0,0 +1,62 @@
+// testing behavior of icache misses in the trace interrupt handler (make a function call to a different region but mret from same region)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  call _count_trace_interrupt
+  mret
+
+_remote_interrupt_handler:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  j _count_remote_interrupt
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x30000
+  csrrw x0, mie, x1
+
+  // send yourself a remote interrupt
+  li x3, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x4, 1
+  sw x4, 0(x3)
+
+  li x31, 0xbeef
+  bne x30, x31, fail
+  li x31, 0xdead
+  bne x29, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+// force an icache miss
+.org 4096
+_count_remote_interrupt:
+  li x29, 0xdead
+  mret
+
+.org 4096*2
+_count_trace_interrupt:
+  li x30, 0xbeef
+  ret
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/dual_npc_mret_test.S b/software/spmd/interrupt_tests/dual_npc_mret_test.S
new file mode 100644
index 000000000..75ea05466
--- /dev/null
+++ b/software/spmd/interrupt_tests/dual_npc_mret_test.S
@@ -0,0 +1,69 @@
+// stress testing the npc changes during an mret by making an interrupt pending at mret
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // send a remote interrupt
+  sw x4, 0(x3)
+
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+
+  // keep remote interrupt pending
+  nop
+
+  mret
+
+_remote_interrupt_handler:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+
+  // increment counter
+  addi x31, x31, 1
+  
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // setup for a remote interrupt
+  // Send a remote interrupt
+  li x3, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x4, 1
+
+  // loop variable
+  li x5, 10
+
+  // enable mie.remote and mie.trace
+  li x1, 0x30000
+  csrrw x0, mie, x1
+
+  // bunch of nops here since we don't really care what happens, we are only testing interrupt-mret behavior
+loop:
+  beq x5, x0, done 
+  addi x5, x5, -1
+  j loop
+
+done:
+  // even the following instruction gets traced so update the check accordingly
+  li x30, 32
+  bne x30, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/dual_remote_no_trace_test.S b/software/spmd/interrupt_tests/dual_remote_no_trace_test.S
new file mode 100644
index 000000000..2bcb3f3ec
--- /dev/null
+++ b/software/spmd/interrupt_tests/dual_remote_no_trace_test.S
@@ -0,0 +1,95 @@
+// this test attempts to establish the priority of remote interrupts over trace interrupts and gives a way we can use remote interrupts to
+// Even though tracing is enabled, we don't want to see PC traces
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  addi t0, t0, 1
+  mret
+
+_remote_interrupt_handler:
+  // clear mie.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+
+  // change MEPC to the next instruction (pseudo tracing)
+  lw x30, 0(sp)
+  beq x30, x15, t
+nt:
+  // send out a new remote interrupt; should be pending on exit
+  sw x4, 0(x3) 
+t: 
+  addi sp, sp, 4
+  csrrw x0, mepc, x30
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // Saving all remote targets on stack
+  la x11, r1
+  la x12, r2
+  la x13, r3
+  la x14, r4
+  la x15, r5
+  addi sp, sp, -20
+  sw x11, 0(sp)
+  sw x12, 4(sp)
+  sw x13, 8(sp)
+  sw x14, 12(sp)
+  sw x15, 16(sp)
+
+  // setup for a remote interrupt
+  li x3, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x4, 1
+  sw x4, 0(x3)
+
+  // enable mie.trace now
+  li x1, 0x20000
+  csrrs x0, mie, x1
+
+  // trace interrupts should not be trigerred for these instructions
+  addi t1, t1, 1
+r1:
+  addi t1, t1, 1
+r2:
+  addi t1, t1, 1
+r3:
+  addi t1, t1, 1
+r4:
+  addi t1, t1, 1
+r5:
+  // tracing would be valid from here
+  li x4, 5
+  // tracing shouldn't have been done before
+  beq x4, t0, fail
+  // first instance of the series of addi's above should be executed
+  li x4, 1
+  // t1 should not have been incremented at all
+  bne x4, t1, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+
diff --git a/software/spmd/interrupt_tests/dual_source_test.S b/software/spmd/interrupt_tests/dual_source_test.S
new file mode 100644
index 000000000..496fe5f52
--- /dev/null
+++ b/software/spmd/interrupt_tests/dual_source_test.S
@@ -0,0 +1,58 @@
+// Author: Tommy
+// testing both trace and remote interrupt
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_remote_interrupt_handler:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x30000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 10
+  li x4, 1
+
+.org 4096
+countdown_loop:
+  beq x0, x2, pass
+  li x3, bsg_tile_group_remote_interrupt_ptr(0,0)
+  sw x4, 0(x3)
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/dual_threading_test.S b/software/spmd/interrupt_tests/dual_threading_test.S
new file mode 100644
index 000000000..d494f6efb
--- /dev/null
+++ b/software/spmd/interrupt_tests/dual_threading_test.S
@@ -0,0 +1,117 @@
+// this program creates a tiny threading package. The threads communicate with each other using remote interruts.
+// A remote interrupt switches the MEPC register in the handler and moves to the other thread. 
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+// Register usage
+// t0 - Scratch pad - useful for loading masks and other things into CSRs
+// t1, t2 - Registers used for sending remote interrupts
+// t3, t4, t5, t6 - registers used for context switching
+// s2, s3, s4 - Registers used by thread 1 and 2
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_remote_interrupt_handler:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+
+  // Read current MEPC
+  csrrw t6, mepc, x0
+
+  // Read the stack for next thread info (GPR + MEPC)
+  lw t5, 8(sp)
+  lw t4, 4(sp)
+  lw t3, 0(sp)
+  // Write current thread info to stack
+  sw s2, 0(sp)
+  sw s3, 4(sp)
+  sw t6, 8(sp)
+  // Restore next thread's state
+  add s2, t3, x0
+  add s3, t4, x0
+
+  // Write new MEPC
+  csrrw x0, mepc, t5
+
+  mret 
+
+_start:
+  bsg_asm_init_regfile
+
+.org 2048
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // Setup
+  // thread 1
+  li s2, 10 // loop variable
+
+  // thread 2
+  // Write contents to stack
+  addi sp, sp, -12
+  li t3, 10 // loop variable
+  li t4, 0  // result
+  la t5, t2_entry // start address for thread 2
+  // Write thread 2 values to stack
+  sw t3, 0(sp)
+  sw t4, 4(sp)
+  sw t5, 8(sp)
+
+  // remote interrupt
+  li t1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li t2, 1
+
+  // enable mie.remote and mie.trace
+  li x1, 0x30000
+  csrrw x0, mie, x1
+
+// t1 is a +10 loop
+t1_entry:
+  beq x0, s2, t1_done
+  addi s3, s3, 10
+  addi s2, s2, -1
+  sw t2, 0(t1)
+  nop
+  nop
+  j t1_entry
+
+t1_done:
+  li s4, 100
+  beq s4, s3, pass
+  j fail
+
+.org 4096
+// t2 is a +20 loop
+t2_entry:
+  beq x0, s2, t2_done
+  addi s3, s3, 20
+  addi s2, s2, -1
+  sw t2, 0(t1)
+  nop
+  nop
+  j t2_entry
+
+t2_done:
+  li s4, 200
+  beq s4, s3, pass
+  j fail
+
+.org 6144
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/remote_fdiv_test.S b/software/spmd/interrupt_tests/remote_fdiv_test.S
new file mode 100644
index 000000000..c5b4ba4fa
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_fdiv_test.S
@@ -0,0 +1,52 @@
+// testing entering remote interrupts during iterative floating point divider usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  li x31, 0x3
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // setup for fdiv
+  li x3, 0x16
+  li x4, 0x7
+  fcvt.s.w f0, x3
+  fcvt.s.w f1, x4
+
+  // issue a floating point divide instruction (1000/3)
+  fdiv.s f2, f0, f1
+
+  // send a remote interrupt to yourself (should get through immediately since backend is not stalled)
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+
+  // will fail if the remote interrupt did not fire
+  fcvt.w.s x7, f2
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_float_test.S b/software/spmd/interrupt_tests/remote_float_test.S
new file mode 100644
index 000000000..3b7f28e07
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_float_test.S
@@ -0,0 +1,52 @@
+// Author: Tommy
+// taking remote interrupt while executing float instr.
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // setup for fadd
+  li x2, 4
+  li x3, 5
+  fcvt.s.w f0, x2
+  fcvt.s.w f1, x3
+  // send yourself remote interrupt
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+  fadd.s f2, f0, f1
+  fcvt.w.s x4,f2
+  li x31, 9
+  bne x31, x4, fail
+
+
+
+
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_handler1_test.S b/software/spmd/interrupt_tests/remote_handler1_test.S
new file mode 100644
index 000000000..3e3ceda25
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_handler1_test.S
@@ -0,0 +1,50 @@
+// testing behavior of icache misses in the trace interrupt handler (mret from a different region)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  j _count_remote_interrupt
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // send yourself a remote interrupt
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+
+  // wait for interrupt
+  nop
+  li x31, 0xbeef
+  bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+// force an icache miss
+.org 4096
+_count_remote_interrupt:
+  li x30, 0xbeef
+  mret
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/remote_handler2_test.S b/software/spmd/interrupt_tests/remote_handler2_test.S
new file mode 100644
index 000000000..280db7a62
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_handler2_test.S
@@ -0,0 +1,51 @@
+// testing behavior of icache misses in the trace interrupt handler (make a function call to a different region but mret from same region)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  call _count_remote_interrupt
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // send yourself a remote interrupt
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+
+  // wait for interrupt
+  nop
+  li x31, 0xbeef
+  bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+// force an icache miss
+.org 4096
+_count_remote_interrupt:
+  li x30, 0xbeef
+  ret
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/remote_icache_miss_test.S b/software/spmd/interrupt_tests/remote_icache_miss_test.S
new file mode 100644
index 000000000..da94a68ce
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_icache_miss_test.S
@@ -0,0 +1,53 @@
+// Author: Tommy
+// testing entering remote interrupt and exiting..
+// with icache miss
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+_remote_interrupt:
+  // clear pending bit
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  li x31, 0xbeef 
+  mret
+
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+.org 4096
+// enable mstatus.MIE
+li x1, 0x8
+csrrw x0, mstatus, x1
+
+// enable mie.remote
+li x1, 0x10000
+csrrw x0, mie, x1
+
+// send yourself remote interrupt
+li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+li x2, 1
+sw x2, 0(x1)
+
+
+// WAIT FOR INTERRUPT
+.org 4096*2
+li x30, 0xbeef
+bne x30, x31, fail
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_idiv_test.S b/software/spmd/interrupt_tests/remote_idiv_test.S
new file mode 100644
index 000000000..bf77e6ab6
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_idiv_test.S
@@ -0,0 +1,49 @@
+// testing entering remote interrupts during iterative divider usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  li x31, 0xc8
+  mret
+_trace_interrupt:
+  j fail
+  
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // setup for div
+  li x3, 0x3e8
+  li x4, 0x5
+
+  // issue a divide instruction (1000/5)
+  div x5, x3, x4
+
+  // send a remote interrupts to yourself (should get through immediately since backend is not stalled)
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)  
+
+  // will fail if the remote interrupt did not fire
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_imul_test.S b/software/spmd/interrupt_tests/remote_imul_test.S
new file mode 100644
index 000000000..85b67b363
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_imul_test.S
@@ -0,0 +1,49 @@
+// testing entering remote interrupts during integer multiplier usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  li x31, 0x2710
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // setup for imul
+  li x3, 0x3e8
+  li x4, 0x5
+
+  // send yourself remote interrupt
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+
+  // issue a multiplication instruction (1000 * 5)
+  mul x6, x3, x4
+
+  // will fail if the remote interrupt did not fire
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_multiple_test.S b/software/spmd/interrupt_tests/remote_multiple_test.S
new file mode 100644
index 000000000..88c13b430
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_multiple_test.S
@@ -0,0 +1,58 @@
+// Testing behaviour for a stream of remote interrupts (we expect the interrupts to override each other)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  addi x30, x30, 1
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // count down from 4
+  li x2, 4
+  // setup for remote interrupts
+  li x3, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x4, 1
+
+countdown_loop:
+  beq x0, x2, check
+  // send 4 back to back interrupts. Only 1 should be accepted
+  sw x4, 0(x3)
+  sw x4, 0(x3)
+  sw x4, 0(x3)
+  sw x4, 0(x3)
+  addi x2, x2, -1
+  j countdown_loop
+
+  // don't go here
+  j fail
+
+check:
+  li x31, 4
+  bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_remote_load_loop_test.S b/software/spmd/interrupt_tests/remote_remote_load_loop_test.S
new file mode 100644
index 000000000..8b16a887c
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_remote_load_loop_test.S
@@ -0,0 +1,78 @@
+// Author: Tommy
+// testing trace interrupt while remote load looping
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+
+.data
+ _dmem_arr: .space 64
+
+.section .dram, "aw"
+  dram_arr: .space 1024
+
+.text
+.globl _start
+_remote_interrupt:
+  // clear mip.remote
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  mret
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.remote
+  li x1, 0x10000
+  csrrw x0, mie, x1
+
+  // count down from 20
+  li x2, 20
+  la x3, dram_arr
+  li x4, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x5, 1
+// load 16 words at a time
+.org 4096
+countdown_loop:
+  beq x0, x2, pass
+  lw x10, 0(x3)
+  lw x11, 4(x3)
+  lw x12, 8(x3)
+  lw x13, 12(x3)
+  lw x14, 16(x3)
+  lw x15, 20(x3)
+  lw x16, 24(x3)
+  lw x17, 28(x3)
+  lw x18, 32(x3)
+  lw x19, 36(x3)
+  lw x20, 40(x3)
+  lw x21, 44(x3)
+  lw x22, 48(x3)
+  lw x23, 52(x3)
+  lw x24, 56(x3)
+  lw x25, 60(x3)
+  sw x5, 0(x4)
+  addi x3, x3, 64
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/remote_test.S b/software/spmd/interrupt_tests/remote_test.S
new file mode 100644
index 000000000..03c83d2e9
--- /dev/null
+++ b/software/spmd/interrupt_tests/remote_test.S
@@ -0,0 +1,57 @@
+// Author: Tommy
+// testing entering remote interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+_remote_interrupt:
+  // clear pending bit
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  li x31, 0xbeef 
+  mret
+
+_trace_interrupt:
+  j fail
+
+_start:
+  bsg_asm_init_regfile
+
+// enable mstatus.MIE
+li x1, 0x8
+csrrw x0, mstatus, x1
+
+// enable mie.remote
+li x1, 0x10000
+csrrw x0, mie, x1
+
+// send yourself remote interrupt
+li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+li x2, 1
+sw x2, 0(x1)
+
+
+// WAIT FOR INTERRUPT
+nop
+nop
+nop
+nop
+nop
+nop
+nop
+li x30, 0xbeef
+bne x30, x31, fail
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_branch_mispredict_loop_test.S b/software/spmd/interrupt_tests/trace_branch_mispredict_loop_test.S
new file mode 100644
index 000000000..735ff27b1
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_branch_mispredict_loop_test.S
@@ -0,0 +1,53 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 10
+  li x3, 1
+
+.org 4096
+countdown_loop:
+  beq x0, x2, pass
+  beq x0, x0, loop1             // forward branch (predict not taken, but actually taken)
+loop1:
+  beq x0, x3, countdown_loop    // backward branch (predict taken, but not actually taken)
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_countdown_test.S b/software/spmd/interrupt_tests/trace_countdown_test.S
new file mode 100644
index 000000000..203ace7d1
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_countdown_test.S
@@ -0,0 +1,46 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 10
+
+countdown_loop:
+  beq x2, x0, pass
+  addi x2, x2, -1
+  j countdown_loop
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_fdiv_test.S b/software/spmd/interrupt_tests/trace_fdiv_test.S
new file mode 100644
index 000000000..6a4de28fb
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_fdiv_test.S
@@ -0,0 +1,53 @@
+// testing entering trace interrupts during iterative floating point divider usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // setup for fdiv
+  li x3, 0x16
+  li x4, 0x7
+  li x5, 0x8
+  li x6, 0x3
+  fcvt.s.w f0, x3
+  fcvt.s.w f1, x4
+  fcvt.s.w f2, x5
+  fcvt.s.w f3, x6
+
+  // issue a floating point divide instruction (1000/3)
+  fdiv.s f3, f0, f1
+  // divider busy stall; trace interrupt should bot fire until first division completes
+  fdiv.s f6, f2, f3
+
+  // decode dependency stall; trace interrupt should not fire until divider completes
+  fcvt.w.s x7, f6
+  li x31, 0x3
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_float_test.S b/software/spmd/interrupt_tests/trace_float_test.S
new file mode 100644
index 000000000..41c5f5584
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_float_test.S
@@ -0,0 +1,53 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10.0
+  fcvt.s.w f0, x0
+  li x2, 10
+  fcvt.s.w f2, x2
+  li x3, 1
+  fcvt.s.w f3, x3
+  li x4, 1
+
+
+countdown_loop:
+  feq.s x2, f2, f0
+  beq x2, x4, pass
+  fsub.s f2, f2, f3
+  j countdown_loop
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_handler1_test.S b/software/spmd/interrupt_tests/trace_handler1_test.S
new file mode 100644
index 000000000..b9cd50b41
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_handler1_test.S
@@ -0,0 +1,43 @@
+// testing behavior of icache misses in the trace interrupt handler (mret from a different region)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  j _count_trace_interrupt
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  li x31, 0xbeef
+  bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+// force an icache miss
+.org 4096
+_count_trace_interrupt:
+  li x30, 0xbeef
+  mret
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/trace_handler2_test.S b/software/spmd/interrupt_tests/trace_handler2_test.S
new file mode 100644
index 000000000..054cd6239
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_handler2_test.S
@@ -0,0 +1,44 @@
+// testing behavior of icache misses in the trace interrupt handler (make a function call to a different region but mret from same region)
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  call _count_trace_interrupt
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  li x31, 0xbeef
+  bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
+
+// force an icache miss
+.org 4096
+_count_trace_interrupt:
+  li x30, 0xbeef
+  ret
\ No newline at end of file
diff --git a/software/spmd/interrupt_tests/trace_icache_miss_test.S b/software/spmd/interrupt_tests/trace_icache_miss_test.S
new file mode 100644
index 000000000..6919af57c
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_icache_miss_test.S
@@ -0,0 +1,45 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear trace enable/pending
+  li x1, 0x20000
+  csrrc x0, mie, x1
+  csrrc x0, mip, x1
+  li x31, 0xbeef
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+// enable mstatus.MIE
+.org 4096
+li x1, 0x8
+csrrw x0, mstatus, x1
+
+// enable mie.trace
+li x1, 0x20000
+csrrw x0, mie, x1
+
+.org 4096*2
+li x30, 0xbeef
+bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_idiv_test.S b/software/spmd/interrupt_tests/trace_idiv_test.S
new file mode 100644
index 000000000..680003837
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_idiv_test.S
@@ -0,0 +1,47 @@
+// testing entering trace interrupts during iterative divider usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // use different operands because divider latches previous result
+  li x3, 0x3e8
+  li x4, 0x5
+  li x5, 0x2
+
+  // issue a divide instruction (1000/5)
+  div x6, x3, x4
+  // divider busy stall; trace interrupt should not fire until first division completes
+  div x7, x3, x5
+
+  // decode dependency stall; trace interrupt should not fire until divider completes
+  li x31, 0x1f4
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_imul_test.S b/software/spmd/interrupt_tests/trace_imul_test.S
new file mode 100644
index 000000000..493edcd2a
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_imul_test.S
@@ -0,0 +1,45 @@
+// testing entering trace interrupts during integer multiplier usage
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  li x3, 0x3e8
+  li x4, 0x5
+  li x5, 0x2
+
+  // issue a multiplication instruction (1000 * 5)
+  mul x6, x3, x4
+  // multiplier dependency (not a real stall since multiplication is 1 cycle (uses fma), trace interrupt should fire as expected)
+  mul x7, x6, x5
+
+  li x31, 0x2710
+  bne x7, x31, fail
+
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_jump_loop_icache_test.S b/software/spmd/interrupt_tests/trace_jump_loop_icache_test.S
new file mode 100644
index 000000000..b59df6377
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_jump_loop_icache_test.S
@@ -0,0 +1,55 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 10
+
+.org 4096
+countdown_loop:
+  beq x0, x2, pass
+  j loop1
+loop2:
+  j loop3
+loop1:
+  j loop2
+loop3:
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_jump_loop_test.S b/software/spmd/interrupt_tests/trace_jump_loop_test.S
new file mode 100644
index 000000000..7a5bba2ac
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_jump_loop_test.S
@@ -0,0 +1,54 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 10
+
+countdown_loop:
+  beq x0, x2, pass
+  j loop1
+loop2:
+  j loop3
+loop1:
+  j loop2
+loop3:
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_remote_load_loop_test.S b/software/spmd/interrupt_tests/trace_remote_load_loop_test.S
new file mode 100644
index 000000000..4832ff742
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_remote_load_loop_test.S
@@ -0,0 +1,77 @@
+// Author: Tommy
+// testing trace interrupt while remote load looping
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+
+.data
+ _dmem_arr: .space 64
+
+.section .dram, "aw"
+  dram_arr: .space 1024
+
+.text
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear mip.trace
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+  // enable mstatus.MIE
+  li x1, 0x8
+  csrrw x0, mstatus, x1
+
+  // enable mie.trace
+  li x1, 0x20000
+  csrrw x0, mie, x1
+
+  // count down from 10
+  li x2, 20
+  la x3, dram_arr
+
+
+// load 16 words at a time
+.org 4096
+countdown_loop:
+  beq x0, x2, pass
+  lw x10, 0(x3)
+  lw x11, 4(x3)
+  lw x12, 8(x3)
+  lw x13, 12(x3)
+  lw x14, 16(x3)
+  lw x15, 20(x3)
+  lw x16, 24(x3)
+  lw x17, 28(x3)
+  lw x18, 32(x3)
+  lw x19, 36(x3)
+  lw x20, 40(x3)
+  lw x21, 44(x3)
+  lw x22, 48(x3)
+  lw x23, 52(x3)
+  lw x24, 56(x3)
+  lw x25, 60(x3)
+  addi x3, x3, 64
+  addi x2, x2, -1
+  j countdown_loop
+
+// don't go here
+  j fail
+
+
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+pass_loop:
+  beq x0, x0, pass_loop
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_tests/trace_test.S b/software/spmd/interrupt_tests/trace_test.S
new file mode 100644
index 000000000..a6d3dd823
--- /dev/null
+++ b/software/spmd/interrupt_tests/trace_test.S
@@ -0,0 +1,43 @@
+// Author: Tommy
+// testing entering trace interrupt and exiting..
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j fail
+_trace_interrupt:
+  // clear trace enable/pending
+  li x1, 0x20000
+  csrrc x0, mie, x1
+  csrrc x0, mip, x1
+  li x31, 0xbeef
+  mret
+
+_start:
+  bsg_asm_init_regfile
+
+// enable mstatus.MIE
+li x1, 0x8
+csrrw x0, mstatus, x1
+
+// enable mie.trace
+li x1, 0x20000
+csrrw x0, mie, x1
+
+li x30, 0xbeef
+bne x30, x31, fail
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/interrupt_trace_trigger_remote/Makefile b/software/spmd/interrupt_trace_trigger_remote/Makefile
new file mode 100644
index 000000000..0bc89ea7f
--- /dev/null
+++ b/software/spmd/interrupt_trace_trigger_remote/Makefile
@@ -0,0 +1,15 @@
+bsg_tiles_X= 1
+bsg_tiles_Y= 1 
+
+all: main.run
+
+include ../Makefile.include
+
+RISCV_LINK_OPTS = -march=rv32imaf -nostdlib -nostartfiles 
+
+main.riscv: $(LINK_SCRIPT)  main.o 
+	$(RISCV_LINK)  main.o -o $@ $(RISCV_LINK_OPTS)
+
+
+include ../../mk/Makefile.tail_rules
+
diff --git a/software/spmd/interrupt_trace_trigger_remote/main.S b/software/spmd/interrupt_trace_trigger_remote/main.S
new file mode 100644
index 000000000..92c6c1457
--- /dev/null
+++ b/software/spmd/interrupt_trace_trigger_remote/main.S
@@ -0,0 +1,117 @@
+// test trace interrupt triggering remote interrupt.
+
+#include "bsg_manycore_arch.h"
+#include "bsg_manycore_asm.h"
+
+.globl _start
+_remote_interrupt:
+  j _remote_interrupt_handler
+_trace_interrupt:
+  // send itself remote interrupt
+  li x1, bsg_tile_group_remote_interrupt_ptr(0,0)
+  li x2, 1
+  sw x2, 0(x1)
+  // clear pending bit
+  li x1, 0x20000
+  csrrc x0, mip, x1
+  mret
+
+_remote_interrupt_handler:
+  // clear pending bit
+  li x1, 0x10000
+  csrrc x0, mip, x1
+  mret
+
+
+_start:
+  li x1, 0
+  li x2, 4096*2-4
+  li x3, 0
+  li x4, 0
+  li x5, 0
+  li x6, 0
+  li x7, 0
+  li x8, 0
+  li x9, 0
+  li x10,0
+  li x11,0
+  li x12,0
+  li x13,0
+  li x14,0
+  li x15,0
+  li x16,0
+  li x17,0
+  li x18,0
+  li x19,0
+  li x20,0
+  li x21,0
+  li x22,0
+  li x23,0
+  li x24,0
+  li x25,0
+  li x26,0
+  li x27,0
+  li x28,0
+  li x29,0
+  li x30,0
+  li x31,0
+
+  fcvt.s.w f0, x0
+  fcvt.s.w f1, x0
+  fcvt.s.w f2, x0
+  fcvt.s.w f3, x0
+  fcvt.s.w f4, x0
+  fcvt.s.w f5, x0
+  fcvt.s.w f6, x0
+  fcvt.s.w f7, x0
+  fcvt.s.w f8, x0
+  fcvt.s.w f9, x0
+  fcvt.s.w f10,x0
+  fcvt.s.w f11,x0
+  fcvt.s.w f12,x0
+  fcvt.s.w f13,x0
+  fcvt.s.w f14,x0
+  fcvt.s.w f15,x0
+  fcvt.s.w f16,x0
+  fcvt.s.w f17,x0
+  fcvt.s.w f18,x0
+  fcvt.s.w f19,x0
+  fcvt.s.w f20,x0
+  fcvt.s.w f21,x0
+  fcvt.s.w f22,x0
+  fcvt.s.w f23,x0
+  fcvt.s.w f24,x0
+  fcvt.s.w f25,x0
+  fcvt.s.w f26,x0
+  fcvt.s.w f27,x0
+  fcvt.s.w f28,x0
+  fcvt.s.w f29,x0
+  fcvt.s.w f30,x0
+  fcvt.s.w f31,x0
+
+// enable mstatus.MIE
+li x1, 0x8
+csrrw x0, mstatus, x1
+
+// enable mie.trace and mie.remote
+li x1, 0x30000
+csrrw x0, mie, x1
+
+li x31, 5
+count_loop:
+  beq x31, x0, pass
+  addi x31, x31, -1
+  j count_loop
+
+// returning from interrupt
+pass:
+  bsg_asm_finish(IO_X_INDEX, 0)
+
+pass_loop:
+  beq x0, x0, pass_loop
+
+fail:
+  bsg_asm_fail(IO_X_INDEX, 0)
+
+fail_loop:
+  beq x0, x0, fail_loop
diff --git a/software/spmd/jalr_rv32/main.S b/software/spmd/jalr_rv32/main.S
index 8ea1d8986..683e9b3a3 100644
--- a/software/spmd/jalr_rv32/main.S
+++ b/software/spmd/jalr_rv32/main.S
@@ -24,6 +24,9 @@ li x10, 2
 
 pass:
   bsg_asm_finish(IO_X_INDEX, 0)
-
+pass_loop:
+  j pass_loop
 fail:
   bsg_asm_fail(IO_X_INDEX, -1)
+fail_loop:
+  j fail_loop
diff --git a/software/spmd/load_dependency/main.S b/software/spmd/load_dependency/main.S
index da7bd0292..3f0cfae44 100644
--- a/software/spmd/load_dependency/main.S
+++ b/software/spmd/load_dependency/main.S
@@ -9,30 +9,30 @@ li ra, 10;
 
 sum:
     // Store data in some locations
-    bsg_asm_remote_store(0, 0, 0x1000, 1);
-    bsg_asm_remote_store(0, 0, 0x1004, 2);
-    bsg_asm_remote_store(0, 0, 0x1008, 3);
-    bsg_asm_remote_store(0, 0, 0x100c, 4);
-    bsg_asm_remote_store(0, 0, 0x1010, 5);
-    bsg_asm_remote_store(0, 0, 0x1014, 6);
-    bsg_asm_remote_store(0, 0, 0x1018, 7);
-    bsg_asm_remote_store(0, 0, 0x101c, 8);
-    bsg_asm_local_store(0x1020, 9);
-    bsg_asm_local_store(0x1024, 10);
-    bsg_asm_local_store(0x1028, 11);
-    bsg_asm_local_store(0x102c, 12);
-    bsg_asm_local_store(0x1030, 13);
-    bsg_asm_local_store(0x1034, 14);
-    bsg_asm_local_store(0x1038, 15);
+    bsg_asm_remote_store(0, 0, 0x0000, 1);
+    bsg_asm_remote_store(0, 0, 0x0004, 2);
+    bsg_asm_remote_store(0, 0, 0x0008, 3);
+    bsg_asm_remote_store(0, 0, 0x000c, 4);
+    bsg_asm_remote_store(0, 0, 0x0010, 5);
+    bsg_asm_remote_store(0, 0, 0x0014, 6);
+    bsg_asm_remote_store(0, 0, 0x0018, 7);
+    bsg_asm_remote_store(0, 0, 0x001c, 8);
+    bsg_asm_local_store(0x0020, 9);
+    bsg_asm_local_store(0x0024, 10);
+    bsg_asm_local_store(0x0028, 11);
+    bsg_asm_local_store(0x002c, 12);
+    bsg_asm_local_store(0x0030, 13);
+    bsg_asm_local_store(0x0034, 14);
+    bsg_asm_local_store(0x0038, 15);
     fence;
 
 test_1:
     // Single cycle remote loads
     // nops to test zero delay remote loads
-    bsg_asm_remote_load(s0, 0, 0, 0x1000); // 1
-    bsg_asm_remote_load(s1, 0, 0, 0x1004); // 2
-    bsg_asm_remote_load(s2, 0, 0, 0x1008); // 3
-    bsg_asm_remote_load(s3, 0, 0, 0x100c); // 4
+    bsg_asm_remote_load(s0, 0, 0, 0x0000); // 1
+    bsg_asm_remote_load(s1, 0, 0, 0x0004); // 2
+    bsg_asm_remote_load(s2, 0, 0, 0x0008); // 3
+    bsg_asm_remote_load(s3, 0, 0, 0x000c); // 4
     nop;
     nop;
     nop;
@@ -52,7 +52,7 @@ test_1:
 test_2:
     // Load write-back during stalls
     // A dummy multiply is used to create a stall
-    bsg_asm_remote_load(s1, 0, 0, 0x1010); // 5
+    bsg_asm_remote_load(s1, 0, 0, 0x0010); // 5
     nop; // nops to let load pass through the pipeline before stall
     nop;
     mul t0, t0, t0;
@@ -62,8 +62,8 @@ test_2:
 
 test_3:
     // Dependency test
-    bsg_asm_remote_load(s1, 0, 0, 0x1014); // 6
-    bsg_asm_remote_load(s2, 0, 0, 0x1018); // 7
+    bsg_asm_remote_load(s1, 0, 0, 0x0014); // 6
+    bsg_asm_remote_load(s2, 0, 0, 0x0018); // 7
     add s1, s0, s1;
     add s0, s1, s2;
     bi(beq, s0, 28, test_4);
@@ -71,14 +71,14 @@ test_3:
 
 test_4:
     // Contention between remote and local loads
-    li a0, 0x1028;
-    li a1, 0x102c;
-    li a2, 0x1030;
-    li a3, 0x1034;
-    li a4, 0x1038;
-    bsg_asm_remote_load(s1, 0, 0, 0x101c); // 8
-    bsg_asm_remote_load(s2, 0, 0, 0x1020); // 9
-    bsg_asm_remote_load(s3, 0, 0, 0x1024); // 10
+    li a0, 0x0028;
+    li a1, 0x002c;
+    li a2, 0x0030;
+    li a3, 0x0034;
+    li a4, 0x0038;
+    bsg_asm_remote_load(s1, 0, 0, 0x001c); // 8
+    bsg_asm_remote_load(s2, 0, 0, 0x0020); // 9
+    bsg_asm_remote_load(s3, 0, 0, 0x0024); // 10
     lw s4, 0x0(a0); // 11
     lw s5, 0x0(a1); // 12
     lw s6, 0x0(a2); // 13
diff --git a/software/spmd/memtest2020/Makefile b/software/spmd/memtest2020/Makefile
new file mode 100644
index 000000000..a641d23a5
--- /dev/null
+++ b/software/spmd/memtest2020/Makefile
@@ -0,0 +1,17 @@
+bsg_tiles_X = 1
+bsg_tiles_Y = 1
+
+
+all: main.run
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/memtest2020/main.c b/software/spmd/memtest2020/main.c
new file mode 100644
index 000000000..5e52e5bc5
--- /dev/null
+++ b/software/spmd/memtest2020/main.c
@@ -0,0 +1,62 @@
+// This test the memory system by accessling a wide range of addresses using varying strides.
+// The strides are 2^n-1 with n > 0. This exercises vcache replacement.
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+int data __attribute__ ((section (".dram"))) = {0};
+#define N 256
+
+
+int get_stride(int n)
+{
+  int stride = 1;
+  for (int i = 0; i < n; i++)
+  {
+    stride = stride * 2;
+  }
+  return stride-1;
+}
+
+int main()
+{
+  bsg_set_tile_x_y();
+
+  int idx;
+  int* dram_ptr = &data;
+
+  for (int k = 0; k < 2; k++)
+  {
+    for (int n = 1; n < 21; n++)
+    {
+      int stride = get_stride(n);
+  
+      // store
+      idx = 0;
+      for (int i = 0; i < N; i++)
+      {
+        dram_ptr[idx] = i;
+        idx += stride;
+      }
+
+      // load
+      int load_val[N];
+      idx = 0;
+      for (int i = 0; i < N; i++)
+      {
+        load_val[i] = dram_ptr[idx];
+        idx += stride;
+      }
+
+      // validate
+      for (int i = 0; i < N; i++)
+      {
+        if (load_val[i] != i) bsg_fail();
+      }
+    }
+  }
+
+  bsg_finish();
+  bsg_wait_while(1);
+}
+
diff --git a/software/spmd/perf_test_barrier/Makefile b/software/spmd/perf_test_barrier/Makefile
index b09451f82..1738f5b12 100644
--- a/software/spmd/perf_test_barrier/Makefile
+++ b/software/spmd/perf_test_barrier/Makefile
@@ -1,5 +1,4 @@
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/8x4
-
+TRACE=0
 
 all: main.run
 
diff --git a/software/spmd/perf_test_barrier_cpp/Makefile b/software/spmd/perf_test_barrier_cpp/Makefile
index a9a6c6849..a90f9f831 100644
--- a/software/spmd/perf_test_barrier_cpp/Makefile
+++ b/software/spmd/perf_test_barrier_cpp/Makefile
@@ -1,4 +1,4 @@
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/16x8_ruche
+TRACE=0
 
 all: main.run
 
diff --git a/software/spmd/perf_test_conv3x3/Makefile b/software/spmd/perf_test_conv3x3/Makefile
index 53d1e9fcf..8b27dc5ae 100644
--- a/software/spmd/perf_test_conv3x3/Makefile
+++ b/software/spmd/perf_test_conv3x3/Makefile
@@ -1,5 +1,3 @@
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/16x8_ruche
-
 all: main.run
 
 OBJECT_FILES=main.o
diff --git a/software/spmd/perf_test_reduction/Makefile b/software/spmd/perf_test_reduction/Makefile
index 53d1e9fcf..8b27dc5ae 100644
--- a/software/spmd/perf_test_reduction/Makefile
+++ b/software/spmd/perf_test_reduction/Makefile
@@ -1,5 +1,3 @@
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/16x8_ruche
-
 all: main.run
 
 OBJECT_FILES=main.o
diff --git a/software/spmd/perf_test_scan/Makefile b/software/spmd/perf_test_scan/Makefile
index 8e7c5467c..adf9ecca1 100644
--- a/software/spmd/perf_test_scan/Makefile
+++ b/software/spmd/perf_test_scan/Makefile
@@ -1,5 +1,3 @@
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/16x8_ruche
-
 bsg_tiles_X = 16
 bsg_tiles_Y = 1
 
diff --git a/software/spmd/pod_barrier/Makefile b/software/spmd/pod_barrier/Makefile
new file mode 100644
index 000000000..d165df7ec
--- /dev/null
+++ b/software/spmd/pod_barrier/Makefile
@@ -0,0 +1,15 @@
+#TRACE=1
+
+all: main.run
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/pod_barrier/main.c b/software/spmd/pod_barrier/main.c
new file mode 100644
index 000000000..52eb7062a
--- /dev/null
+++ b/software/spmd/pod_barrier/main.c
@@ -0,0 +1,176 @@
+// does a pod barrier in 87 cycles
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
+#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
+//#define BSG_BARRIER_DEBUG
+#include "bsg_tile_group_barrier.h"
+
+
+typedef struct _bsg_pod_barrier_ {
+  int col[2];
+  int row[4];
+  int notify;
+} bsg_pod_barrier_s;
+
+bsg_pod_barrier_s pod_barrier = { {0},{0},0 };
+int pod_barrier_sense = -1;
+
+#define kPodBarrier_CenterX 8
+#define kPodBarrier_CenterY 4
+
+char *pod_barrier_parent = 0;
+int  *pod_barrier_buddy = 0;
+
+void bsg_pod_barrier_init()
+{
+  // send notification to my row
+  char *p = (char *) bsg_remote_ptr(kPodBarrier_CenterX, bsg_y, &pod_barrier.row);
+  pod_barrier_parent = &p[((bsg_x & 3)<<2) + ((bsg_x >> 2) & 3)];
+  int buddy_x = bsg_x - 1;
+
+  // set buddy to self if at the end of a group of 4
+  if ((bsg_x & 3) == 0)
+    buddy_x = bsg_x;
+
+  pod_barrier_buddy = (char *) bsg_remote_ptr(buddy_x, bsg_y, &pod_barrier.notify);
+}
+
+void bsg_pod_barrier()
+{
+  int tmp_pod_barrier_sense = pod_barrier_sense;
+  int tmp_bsg_x = bsg_x;
+  *pod_barrier_parent = tmp_pod_barrier_sense;
+
+  int *tmp_pod_barrier_buddy = pod_barrier_buddy;
+  int *pod_barrier_notify_ptr = &pod_barrier.notify;
+  int tmp_pod_barrier_sense_inv = ~tmp_pod_barrier_sense;
+  int tmp_bsg_y = bsg_y;
+  int inc4;
+
+  tmp_bsg_x = bsg_mulu(tmp_bsg_x,1);
+  tmp_bsg_x = bsg_mulu(tmp_bsg_x,1);
+  //  bsg_print_int(0x00BA0000+__bsg_id);
+  if (tmp_bsg_x == kPodBarrier_CenterX)
+    {
+      // bsg_print_int(0xBEEF);
+      char *q = (char *) bsg_remote_ptr(kPodBarrier_CenterX,kPodBarrier_CenterY,&pod_barrier.col[0]);
+      q = &q[tmp_bsg_y];
+
+      bsg_remote_int_ptr foo = &pod_barrier.row;
+      bsg_join4_relay(foo,tmp_pod_barrier_sense, q);
+
+      if (tmp_bsg_y == kPodBarrier_CenterY)
+	{
+	  // shift by 2 is because it is an int pointer
+	  int incY = bsg_li((1 << REMOTE_Y_CORD_SHIFT) >> 2);
+	  int *r = (int *) bsg_remote_ptr(kPodBarrier_CenterX-1,0,pod_barrier_notify_ptr);
+	  int *s = (int *) bsg_remote_ptr(kPodBarrier_CenterX,0,pod_barrier_notify_ptr);
+
+	  bsg_remote_int_ptr addr = &pod_barrier.col;
+
+	  bsg_join2(addr,tmp_pod_barrier_sense);
+	  //	  while (addr[0] != tmp_pod_barrier_sense);
+	  //      while (addr[1] != tmp_pod_barrier_sense);
+	  
+	  // barrier has completed!!
+	  // wake up 16 tiles
+	  *r = tmp_pod_barrier_sense; r += incY; // 0
+	  *r = tmp_pod_barrier_sense; r += incY; // 1 
+	  *r = tmp_pod_barrier_sense; r += incY; // 2
+	  *r = tmp_pod_barrier_sense; r += incY; // 3 
+	  *r = tmp_pod_barrier_sense; r += incY; // 4
+	  *r = tmp_pod_barrier_sense; r += incY; // 5 
+	  *r = tmp_pod_barrier_sense; r += incY; // 6
+	  *r = tmp_pod_barrier_sense; r += incY; // 7 
+	  *s = tmp_pod_barrier_sense; s += incY; // 0
+	  *s = tmp_pod_barrier_sense; s += incY; // 1 
+	  *s = tmp_pod_barrier_sense; s += incY; // 2
+	  *s = tmp_pod_barrier_sense; s += incY; // 3 
+	  *s = tmp_pod_barrier_sense; s += incY; // 4
+	  *s = tmp_pod_barrier_sense; s += incY; // 5 
+	  *s = tmp_pod_barrier_sense; s += incY; // 6
+	  *s = tmp_pod_barrier_sense; s += incY; // 7 
+
+	  goto done;
+	}
+    }
+  else if (tmp_bsg_x == kPodBarrier_CenterX-1)
+    {
+      // shift by 2 is because it is an int pointer
+      inc4 = bsg_li((1 << REMOTE_X_CORD_SHIFT) >> 2)*4;
+      volatile int *s = (int *) bsg_remote_ptr(3,tmp_bsg_y,pod_barrier_notify_ptr); // 3
+      volatile int *t = s + inc4+inc4; // 11
+      volatile int *u = s + inc4*3;   // 15
+
+      // wait for notification from the center tile, send out notification to other 3 quads
+      bsg_wait_local_int_asm_blind(pod_barrier_notify_ptr,tmp_pod_barrier_sense);
+
+      *s = tmp_pod_barrier_sense; 
+      *t = tmp_pod_barrier_sense;
+      *u = tmp_pod_barrier_sense;
+
+      *tmp_pod_barrier_buddy = tmp_pod_barrier_sense;
+
+      goto done;
+    }
+
+  // wait for broadcast to return!
+  bsg_wait_local_int_asm_blind(pod_barrier_notify_ptr,tmp_pod_barrier_sense);
+  //bsg_print_int(0xBEEB);
+
+  *tmp_pod_barrier_buddy = tmp_pod_barrier_sense;
+  // bsg_print_int(0x10BA0000+__bsg_id);
+ done:
+  // invert sense of pod barrier, ready to go for next barrier!
+  pod_barrier_sense= tmp_pod_barrier_sense_inv;
+}
+
+INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier, 0, bsg_tiles_X-1, 0, bsg_tiles_Y-1);
+
+int main()
+{
+  int val;
+
+  bsg_set_tile_x_y();
+  bsg_pod_barrier_init();
+
+  int id = __bsg_id;
+
+  if (id == 0)
+    {
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_pod_barrier();
+      bsg_print_int(0xFACADE);
+      bsg_finish();
+    }
+  else
+    {
+#define STALL 0
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+      bsg_pod_barrier();
+      if (STALL) val += bsg_div(val,2);
+    }
+  bsg_wait_while(1);
+}
diff --git a/software/spmd/prefetch_bandwidth_test2/Makefile b/software/spmd/prefetch_bandwidth_test2/Makefile
new file mode 100644
index 000000000..ea88c26cb
--- /dev/null
+++ b/software/spmd/prefetch_bandwidth_test2/Makefile
@@ -0,0 +1,21 @@
+MAX_CYCLES=100000000
+NUM_FINISH=32 # each tile sends a finish packet.
+
+bsg_tiles_org_X ?=0
+bsg_tiles_org_Y ?=3
+bsg_tiles_X=16
+bsg_tiles_Y=2
+
+all: main.run
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/prefetch_bandwidth_test2/main.c b/software/spmd/prefetch_bandwidth_test2/main.c
new file mode 100644
index 000000000..e0fa60970
--- /dev/null
+++ b/software/spmd/prefetch_bandwidth_test2/main.c
@@ -0,0 +1,42 @@
+// prefetch bandwidth test
+
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+// number of tiles used
+#define N (bsg_tiles_X*bsg_tiles_Y)
+// number of cache lines fetched by each tile
+#define M (16*65536/bsg_tiles_X/bsg_tiles_Y/VCACHE_BLOCK_SIZE_WORDS)
+
+// Tiles write to this array.
+int data[1] __attribute__ ((section (".dram"))) = {0};
+
+
+int main()
+{
+
+  // set tiles
+  bsg_set_tile_x_y();
+
+
+  // Everyone prefetch data from DRAM to vcache.
+  int * data_ptr = &data[__bsg_id * VCACHE_BLOCK_SIZE_WORDS];
+  int stride = N*VCACHE_BLOCK_SIZE_WORDS;
+  int i = 0;
+
+  while (i < M) {
+    asm volatile ("lw x0, 0(%[data_ptr])" : : [data_ptr] "r" (data_ptr));
+    data_ptr = data_ptr + stride;
+    i++;
+  }
+
+  // fence
+  bsg_fence();
+
+
+  bsg_finish();
+
+  bsg_wait_while(1);
+}
+
diff --git a/software/spmd/quicksort/main.c b/software/spmd/quicksort/main.c
index 936fb7b2a..683ad4898 100644
--- a/software/spmd/quicksort/main.c
+++ b/software/spmd/quicksort/main.c
@@ -122,7 +122,7 @@ void quicksort(int* lst, int n)
   }
   else
   {
-    bsg_fail_x(0);
+    bsg_fail();
   }
 }
 
@@ -146,7 +146,7 @@ int main()
 
     if (data_copy[bsg_x][bsg_y][i] > data_copy[bsg_x][bsg_y][i+1])
     {
-      bsg_fail_x(0);
+      bsg_fail();
     }
   }
 
@@ -157,13 +157,13 @@ int main()
   else 
   {
     bsg_printf("sum: %d, expected %d, [FAILED]\n", sum,ANSWER);
-    bsg_fail_x(0);
+    bsg_fail();
   }
 
   bsg_barrier_wait( &tile0_barrier, 0, 0);
 
   if( bsg_x == 0  && bsg_y == 0)
-    bsg_finish_x(0);
+    bsg_finish();
 
   bsg_wait_while(1);
 }
diff --git a/software/spmd/quicktouch/main.c b/software/spmd/quicktouch/main.c
index 4771f91f5..d59c864a3 100644
--- a/software/spmd/quicktouch/main.c
+++ b/software/spmd/quicktouch/main.c
@@ -14,6 +14,11 @@
 #include "bsg_manycore.h"
 #include "bsg_set_tile_x_y.h"
 
+#define VCACHE_BLOCK_SIZE_IN_WORDS 8
+
+float data __attribute__ ((section (".dram"))) = {0};
+
+
 int main()
 {
 
@@ -59,27 +64,21 @@ int main()
   // store the float val to each vcache.
   float a = 1.1;
   float c = -0.32;
-  for (int x = 0; x < bsg_global_X; x++)
+
+  float *dram_ptr = &data;
+  for (int x = 0; x < 2*bsg_global_X; x++)
   {
     float b = (float) x;
-    // bot vcache
     vcache_store_val[x] = (a*b)+c;
-    bsg_global_float_store(x,bsg_global_Y+1,0,vcache_store_val[x]);
-    // top vcache
-    vcache_store_val[bsg_global_X+x] = (a*b)-c;
-    bsg_global_float_store(x,0,0,vcache_store_val[bsg_global_X+x]);
+    dram_ptr[VCACHE_BLOCK_SIZE_IN_WORDS*x] = vcache_store_val[x];
   }
 
   // load the float vals from the vcaches.
   float temp;
-  for (int x = 0; x < bsg_global_X; x++)
+  for (int x = 0; x < 2*bsg_global_X; x++)
   {
-    // bot vcache
-    bsg_global_float_load(x,bsg_global_Y+1,0,temp);
+    temp = dram_ptr[VCACHE_BLOCK_SIZE_IN_WORDS*x];
     vcache_load_val[x] = temp;
-    // top vcache
-    bsg_global_float_load(x,0,0,temp);
-    vcache_load_val[bsg_global_X+x] = temp;
   }
 
   // validate
diff --git a/software/spmd/saif/Makefile b/software/spmd/saif/Makefile
new file mode 100644
index 000000000..715f7f171
--- /dev/null
+++ b/software/spmd/saif/Makefile
@@ -0,0 +1,19 @@
+bsg_tiles_X = 1
+bsg_tiles_Y = 1
+
+
+all: main.run
+
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/saif/main.c b/software/spmd/saif/main.c
new file mode 100644
index 000000000..3f79463a1
--- /dev/null
+++ b/software/spmd/saif/main.c
@@ -0,0 +1,65 @@
+/**
+ *  main.c
+ *  
+ *  saif
+ *
+ *  Tests that bsg_saif_start() and bsg_saif_end() work correctly
+ *  calculates the sum of first N fibonacci sequence recursively.
+ *
+ *  fib[0] = 0
+ *  fib[1] = 1
+ *  fib[n] = fib[n-1] + fib[n-2]
+ *
+ */
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+#define N 15
+#define ANSWER 986
+
+int my_fib[N];
+
+int fib(int n)
+{
+  if (n == 0)
+  {
+    return 0;
+  }
+  else if (n == 1)
+  {
+    return 1;
+  }
+  else
+  {
+    return fib(n-1) + fib(n-2);
+  }
+}
+
+int main()
+{
+
+  bsg_set_tile_x_y();
+  int sum;
+  sum = 0;
+
+  if ((__bsg_x == 0) && (__bsg_y == 0)) {
+    bsg_saif_start();
+    for (int i = 0; i < N; i++)
+    {
+      my_fib[i] = fib(i);
+      bsg_printf("fib[%d] = %d\r\n", i, my_fib[i]);
+      sum += my_fib[i];
+    }
+  
+    bsg_saif_end();
+    if (sum == ANSWER)
+      bsg_finish();
+    else
+      bsg_fail();
+  }
+
+  bsg_wait_while(1);
+
+}
+
diff --git a/software/spmd/stall_force_wb_bug/Makefile b/software/spmd/stall_force_wb_bug/Makefile
index 36d483f75..7de3945c6 100644
--- a/software/spmd/stall_force_wb_bug/Makefile
+++ b/software/spmd/stall_force_wb_bug/Makefile
@@ -1,9 +1,9 @@
 bsg_tiles_X= 1
 bsg_tiles_Y= 1 
 bsg_tiles_org_X =0
-bsg_tiles_org_Y =2
+bsg_tiles_org_Y =0
 
-export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/8x4
+#export BSG_MACHINE_PATH = $(BSG_MANYCORE_DIR)/machines/8x4
 
 all: main.run
 
diff --git a/software/spmd/stall_force_wb_bug/main.S b/software/spmd/stall_force_wb_bug/main.S
index c68d917fd..0905a3c18 100644
--- a/software/spmd/stall_force_wb_bug/main.S
+++ b/software/spmd/stall_force_wb_bug/main.S
@@ -4,11 +4,11 @@
 .text
   li x1, 0
   li x2, 1
-  li x3, 0x00001000
+  li x3, 0x00000000
   sw x1, 0(x3)
   sw x2, 4(x3)
-  li x3, 0x20001000 /* loading from the tile itself using remote load */
-  li x6, 0x230c1000 /* loading from the distant tile which takes some cycles */
+  li x3, bsg_asm_remote_ptr(0,0,0) /* loading from the tile itself using remote load */
+  li x6, bsg_asm_remote_ptr(3,3,0) /* loading from the distant tile which takes some cycles */
   add x0, x0, x0
   add x0, x0, x0
   add x0, x0, x0
diff --git a/software/spmd/store_bug/hello.c b/software/spmd/store_bug/hello.c
index 2f5b5d1a7..01ff4d976 100644
--- a/software/spmd/store_bug/hello.c
+++ b/software/spmd/store_bug/hello.c
@@ -10,37 +10,37 @@ int main()
 {
   bsg_set_tile_x_y();
 
-  bsg_remote_ptr_io_store(0,0,0);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,0);
 
   foo[0] = 1;                       // set foo[0] to 1
   bsg_wait_while(foo[0]!=1);
 
-  bsg_remote_ptr_io_store(0,0,1);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,1);
 
   cp[0] = 2;
   bsg_wait_while(foo[0]!=2);
 
-  bsg_remote_ptr_io_store(0,0,2);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,2);
 
   cp[1] = 3;
   bsg_wait_while(foo[0]!=0x302);
 
-  bsg_remote_ptr_io_store(0,0,3);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,3);
 
   cp[2] = 4;
   bsg_wait_while(foo[0]!=0x040302);
 
-  bsg_remote_ptr_io_store(0,0,4);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,4);
 
   cp[3] = 5;
   bsg_wait_while(foo[0]!=0x05040302);
 
-  bsg_remote_ptr_io_store(0,0,5);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,5);
 
   sp[1] = 0x9080;
   bsg_wait_while(foo[0]!=0x90800302);
 
-  bsg_remote_ptr_io_store(0,0,6);
+  bsg_remote_ptr_io_store(IO_X_INDEX,0,6);
 
   bsg_finish();
 }
diff --git a/software/spmd/test_global_pod_ptr/Makefile b/software/spmd/test_global_pod_ptr/Makefile
new file mode 100644
index 000000000..8699f8a18
--- /dev/null
+++ b/software/spmd/test_global_pod_ptr/Makefile
@@ -0,0 +1,19 @@
+override bsg_pods_X = 1
+override bsg_pods_Y = 1
+bsg_tiles_X = 1
+bsg_tiles_Y = 1
+
+
+all: main.run
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/test_global_pod_ptr/main.c b/software/spmd/test_global_pod_ptr/main.c
new file mode 100644
index 000000000..3adaa0bda
--- /dev/null
+++ b/software/spmd/test_global_pod_ptr/main.c
@@ -0,0 +1,59 @@
+// store and load from every tile in the entire pod array using bsg_global_pod_ptr;
+
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+
+volatile int data;
+
+
+void test_pod(int px, int py)
+{
+  int load_val[bsg_global_X*bsg_global_Y];
+  int store_val[bsg_global_X*bsg_global_Y];
+
+  // store
+  for (int x = 0; x < bsg_global_X; x++) {
+    for (int y = 0; y < bsg_global_Y; y++) {
+      int id = x + (y*bsg_global_X);
+      store_val[id] = (px+(py*3)+x)-(y*2); // some hash val
+      bsg_global_pod_store(px,py,x,y,&data,store_val[id]);
+    }
+  }
+
+  // load
+  for (int x = 0; x < bsg_global_X; x++) {
+    for (int y = 0; y < bsg_global_Y; y++) {
+      int id = x + (y*bsg_global_X);
+      bsg_global_pod_load(px,py,x,y,&data,load_val[id]);
+    }
+  }
+
+  // validate
+  for (int i = 0; i < bsg_global_X*bsg_global_Y; i++) {
+    if (store_val[i] != load_val[i]) {
+      bsg_fail();
+      bsg_wait_while(1);
+    }
+  }
+}
+
+
+
+int main()
+{
+  bsg_set_tile_x_y();
+
+
+  for (int px = 0; px < bsg_pods_X; px++) {
+    for (int py = 0; py < bsg_pods_X; py++) {
+      test_pod(px,py);
+    }
+  }
+
+
+  bsg_finish();
+  bsg_wait_while(1);
+}
+
diff --git a/software/spmd/vcache_atomic_histogram/main.c b/software/spmd/vcache_atomic_histogram/main.c
index 4429940f2..60a5e8b74 100644
--- a/software/spmd/vcache_atomic_histogram/main.c
+++ b/software/spmd/vcache_atomic_histogram/main.c
@@ -63,8 +63,15 @@ void do_histogram_work()
 
   // get the lock
   int lock_val = 1;
+  volatile int counter = 0;
 
   do {
+    // count to 100 before sending amoswap to prevent severe network congestion/lock contention.
+    counter = 0;
+    for (int j = 0; j < 100; j++) {
+      counter++;
+    }
+
     lock_val = bsg_amoswap_aq(&lock, 1);
   } while (lock_val != 0); 
 
diff --git a/software/spmd/vcache_atomic_inc/main.c b/software/spmd/vcache_atomic_inc/main.c
index dcf0dcd09..478b6f707 100644
--- a/software/spmd/vcache_atomic_inc/main.c
+++ b/software/spmd/vcache_atomic_inc/main.c
@@ -13,14 +13,14 @@ void atomic_inc()
   do {
     lock_val = bsg_amoswap_aq(&lock, 1);
   } while (lock_val != 0); 
-  bsg_printf("I got the lock! x=%d y=%d\n", __bsg_x, __bsg_y);
+  //bsg_printf("I got the lock! x=%d y=%d\n", __bsg_x, __bsg_y);
 
   // critical region
   int local_data = data;
   data = local_data+1; 
   bsg_printf("%d\n",local_data);
 
-  bsg_printf("I'm releasing the lock... x=%d y=%d\n", __bsg_x, __bsg_y);
+  //bsg_printf("I'm releasing the lock... x=%d y=%d\n", __bsg_x, __bsg_y);
 
   // release
   bsg_amoswap_rl(&lock, 0);
diff --git a/software/spmd/vcache_atomics/main.c b/software/spmd/vcache_atomics/main.c
index 7f74826db..ce4ab1804 100644
--- a/software/spmd/vcache_atomics/main.c
+++ b/software/spmd/vcache_atomics/main.c
@@ -4,6 +4,7 @@
 
 int lock __attribute__ ((section (".dram"))) = {0};
 int lock2 __attribute__ ((section (".dram"))) = {0};
+int lock3 __attribute__ ((section (".dram"))) = {0};
 
 int main()
 {
@@ -48,6 +49,24 @@ int main()
     if (result2 != 15) bsg_fail();
     bsg_printf("%d\n", result2);
 
+
+    lock3= 10;
+    int result3 = bsg_amoadd(&lock3,10);
+    if (result3 != 10) bsg_fail();
+    bsg_printf("%d\n", result3);
+
+    result3 = bsg_amoadd_aq(&lock3,2);
+    if (result3 != 20) bsg_fail();
+    bsg_printf("%d\n", result3);
+
+    result3 = bsg_amoadd_rl(&lock3,18);
+    if (result3 != 22) bsg_fail();
+    bsg_printf("%d\n", result3);
+
+    result3 = bsg_amoadd_aqrl(&lock3,40);
+    if (result3 != 40) bsg_fail();
+    bsg_printf("%d\n", result3);
+
     bsg_finish();
   }
 
diff --git a/software/spmd/write_bandwidth_test/Makefile b/software/spmd/write_bandwidth_test/Makefile
new file mode 100644
index 000000000..ea88c26cb
--- /dev/null
+++ b/software/spmd/write_bandwidth_test/Makefile
@@ -0,0 +1,21 @@
+MAX_CYCLES=100000000
+NUM_FINISH=32 # each tile sends a finish packet.
+
+bsg_tiles_org_X ?=0
+bsg_tiles_org_Y ?=3
+bsg_tiles_X=16
+bsg_tiles_Y=2
+
+all: main.run
+
+OBJECT_FILES=main.o
+
+include ../Makefile.include
+
+main.riscv: $(LINK_SCRIPT) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) $(BSG_MANYCORE_LIB) crt.o
+	$(RISCV_LINK) $(OBJECT_FILES) $(SPMD_COMMON_OBJECTS) -L. "-l:$(BSG_MANYCORE_LIB)" -o $@ $(RISCV_LINK_OPTS)
+
+
+main.o: Makefile
+
+include ../../mk/Makefile.tail_rules
diff --git a/software/spmd/write_bandwidth_test/main.c b/software/spmd/write_bandwidth_test/main.c
new file mode 100644
index 000000000..97c236613
--- /dev/null
+++ b/software/spmd/write_bandwidth_test/main.c
@@ -0,0 +1,39 @@
+// write bandwidth test
+
+
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+
+// number of tiles used
+#define N (bsg_tiles_X*bsg_tiles_Y)
+// number of cache lines fetched by each tile
+#define M (16*65536/bsg_tiles_X/bsg_tiles_Y/VCACHE_BLOCK_SIZE_WORDS)
+
+// Tiles write to this array.
+int data[1] __attribute__ ((section (".dram"))) = {0};
+
+
+int main()
+{
+
+  // set tiles
+  bsg_set_tile_x_y();
+
+
+  // Everyone writes zero
+  int * data_ptr = &data[__bsg_id * VCACHE_BLOCK_SIZE_WORDS];
+  int stride = N*VCACHE_BLOCK_SIZE_WORDS;
+  int i = 0;
+  while (i < M) {
+    asm volatile ("sw x0, 0(%[data_ptr])" : : [data_ptr] "r" (data_ptr));
+    data_ptr = data_ptr + stride;
+    i++;
+  }
+
+
+  bsg_fence();
+  bsg_finish();
+
+  bsg_wait_while(1);
+}
+
diff --git a/testbenches/common/v/bsg_manycore_mem_cfg_pkg.v b/testbenches/common/v/bsg_manycore_mem_cfg_pkg.v
index cc721243d..8c2dff5d4 100644
--- a/testbenches/common/v/bsg_manycore_mem_cfg_pkg.v
+++ b/testbenches/common/v/bsg_manycore_mem_cfg_pkg.v
@@ -38,34 +38,40 @@ package bsg_manycore_mem_cfg_pkg;
 
   typedef enum bit [lg_max_cfgs-1:0] {
 
+
+    e_vcache_test_mem
+    
+    ,e_vcache_hbm2
+
+
     // LEVEL 1) zero-latency, infinite capacity block mem.
     //          (uses associative array)
-    e_infinite_mem
+    //e_infinite_mem
     
     // LEVEL 1) bsg_manycore_vcache (blocking)
     // LEVEL 2) bsg_cache_to_axi
     // LEVEL 3) bsg_nonsynth_manycore_axi_mem
-    , e_vcache_blocking_axi4_nonsynth_mem
+    //, e_vcache_blocking_axi4_nonsynth_mem
 
     // LEVEL 1) bsg_manycore_vcache (non-blocking)
     // LEVEL 2) bsg_cache_to_axi
     // LEVEL 3) bsg_nonsynth_manycore_axi_mem
-    , e_vcache_non_blocking_axi4_nonsynth_mem
+    //, e_vcache_non_blocking_axi4_nonsynth_mem
 
     // LEVEL 1) bsg_manycore_vcache (blocking)
     // LEVEL 2) bsg_cache_to_dram_ctrl
     // LEVEL 3) bsg_dmc (lpddr) (512 MB)
-    , e_vcache_blocking_dmc_lpddr
+    //, e_vcache_blocking_dmc_lpddr
 
     // LEVEL 1) bsg_manycore_vcache (non-blocking)
     // LEVEL 2) bsg_cache_to_dram_ctrl
     // LEVEL 3) bsg_dmc (lpddr) (512 MB)
-    , e_vcache_non_blocking_dmc_lpddr
+    //, e_vcache_non_blocking_dmc_lpddr
 
     // LEVEL 1) bsg_manycore_vcache (blocking)
     // LEVEL 2) bsg_cache_to_test_dram
     // LEVEL 3) bsg_nonsynth_dramsim3 (HBM2)
-    , e_vcache_blocking_dramsim3_hbm2
+    //, e_vcache_blocking_dramsim3_hbm2
 
     // placeholder for max enum val
     , e_max_val
diff --git a/testbenches/common/v/bsg_manycore_vcache_wh_to_cache_dma.v b/testbenches/common/v/bsg_manycore_vcache_wh_to_cache_dma.v
new file mode 100644
index 000000000..13c69978f
--- /dev/null
+++ b/testbenches/common/v/bsg_manycore_vcache_wh_to_cache_dma.v
@@ -0,0 +1,446 @@
+/**
+ *    bsg_manycore_vcache_wh_to_cache_dma.v
+ *    
+ *    this converts vcache wh link to an array of cache dma interface to that it can be interfaced to
+ *    bsg_cache_to_test_dram.v
+ *
+ *    Intended to be used for simulation only.
+ */
+
+
+`include "bsg_noc_links.vh"
+
+
+module bsg_manycore_vcache_wh_to_cache_dma
+  import bsg_cache_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter wh_flit_width_p="inv"
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter wh_ruche_factor_p="inv"
+
+    , parameter num_vcaches_p="inv"
+    , parameter vcache_addr_width_p="inv"
+    , parameter vcache_data_width_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+
+    
+    , parameter num_pods_x_p ="inv"
+    , parameter pod_start_x_p = 0
+    , parameter num_tiles_x_p = "inv"
+    , parameter lg_num_tiles_x_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter lg_num_pods_x_lp = `BSG_SAFE_CLOG2(num_pods_x_p)
+
+    , parameter no_concentration_p = 0
+
+    // FIFO parameters
+    , parameter in_fifo_els_p = 8
+
+    , parameter lg_num_vcaches_lp=`BSG_SAFE_CLOG2(num_vcaches_p)
+    , parameter data_len_lp=(vcache_data_width_p*vcache_block_size_in_words_p/vcache_dma_data_width_p)
+    , parameter count_width_lp = `BSG_SAFE_CLOG2(data_len_lp)
+
+    , parameter wh_link_sif_width_lp=`bsg_ready_and_link_sif_width(wh_flit_width_p)
+    , parameter dma_pkt_width_lp=`bsg_cache_dma_pkt_width(vcache_addr_width_p)
+
+    , parameter lg_wh_ruche_factor_lp = `BSG_SAFE_CLOG2(wh_ruche_factor_p)
+  )
+  (
+    input clk_i
+    , input reset_i
+
+
+    // wormhole link
+    , input  [wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [wh_link_sif_width_lp-1:0] wh_link_sif_o
+   
+
+    // cache DMA 
+    , output logic [num_vcaches_p-1:0][dma_pkt_width_lp-1:0] dma_pkt_o
+    , output logic [num_vcaches_p-1:0] dma_pkt_v_o
+    , input [num_vcaches_p-1:0] dma_pkt_yumi_i
+
+    , input [num_vcaches_p-1:0][vcache_dma_data_width_p-1:0] dma_data_i
+    , input [num_vcaches_p-1:0] dma_data_v_i
+    , output logic [num_vcaches_p-1:0] dma_data_ready_o
+
+    , output logic [num_vcaches_p-1:0][vcache_dma_data_width_p-1:0] dma_data_o
+    , output logic [num_vcaches_p-1:0] dma_data_v_o
+    , input [num_vcaches_p-1:0] dma_data_yumi_i
+  );
+
+
+  // structs
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p,wh_link_sif_s);
+  `declare_bsg_cache_dma_pkt_s(vcache_addr_width_p);
+  `declare_bsg_manycore_vcache_wh_header_flit_s(wh_flit_width_p,wh_cord_width_p,wh_len_width_p,wh_cid_width_p);
+
+
+  // cast wormhole links
+  wh_link_sif_s wh_link_sif_in;
+  wh_link_sif_s wh_link_sif_out;
+  assign wh_link_sif_in = wh_link_sif_i;
+  assign wh_link_sif_o = wh_link_sif_out;
+
+
+  // Buffer incoming flits.
+  logic [wh_flit_width_p-1:0] in_fifo_data_lo;
+  logic in_fifo_yumi_li;
+  logic in_fifo_v_lo;
+
+  bsg_fifo_1r1w_small #(
+    .els_p(in_fifo_els_p)
+    ,.width_p(wh_flit_width_p)
+  ) in_fifo (
+    .clk_i    (clk_i)
+    ,.reset_i (reset_i)
+
+    ,.v_i     (wh_link_sif_in.v)
+    ,.data_i  (wh_link_sif_in.data)
+    ,.ready_o (wh_link_sif_out.ready_and_rev)
+
+    ,.v_o     (in_fifo_v_lo)
+    ,.data_o  (in_fifo_data_lo)
+    ,.yumi_i  (in_fifo_yumi_li)
+  );
+
+
+  // DMA pkt going out
+  bsg_cache_dma_pkt_s dma_pkt_out;
+  for (genvar i = 0; i < num_vcaches_p; i++) begin
+    assign dma_pkt_o[i] = dma_pkt_out;
+  end
+
+
+  // header flits coming in and going out
+  bsg_manycore_vcache_wh_header_flit_s header_flit_in, header_flit_out;
+  assign header_flit_in = in_fifo_data_lo;
+
+
+  // cid, src_cord table
+  logic [num_vcaches_p-1:0][wh_cid_width_p-1:0] cid_r;
+  logic [num_vcaches_p-1:0][wh_cord_width_p-1:0] src_cord_r;
+  logic [wh_cid_width_p-1:0] cid_n;
+  logic [wh_cord_width_p-1:0] src_cord_n;
+  logic [lg_num_vcaches_lp-1:0] table_w_addr;
+  logic table_we;
+
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      cid_r <= '0;
+      src_cord_r <= '0;
+    end
+    else begin
+      if (table_we) begin
+        cid_r[table_w_addr] <= cid_n;
+        src_cord_r[table_w_addr] <= src_cord_n;
+      end
+    end
+  end
+
+
+
+  // send FSM
+  // receives wh packets and cache dma pkts.
+  typedef enum logic [1:0] {
+    SEND_RESET,
+    SEND_READY,
+    SEND_DMA_PKT,
+    SEND_EVICT_DATA
+  } send_state_e;
+
+  send_state_e send_state_r, send_state_n;
+  logic write_not_read_r, write_not_read_n;
+  logic [lg_num_vcaches_lp-1:0] send_cache_id_r, send_cache_id_n;
+
+  logic send_clear_li;
+  logic send_up_li;
+  logic [count_width_lp-1:0] send_count_lo;
+  bsg_counter_clear_up #(
+    .max_val_p(data_len_lp-1)    
+    ,.init_val_p(0)
+  ) send_count (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.clear_i(send_clear_li)
+    ,.up_i(send_up_li)
+    ,.count_o(send_count_lo)
+  );
+
+  wire [lg_num_vcaches_lp-1:0] send_cache_id;
+
+  if (no_concentration_p) begin
+    assign send_cache_id = header_flit_in.src_cord[lg_wh_ruche_factor_lp+:lg_num_vcaches_lp];
+
+  end
+  else begin
+    if (num_pods_x_p == 1) begin
+      // For pod 1x1, there are 1 HBM on each side of west and east.
+      // Left half of top and bottom vcaches (16 total) maps to ch0 of HBM2 on west.
+      // Right half of top and bottom vcaches (16 total) maps to ch0 of HBM2 on east.
+      assign send_cache_id = {
+        (1)'(header_flit_in.cid/wh_ruche_factor_p),
+        header_flit_in.src_cord[lg_num_tiles_x_lp-2:0]
+      };
+    end
+    else begin
+      //  The left half of the pod array maps to HBM2 on the left side, and the right half on the right. 
+      //  HBM2 channels are allocated to pods starting from the top left corner.
+      //  Within a pod, a row of vcaches (16) is allocated to a channel, so that there is one-to-one mapping from
+      //  vcache to HBM2 bank.  
+      //  
+      //    
+      // For pod 4x4
+      //
+      // [dev0-ch0] [dev0-ch2] [dev2-ch0] [dev2-ch2] 
+      // [  m  c  ] [   mc   ] [  m  c  ] [   mc   ]
+      // [dev0-ch1] [dev0-ch3] [dev2-ch1] [dev2-ch3]
+      //
+      // [dev0-ch4] [dev0-ch6] [dev2-ch4] [dev2-ch6]
+      // [  m  c  ] [   mc   ] [  m  c  ] [   mc   ]
+      // [dev0-ch5] [dev0-ch7] [dev2-ch5] [dev2-ch7]
+      //
+      // [dev1-ch0] [dev1-ch2] [dev3-ch0] [dev3-ch2]
+      // [  m  c  ] [   mc   ] [  m  c  ] [   mc   ]
+      // [dev1-ch1] [dev0-ch3] [dev3-ch1] [dev3-ch3]
+      //
+      // [dev1-ch4] [dev1-ch6] [dev3-ch4] [dev3-ch6]
+      // [  m  c  ] [   mc   ] [  m  c  ] [   mc   ]
+      // [dev1-ch5] [dev1-ch7] [dev3-ch5] [dev3-ch7]
+      //
+      assign send_cache_id = {
+        (lg_num_pods_x_lp-1)'((header_flit_in.src_cord[wh_cord_width_p-1:lg_num_tiles_x_lp] - pod_start_x_p)%(num_pods_x_p/2)),
+        (1)'(header_flit_in.cid/wh_ruche_factor_p),
+        header_flit_in.src_cord[lg_num_tiles_x_lp-1:0]
+      };
+    end
+  end
+  
+  
+
+  always_comb begin
+    send_state_n = send_state_r;
+    write_not_read_n = write_not_read_r;
+    send_cache_id_n = send_cache_id_r;
+    table_we = 1'b0;
+    table_w_addr = '0;
+    src_cord_n = '0;
+    cid_n = '0;   
+  
+    in_fifo_yumi_li = 1'b0;
+    
+    send_clear_li = 1'b0;
+    send_up_li = 1'b0;
+    dma_pkt_v_o = '0;
+    dma_pkt_out = '0;
+
+    dma_data_v_o = '0;
+    dma_data_o = '0;
+
+    case (send_state_r)
+      // coming out of reset
+      SEND_RESET: begin
+        send_state_n = SEND_READY;
+      end
+    
+      // wait for a header flit.
+      // store the write_not_read, src_cord.
+      // save the cid in a table.
+      SEND_READY: begin
+        if (in_fifo_v_lo) begin
+          in_fifo_yumi_li = 1'b1;
+          write_not_read_n = header_flit_in.write_not_read;
+          src_cord_n = header_flit_in.src_cord;
+          cid_n = header_flit_in.cid;
+          table_w_addr = send_cache_id;
+          table_we = 1'b1;
+          send_cache_id_n = send_cache_id;
+          send_state_n = SEND_DMA_PKT;
+        end
+      end
+
+      // take the addr flit and send out the dma pkt.
+      // For read, return to SEND_READY.
+      // For write, move to SEND_EVICT_DATA to pass the evict data.
+      SEND_DMA_PKT: begin
+        dma_pkt_v_o[send_cache_id_r] = in_fifo_v_lo;
+        dma_pkt_out.write_not_read = write_not_read_r;
+        dma_pkt_out.addr = vcache_addr_width_p'(in_fifo_data_lo);
+      
+        in_fifo_yumi_li = dma_pkt_yumi_i[send_cache_id_r];
+        send_state_n = dma_pkt_yumi_i[send_cache_id_r]
+          ? (write_not_read_r ? SEND_EVICT_DATA : SEND_READY)
+          : SEND_DMA_PKT;
+      end
+
+      // once all evict data has been passed along return to SEND_READY
+      SEND_EVICT_DATA: begin
+        dma_data_v_o[send_cache_id_r] = in_fifo_v_lo;
+        dma_data_o[send_cache_id_r] = in_fifo_data_lo;
+        if (dma_data_yumi_i[send_cache_id_r]) begin
+          in_fifo_yumi_li = 1'b1;
+          send_up_li = send_count_lo != data_len_lp-1;
+          send_clear_li = send_count_lo == data_len_lp-1;
+          send_state_n = (send_count_lo == data_len_lp-1)
+            ? SEND_READY
+            : SEND_EVICT_DATA;
+        end
+      end
+
+    endcase    
+
+  end
+
+
+
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      send_state_r <= SEND_RESET;
+      write_not_read_r <= 1'b0;
+      send_cache_id_r <= '0;
+    end
+    else begin
+      send_state_r <= send_state_n;
+      write_not_read_r <= write_not_read_n;
+      send_cache_id_r <= send_cache_id_n;
+    end
+  end
+
+
+
+  // receiver FSM
+  // receives dma_data_i and send them to the vcaches using wh link.
+  typedef enum logic [1:0] {
+    RECV_RESET,
+    RECV_READY,
+    RECV_HEADER,
+    RECV_FILL_DATA
+  } recv_state_e;
+
+  recv_state_e recv_state_r, recv_state_n;
+  logic [lg_num_vcaches_lp-1:0] recv_cache_id_r, recv_cache_id_n;
+
+
+  logic rr_v_lo;
+  logic rr_yumi_li;
+  logic [lg_num_vcaches_lp-1:0] rr_addr_lo;
+  logic [num_vcaches_p-1:0] rr_grants_lo;
+  bsg_arb_round_robin #(
+    .width_p(num_vcaches_p)
+  ) rr0 (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+
+    ,.reqs_i(dma_data_v_i)
+    ,.grants_o(rr_grants_lo)
+    ,.yumi_i(rr_yumi_li)
+  );
+  assign rr_v_lo = |dma_data_v_i;
+  bsg_encode_one_hot #(
+    .width_p(num_vcaches_p)
+  ) eoh (
+    .i(rr_grants_lo)
+    ,.addr_o(rr_addr_lo)
+    ,.v_o()
+  );
+
+
+  logic recv_clear_li;
+  logic recv_up_li;
+  logic [count_width_lp-1:0] recv_count_lo;
+  bsg_counter_clear_up #(
+    .max_val_p(data_len_lp-1)    
+    ,.init_val_p(0)
+  ) recv_count (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.clear_i(recv_clear_li)
+    ,.up_i(recv_up_li)
+    ,.count_o(recv_count_lo)
+  );
+
+
+
+
+  always_comb begin
+
+    wh_link_sif_out.v = 1'b0;
+    wh_link_sif_out.data = '0;
+
+    rr_yumi_li = 1'b0;
+
+    recv_state_n = recv_state_r;
+    recv_cache_id_n = recv_cache_id_r;
+
+    recv_clear_li = 1'b0;
+    recv_up_li = 1'b0;
+
+    header_flit_out.unused = '0;
+    header_flit_out.write_not_read = 1'b0; // dont matter
+    header_flit_out.src_cord = '0; // dont matter
+    header_flit_out.cid = cid_r[recv_cache_id_r];
+    header_flit_out.len = data_len_lp;
+    header_flit_out.dest_cord = src_cord_r[recv_cache_id_r];
+
+    dma_data_ready_o = '0;
+
+    case (recv_state_r)
+
+      // coming out of reset
+      RECV_RESET: begin
+        recv_state_n = RECV_READY;
+      end
+
+      // wait for one of dma_data_v_i to be 1.
+      // save the cache id.
+      RECV_READY: begin
+        if (rr_v_lo) begin
+          rr_yumi_li = 1'b1;
+          recv_cache_id_n = rr_addr_lo;
+          recv_state_n = RECV_HEADER;
+        end
+      end     
+     
+      // send out header to dest vcache
+      RECV_HEADER: begin
+        wh_link_sif_out.v = 1'b1;
+        wh_link_sif_out.data = header_flit_out;
+        if (wh_link_sif_in.ready_and_rev) begin
+          recv_state_n = RECV_FILL_DATA;
+        end
+      end
+ 
+      // send the data flits to the vcache.
+      // once it's done, go back to RECV_READY.
+      RECV_FILL_DATA: begin
+        wh_link_sif_out.v = dma_data_v_i[recv_cache_id_r];
+        wh_link_sif_out.data = dma_data_i[recv_cache_id_r];
+        dma_data_ready_o[recv_cache_id_r] = wh_link_sif_in.ready_and_rev;
+        if (dma_data_v_i[recv_cache_id_r] & wh_link_sif_in.ready_and_rev) begin
+          recv_clear_li = (recv_count_lo == data_len_lp-1);
+          recv_up_li = (recv_count_lo != data_len_lp-1);
+          recv_state_n = (recv_count_lo == data_len_lp-1)
+            ? RECV_READY
+            : RECV_FILL_DATA;
+        end
+      end
+
+    endcase
+  end
+
+
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      recv_state_r <= RECV_RESET;
+      recv_cache_id_r <= '0;
+    end
+    else begin
+      recv_state_r <= recv_state_n;
+      recv_cache_id_r <= recv_cache_id_n;
+    end
+  end
+
+
+endmodule
diff --git a/testbenches/common/v/bsg_nonsynth_manycore_io_complex.v b/testbenches/common/v/bsg_nonsynth_manycore_io_complex.v
index 5160977e7..f2fb32175 100644
--- a/testbenches/common/v/bsg_nonsynth_manycore_io_complex.v
+++ b/testbenches/common/v/bsg_nonsynth_manycore_io_complex.v
@@ -13,9 +13,6 @@ module bsg_nonsynth_manycore_io_complex
     , parameter x_cord_width_p="inv"
     , parameter y_cord_width_p="inv"
 
-    , parameter num_tiles_x_p="inv"
-    , parameter num_tiles_y_p="inv"
-
     , parameter io_x_cord_p=0 
     , parameter io_y_cord_p=1
 
@@ -43,9 +40,8 @@ module bsg_nonsynth_manycore_io_complex
     , output logic [data_width_p-1:0] print_stat_tag_o
   );
 
-  initial begin
-    $display("## creating manycore io complex num_tiles.");
-  end
+
+
 
   // endpoint standard
   //
@@ -112,9 +108,12 @@ module bsg_nonsynth_manycore_io_complex
     ,.returned_yumi_i(returned_v_r_lo)
 
     // misc
+    ,.returned_credit_v_r_o()
+    ,.returned_credit_reg_id_r_o()
     ,.out_credits_o(out_credits_lo)
-    ,.my_x_i((x_cord_width_p)'(io_x_cord_p))
-    ,.my_y_i((y_cord_width_p)'(io_y_cord_p))
+
+    ,.global_x_i((x_cord_width_p)'(io_x_cord_p))
+    ,.global_y_i((y_cord_width_p)'(io_y_cord_p))
   );
 
   // monitor
diff --git a/testbenches/common/v/bsg_nonsynth_manycore_monitor.v b/testbenches/common/v/bsg_nonsynth_manycore_monitor.v
index 248a2a145..cc465f60c 100644
--- a/testbenches/common/v/bsg_nonsynth_manycore_monitor.v
+++ b/testbenches/common/v/bsg_nonsynth_manycore_monitor.v
@@ -43,13 +43,30 @@ module bsg_nonsynth_manycore_monitor
 
   int status;
   int max_cycle;
+  int num_finish;   // Number of finish packets needs to be received to end the simulation.
+                    // By default, number of pods running the SPMD program. Each pod sends one finish packet.
+                    // However, you can set a different number, depending on the nature of the spmd program.
+                    // For example, you can require a finish packet from each tile in 4x4 tile-group spmd program.
+                    // In  that case, you would set num_finish to 16. this helps with not requiring barrier to synchronize task completion of all tiles.
   initial begin
     status = $value$plusargs("max_cycle=%d", max_cycle);
+    status = $value$plusargs("num_finish=%d", num_finish);
     if (max_cycle == 0) begin
       max_cycle = 1000000; // default
     end
   end
 
+  // keep track of number of finish packets received.
+  integer finish_count;
+  always_ff @ (negedge clk_i) begin
+    if (~reset_i) begin
+      if (finish_count == num_finish) begin
+        $display("[INFO][MONITOR] RECEIVED BSG_FINISH PACKET from all pods, time=%0t", $time);
+        $finish;
+      end
+    end
+  end
+
   // cycle counter
   //
   logic [39:0] cycle_count;
@@ -154,13 +171,16 @@ module bsg_nonsynth_manycore_monitor
 
 
   always_ff @ (negedge clk_i) begin
-    if (~reset_i) begin
+    if (reset_i) begin
+      finish_count <= 0;
+    end
+    else begin
       if (v_i & we_i) begin
         if (~addr_i[addr_width_p-1]) begin
           if (epa_addr == bsg_finish_epa_gp) begin
-            $display("[INFO][MONITOR] RECEIVED BSG_FINISH PACKET from tile y,x=%2d,%2d, data=%x, time=%0t",
+            $display("[INFO][MONITOR] RECEIVED a finish packet from tile y,x=%2d,%2d, data=%x, time=%0t",
               src_y_cord_i, src_x_cord_i, data_i, $time);
-            $finish;
+            finish_count <= finish_count + 1;
           end
           else if (epa_addr == bsg_time_epa_gp) begin
             $display("[INFO][MONITOR] RECEIVED TIME BSG_PACKET from tile y,x=%2d,%2d, data=%x, time=%0t",
diff --git a/testbenches/common/v/bsg_nonsynth_manycore_spmd_loader.v b/testbenches/common/v/bsg_nonsynth_manycore_spmd_loader.v
index 3406c75f7..eea60c031 100644
--- a/testbenches/common/v/bsg_nonsynth_manycore_spmd_loader.v
+++ b/testbenches/common/v/bsg_nonsynth_manycore_spmd_loader.v
@@ -20,6 +20,7 @@ module bsg_nonsynth_manycore_spmd_loader
 
     , parameter max_out_credits_p=200
     , parameter credit_counter_width_lp=`BSG_SAFE_CLOG2(max_out_credits_p+1)
+    , parameter verbose_p = 0
   )
   ( 
     input clk_i
@@ -62,14 +63,14 @@ module bsg_nonsynth_manycore_spmd_loader
   assign curr_nbf = nbf[nbf_addr_r];
 
   assign packet.addr = curr_nbf.epa[0+:addr_width_p];
-  assign packet.op = e_remote_store;
-  assign packet.op_ex = 4'b1111;
+  assign packet.op_v2 = e_remote_store;
   assign packet.payload = curr_nbf.data;
   assign packet.src_y_cord = my_y_i;
   assign packet.src_x_cord = my_x_i;
   assign packet.y_cord = curr_nbf.y_cord[0+:y_cord_width_p];
   assign packet.x_cord = curr_nbf.x_cord[0+:x_cord_width_p];
-  assign packet.reg_id = '0;
+  assign packet.reg_id.store_mask_s.mask = '1;
+  assign packet.reg_id.store_mask_s.unused = 1'b0;
 
   integer status;
   string nbf_file;
@@ -123,7 +124,7 @@ module bsg_nonsynth_manycore_spmd_loader
       if (loader_done)
         $display("[BSG_INFO][SPMD_LOADER] SPMD loader finished loading. t=%0t", $time);
   
-      if (v_o & ready_i)
+      if (v_o & ready_i & verbose_p)
         $display("[BSG_INFO][SPMD_LOADER] sending packet #%0d. x,y=%0d,%0d, addr=%x, data=%x, t=%0t",
           nbf_addr_r,
           packet.x_cord, packet.y_cord,
diff --git a/testbenches/common/v/bsg_nonsynth_manycore_tag_master.v b/testbenches/common/v/bsg_nonsynth_manycore_tag_master.v
new file mode 100644
index 000000000..b7fbc59c5
--- /dev/null
+++ b/testbenches/common/v/bsg_nonsynth_manycore_tag_master.v
@@ -0,0 +1,90 @@
+/**
+ *      bsg_nonsynth_manycore_tag_master.v
+ *
+ */
+
+
+
+module bsg_nonsynth_manycore_tag_master
+  import bsg_tag_pkg::*;
+  import bsg_noc_pkg::*;
+  #(parameter num_pods_x_p="inv"
+    , parameter num_pods_y_p="inv"
+
+    , parameter wh_cord_width_p="inv"
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    // done signal for peripherals
+    , output logic tag_done_o
+    , output  bsg_tag_s [num_pods_y_p-1:0][num_pods_x_p-1:0] pod_tags_o
+  );
+
+  // one tag client per pods
+  localparam num_clients_lp = (num_pods_y_p*num_pods_x_p);
+  localparam rom_addr_width_lp = 12;
+  localparam payload_width_lp = 1; // {reset}
+  localparam lg_payload_width_lp = `BSG_WIDTH(payload_width_lp); // number of bits used to represent the payload width
+  localparam max_payload_width_lp = (1<<lg_payload_width_lp)-1; 
+  localparam rom_data_width_lp = 4+1+`BSG_SAFE_CLOG2(num_clients_lp)+1+lg_payload_width_lp+max_payload_width_lp;
+
+  // BSG TAG trace replay
+  logic tr_valid_lo;
+  logic tr_en_r_lo;
+  logic tr_data_lo;
+  logic [rom_data_width_lp-1:0] rom_data;
+  logic [rom_addr_width_lp-1:0] rom_addr;
+
+  bsg_tag_trace_replay #(
+    .rom_addr_width_p(rom_addr_width_lp)
+    ,.rom_data_width_p(rom_data_width_lp)
+    ,.num_masters_p(1)
+    ,.num_clients_p(num_clients_lp)
+    ,.max_payload_width_p(max_payload_width_lp)
+  ) tr (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.en_i(1'b1)
+
+    ,.rom_addr_o(rom_addr)
+    ,.rom_data_i(rom_data)
+
+    ,.valid_i(1'b0)
+    ,.data_i('0)
+    ,.ready_o()
+
+    ,.valid_o(tr_valid_lo)
+    ,.en_r_o(tr_en_r_lo)
+    ,.tag_data_o(tr_data_lo)
+    ,.yumi_i(tr_valid_lo)
+    
+    ,.done_o(tag_done_o)
+    ,.error_o()
+  );  
+
+  // BSG TAG boot rom
+  bsg_tag_boot_rom #(
+    .width_p(rom_data_width_lp)
+    ,.addr_width_p(rom_addr_width_lp)
+  ) rom (
+    .addr_i(rom_addr)
+    ,.data_o(rom_data)
+  );
+
+
+  // BSG TAG MASTER
+  bsg_tag_master #(
+    .els_p(num_clients_lp)
+    ,.lg_width_p(lg_payload_width_lp)
+  ) btm (
+    .clk_i(clk_i)
+    ,.data_i(tr_en_r_lo & tr_valid_lo & tr_data_lo)
+    ,.en_i(1'b1)
+    ,.clients_r_o({pod_tags_o})
+  );
+
+
+
+endmodule
diff --git a/testbenches/common/v/bsg_nonsynth_manycore_testbench.v b/testbenches/common/v/bsg_nonsynth_manycore_testbench.v
new file mode 100644
index 000000000..4ac1ba35a
--- /dev/null
+++ b/testbenches/common/v/bsg_nonsynth_manycore_testbench.v
@@ -0,0 +1,773 @@
+/**
+ *    bsg_nonsynth_manycore_testbench.v
+ *
+ */
+
+
+module bsg_nonsynth_manycore_testbench
+  import bsg_noc_pkg::*; // {P=0, W, E, N, S}
+  import bsg_tag_pkg::*;
+  import bsg_manycore_pkg::*;
+  import bsg_manycore_mem_cfg_pkg::*;
+  #(parameter num_pods_x_p  = "inv"
+    , parameter num_pods_y_p  = "inv"
+    , parameter num_tiles_x_p = "inv"
+    , parameter num_tiles_y_p = "inv"
+    , parameter x_cord_width_p = "inv"
+    , parameter y_cord_width_p = "inv"
+    , parameter pod_x_cord_width_p = "inv"
+    , parameter pod_y_cord_width_p = "inv"
+    , parameter addr_width_p = "inv"
+    , parameter data_width_p = "inv"
+    , parameter dmem_size_p = "inv"
+    , parameter icache_entries_p = "inv"
+    , parameter icache_tag_width_p = "inv"
+    , parameter ruche_factor_X_p  = "inv"
+
+    , parameter num_subarray_x_p = "inv"
+    , parameter num_subarray_y_p = "inv"
+
+    , parameter num_vcache_rows_p = "inv"
+    , parameter vcache_data_width_p = "inv"
+    , parameter vcache_sets_p = "inv"
+    , parameter vcache_ways_p = "inv"
+    , parameter vcache_block_size_in_words_p = "inv" // in words
+    , parameter vcache_dma_data_width_p = "inv" // in bits
+    , parameter vcache_size_p = "inv" // in words
+    , parameter vcache_addr_width_p="inv" // byte addr
+    , parameter num_vcaches_per_channel_p = "inv"
+
+    , parameter wh_flit_width_p = "inv"
+    , parameter wh_ruche_factor_p = 2
+    , parameter wh_cid_width_p = "inv"
+    , parameter wh_len_width_p = "inv"
+    , parameter wh_cord_width_p = "inv"
+
+    , parameter bsg_manycore_mem_cfg_e bsg_manycore_mem_cfg_p = e_vcache_test_mem
+    , parameter bsg_dram_size_p ="inv" // in word
+    , parameter reset_depth_p = 3
+
+    , parameter enable_vcore_profiling_p=0
+    , parameter enable_router_profiling_p=0
+    , parameter enable_cache_profiling_p=0
+
+    , parameter cache_bank_addr_width_lp = `BSG_SAFE_CLOG2(bsg_dram_size_p/(2*num_tiles_x_p*num_vcache_rows_p)*4) // byte addr
+    , parameter link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    // This is used to define heterogeneous arrays. Each index defines
+    // the type of an X/Y coordinate in the array. This is a vector of
+    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
+    // default. See bsg_manycore_hetero_socket.v for more types.
+    , parameter int hetero_type_vec_p [0:(num_tiles_y_p*num_tiles_x_p) - 1]  = '{default:0}
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    , output tag_done_o
+    
+    , input  [link_sif_width_lp-1:0] io_link_sif_i
+    , output [link_sif_width_lp-1:0] io_link_sif_o
+  );
+
+
+  // print machine settings
+  initial begin
+    $display("MACHINE SETTINGS:");
+    $display("[INFO][TESTBENCH] BSG_MACHINE_GLOBAL_X                 = %d", num_tiles_x_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_GLOBAL_Y                 = %d", num_tiles_y_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_SET               = %d", vcache_sets_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_WAY               = %d", vcache_ways_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS  = %d", vcache_block_size_in_words_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_MAX_EPA_WIDTH            = %d", addr_width_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_MEM_CFG                  = %s", bsg_manycore_mem_cfg_p.name());
+    $display("[INFO][TESTBENCH] BSG_MACHINE_RUCHE_FACTOR_X           = %d", ruche_factor_X_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_SUBARRAY_X               = %d", num_subarray_x_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_SUBARRAY_Y               = %d", num_subarray_y_p);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_ORIGIN_X_CORD            = %d", `BSG_MACHINE_ORIGIN_X_CORD);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_ORIGIN_Y_CORD            = %d", `BSG_MACHINE_ORIGIN_Y_CORD);
+    $display("[INFO][TESTBENCH] BSG_MACHINE_NUM_VCACHE_ROWS          = %d", num_vcache_rows_p);
+    $display("[INFO][TESTBENCH] enable_vcore_profiling_p             = %d", enable_vcore_profiling_p);
+    $display("[INFO][TESTBENCH] enable_router_profiling_p            = %d", enable_router_profiling_p);
+    $display("[INFO][TESTBENCH] enable_cache_profiling_p             = %d", enable_cache_profiling_p);
+  end
+
+
+  // BSG TAG MASTER
+  logic tag_done_lo;
+  bsg_tag_s [num_pods_y_p-1:0][num_pods_x_p-1:0] pod_tags_lo;
+
+  bsg_nonsynth_manycore_tag_master #(
+    .num_pods_x_p(num_pods_x_p)
+    ,.num_pods_y_p(num_pods_y_p)
+    ,.wh_cord_width_p(wh_cord_width_p)
+  ) mtm (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    
+    ,.tag_done_o(tag_done_lo)
+    ,.pod_tags_o(pod_tags_lo)
+  );   
+  
+  assign tag_done_o = tag_done_lo;
+
+  // deassert reset when tag programming is done.
+  wire reset = ~tag_done_lo;
+  logic reset_r;
+  bsg_dff_chain #(
+    .width_p(1)
+    ,.num_stages_p(reset_depth_p)
+  ) reset_dff (
+    .clk_i(clk_i)
+    ,.data_i(reset)
+    ,.data_o(reset_r)
+  );
+
+
+  // instantiate manycore
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+  bsg_manycore_link_sif_s [S:N][(num_pods_x_p*num_tiles_x_p)-1:0] ver_link_sif_li;
+  bsg_manycore_link_sif_s [S:N][(num_pods_x_p*num_tiles_x_p)-1:0] ver_link_sif_lo;
+  wh_link_sif_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_li;
+  wh_link_sif_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_lo;
+  bsg_manycore_link_sif_s [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0] hor_link_sif_li;
+  bsg_manycore_link_sif_s [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0] hor_link_sif_lo;
+  bsg_manycore_ruche_x_link_sif_s [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0] ruche_link_li;
+  bsg_manycore_ruche_x_link_sif_s [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0] ruche_link_lo;
+
+  bsg_manycore_pod_ruche_array #(
+    .num_tiles_x_p(num_tiles_x_p)
+    ,.num_tiles_y_p(num_tiles_y_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.addr_width_p(addr_width_p)
+    ,.data_width_p(data_width_p)
+    ,.ruche_factor_X_p(ruche_factor_X_p)
+
+    ,.num_subarray_x_p(num_subarray_x_p)
+    ,.num_subarray_y_p(num_subarray_y_p)
+
+    ,.dmem_size_p(dmem_size_p)
+    ,.icache_entries_p(icache_entries_p)
+    ,.icache_tag_width_p(icache_tag_width_p)
+
+    ,.num_vcache_rows_p(num_vcache_rows_p)
+    ,.vcache_addr_width_p(vcache_addr_width_p)
+    ,.vcache_data_width_p(vcache_data_width_p)
+    ,.vcache_ways_p(vcache_ways_p)
+    ,.vcache_sets_p(vcache_sets_p)
+    ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+    ,.vcache_size_p(vcache_size_p)
+    ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+    ,.wh_ruche_factor_p(wh_ruche_factor_p)
+    ,.wh_cid_width_p(wh_cid_width_p)
+    ,.wh_flit_width_p(wh_flit_width_p)
+    ,.wh_cord_width_p(wh_cord_width_p)
+    ,.wh_len_width_p(wh_len_width_p)
+
+    ,.num_pods_y_p(num_pods_y_p)
+    ,.num_pods_x_p(num_pods_x_p)
+
+    ,.reset_depth_p(reset_depth_p)
+    ,.hetero_type_vec_p(hetero_type_vec_p)
+  ) DUT (
+    .clk_i(clk_i)
+
+    ,.ver_link_sif_i(ver_link_sif_li)
+    ,.ver_link_sif_o(ver_link_sif_lo)
+
+    ,.wh_link_sif_i(wh_link_sif_li)
+    ,.wh_link_sif_o(wh_link_sif_lo)
+
+    ,.hor_link_sif_i(hor_link_sif_li)
+    ,.hor_link_sif_o(hor_link_sif_lo)
+
+    ,.ruche_link_i(ruche_link_li)
+    ,.ruche_link_o(ruche_link_lo)
+
+    ,.pod_tags_i(pod_tags_lo) 
+  );
+
+  // Invert WH ruche links
+  // hardcoded for ruche factor = 2
+  wh_link_sif_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] buffered_wh_link_sif_li;
+  wh_link_sif_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] buffered_wh_link_sif_lo;
+  for (genvar i = W; i <= E; i++) begin
+    for (genvar j = 0; j < num_pods_y_p; j++) begin
+      for (genvar k = N; k <= S; k++) begin
+        for (genvar v = 0; v < num_vcache_rows_p; v++) begin
+          for (genvar r = 0; r < wh_ruche_factor_p; r++) begin
+            if (r == 0) begin
+              assign wh_link_sif_li[i][j][k][v][r] = buffered_wh_link_sif_li[i][j][k][v][r];
+              assign buffered_wh_link_sif_lo[i][j][k][v][r] = wh_link_sif_lo[i][j][k][v][r];
+            end
+            else begin
+              assign wh_link_sif_li[i][j][k][v][r] = ~buffered_wh_link_sif_li[i][j][k][v][r];
+              assign buffered_wh_link_sif_lo[i][j][k][v][r] = ~wh_link_sif_lo[i][j][k][v][r];
+            end
+          end
+        end
+      end
+    end
+  end
+
+  // IO ROUTER
+  bsg_manycore_link_sif_s [(num_pods_x_p*num_tiles_x_p)-1:0][S:P] io_link_sif_li;
+  bsg_manycore_link_sif_s [(num_pods_x_p*num_tiles_x_p)-1:0][S:P] io_link_sif_lo;
+
+  for (genvar x = 0; x < num_pods_x_p*num_tiles_x_p; x++) begin: io_rtr_x
+    bsg_manycore_mesh_node #(
+      .x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.stub_p(4'b0100) // stub north
+    ) io_rtr (
+      .clk_i(clk_i)
+      ,.reset_i(reset_r)
+
+      ,.links_sif_i(io_link_sif_li[x][S:W])
+      ,.links_sif_o(io_link_sif_lo[x][S:W])
+
+      ,.proc_link_sif_i(io_link_sif_li[x][P])
+      ,.proc_link_sif_o(io_link_sif_lo[x][P])
+
+      ,.global_x_i(x_cord_width_p'(num_tiles_x_p+x))
+      ,.global_y_i(y_cord_width_p'(0))
+    );
+
+    // connect to pod array
+    assign ver_link_sif_li[N][x] = io_link_sif_lo[x][S];
+    assign io_link_sif_li[x][S] = ver_link_sif_lo[N][x];
+
+    // connect between io rtr
+    if (x < (num_pods_x_p*num_tiles_x_p)-1) begin
+      assign io_link_sif_li[x][E] = io_link_sif_lo[x+1][W];
+      assign io_link_sif_li[x+1][W] = io_link_sif_lo[x][E];
+    end
+  end
+
+
+
+  // Host link connection
+  assign io_link_sif_li[0][P] = io_link_sif_i;
+  assign io_link_sif_o = io_link_sif_lo[0][P];
+
+
+
+
+  //                              //
+  // Configurable Memory System   //
+  //                              //
+  localparam logic [e_max_val-1:0] mem_cfg_lp = (1 << bsg_manycore_mem_cfg_p);
+
+  if (mem_cfg_lp[e_vcache_test_mem]) begin: test_mem
+    // in bytes
+    // north + south row of vcache
+    localparam longint unsigned mem_size_lp = (2**30)*num_pods_x_p/wh_ruche_factor_p/num_vcache_rows_p/2;
+    localparam num_vcaches_per_test_mem_lp = (num_tiles_x_p*num_pods_x_p)/wh_ruche_factor_p/2;
+
+    for (genvar i = W; i <= E; i++) begin: hs                           // horizontal side
+      for (genvar j = 0; j < num_pods_y_p; j++) begin: py               // pod y
+        for (genvar k = N; k <= S; k++) begin: vs                       // vertical side
+          for (genvar v = 0; v < num_vcache_rows_p; v++) begin: vr      // vcache row
+            for (genvar r = 0; r < wh_ruche_factor_p; r++) begin: rf    // ruching
+              bsg_nonsynth_wormhole_test_mem #(
+                .vcache_data_width_p(vcache_data_width_p)
+                ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+                ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+                ,.num_vcaches_p(num_vcaches_per_test_mem_lp)
+                ,.wh_cid_width_p(wh_cid_width_p)
+                ,.wh_flit_width_p(wh_flit_width_p)
+                ,.wh_cord_width_p(wh_cord_width_p)
+                ,.wh_len_width_p(wh_len_width_p)
+                ,.wh_ruche_factor_p(wh_ruche_factor_p)
+                ,.no_concentration_p(1)
+                ,.mem_size_p(mem_size_lp)
+              ) test_mem (
+                .clk_i(clk_i)
+                ,.reset_i(reset_r)
+
+                ,.wh_link_sif_i(buffered_wh_link_sif_lo[i][j][k][v][r])
+                ,.wh_link_sif_o(buffered_wh_link_sif_li[i][j][k][v][r])
+              );
+            end
+          end
+        end
+      end
+    end
+
+  end
+  else if (mem_cfg_lp[e_vcache_hbm2]) begin: hbm2
+    
+
+    `define dram_pkg `BSG_MACHINE_DRAMSIM3_PKG
+    parameter hbm2_data_width_p = `dram_pkg::data_width_p;
+    parameter hbm2_channel_addr_width_p = `dram_pkg::channel_addr_width_p;
+    parameter hbm2_num_channels_p = `dram_pkg::num_channels_p;
+      
+    parameter num_total_vcaches_lp = (num_pods_x_p*num_pods_y_p*2*num_tiles_x_p*num_vcache_rows_p);
+    parameter lg_num_total_vcaches_lp = `BSG_SAFE_CLOG2(num_total_vcaches_lp);
+    parameter num_vcaches_per_link_lp = (num_tiles_x_p*num_pods_x_p)/wh_ruche_factor_p/2; // # of vcaches attached to each link
+
+    parameter num_total_channels_lp = num_total_vcaches_lp/num_vcaches_per_channel_p;
+    parameter num_dram_lp = `BSG_CDIV(num_total_channels_lp,hbm2_num_channels_p);
+
+
+    // WH to cache dma
+    `declare_bsg_cache_dma_pkt_s(vcache_addr_width_p);
+    bsg_cache_dma_pkt_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_pkt_lo;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_pkt_v_lo;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_pkt_yumi_li;
+
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0][vcache_dma_data_width_p-1:0] dma_data_li;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_v_li;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_ready_lo;
+
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0][vcache_dma_data_width_p-1:0] dma_data_lo;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_v_lo;
+    logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_yumi_li;
+
+
+    for (genvar i = W; i <= E; i++) begin: hs
+      for (genvar j = 0; j < num_pods_y_p; j++) begin: py
+        for (genvar k = N; k <= S; k++) begin: py
+          for (genvar n = 0; n < num_vcache_rows_p; n++) begin: row
+            for (genvar r = 0; r < wh_ruche_factor_p; r++) begin: rf
+              bsg_manycore_vcache_wh_to_cache_dma #(
+               .wh_flit_width_p(wh_flit_width_p)
+               ,.wh_cid_width_p(wh_cid_width_p)
+                ,.wh_len_width_p(wh_len_width_p)
+                ,.wh_cord_width_p(wh_cord_width_p)
+                ,.wh_ruche_factor_p(wh_ruche_factor_p)
+
+                ,.num_vcaches_p(num_vcaches_per_link_lp)
+                ,.vcache_addr_width_p(vcache_addr_width_p)
+                ,.vcache_data_width_p(vcache_data_width_p)
+                ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+                ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+
+                ,.no_concentration_p(1)
+                ,.num_pods_x_p(num_pods_x_p)
+                ,.pod_start_x_p(1)
+                ,.num_tiles_x_p(num_tiles_x_p)
+              ) wh_to_dma (
+                .clk_i(clk_i)
+                ,.reset_i(reset_r)
+    
+                ,.wh_link_sif_i     (buffered_wh_link_sif_lo[i][j][k][n][r])
+                ,.wh_link_sif_o     (buffered_wh_link_sif_li[i][j][k][n][r])
+
+                ,.dma_pkt_o         (dma_pkt_lo[i][j][k][n][r])
+                ,.dma_pkt_v_o       (dma_pkt_v_lo[i][j][k][n][r])
+                ,.dma_pkt_yumi_i    (dma_pkt_yumi_li[i][j][k][n][r])
+
+                ,.dma_data_i        (dma_data_li[i][j][k][n][r])
+                ,.dma_data_v_i      (dma_data_v_li[i][j][k][n][r])
+                ,.dma_data_ready_o  (dma_data_ready_lo[i][j][k][n][r])
+
+                ,.dma_data_o        (dma_data_lo[i][j][k][n][r])
+                ,.dma_data_v_o      (dma_data_v_lo[i][j][k][n][r])
+                ,.dma_data_yumi_i   (dma_data_yumi_li[i][j][k][n][r])
+              );
+            end
+          end
+        end
+      end
+    end
+
+
+    // cache DMA to DRAMSIM3
+    // assign vcache DMA to correct HBM2 channel / bank
+    bsg_cache_dma_pkt_s [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_pkt_lo;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_pkt_v_lo;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_pkt_yumi_li;
+
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0][vcache_dma_data_width_p-1:0] remapped_dma_data_li;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_data_v_li;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_data_ready_lo;
+
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0][vcache_dma_data_width_p-1:0] remapped_dma_data_lo;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_data_v_lo;
+    logic [num_total_channels_lp-1:0][num_vcaches_per_channel_p-1:0] remapped_dma_data_yumi_li;
+
+
+    vcache_dma_to_dram_channel_map #(
+      .num_pods_y_p(num_pods_y_p)
+      ,.num_pods_x_p(num_pods_x_p)
+      ,.num_tiles_x_p(num_tiles_x_p)
+
+      ,.wh_ruche_factor_p(wh_ruche_factor_p)
+
+      ,.num_vcache_rows_p(num_vcache_rows_p)
+      ,.vcache_addr_width_p(vcache_addr_width_p)
+      ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+    ) dma_map (
+      // unmapped
+      .dma_pkt_i                    (dma_pkt_lo)
+      ,.dma_pkt_v_i                 (dma_pkt_v_lo)
+      ,.dma_pkt_yumi_o              (dma_pkt_yumi_li)
+
+      ,.dma_data_o                  (dma_data_li)
+      ,.dma_data_v_o                (dma_data_v_li)
+      ,.dma_data_ready_i            (dma_data_ready_lo)
+
+      ,.dma_data_i                  (dma_data_lo)
+      ,.dma_data_v_i                (dma_data_v_lo)
+      ,.dma_data_yumi_o             (dma_data_yumi_li)
+
+      // remapped
+      ,.remapped_dma_pkt_o          (remapped_dma_pkt_lo)
+      ,.remapped_dma_pkt_v_o        (remapped_dma_pkt_v_lo)
+      ,.remapped_dma_pkt_yumi_i     (remapped_dma_pkt_yumi_li)
+      
+      ,.remapped_dma_data_i         (remapped_dma_data_li)
+      ,.remapped_dma_data_v_i       (remapped_dma_data_v_li)
+      ,.remapped_dma_data_ready_o   (remapped_dma_data_ready_lo)
+
+      ,.remapped_dma_data_o         (remapped_dma_data_lo)
+      ,.remapped_dma_data_v_o       (remapped_dma_data_v_lo)
+      ,.remapped_dma_data_yumi_i    (remapped_dma_data_yumi_li)
+    );
+        
+
+    // DRAMSIM3
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_v_li;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_write_not_read_li;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0][hbm2_channel_addr_width_p-1:0] dramsim3_ch_addr_li;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_yumi_lo;
+
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0][hbm2_data_width_p-1:0] dramsim3_data_li;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_data_v_li;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_data_yumi_lo;
+
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0][hbm2_data_width_p-1:0] dramsim3_data_lo;
+    logic [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_data_v_lo;
+    `dram_pkg::dram_ch_addr_s [(num_dram_lp*hbm2_num_channels_p)-1:0] dramsim3_read_done_ch_addr_lo;
+    
+    for (genvar i = 0; i < num_dram_lp; i++) begin
+      bsg_nonsynth_dramsim3 #(
+        .channel_addr_width_p (hbm2_channel_addr_width_p)
+        ,.data_width_p        (hbm2_data_width_p)
+        ,.num_channels_p      (hbm2_num_channels_p)
+        ,.num_columns_p       (`dram_pkg::num_columns_p)
+        ,.num_rows_p          (`dram_pkg::num_rows_p)
+        ,.num_ba_p            (`dram_pkg::num_ba_p)
+        ,.num_bg_p            (`dram_pkg::num_bg_p)
+        ,.num_ranks_p         (`dram_pkg::num_ranks_p)
+        ,.address_mapping_p   (`dram_pkg::address_mapping_p)
+        ,.size_in_bits_p      (`dram_pkg::size_in_bits_p)
+        ,.config_p            (`dram_pkg::config_p)
+        ,.init_mem_p          (1)
+        ,.base_id_p           (i*hbm2_num_channels_p)
+      ) hbm0 (
+        .clk_i                (clk_i)
+        ,.reset_i             (reset_r)
+      
+        ,.v_i                 (dramsim3_v_li[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.write_not_read_i    (dramsim3_write_not_read_li[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.ch_addr_i           (dramsim3_ch_addr_li[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.mask_i              ('1)
+        ,.yumi_o              (dramsim3_yumi_lo[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+
+        ,.data_v_i            (dramsim3_data_v_li[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.data_i              (dramsim3_data_li[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.data_yumi_o         (dramsim3_data_yumi_lo[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+
+        ,.data_v_o            (dramsim3_data_v_lo[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.data_o              (dramsim3_data_lo[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+        ,.read_done_ch_addr_o (dramsim3_read_done_ch_addr_lo[hbm2_num_channels_p*i+:hbm2_num_channels_p])
+
+        ,.print_stat_v_i      ($root.`HOST_MODULE_PATH.print_stat_v)
+        ,.print_stat_tag_i    ($root.`HOST_MODULE_PATH.print_stat_tag)
+
+        ,.write_done_o        ()
+        ,.write_done_ch_addr_o()
+      );
+    end
+
+
+    // cache to test dram
+    // This is the address format coming out of cache dma.
+    typedef struct packed {
+      logic [$clog2(`dram_pkg::num_ba_p)-1:0] ba;
+      logic [$clog2(`dram_pkg::num_bg_p)-1:0] bg;
+      logic [$clog2(`dram_pkg::num_rows_p)-1:0] ro;
+      logic [$clog2(`dram_pkg::num_columns_p)-1:0] co;
+      logic [$clog2(`dram_pkg::data_width_p>>3)-1:0] byte_offset;
+    } dram_ch_addr_s; 
+
+    dram_ch_addr_s [num_total_channels_lp-1:0] test_dram_ch_addr_lo;
+    logic [num_total_channels_lp-1:0][hbm2_channel_addr_width_p-1:0] test_dram_ch_addr_li;
+
+    for (genvar i = 0; i < num_total_channels_lp; i++) begin
+
+      bsg_cache_to_test_dram #(
+        .num_cache_p(num_vcaches_per_channel_p)
+        ,.addr_width_p(vcache_addr_width_p)
+        ,.data_width_p(vcache_data_width_p)
+        ,.block_size_in_words_p(vcache_block_size_in_words_p)
+        ,.cache_bank_addr_width_p(cache_bank_addr_width_lp)
+        ,.dma_data_width_p(vcache_dma_data_width_p)
+      
+        ,.dram_channel_addr_width_p(hbm2_channel_addr_width_p)
+        ,.dram_data_width_p(hbm2_data_width_p)
+      ) cache_to_tram (
+        .core_clk_i           (clk_i)
+        ,.core_reset_i        (reset_r)
+
+        ,.dma_pkt_i           (remapped_dma_pkt_lo[i])
+        ,.dma_pkt_v_i         (remapped_dma_pkt_v_lo[i])
+        ,.dma_pkt_yumi_o      (remapped_dma_pkt_yumi_li[i])
+
+        ,.dma_data_o          (remapped_dma_data_li[i])
+        ,.dma_data_v_o        (remapped_dma_data_v_li[i])
+        ,.dma_data_ready_i    (remapped_dma_data_ready_lo[i])
+
+        ,.dma_data_i          (remapped_dma_data_lo[i])
+        ,.dma_data_v_i        (remapped_dma_data_v_lo[i])
+        ,.dma_data_yumi_o     (remapped_dma_data_yumi_li[i])
+
+
+        ,.dram_clk_i              (clk_i)
+        ,.dram_reset_i            (reset_r)
+    
+        ,.dram_req_v_o            (dramsim3_v_li[i])
+        ,.dram_write_not_read_o   (dramsim3_write_not_read_li[i])
+        ,.dram_ch_addr_o          (test_dram_ch_addr_lo[i])
+        ,.dram_req_yumi_i         (dramsim3_yumi_lo[i])
+
+        ,.dram_data_v_o           (dramsim3_data_v_li[i])
+        ,.dram_data_o             (dramsim3_data_li[i])
+        ,.dram_data_yumi_i        (dramsim3_data_yumi_lo[i])
+
+        ,.dram_data_v_i           (dramsim3_data_v_lo[i])
+        ,.dram_data_i             (dramsim3_data_lo[i])
+        ,.dram_ch_addr_i          (test_dram_ch_addr_li[i])
+      );
+
+      // manycore to dramsim3 address hashing
+      // dramsim3 uses ro-bg-ba-co-bo as address map, so we are changing the mapping here.
+      assign dramsim3_ch_addr_li[i] = {
+        test_dram_ch_addr_lo[i].ro,
+        test_dram_ch_addr_lo[i].bg,
+        test_dram_ch_addr_lo[i].ba,
+        test_dram_ch_addr_lo[i].co,
+        test_dram_ch_addr_lo[i].byte_offset
+      };
+
+      // dramsim3 to manycore address hashing
+      // address coming out of dramsim3 is also ro-bg-ba-co-bo, so we are changing it back to the format that cache dma uses.
+      assign test_dram_ch_addr_li[i] = {
+        dramsim3_read_done_ch_addr_lo[i].ba,
+        dramsim3_read_done_ch_addr_lo[i].bg,
+        dramsim3_read_done_ch_addr_lo[i].ro,
+        dramsim3_read_done_ch_addr_lo[i].co,
+        dramsim3_read_done_ch_addr_lo[i].byte_offset
+      };
+    end
+  end
+
+
+
+
+
+
+
+
+
+
+
+
+
+  ////                        ////
+  ////      TIE OFF           ////
+  ////                        ////
+
+
+  // IO P tie off
+  for (genvar i = 1; i < num_pods_x_p*num_tiles_x_p; i++) begin
+    bsg_manycore_link_sif_tieoff #(
+      .addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+    ) io_p_tieoff (
+      .clk_i(clk_i)
+      ,.reset_i(reset_r)
+      ,.link_sif_i(io_link_sif_lo[i][P])
+      ,.link_sif_o(io_link_sif_li[i][P])
+    );
+  end
+
+  // IO west end tieoff
+  bsg_manycore_link_sif_tieoff #(
+    .addr_width_p(addr_width_p)
+    ,.data_width_p(data_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+  ) io_w_tieoff (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+    ,.link_sif_i(io_link_sif_lo[0][W])
+    ,.link_sif_o(io_link_sif_li[0][W])
+  );
+
+  // IO east end tieoff
+  bsg_manycore_link_sif_tieoff #(
+    .addr_width_p(addr_width_p)
+    ,.data_width_p(data_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+  ) io_e_tieoff (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+    ,.link_sif_i(io_link_sif_lo[(num_pods_x_p*num_tiles_x_p)-1][E])
+    ,.link_sif_o(io_link_sif_li[(num_pods_x_p*num_tiles_x_p)-1][E])
+  );
+
+
+  // SOUTH VER LINK TIE OFFS
+  for (genvar i = 0; i < num_pods_x_p*num_tiles_x_p; i++) begin
+    bsg_manycore_link_sif_tieoff #(
+      .addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+    ) ver_s_tieoff (
+      .clk_i(clk_i)
+      ,.reset_i(reset_r)
+      ,.link_sif_i(ver_link_sif_lo[S][i])
+      ,.link_sif_o(ver_link_sif_li[S][i])
+    );
+  end
+
+
+  // HOR TIEOFF (local link)
+  for (genvar i = W; i <= E; i++) begin
+    for (genvar j = 0; j < num_pods_y_p; j++) begin
+      for (genvar k = 0; k < num_tiles_y_p; k++) begin
+        bsg_manycore_link_sif_tieoff #(
+          .addr_width_p(addr_width_p)
+          ,.data_width_p(data_width_p)
+          ,.x_cord_width_p(x_cord_width_p)
+          ,.y_cord_width_p(y_cord_width_p)
+        ) hor_tieoff (
+          .clk_i(clk_i)
+          ,.reset_i(reset_r)
+          ,.link_sif_i(hor_link_sif_lo[i][j][k])
+          ,.link_sif_o(hor_link_sif_li[i][j][k])
+        );
+      end
+    end
+  end
+
+
+  // RUCHE LINK TIEOFF (west)
+  for (genvar j = 0; j < num_pods_y_p; j++) begin
+    for (genvar k = 0; k < num_tiles_y_p; k++) begin
+      // hard coded for ruche factor 3
+      assign ruche_link_li[W][j][k] = '0;
+    end
+  end
+
+  // RUCHE LINK TIEOFF (east)
+  for (genvar j = 0; j < num_pods_y_p; j++) begin
+    for (genvar k = 0; k < num_tiles_y_p; k++) begin
+      // hard coded for ruche factor 3
+      assign ruche_link_li[E][j][k] = '0;
+    end
+  end
+  
+
+
+
+
+
+
+
+
+//                  //
+//    PROFILERS     //
+//                  //
+
+if (enable_vcore_profiling_p) begin
+  // vanilla core profiler
+   bind vanilla_core vanilla_core_profiler #(
+    .x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.icache_tag_width_p(icache_tag_width_p)
+    ,.icache_entries_p(icache_entries_p)
+    ,.data_width_p(data_width_p)
+    ,.origin_x_cord_p(`BSG_MACHINE_ORIGIN_X_CORD)
+    ,.origin_y_cord_p(`BSG_MACHINE_ORIGIN_Y_CORD)
+  ) vcore_prof (
+    .*
+    ,.global_ctr_i($root.`HOST_MODULE_PATH.global_ctr)
+    ,.print_stat_v_i($root.`HOST_MODULE_PATH.print_stat_v)
+    ,.print_stat_tag_i($root.`HOST_MODULE_PATH.print_stat_tag)
+    ,.trace_en_i($root.`HOST_MODULE_PATH.trace_en)
+  );
+
+  bind network_tx remote_load_trace #(
+    .addr_width_p(addr_width_p)
+    ,.data_width_p(data_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
+    ,.num_tiles_x_p(num_tiles_x_p)
+    ,.num_tiles_y_p(num_tiles_y_p)
+    ,.origin_x_cord_p(`BSG_MACHINE_ORIGIN_X_CORD)
+    ,.origin_y_cord_p(`BSG_MACHINE_ORIGIN_Y_CORD)
+  ) rlt (
+    .*
+    ,.global_ctr_i($root.`HOST_MODULE_PATH.global_ctr)
+    ,.trace_en_i($root.`HOST_MODULE_PATH.trace_en)
+  );
+
+end
+
+if (enable_cache_profiling_p) begin
+  bind bsg_cache vcache_profiler #(
+    .data_width_p(data_width_p)
+    ,.addr_width_p(addr_width_p)
+    ,.header_print_p({`BSG_STRINGIFY(`HOST_MODULE_PATH),".testbench.DUT.py[0].px[0].pod.north_vc_x[0].north_vc_row.vc_y[0].vc_x[0].vc.cache.vcache_prof"})
+    ,.ways_p(ways_p)
+  ) vcache_prof (
+    // everything else
+    .*
+    // bsg_cache_miss
+    ,.chosen_way_n(miss.chosen_way_n)
+    // from testbench
+    ,.global_ctr_i($root.`HOST_MODULE_PATH.global_ctr)
+    ,.print_stat_v_i($root.`HOST_MODULE_PATH.print_stat_v)
+    ,.print_stat_tag_i($root.`HOST_MODULE_PATH.print_stat_tag)
+    ,.trace_en_i($root.`HOST_MODULE_PATH.trace_en)
+  );
+end
+
+if (enable_router_profiling_p) begin
+  bind bsg_mesh_router router_profiler #(
+    .x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.dims_p(dims_p)
+    ,.XY_order_p(XY_order_p)
+    ,.origin_x_cord_p(`BSG_MACHINE_ORIGIN_X_CORD)
+    ,.origin_y_cord_p(`BSG_MACHINE_ORIGIN_Y_CORD)
+  ) rp0 (
+    .*
+    ,.global_ctr_i($root.`HOST_MODULE_PATH.global_ctr)
+    ,.trace_en_i($root.`HOST_MODULE_PATH.trace_en)
+    ,.print_stat_v_i($root.`HOST_MODULE_PATH.print_stat_v)
+  );
+end
+
+
+endmodule
diff --git a/testbenches/common/v/bsg_nonsynth_mem_infinite.v b/testbenches/common/v/bsg_nonsynth_mem_infinite.v
index c5a1f44c1..29e35e0e3 100644
--- a/testbenches/common/v/bsg_nonsynth_mem_infinite.v
+++ b/testbenches/common/v/bsg_nonsynth_mem_infinite.v
@@ -114,6 +114,12 @@ module bsg_nonsynth_mem_infinite
     ,.load_data_o(load_data_lo)
   );
 
+  wire [bsg_manycore_reg_id_width_gp-1:0] store_reg_id;
+  bsg_manycore_reg_id_decode pd0 (
+    .data_i(packet_r.payload)
+    ,.mask_i(packet_r.reg_id.store_mask_s.mask)
+    ,.reg_id_o(store_reg_id)
+  );
 
   // FSM
   //
@@ -125,6 +131,11 @@ module bsg_nonsynth_mem_infinite
   state_e state_r, state_n;
   logic return_v_r, return_v_n;
   
+  wire is_amo = (packet_lo.op_v2 == e_remote_amoswap)
+              | (packet_lo.op_v2 == e_remote_amoor)
+              | (packet_lo.op_v2 == e_remote_amoadd);
+
+
   always_comb begin
   
     packet_yumi_li = 1'b0;
@@ -146,12 +157,12 @@ module bsg_nonsynth_mem_infinite
 
       READY: begin
 
-        mem_w_li = (packet_lo.op == e_remote_store);
+        mem_w_li = packet_lo.op_v2 inside {e_remote_store, e_remote_sw};
         mem_addr_li = packet_lo.addr[0+:mem_addr_width_lp];
         mem_data_li = packet_lo.payload;
-        mem_mask_li = packet_lo.op_ex.store_mask;
+        mem_mask_li = packet_lo.op_v2 == e_remote_store ? packet_lo.reg_id.store_mask_s.mask : 4'hf;
 
-        if (packet_r.op == e_remote_store) begin
+        if (packet_r.op_v2 inside {e_remote_store, e_remote_sw}) begin
           return_packet_li.pkt_type = e_return_credit;
         end
         else begin
@@ -163,12 +174,12 @@ module bsg_nonsynth_mem_infinite
             return_packet_li.pkt_type = e_return_int_wb;
         end
     
-        return_packet_li.data = (packet_r.op == e_remote_load)
+        return_packet_li.data = (packet_r.op_v2 == e_remote_load)
           ? load_data_lo
           : '0;
-        return_packet_li.reg_id = (packet_r.op == e_remote_load)
+        return_packet_li.reg_id = (packet_r.op_v2 inside {e_remote_load, e_remote_sw})
           ? packet_r.reg_id
-          : '0;
+          : store_reg_id;
         return_packet_li.y_cord = packet_r.src_y_cord;
         return_packet_li.x_cord = packet_r.src_x_cord;
 
@@ -181,9 +192,9 @@ module bsg_nonsynth_mem_infinite
             : packet_r;
           return_packet_v_li = 1'b1;
           return_v_n = return_packet_ready_lo
-            ? (packet_v_lo & (packet_lo.op != e_remote_amo))
+            ? (packet_v_lo & ~is_amo)
             : return_v_r;
-          state_n =  (return_packet_ready_lo & packet_v_lo & (packet_lo.op == e_remote_amo))
+          state_n =  (return_packet_ready_lo & packet_v_lo & is_amo)
             ? ATOMIC
             : READY;
         end
@@ -194,8 +205,8 @@ module bsg_nonsynth_mem_infinite
             ? packet_lo
             : packet_r;
           return_packet_v_li = 1'b0;
-          return_v_n = packet_v_lo & (packet_lo.op != e_remote_amo);
-          state_n = (packet_v_lo & (packet_lo.op == e_remote_amo))
+          return_v_n = packet_v_lo & ~is_amo;
+          state_n = (packet_v_lo & is_amo)
             ? ATOMIC
             : READY;
         end
@@ -206,9 +217,10 @@ module bsg_nonsynth_mem_infinite
         mem_v_li = return_packet_ready_lo;
         mem_w_li = return_packet_ready_lo;
         mem_addr_li = packet_r.addr[0+:mem_addr_width_lp];
-        case (packet_r.op_ex.amo_type)
-          e_amo_swap: mem_data_li = packet_r.payload;
-          e_amo_or:  mem_data_li = packet_r.payload | mem_data_lo;
+        case (packet_r.op_v2)
+          e_remote_amoswap: mem_data_li = packet_r.payload;
+          e_remote_amoor:  mem_data_li = packet_r.payload | mem_data_lo;
+          e_remote_amoadd: mem_data_li = packet_r.payload + mem_data_lo;
           default: mem_data_li = '0; // should never happen.
         endcase
         mem_mask_li = {data_mask_width_lp{1'b1}};
@@ -259,7 +271,7 @@ module bsg_nonsynth_mem_infinite
     if (~reset_i) begin
 
       if (packet_v_lo) begin
-        assert(packet_lo.op != e_cache_op) else $error("infinite mem does not support cache mgmt op.");
+        assert(packet_lo.op_v2 != e_cache_op) else $error("infinite mem does not support cache mgmt op.");
       end
 
     end
diff --git a/testbenches/common/v/bsg_nonsynth_wormhole_test_mem.v b/testbenches/common/v/bsg_nonsynth_wormhole_test_mem.v
new file mode 100644
index 000000000..ebeb8123b
--- /dev/null
+++ b/testbenches/common/v/bsg_nonsynth_wormhole_test_mem.v
@@ -0,0 +1,231 @@
+module bsg_nonsynth_wormhole_test_mem
+  import bsg_manycore_pkg::*;
+  #(parameter vcache_data_width_p = "inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+    , parameter num_vcaches_p = "inv" // how many vcaches are mapped to this test mem?
+    , parameter lg_num_vcaches_lp = `BSG_SAFE_CLOG2(num_vcaches_p)
+   
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    , parameter wh_ruche_factor_p="inv"
+
+    // determines address hashing based on cid and src_cord
+    , parameter no_concentration_p=0
+
+    , parameter data_len_lp = (vcache_data_width_p*vcache_block_size_in_words_p/vcache_dma_data_width_p)
+    , parameter longint unsigned mem_size_p = "inv"   // size of memory in bytes
+    , parameter mem_els_lp = mem_size_p/(vcache_dma_data_width_p/8)
+    , parameter mem_addr_width_lp = `BSG_SAFE_CLOG2(mem_els_lp)
+
+    , parameter lg_wh_ruche_factor_lp = `BSG_SAFE_CLOG2(wh_ruche_factor_p)
+
+    , parameter count_width_lp = `BSG_SAFE_CLOG2(data_len_lp)
+
+    , parameter block_offset_width_lp = `BSG_SAFE_CLOG2((vcache_data_width_p>>3)*vcache_block_size_in_words_p)
+
+    , parameter wh_link_sif_width_lp =
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    , input  [wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [wh_link_sif_width_lp-1:0] wh_link_sif_o
+  );
+
+ 
+  // memory
+  logic mem_we;
+  logic [mem_addr_width_lp-1:0] mem_addr;
+  logic [vcache_dma_data_width_p-1:0] mem_w_data;
+  logic [vcache_dma_data_width_p-1:0] mem_r_data;
+  logic [vcache_dma_data_width_p-1:0] mem_r [mem_els_lp-1:0];
+
+  always_ff @ (posedge clk_i) begin
+    if (mem_we) begin
+      mem_r[mem_addr] <= mem_w_data;
+    end
+  end
+
+  assign mem_r_data = mem_r[mem_addr];
+
+
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+  wh_link_sif_s wh_link_sif_in;
+  wh_link_sif_s wh_link_sif_out;
+  assign wh_link_sif_in = wh_link_sif_i;
+  assign wh_link_sif_o = wh_link_sif_out;
+
+
+  `declare_bsg_manycore_vcache_wh_header_flit_s(wh_flit_width_p,wh_cord_width_p,wh_len_width_p,wh_cid_width_p);
+
+  bsg_manycore_vcache_wh_header_flit_s header_flit_in;
+  assign header_flit_in = wh_link_sif_in.data;
+
+  logic clear_li;
+  logic up_li;
+  logic [count_width_lp-1:0] count_lo;
+
+  bsg_counter_clear_up #(
+    .max_val_p(data_len_lp-1)
+    ,.init_val_p(0)
+  ) count (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.clear_i(clear_li)
+    ,.up_i(up_li)
+    ,.count_o(count_lo)
+  );
+
+
+  typedef enum logic [2:0] {
+    RESET
+    ,READY
+    ,RECV_ADDR
+    ,RECV_EVICT_DATA
+    ,SEND_FILL_HEADER
+    ,SEND_FILL_DATA
+  } mem_state_e;
+
+  mem_state_e mem_state_r, mem_state_n;
+  logic write_not_read_r, write_not_read_n;
+  logic [wh_flit_width_p-1:0] addr_r, addr_n;
+  logic [wh_cord_width_p-1:0] src_cord_r, src_cord_n;
+  logic [wh_cid_width_p-1:0] cid_r, cid_n;
+  
+  bsg_manycore_vcache_wh_header_flit_s header_flit_out;
+  assign header_flit_out.unused = '0;
+  assign header_flit_out.write_not_read = '0; // dont care
+  assign header_flit_out.src_cord = '0;   // dont care
+  assign header_flit_out.cid = cid_r;
+  assign header_flit_out.len = wh_len_width_p'(data_len_lp);
+  assign header_flit_out.dest_cord = src_cord_r;
+
+  always_comb begin
+    wh_link_sif_out = '0;
+    clear_li = 1'b0;
+    up_li = 1'b0;
+
+    write_not_read_n = write_not_read_r;
+    addr_n = addr_r;
+    src_cord_n = src_cord_r;
+    cid_n = cid_r;
+    mem_state_n = mem_state_r;
+ 
+    mem_we = 1'b0;
+    mem_w_data = wh_link_sif_in.data;
+
+   
+    case (mem_state_r)
+
+      RESET: begin
+        mem_state_n = READY;
+      end
+
+      READY: begin
+        wh_link_sif_out.ready_and_rev = 1'b1;
+        if (wh_link_sif_in.v) begin
+          write_not_read_n = header_flit_in.write_not_read;
+          src_cord_n = header_flit_in.src_cord;
+          cid_n = header_flit_in.cid;
+          mem_state_n = RECV_ADDR;
+        end
+      end
+      
+      RECV_ADDR: begin
+        wh_link_sif_out.ready_and_rev = 1'b1;
+        if (wh_link_sif_in.v) begin
+          addr_n = wh_link_sif_in.data;
+          mem_state_n = write_not_read_r
+            ? RECV_EVICT_DATA
+            : SEND_FILL_HEADER;
+        end
+      end
+
+      RECV_EVICT_DATA: begin
+        wh_link_sif_out.ready_and_rev = 1'b1;
+        if (wh_link_sif_in.v) begin
+          mem_we = 1'b1;
+          up_li = (count_lo != data_len_lp-1);
+          clear_li = (count_lo == data_len_lp-1);
+          mem_state_n = (count_lo == data_len_lp-1)
+            ? READY
+            : RECV_EVICT_DATA;
+        end
+      end
+
+      SEND_FILL_HEADER: begin
+        wh_link_sif_out.v = 1'b1;
+        wh_link_sif_out.data = header_flit_out;
+        if (wh_link_sif_in.ready_and_rev) begin
+          mem_state_n = SEND_FILL_DATA;
+        end
+      end
+
+      SEND_FILL_DATA: begin
+        wh_link_sif_out.v = 1'b1;
+        wh_link_sif_out.data = mem_r_data;
+        if (wh_link_sif_in.ready_and_rev) begin
+          clear_li = (count_lo == data_len_lp-1);
+          up_li = (count_lo != data_len_lp-1);
+          mem_state_n = (count_lo == data_len_lp-1)
+            ? READY
+            : SEND_FILL_DATA;
+        end
+      end
+
+      default: begin
+        mem_state_n = READY; // never happens
+      end
+
+    endcase
+
+
+  end
+
+  
+  // address hashing
+  if (no_concentration_p) begin
+    // no concentration. each wh ruche link gets a test_mem.
+    assign mem_addr = {
+      src_cord_r[lg_wh_ruche_factor_lp+:lg_num_vcaches_lp],
+      addr_r[block_offset_width_lp+:mem_addr_width_lp-lg_num_vcaches_lp-count_width_lp],
+      count_lo
+    };
+  end
+  else begin
+    // wh ruche links coming from top and bottom caches are concentrated into one link.
+    assign mem_addr = {
+      (1)'(cid_r/wh_ruche_factor_p), // determine north or south vcache
+      src_cord_r[0+:(lg_num_vcaches_lp-1)],
+      addr_r[block_offset_width_lp+:mem_addr_width_lp-lg_num_vcaches_lp-count_width_lp],
+      count_lo
+    };
+  end
+
+
+
+
+
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      mem_state_r <= RESET;
+      write_not_read_r <= 1'b0;
+      src_cord_r <= '0;
+      cid_r <= '0;
+      addr_r <= '0;
+    end
+    else begin
+      mem_state_r <= mem_state_n;
+      write_not_read_r <= write_not_read_n;
+      src_cord_r <= src_cord_n;
+      cid_r <= cid_n;
+      addr_r <= addr_n;
+    end
+  end
+
+endmodule
diff --git a/testbenches/common/v/infinite_mem_profiler.v b/testbenches/common/v/infinite_mem_profiler.v
index 9456edfe7..fcdbacb5f 100644
--- a/testbenches/common/v/infinite_mem_profiler.v
+++ b/testbenches/common/v/infinite_mem_profiler.v
@@ -34,10 +34,10 @@ module infinite_mem_profiler
   assign packet_cast = packet_lo;
 
   wire inc = packet_v_lo & packet_yumi_li; 
-  wire inc_ld = inc & (packet_cast.op == e_remote_load);
-  wire inc_st = inc & (packet_cast.op == e_remote_store);
-  wire inc_amoswap = inc & (packet_cast.op == e_remote_amo) & (packet_cast.op_ex.amo_type == e_amo_swap);
-  wire inc_amoor = inc & (packet_cast.op == e_remote_amo) & (packet_cast.op_ex.amo_type == e_amo_or);
+  wire inc_ld = inc & (packet_cast.op_v2 == e_remote_load);
+  wire inc_st = inc & (packet_cast.op_v2 == e_remote_store);
+  wire inc_amoswap = inc & (packet_cast.op_v2 == e_remote_amoswap);
+  wire inc_amoor = inc & (packet_cast.op_v2 == e_remote_amoor);
   
   integer ld_count_r;
   integer st_count_r;
diff --git a/testbenches/common/v/remote_load_trace.v b/testbenches/common/v/remote_load_trace.v
index 00948cef0..b7bd9e01d 100644
--- a/testbenches/common/v/remote_load_trace.v
+++ b/testbenches/common/v/remote_load_trace.v
@@ -28,11 +28,16 @@ module remote_load_trace
     , parameter data_width_p="inv"
     , parameter x_cord_width_p="inv"
     , parameter y_cord_width_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+    , parameter num_tiles_x_p="inv"
+    , parameter num_tiles_y_p="inv"
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
 
     , parameter origin_x_cord_p="inv"
     , parameter origin_y_cord_p="inv"
 
-
     , parameter tracefile_p = "remote_load_trace.csv"
 
     , parameter packet_width_lp=
@@ -56,15 +61,19 @@ module remote_load_trace
 
 
     // coord
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
-
+    , input [x_subcord_width_lp-1:0] my_x_i
+    , input [y_subcord_width_lp-1:0] my_y_i
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
 
     // ctrl signal
     , input trace_en_i
     , input [31:0] global_ctr_i
   );
 
+  wire [x_cord_width_p-1:0] global_x = {pod_x_i, my_x_i};
+  wire [y_cord_width_p-1:0] global_y = {pod_y_i, my_y_i};
+
   // manycore packet
   `declare_bsg_manycore_packet_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
   bsg_manycore_packet_s out_packet;
@@ -86,13 +95,13 @@ module remote_load_trace
   remote_load_status_s icache_status_r;
 
   wire int_rl_v    = out_v_o & (
-    ((out_packet.op == e_remote_load) & ~load_info.icache_fetch & ~load_info.float_wb)
-    | (out_packet.op == e_remote_amo));
+    ((out_packet.op_v2 == e_remote_load) & ~load_info.icache_fetch & ~load_info.float_wb)
+    | remote_req_i.is_amo_op);
   wire float_rl_v = out_v_o & (
-    (out_packet.op == e_remote_load) & load_info.float_wb); 
+    (out_packet.op_v2 == e_remote_load) & load_info.float_wb); 
 
   wire icache_rl_v = out_v_o & (
-    (out_packet.op == e_remote_load) & load_info.icache_fetch);
+    (out_packet.op_v2 == e_remote_load) & load_info.icache_fetch);
     
   logic [RV32_reg_els_gp-1:0] int_rl_we;
   logic [RV32_reg_els_gp-1:0] float_rl_we;
@@ -159,8 +168,8 @@ module remote_load_trace
 
   // origin tile writes the csv header.
   always @ (negedge reset_i) begin
-    if ((my_x_i == x_cord_width_p'(origin_x_cord_p))
-      & (my_y_i == y_cord_width_p'(origin_y_cord_p))) begin
+    if ((global_x == x_cord_width_p'(origin_x_cord_p))
+      & (global_y == y_cord_width_p'(origin_y_cord_p))) begin
 
       fd = $fopen(tracefile_p, "a");
       $fwrite(fd,"start_cycle,end_cycle,src_x,src_y,dest_x,dest_y,type,latency\n");
@@ -182,8 +191,8 @@ module remote_load_trace
             $fwrite(fd,"%0d,%0d,%0d,%0d,%0d,%0d,%s,%0d\n", 
               int_rl_status_r[returned_reg_id_i].start_cycle,
               global_ctr_i,
-              my_x_i,
-              my_y_i,
+              global_x,
+              global_y,
               int_rl_status_r[returned_reg_id_i].x_cord,
               int_rl_status_r[returned_reg_id_i].y_cord,
               "int",
@@ -197,8 +206,8 @@ module remote_load_trace
             $fwrite(fd,"%0d,%0d,%0d,%0d,%0d,%0d,%s,%0d\n", 
               float_rl_status_r[returned_reg_id_i].start_cycle,
               global_ctr_i,
-              my_x_i,
-              my_y_i,
+              global_x,
+              global_y,
               float_rl_status_r[returned_reg_id_i].x_cord,
               float_rl_status_r[returned_reg_id_i].y_cord,
               "float",
@@ -212,8 +221,8 @@ module remote_load_trace
             $fwrite(fd,"%0d,%0d,%0d,%0d,%0d,%0d,%s,%0d\n", 
               icache_status_r.start_cycle,
               global_ctr_i,
-              my_x_i,
-              my_y_i,
+              global_x,
+              global_y,
               icache_status_r.x_cord,
               icache_status_r.y_cord,
               "icache",
diff --git a/testbenches/common/v/spmd_testbench.v b/testbenches/common/v/spmd_testbench.v
index b00e264a9..b36c27e12 100644
--- a/testbenches/common/v/spmd_testbench.v
+++ b/testbenches/common/v/spmd_testbench.v
@@ -1,1177 +1,183 @@
 /**
- *  spmd_testbench.v
+ *    spmd_testbench.v
  *
- */
+ */ 
 
-module spmd_testbench;
-  import bsg_noc_pkg::*; // {P=0, W, E, N, S}
+
+module spmd_testbench();
   import bsg_manycore_pkg::*;
   import bsg_manycore_mem_cfg_pkg::*;
-  import bsg_manycore_network_cfg_pkg::*;
 
-  // defines from VCS
-  // rename it to something more familiar.
+  parameter num_pods_x_p  = `BSG_MACHINE_PODS_X;
+  parameter num_pods_y_p  = `BSG_MACHINE_PODS_Y;
   parameter num_tiles_x_p = `BSG_MACHINE_GLOBAL_X;
   parameter num_tiles_y_p = `BSG_MACHINE_GLOBAL_Y;
-  parameter vcache_sets_p = `BSG_MACHINE_VCACHE_SET;
-  parameter vcache_ways_p = `BSG_MACHINE_VCACHE_WAY;
-  parameter vcache_block_size_in_words_p = `BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS; // in words
-  parameter vcache_dma_data_width_p = `BSG_MACHINE_VCACHE_DMA_DATA_WIDTH; // in bits
-  parameter bsg_dram_size_p = `BSG_MACHINE_DRAM_SIZE_WORDS; // in words
-  parameter bsg_dram_included_p = `BSG_MACHINE_DRAM_INCLUDED;
-  parameter bsg_max_epa_width_p = `BSG_MACHINE_MAX_EPA_WIDTH;
-  parameter bsg_manycore_mem_cfg_e bsg_manycore_mem_cfg_p = `BSG_MACHINE_MEM_CFG;
-  parameter bsg_branch_trace_en_p = `BSG_MACHINE_BRANCH_TRACE_EN;
-  parameter vcache_miss_fifo_els_p = `BSG_MACHINE_VCACHE_MISS_FIFO_ELS;
-
-  parameter ruche_factor_X_p    = `BSG_MACHINE_RUCHE_FACTOR_X;
-  parameter bsg_manycore_network_cfg_e bsg_manycore_network_cfg_p = `BSG_MACHINE_NETWORK_CFG;
-
-  parameter int hetero_type_vec_p [0:((num_tiles_y_p-1)*num_tiles_x_p) - 1]  = '{`BSG_MACHINE_HETERO_TYPE_VEC};
-
-  // constant params
+  parameter x_cord_width_p = 7;
+  parameter y_cord_width_p = 7;
+  parameter pod_x_cord_width_p = 3;
+  parameter pod_y_cord_width_p = 4;
+  parameter num_subarray_x_p = `BSG_MACHINE_SUBARRAY_X;
+  parameter num_subarray_y_p = `BSG_MACHINE_SUBARRAY_Y;
   parameter data_width_p = 32;
+  parameter addr_width_p = `BSG_MACHINE_MAX_EPA_WIDTH; // word addr
   parameter dmem_size_p = 1024;
   parameter icache_entries_p = 1024;
   parameter icache_tag_width_p = 12;
+  parameter ruche_factor_X_p    = `BSG_MACHINE_RUCHE_FACTOR_X;
 
-  parameter axi_id_width_p = 6;
-  parameter axi_addr_width_p = 64;
-  parameter axi_data_width_p = 256;
-  parameter axi_burst_len_p = 1;
-
-  `define ORIGIN_X_CORD_P 0
-  `define ORIGIN_Y_CORD_P 2
-
-  // dmc param
-  parameter dram_ctrl_addr_width_p = 29; // 512 MB
-
-  // dramsim3 HBM2 param
-  `define dram_pkg bsg_dramsim3_hbm2_8gb_x128_pkg
-  parameter hbm2_data_width_p = `dram_pkg::data_width_p;
-  parameter hbm2_channel_addr_width_p = `dram_pkg::channel_addr_width_p;
-  parameter hbm2_num_channels_p = `dram_pkg::num_channels_p;
-  parameter hbm2_num_cache_per_channel_p = 16;
-  parameter hbm2_num_channel_per_side_p = (num_tiles_x_p/hbm2_num_cache_per_channel_p);
-
-  // derived param
-  parameter axi_strb_width_lp = (axi_data_width_p>>3);
-  parameter x_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p);
-  parameter y_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p + 2);
+  parameter num_vcache_rows_p = `BSG_MACHINE_NUM_VCACHE_ROWS;
+  parameter vcache_data_width_p = data_width_p;
+  parameter vcache_sets_p = `BSG_MACHINE_VCACHE_SET;
+  parameter vcache_ways_p = `BSG_MACHINE_VCACHE_WAY;
+  parameter vcache_block_size_in_words_p = `BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS; // in words
+  parameter vcache_dma_data_width_p = `BSG_MACHINE_VCACHE_DMA_DATA_WIDTH; // in bits
+  parameter vcache_size_p = vcache_sets_p*vcache_ways_p*vcache_block_size_in_words_p;
+  parameter vcache_addr_width_p=(addr_width_p-1+`BSG_SAFE_CLOG2(data_width_p>>3));  // in bytes
+  parameter num_vcaches_per_channel_p = `BSG_MACHINE_NUM_VCACHES_PER_CHANNEL;  
 
-  parameter vcache_size_p = vcache_sets_p * vcache_ways_p * vcache_block_size_in_words_p;
-  parameter byte_offset_width_lp=`BSG_SAFE_CLOG2(data_width_p>>3);
-  parameter cache_addr_width_lp=(bsg_max_epa_width_p-1+byte_offset_width_lp);
-  parameter data_mask_width_lp=(data_width_p>>3);
 
-  parameter cache_bank_addr_width_lp = `BSG_SAFE_CLOG2(bsg_dram_size_p/(2*num_tiles_x_p)*4); // byte addr
+  parameter wh_flit_width_p = vcache_dma_data_width_p;
+  parameter wh_ruche_factor_p = 2;
+  parameter wh_cid_width_p = `BSG_SAFE_CLOG2(2*wh_ruche_factor_p); // no concentration in this testbench; cid is ignored.
+  parameter wh_len_width_p = `BSG_SAFE_CLOG2(1+(vcache_block_size_in_words_p*vcache_data_width_p/vcache_dma_data_width_p)); // header + addr + data
+  parameter wh_cord_width_p = x_cord_width_p;
 
-  // print machine settings
-  initial begin
-    $display("MACHINE SETTINGS:");
-    $display("[INFO][TESTBENCH] BSG_MACHINE_GLOBAL_X                 = %d", num_tiles_x_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_GLOBAL_Y                 = %d", num_tiles_y_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_SET               = %d", vcache_sets_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_WAY               = %d", vcache_ways_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_BLOCK_SIZE_WORDS  = %d", vcache_block_size_in_words_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_VCACHE_MISS_FIFO_ELS     = %d", vcache_miss_fifo_els_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_DRAM_SIZE_WORDS          = %d", bsg_dram_size_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_DRAM_INCLUDED            = %d", bsg_dram_included_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_MAX_EPA_WIDTH            = %d", bsg_max_epa_width_p);
-    $display("[INFO][TESTBENCH] BSG_MACHINE_MEM_CFG                  = %s", bsg_manycore_mem_cfg_p.name());
-    $display("[INFO][TESTBENCH] BSG_MACHINE_NETWORK_CFG              = %s", bsg_manycore_network_cfg_p.name());
-    $display("[INFO][TESTBENCH] BSG_MACHINE_RUCHE_FACTOR_X           = %d", ruche_factor_X_p);
-  end
+  parameter bsg_dram_size_p = `BSG_MACHINE_DRAM_SIZE_WORDS; // in words
+  parameter bsg_dram_included_p = `BSG_MACHINE_DRAM_INCLUDED;
+  parameter bsg_manycore_mem_cfg_e bsg_manycore_mem_cfg_p = `BSG_MACHINE_MEM_CFG;
+  parameter reset_depth_p = 3;
 
 
-  // clock and reset generation
-  //
+  // clock and reset
   parameter core_clk_period_p = 1000; // 1000 ps == 1 GHz
-
   bit core_clk;
-  bit reset;
-
+  bit global_reset;
   bsg_nonsynth_clock_gen #(
     .cycle_time_p(core_clk_period_p)
   ) clock_gen (
     .o(core_clk)
   );
-
   bsg_nonsynth_reset_gen #(
     .num_clocks_p(1)
     ,.reset_cycles_lo_p(0)
     ,.reset_cycles_hi_p(16)
   ) reset_gen (
     .clk_i(core_clk)
-    ,.async_reset_o(reset)
+    ,.async_reset_o(global_reset)
   );
 
 
-  // bsg_manycore has 3 flops that reset signal needs to go through.
-  // So we are trying to match that here.
-  logic [2:0] reset_r;
+  // testbench
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_link_sif_s io_link_sif_li, io_link_sif_lo;
+  logic tag_done_lo;
 
-  always_ff @ (posedge core_clk) begin
-    reset_r[0] <= reset;
-    reset_r[1] <= reset_r[0];
-    reset_r[2] <= reset_r[1];
-  end
-
-
-  // instantiate manycore
-  `declare_bsg_manycore_link_sif_s(bsg_max_epa_width_p,data_width_p,
-    x_cord_width_lp,y_cord_width_lp);
-
-  bsg_manycore_link_sif_s [S:N][num_tiles_x_p-1:0] ver_link_li, ver_link_lo;
-  bsg_manycore_link_sif_s [E:W][num_tiles_y_p-1:0] hor_link_li, hor_link_lo;
-  bsg_manycore_link_sif_s [num_tiles_x_p-1:0] io_link_li, io_link_lo;
-
-
-  // Configurable Network
-  localparam logic [e_network_max_val-1:0] network_cfg_lp = (1 << bsg_manycore_network_cfg_p);
-
-  if (network_cfg_lp[e_network_crossbar]) begin: cnet
-
-    bsg_manycore_top_crossbar #(
-      .dmem_size_p(dmem_size_p)
-      ,.icache_entries_p(icache_entries_p)
-      ,.icache_tag_width_p(icache_tag_width_p)
-      ,.vcache_size_p(vcache_size_p)
-      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
-      ,.vcache_sets_p(vcache_sets_p)
-      ,.data_width_p(data_width_p)
-      ,.addr_width_p(bsg_max_epa_width_p)
-      ,.num_tiles_x_p(num_tiles_x_p)
-      ,.num_tiles_y_p(num_tiles_y_p)
-    ) DUT (
-      .clk_i(core_clk)
-      ,.reset_i(reset)
-
-      ,.ver_link_sif_i(ver_link_li)
-      ,.ver_link_sif_o(ver_link_lo)
-
-      ,.io_link_sif_i(io_link_li)
-      ,.io_link_sif_o(io_link_lo)
-    );
-
-  end
-  else if (network_cfg_lp[e_network_mesh]) begin: mnet
-
-    bsg_manycore_top_mesh #(
-      .dmem_size_p(dmem_size_p)
-      ,.icache_entries_p(icache_entries_p)
-      ,.icache_tag_width_p(icache_tag_width_p)
-      ,.vcache_size_p(vcache_size_p)
-      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
-      ,.vcache_sets_p(vcache_sets_p)
-      ,.data_width_p(data_width_p)
-      ,.addr_width_p(bsg_max_epa_width_p)
-      ,.num_tiles_x_p(num_tiles_x_p)
-      ,.num_tiles_y_p(num_tiles_y_p)
-      ,.branch_trace_en_p(bsg_branch_trace_en_p)
-      ,.hetero_type_vec_p(hetero_type_vec_p)
-    ) DUT (
-      .clk_i(core_clk)
-      ,.reset_i(reset)
-
-      ,.hor_link_sif_i(hor_link_li)
-      ,.hor_link_sif_o(hor_link_lo)
-
-      ,.ver_link_sif_i(ver_link_li)
-      ,.ver_link_sif_o(ver_link_lo)
-
-      ,.io_link_sif_i(io_link_li)
-      ,.io_link_sif_o(io_link_lo)
-    );
-
-  end
-  else if (network_cfg_lp[e_network_half_ruche_x]) begin: rnet
-
-    `declare_bsg_manycore_ruche_x_link_sif_s(bsg_max_epa_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp);
-    bsg_manycore_ruche_x_link_sif_s [E:W][num_tiles_y_p-2:0][ruche_factor_X_p-1:0] ruche_link_li, ruche_link_lo;
-
-    bsg_manycore_top_ruche #(
-      .dmem_size_p(dmem_size_p)
-      ,.icache_entries_p(icache_entries_p)
-      ,.icache_tag_width_p(icache_tag_width_p)
-      ,.vcache_size_p(vcache_size_p)
-      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
-      ,.vcache_sets_p(vcache_sets_p)
-      ,.data_width_p(data_width_p)
-      ,.addr_width_p(bsg_max_epa_width_p)
-      ,.num_tiles_x_p(num_tiles_x_p)
-      ,.num_tiles_y_p(num_tiles_y_p)
-      ,.branch_trace_en_p(bsg_branch_trace_en_p)
-      ,.hetero_type_vec_p(hetero_type_vec_p)
-      ,.ruche_factor_X_p(ruche_factor_X_p)
-    ) DUT (
-      .clk_i(core_clk)
-      ,.reset_i(reset)
-
-      ,.hor_link_sif_i(hor_link_li)
-      ,.hor_link_sif_o(hor_link_lo)
-
-      ,.ver_link_sif_i(ver_link_li)
-      ,.ver_link_sif_o(ver_link_lo)
-
-      ,.io_link_sif_i(io_link_li)
-      ,.io_link_sif_o(io_link_lo)
-
-      ,.ruche_link_i(ruche_link_li)
-      ,.ruche_link_o(ruche_link_lo)
-    );
-
-
-    // tieoff ruche links
-    for (genvar i = 0; i < num_tiles_y_p-1; i++) begin: y
-      for (genvar j = 0; j < ruche_factor_X_p; j++) begin: r
+  bsg_nonsynth_manycore_testbench #(
+    .num_pods_x_p(num_pods_x_p)
+    ,.num_pods_y_p(num_pods_y_p)
+    ,.num_tiles_x_p(num_tiles_x_p)
+    ,.num_tiles_y_p(num_tiles_y_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
+    ,.addr_width_p(addr_width_p)
+    ,.data_width_p(data_width_p)
+    ,.dmem_size_p(dmem_size_p)
+    ,.icache_entries_p(icache_entries_p)
+    ,.icache_tag_width_p(icache_tag_width_p)
+    ,.ruche_factor_X_p(ruche_factor_X_p)
+
+    ,.num_subarray_x_p(num_subarray_x_p)
+    ,.num_subarray_y_p(num_subarray_y_p)
+
+    ,.num_vcache_rows_p(num_vcache_rows_p)
+    ,.vcache_data_width_p(vcache_data_width_p)
+    ,.vcache_sets_p(vcache_sets_p)
+    ,.vcache_ways_p(vcache_ways_p)
+    ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+    ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+    ,.vcache_size_p(vcache_size_p)
+    ,.vcache_addr_width_p(vcache_addr_width_p)
+    ,.num_vcaches_per_channel_p(num_vcaches_per_channel_p)
+
+    ,.wh_flit_width_p(wh_flit_width_p)
+    ,.wh_ruche_factor_p(wh_ruche_factor_p)
+    ,.wh_cid_width_p(wh_cid_width_p)
+    ,.wh_len_width_p(wh_len_width_p)
+    ,.wh_cord_width_p(wh_cord_width_p)
+
+    ,.bsg_manycore_mem_cfg_p(bsg_manycore_mem_cfg_p)
+    ,.bsg_dram_size_p(bsg_dram_size_p)
+
+    ,.reset_depth_p(reset_depth_p)
+
+`ifdef BSG_ENABLE_PROFILING
+    ,.enable_vcore_profiling_p(1)
+    ,.enable_router_profiling_p(1)
+    ,.enable_cache_profiling_p(1)
+`endif				    
+  ) tb (
+    .clk_i(core_clk)
+    ,.reset_i(global_reset)
 
-        bsg_manycore_ruche_x_link_sif_tieoff #(
-          .addr_width_p(bsg_max_epa_width_p)
-          ,.data_width_p(data_width_p)
-          ,.x_cord_width_p(x_cord_width_lp)
-          ,.y_cord_width_p(y_cord_width_lp)
-          ,.ruche_factor_X_p(ruche_factor_X_p)
-          ,.ruche_stage_p(j)
-        ) tieoff_re (
-          .clk_i(core_clk)
-          ,.reset_i(reset_r[2])
-          ,.ruche_link_i(ruche_link_lo[E][i][j])
-          ,.ruche_link_o(ruche_link_li[E][i][j])
-        );
+    ,.io_link_sif_i(io_link_sif_li)
+    ,.io_link_sif_o(io_link_sif_lo)
 
-        bsg_manycore_ruche_x_link_sif_tieoff #(
-          .addr_width_p(bsg_max_epa_width_p)
-          ,.data_width_p(data_width_p)
-          ,.x_cord_width_p(x_cord_width_lp)
-          ,.y_cord_width_p(y_cord_width_lp)
-          ,.ruche_factor_X_p(ruche_factor_X_p)
-          ,.ruche_stage_p(j)
-        ) tieoff_rw (
-          .clk_i(core_clk)
-          ,.reset_i(reset_r[2])
-          ,.ruche_link_i(ruche_link_lo[W][i][j])
-          ,.ruche_link_o(ruche_link_li[W][i][j])
-        );
-      end
-    end
-  end
+    ,.tag_done_o(tag_done_lo)
+  );
 
+  // reset is deasserted when tag programming is done.
+  logic reset_r;
+  bsg_dff_chain #(
+    .width_p(1)
+    ,.num_stages_p(reset_depth_p)
+  ) reset_dff (
+    .clk_i(core_clk)
+    ,.data_i(~tag_done_lo)
+    ,.data_o(reset_r)
+  );
 
 
-  // instantiate the loader and moniter
-  // connects to P-port of (x,y)=(0,1)
+  // SPMD LOADER
   logic print_stat_v;
   logic [data_width_p-1:0] print_stat_tag;
-
   bsg_nonsynth_manycore_io_complex #(
-    .addr_width_p(bsg_max_epa_width_p)
+    .addr_width_p(addr_width_p)
     ,.data_width_p(data_width_p)
-    ,.x_cord_width_p(x_cord_width_lp)
-    ,.y_cord_width_p(y_cord_width_lp)
-
-    ,.num_tiles_x_p(num_tiles_x_p)
-    ,.num_tiles_y_p(num_tiles_y_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.io_x_cord_p(`BSG_MACHINE_HOST_X_CORD)
+    ,.io_y_cord_p(`BSG_MACHINE_HOST_Y_CORD)
   ) io (
     .clk_i(core_clk)
-    ,.reset_i(reset_r[2])
-    ,.io_link_sif_i(io_link_lo[0])
-    ,.io_link_sif_o(io_link_li[0])
+    ,.reset_i(reset_r)
+    ,.io_link_sif_i(io_link_sif_lo)
+    ,.io_link_sif_o(io_link_sif_li)
     ,.print_stat_v_o(print_stat_v)
     ,.print_stat_tag_o(print_stat_tag)
     ,.loader_done_o()
   );
 
+  // reset dff
 
-  // global counter
-  //
-  logic [31:0] global_ctr;
-
-  bsg_cycle_counter global_cc (
-    .clk_i(core_clk)
-    ,.reset_i(reset_r[2])
-    ,.ctr_r_o(global_ctr)
-  );
-
-
-  //                              //
-  // Configurable Memory System   //
-  //                              //
-  localparam logic [e_max_val-1:0] mem_cfg_lp = (1 << bsg_manycore_mem_cfg_p);
-
-  if (mem_cfg_lp[e_vcache_blocking_axi4_nonsynth_mem]
-      | mem_cfg_lp[e_vcache_non_blocking_axi4_nonsynth_mem]
-      | mem_cfg_lp[e_vcache_blocking_dmc_lpddr]
-      | mem_cfg_lp[e_vcache_non_blocking_dmc_lpddr]
-      | mem_cfg_lp[e_vcache_blocking_dramsim3_hbm2]
-      ) begin: lv1_dma
-
-    // for now blocking and non-blocking shares the same wire, since interface is
-    // the same. But it might change in the future.
-    import bsg_cache_pkg::*;
-    localparam dma_pkt_width_lp = `bsg_cache_dma_pkt_width(cache_addr_width_lp);
-
-    logic [S:N][num_tiles_x_p-1:0][dma_pkt_width_lp-1:0] dma_pkt;
-    logic [S:N][num_tiles_x_p-1:0] dma_pkt_v_lo;
-    logic [S:N][num_tiles_x_p-1:0] dma_pkt_yumi_li;
-
-    logic [S:N][num_tiles_x_p-1:0][vcache_dma_data_width_p-1:0] dma_data_li;
-    logic [S:N][num_tiles_x_p-1:0] dma_data_v_li;
-    logic [S:N][num_tiles_x_p-1:0] dma_data_ready_lo;
-
-    logic [S:N][num_tiles_x_p-1:0][vcache_dma_data_width_p-1:0] dma_data_lo;
-    logic [S:N][num_tiles_x_p-1:0] dma_data_v_lo;
-    logic [S:N][num_tiles_x_p-1:0] dma_data_yumi_li;
-
-  end
-
-  // LEVEL 1
-  if (mem_cfg_lp[e_infinite_mem]) begin: lv1_infty
-    localparam infmem_els_lp = 1<<(bsg_max_epa_width_p-x_cord_width_lp-1);
-    for (genvar j = N; j <= S; j++) begin: y
-      for (genvar i = 0; i < num_tiles_x_p; i++) begin: x
-        bsg_nonsynth_mem_infinite #(
-          .data_width_p(data_width_p)
-          ,.addr_width_p(bsg_max_epa_width_p)
-          ,.mem_els_p(infmem_els_lp)
-          ,.x_cord_width_p(x_cord_width_lp)
-          ,.y_cord_width_p(y_cord_width_lp)
-          ,.id_p(num_tiles_x_p*j + i)
-        ) mem_infty (
-          .clk_i(core_clk)
-          ,.reset_i(reset_r[2])
-
-          ,.link_sif_i(ver_link_lo[j][i])
-          ,.link_sif_o(ver_link_li[j][i])
-        
-          ,.my_x_i((x_cord_width_lp)'(i))
-          ,.my_y_i((y_cord_width_lp)'(0))
-        );
-      end
-    end
-    
-    
-    bind bsg_nonsynth_mem_infinite infinite_mem_profiler #(
-      .data_width_p(data_width_p)
-      ,.addr_width_p(addr_width_p)
-      ,.x_cord_width_p(x_cord_width_p)
-      ,.y_cord_width_p(y_cord_width_p)
-    ) infinite_mem_prof (
-      .*
-      ,.global_ctr_i($root.spmd_testbench.global_ctr)
-      ,.print_stat_v_i($root.spmd_testbench.print_stat_v)
-      ,.print_stat_tag_i($root.spmd_testbench.print_stat_tag)
-    );
-
-  end
-  else if (mem_cfg_lp[e_vcache_blocking_axi4_nonsynth_mem]
-          | mem_cfg_lp[e_vcache_blocking_dmc_lpddr]
-          | mem_cfg_lp[e_vcache_blocking_dramsim3_hbm2]
-          ) begin: lv1_vcache
-
-    for (genvar j = N; j <= S; j++) begin: y
-      for (genvar i = 0; i < num_tiles_x_p; i++) begin: x
-        bsg_manycore_vcache_blocking #(
-          .data_width_p(data_width_p)
-          ,.addr_width_p(bsg_max_epa_width_p)
-          ,.block_size_in_words_p(vcache_block_size_in_words_p)
-          ,.sets_p(vcache_sets_p)
-          ,.ways_p(vcache_ways_p)
-          ,.dma_data_width_p(vcache_dma_data_width_p)
-          ,.x_cord_width_p(x_cord_width_lp)
-          ,.y_cord_width_p(y_cord_width_lp)
-        ) vcache (
-          .clk_i(core_clk)
-          ,.reset_i(reset_r[1])
-
-          ,.link_sif_i(ver_link_lo[j][i])
-          ,.link_sif_o(ver_link_li[j][i])
-  
-          ,.dma_pkt_o(lv1_dma.dma_pkt[j][i])
-          ,.dma_pkt_v_o(lv1_dma.dma_pkt_v_lo[j][i])
-          ,.dma_pkt_yumi_i(lv1_dma.dma_pkt_yumi_li[j][i])
-
-          ,.dma_data_i(lv1_dma.dma_data_li[j][i])
-          ,.dma_data_v_i(lv1_dma.dma_data_v_li[j][i])
-          ,.dma_data_ready_o(lv1_dma.dma_data_ready_lo[j][i])
-
-          ,.dma_data_o(lv1_dma.dma_data_lo[j][i])
-          ,.dma_data_v_o(lv1_dma.dma_data_v_lo[j][i])
-          ,.dma_data_yumi_i(lv1_dma.dma_data_yumi_li[j][i])
-        );
-      end
-    end
-
-    bind bsg_cache vcache_profiler #(
-      .data_width_p(data_width_p)
-      ,.addr_width_p(addr_width_p)
-      ,.header_print_p("y[3].x[0]")
-      ,.ways_p(ways_p)
-    ) vcache_prof (
-      // everything else
-      .*
-      // bsg_cache_miss
-      ,.chosen_way_n(miss.chosen_way_n)
-      // from spmd_testbench
-      ,.global_ctr_i($root.spmd_testbench.global_ctr)
-      ,.print_stat_v_i($root.spmd_testbench.print_stat_v)
-      ,.print_stat_tag_i($root.spmd_testbench.print_stat_tag)
-      ,.trace_en_i($root.spmd_testbench.trace_en)
-    );
-
-
-  end
-  else if (mem_cfg_lp[e_vcache_non_blocking_axi4_nonsynth_mem]
-          |mem_cfg_lp[e_vcache_non_blocking_dmc_lpddr]) begin: lv1_vcache_nb
-
-    for (genvar j = N; j <= S; j++) begin: y
-      for (genvar i = 0; i < num_tiles_x_p; i++) begin: x
-        bsg_manycore_vcache_non_blocking #(
-          .data_width_p(data_width_p)
-          ,.addr_width_p(bsg_max_epa_width_p)
-          ,.block_size_in_words_p(vcache_block_size_in_words_p)
-          ,.sets_p(vcache_sets_p)
-          ,.ways_p(vcache_ways_p)
-          ,.miss_fifo_els_p(vcache_miss_fifo_els_p)
-          ,.x_cord_width_p(x_cord_width_lp)
-          ,.y_cord_width_p(y_cord_width_lp)
-        ) vcache (
-          .clk_i(core_clk)
-          ,.reset_i(reset_r[1])
-
-          ,.link_sif_i(ver_link_lo[j][i])
-          ,.link_sif_o(ver_link_li[j][i])
-
-          ,.dma_pkt_o(lv1_dma.dma_pkt[j][i])
-          ,.dma_pkt_v_o(lv1_dma.dma_pkt_v_lo[j][i])
-          ,.dma_pkt_yumi_i(lv1_dma.dma_pkt_yumi_li[j][i])
-
-          ,.dma_data_i(lv1_dma.dma_data_li[j][i])
-          ,.dma_data_v_i(lv1_dma.dma_data_v_li[j][i])
-          ,.dma_data_ready_o(lv1_dma.dma_data_ready_lo[j][i])
-
-          ,.dma_data_o(lv1_dma.dma_data_lo[j][i])
-          ,.dma_data_v_o(lv1_dma.dma_data_v_lo[j][i])
-          ,.dma_data_yumi_i(lv1_dma.dma_data_yumi_li[j][i])
-        );
-      end
-    end
-
-    bind bsg_cache_non_blocking vcache_non_blocking_profiler #(
-      .data_width_p(data_width_p)
-      ,.addr_width_p(addr_width_p)
-      ,.sets_p(sets_p)
-      ,.ways_p(ways_p)
-      ,.id_width_p(id_width_p)
-      ,.block_size_in_words_p(block_size_in_words_p)
-      ,.header_print_p("y[3].x[0]")
-    ) vcache_prof (
-      .*
-      ,.replacement_dirty(mhu0.replacement_dirty)
-      ,.replacement_valid(mhu0.replacement_valid)
-      ,.global_ctr_i($root.spmd_testbench.global_ctr)
-      ,.print_stat_v_i($root.spmd_testbench.print_stat_v)
-      ,.print_stat_tag_i($root.spmd_testbench.print_stat_tag)
-    );
-  end
-
-  
-  // LEVEL 2
-  //
-  if (mem_cfg_lp[e_vcache_blocking_axi4_nonsynth_mem]
-      | mem_cfg_lp[e_vcache_non_blocking_axi4_nonsynth_mem]) begin: lv2_axi4
-
-    logic [S:N][axi_id_width_p-1:0] axi_awid;
-    logic [S:N][axi_addr_width_p-1:0] axi_awaddr;
-    logic [S:N][7:0] axi_awlen;
-    logic [S:N][2:0] axi_awsize;
-    logic [S:N][1:0] axi_awburst;
-    logic [S:N][3:0] axi_awcache;
-    logic [S:N][2:0] axi_awprot;
-    logic [S:N] axi_awlock;
-    logic [S:N] axi_awvalid;
-    logic [S:N] axi_awready;
-
-    logic [S:N][axi_data_width_p-1:0] axi_wdata;
-    logic [S:N][axi_strb_width_lp-1:0] axi_wstrb;
-    logic [S:N] axi_wlast;
-    logic [S:N] axi_wvalid;
-    logic [S:N] axi_wready;
-
-    logic [S:N][axi_id_width_p-1:0] axi_bid;
-    logic [S:N][1:0] axi_bresp;
-    logic [S:N] axi_bvalid;
-    logic [S:N] axi_bready;
-
-    logic [S:N][axi_id_width_p-1:0] axi_arid;
-    logic [S:N][axi_addr_width_p-1:0] axi_araddr;
-    logic [S:N][7:0] axi_arlen;
-    logic [S:N][2:0] axi_arsize;
-    logic [S:N][1:0] axi_arburst;
-    logic [S:N][3:0] axi_arcache;
-    logic [S:N][2:0] axi_arprot;
-    logic [S:N] axi_arlock;
-    logic [S:N] axi_arvalid;
-    logic [S:N] axi_arready;
-
-    logic [S:N][axi_id_width_p-1:0] axi_rid;
-    logic [S:N][axi_data_width_p-1:0] axi_rdata;
-    logic [S:N][1:0] axi_rresp;
-    logic [S:N] axi_rlast;
-    logic [S:N] axi_rvalid;
-    logic [S:N] axi_rready;
-
-    for (genvar i = N; i <= S; i++) begin
-      // bsg_cache_to_axi_hashed does not support dma_data_width_p yet.
-      // For this configuration, we just expect dma_data_width_p to be 32.
-      bsg_cache_to_axi_hashed #(
-        .addr_width_p(cache_addr_width_lp)
-        ,.block_size_in_words_p(vcache_block_size_in_words_p)
-        ,.data_width_p(data_width_p)
-        ,.num_cache_p(num_tiles_x_p)
 
-        ,.axi_id_width_p(axi_id_width_p)
-        ,.axi_addr_width_p(axi_addr_width_p)
-        ,.axi_data_width_p(axi_data_width_p)
-        ,.axi_burst_len_p(axi_burst_len_p)
-      ) cache_to_axi0 (
-        .clk_i(core_clk)
-        ,.reset_i(reset_r[2])
-
-        ,.dma_pkt_i(lv1_dma.dma_pkt[i])
-        ,.dma_pkt_v_i(lv1_dma.dma_pkt_v_lo[i])
-        ,.dma_pkt_yumi_o(lv1_dma.dma_pkt_yumi_li[i])
-
-        ,.dma_data_o(lv1_dma.dma_data_li[i])
-        ,.dma_data_v_o(lv1_dma.dma_data_v_li[i])
-        ,.dma_data_ready_i(lv1_dma.dma_data_ready_lo[i])
-
-        ,.dma_data_i(lv1_dma.dma_data_lo[i])
-        ,.dma_data_v_i(lv1_dma.dma_data_v_lo[i])
-        ,.dma_data_yumi_o(lv1_dma.dma_data_yumi_li[i])
-
-        ,.axi_awid_o(axi_awid[i])
-        ,.axi_awaddr_o(axi_awaddr[i])
-        ,.axi_awlen_o(axi_awlen[i])
-        ,.axi_awsize_o(axi_awsize[i])
-        ,.axi_awburst_o(axi_awburst[i])
-        ,.axi_awcache_o(axi_awcache[i])
-        ,.axi_awprot_o(axi_awprot[i])
-        ,.axi_awlock_o(axi_awlock[i])
-        ,.axi_awvalid_o(axi_awvalid[i])
-        ,.axi_awready_i(axi_awready[i])
-
-        ,.axi_wdata_o(axi_wdata[i])
-        ,.axi_wstrb_o(axi_wstrb[i])
-        ,.axi_wlast_o(axi_wlast[i])
-        ,.axi_wvalid_o(axi_wvalid[i])
-        ,.axi_wready_i(axi_wready[i])
-
-        ,.axi_bid_i(axi_bid[i])
-        ,.axi_bresp_i(axi_bresp[i])
-        ,.axi_bvalid_i(axi_bvalid[i])
-        ,.axi_bready_o(axi_bready[i])
-
-        ,.axi_arid_o(axi_arid[i])
-        ,.axi_araddr_o(axi_araddr[i])
-        ,.axi_arlen_o(axi_arlen[i])
-        ,.axi_arsize_o(axi_arsize[i])
-        ,.axi_arburst_o(axi_arburst[i])
-        ,.axi_arcache_o(axi_arcache[i])
-        ,.axi_arprot_o(axi_arprot[i])
-        ,.axi_arlock_o(axi_arlock[i])
-        ,.axi_arvalid_o(axi_arvalid[i])
-        ,.axi_arready_i(axi_arready[i])
-
-        ,.axi_rid_i(axi_rid[i])
-        ,.axi_rdata_i(axi_rdata[i])
-        ,.axi_rresp_i(axi_rresp[i])
-        ,.axi_rlast_i(axi_rlast[i])
-        ,.axi_rvalid_i(axi_rvalid[i])
-        ,.axi_rready_o(axi_rready[i])
-      );
-    end
-  end
-  else if (mem_cfg_lp[e_vcache_blocking_dmc_lpddr]
-          | mem_cfg_lp[e_vcache_non_blocking_dmc_lpddr]) begin: lv2_dmc
-
-    logic [S:N] app_en;
-    logic [S:N] app_rdy;
-    logic [S:N][2:0] app_cmd;
-    logic [S:N][dram_ctrl_addr_width_p-1:0] app_addr;
-
-    logic [S:N] app_wdf_wren;
-    logic [S:N] app_wdf_rdy;
-    logic [S:N][data_width_p-1:0] app_wdf_data;
-    logic [S:N][data_mask_width_lp-1:0] app_wdf_mask;
-    logic [S:N] app_wdf_end;
-
-    logic [S:N] app_rd_data_valid;
-    logic [S:N][data_width_p-1:0] app_rd_data;
-    logic [S:N] app_rd_data_end;
-
-    for (genvar i = N; i <= S; i++) begin: cache_to_dmc
-      bsg_cache_to_dram_ctrl #(
-        .num_cache_p(num_tiles_x_p)
-        ,.addr_width_p(cache_addr_width_lp)
-        ,.data_width_p(data_width_p)
-        ,.block_size_in_words_p(vcache_block_size_in_words_p)
-        ,.dram_ctrl_burst_len_p(vcache_block_size_in_words_p)
-        ,.dram_ctrl_addr_width_p(dram_ctrl_addr_width_p)
-      ) cache_to_dram_ctrl (
-        .clk_i(core_clk)
-        ,.reset_i(reset_r[2])
-
-        ,.dram_size_i(3'b100) // 4Gb
-    
-        ,.dma_pkt_i(lv1_dma.dma_pkt[i])
-        ,.dma_pkt_v_i(lv1_dma.dma_pkt_v_lo[i])
-        ,.dma_pkt_yumi_o(lv1_dma.dma_pkt_yumi_li[i])
-
-        ,.dma_data_o(lv1_dma.dma_data_li[i])
-        ,.dma_data_v_o(lv1_dma.dma_data_v_li[i])
-        ,.dma_data_ready_i(lv1_dma.dma_data_ready_lo[i])
-
-        ,.dma_data_i(lv1_dma.dma_data_lo[i])
-        ,.dma_data_v_i(lv1_dma.dma_data_v_lo[i])
-        ,.dma_data_yumi_o(lv1_dma.dma_data_yumi_li[i])
-
-        ,.app_en_o(app_en[i])
-        ,.app_rdy_i(app_rdy[i])
-        ,.app_cmd_o(app_cmd[i])
-        ,.app_addr_o(app_addr[i])
-  
-        ,.app_wdf_wren_o(app_wdf_wren[i])
-        ,.app_wdf_rdy_i(app_wdf_rdy[i])
-        ,.app_wdf_data_o(app_wdf_data[i])
-        ,.app_wdf_mask_o(app_wdf_mask[i])
-        ,.app_wdf_end_o(app_wdf_end[i])
-
-        ,.app_rd_data_valid_i(app_rd_data_valid[i])
-        ,.app_rd_data_i(app_rd_data[i])
-        ,.app_rd_data_end_i(app_rd_data_end[i])
-      );
-    end
-  end
-  else if (mem_cfg_lp[e_vcache_blocking_dramsim3_hbm2]) begin: lv2_hbm2
-
-    
-    typedef struct packed {
-      logic [1:0] bg;
-      logic [1:0] ba;
-      logic [14:0] ro;
-      logic [5:0] co;
-      logic [4:0] byte_offset;
-    } dram_ch_addr_s; 
-  
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_req_v_lo;
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_write_not_read_lo;
-    dram_ch_addr_s [S:N][hbm2_num_channel_per_side_p-1:0] dram_ch_addr_lo;
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_req_yumi_li;
-
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_data_v_lo;
-    logic [S:N][hbm2_num_channel_per_side_p-1:0][hbm2_data_width_p-1:0] dram_data_lo;
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_data_yumi_li;
-    
-    logic [S:N][hbm2_num_channel_per_side_p-1:0] dram_data_v_li;
-    logic [S:N][hbm2_num_channel_per_side_p-1:0][hbm2_data_width_p-1:0] dram_data_li;
-    dram_ch_addr_s [S:N][hbm2_num_channel_per_side_p-1:0] dram_ch_addr_li;
-    
- 
-    for (genvar i = N; i <= S; i++) begin
-      for (genvar j = 0; j < hbm2_num_channel_per_side_p; j++) begin
-        bsg_cache_to_test_dram #(
-          .num_cache_p(hbm2_num_cache_per_channel_p)
-          ,.addr_width_p(cache_addr_width_lp)
-          ,.data_width_p(data_width_p)
-          ,.block_size_in_words_p(vcache_block_size_in_words_p)
-          ,.cache_bank_addr_width_p(cache_bank_addr_width_lp) // byte addr
-          ,.dma_data_width_p(vcache_dma_data_width_p)
-
-          ,.dram_channel_addr_width_p(hbm2_channel_addr_width_p)
-          ,.dram_data_width_p(hbm2_data_width_p)
-        ) cache_to_test_dram0 (
-          .core_clk_i(core_clk)
-          ,.core_reset_i(reset_r[2])
-      
-          ,.dma_pkt_i(lv1_dma.dma_pkt[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_pkt_v_i(lv1_dma.dma_pkt_v_lo[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_pkt_yumi_o(lv1_dma.dma_pkt_yumi_li[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-
-          ,.dma_data_o(lv1_dma.dma_data_li[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_data_v_o(lv1_dma.dma_data_v_li[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_data_ready_i(lv1_dma.dma_data_ready_lo[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-
-          ,.dma_data_i(lv1_dma.dma_data_lo[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_data_v_i(lv1_dma.dma_data_v_lo[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-          ,.dma_data_yumi_o(lv1_dma.dma_data_yumi_li[i][j*hbm2_num_cache_per_channel_p+:hbm2_num_cache_per_channel_p])
-
-          ,.dram_clk_i(core_clk)
-          ,.dram_reset_i(reset_r[2])
-    
-          ,.dram_req_v_o(dram_req_v_lo[i][j])
-          ,.dram_write_not_read_o(dram_write_not_read_lo[i][j])
-          ,.dram_ch_addr_o(dram_ch_addr_lo[i][j])
-          ,.dram_req_yumi_i(dram_req_yumi_li[i][j])
-
-          ,.dram_data_v_o(dram_data_v_lo[i][j])
-          ,.dram_data_o(dram_data_lo[i][j])
-          ,.dram_data_yumi_i(dram_data_yumi_li[i][j])
-
-          ,.dram_data_v_i(dram_data_v_li[i][j])
-          ,.dram_data_i(dram_data_li[i][j])
-          ,.dram_ch_addr_i(dram_ch_addr_li[i][j])
-        );
-      end
-    end
-  end
-
-
-  // LEVEL 3
-  //
-  if (mem_cfg_lp[e_vcache_blocking_axi4_nonsynth_mem]
-     | mem_cfg_lp[e_vcache_non_blocking_axi4_nonsynth_mem]) begin: lv3_axi_mem
-
-    for (genvar i = N; i <= S; i++) begin
-      bsg_nonsynth_manycore_axi_mem #(
-        .axi_id_width_p(axi_id_width_p)
-        ,.axi_addr_width_p(axi_addr_width_p)
-        ,.axi_data_width_p(axi_data_width_p)
-        ,.axi_burst_len_p(axi_burst_len_p)
-        ,.mem_els_p(bsg_dram_size_p/(2*axi_data_width_p/data_width_p))
-        ,.bsg_dram_included_p(bsg_dram_included_p)
-      ) axi_mem0 (
-        .clk_i(core_clk)
-        ,.reset_i(reset_r[2])
-
-        ,.axi_awid_i(lv2_axi4.axi_awid[i])
-        ,.axi_awaddr_i(lv2_axi4.axi_awaddr[i])
-        ,.axi_awvalid_i(lv2_axi4.axi_awvalid[i])
-        ,.axi_awready_o(lv2_axi4.axi_awready[i])
-
-        ,.axi_wdata_i(lv2_axi4.axi_wdata[i])
-        ,.axi_wstrb_i(lv2_axi4.axi_wstrb[i])
-        ,.axi_wlast_i(lv2_axi4.axi_wlast[i])
-        ,.axi_wvalid_i(lv2_axi4.axi_wvalid[i])
-        ,.axi_wready_o(lv2_axi4.axi_wready[i])
-
-        ,.axi_bid_o(lv2_axi4.axi_bid[i])
-        ,.axi_bresp_o(lv2_axi4.axi_bresp[i])
-        ,.axi_bvalid_o(lv2_axi4.axi_bvalid[i])
-        ,.axi_bready_i(lv2_axi4.axi_bready[i])
-
-        ,.axi_arid_i(lv2_axi4.axi_arid[i])
-        ,.axi_araddr_i(lv2_axi4.axi_araddr[i])
-        ,.axi_arvalid_i(lv2_axi4.axi_arvalid[i])
-        ,.axi_arready_o(lv2_axi4.axi_arready[i])
-
-        ,.axi_rid_o(lv2_axi4.axi_rid[i])
-        ,.axi_rdata_o(lv2_axi4.axi_rdata[i])
-        ,.axi_rresp_o(lv2_axi4.axi_rresp[i])
-        ,.axi_rlast_o(lv2_axi4.axi_rlast[i])
-        ,.axi_rvalid_o(lv2_axi4.axi_rvalid[i])
-        ,.axi_rready_i(lv2_axi4.axi_rready[i])
-      );
-    end
-  end
-  else if (mem_cfg_lp[e_vcache_blocking_dmc_lpddr]
-          | mem_cfg_lp[e_vcache_non_blocking_dmc_lpddr]) begin: lv3_dmc
-
-    import bsg_dmc_pkg::*;
-
-    bsg_dmc_s dmc_p;
-    assign dmc_p.trefi = 16'd1023;
-    assign dmc_p.tmrd = 4'd1;
-    assign dmc_p.trfc = 4'd15;
-    assign dmc_p.trc = 4'd10;
-    assign dmc_p.trp = 4'd2;
-    assign dmc_p.tras = 4'd7;
-    assign dmc_p.trrd = 4'd1;
-    assign dmc_p.trcd = 4'd2;
-    assign dmc_p.twr = 4'd7;
-    assign dmc_p.twtr = 4'd7;
-    assign dmc_p.trtp = 4'd3;
-    assign dmc_p.tcas = 4'd3;
-    assign dmc_p.col_width = 4'd11;
-    assign dmc_p.row_width = 4'd14;
-    assign dmc_p.bank_width = 2'd2;
-    assign dmc_p.dqs_sel_cal = 2'd3;
-    assign dmc_p.init_cmd_cnt = 4'd5;
-
-    localparam ui_addr_width_p = 27; // word address (512 MB)
-    localparam ui_data_width_p = data_width_p;
-    localparam burst_data_width_p = data_width_p * vcache_block_size_in_words_p;
-    localparam dq_data_width_p = data_width_p;
-    localparam dq_group_lp = dq_data_width_p >> 3;
-
-    localparam dfi_clk_period_p = 5000;     // 200 MHz
-    localparam dfi_clk_2x_period_p = 2500;  // 400 MHz
-
-    bit dfi_clk;
-    bit dfi_clk_2x;
-
-    bsg_nonsynth_clock_gen #(
-      .cycle_time_p(dfi_clk_period_p)
-    ) dfi_cg (
-      .o(dfi_clk)
-    );
-    
-    bsg_nonsynth_clock_gen #(
-      .cycle_time_p(dfi_clk_2x_period_p)
-    ) dfi_2x_cg (
-      .o(dfi_clk_2x)
-    );
-
-    wire [S:N] ddr_ck_p;
-    wire [S:N] ddr_ck_n;
-    wire [S:N] ddr_cke;
-    wire [S:N] ddr_cs_n;
-    wire [S:N] ddr_ras_n;
-    wire [S:N] ddr_cas_n;
-    wire [S:N] ddr_we_n;
-    wire [S:N][2:0] ddr_ba;
-    wire [S:N][15:0] ddr_addr;
-
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dm_oen_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dm_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_p_oen_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_p_ien_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_p_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_p_li;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_n_oen_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_n_ien_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_n_lo;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_n_li;
-    wire [S:N][dq_data_width_p-1:0] ddr_dq_oen_lo;
-    wire [S:N][dq_data_width_p-1:0] ddr_dq_lo;
-    wire [S:N][dq_data_width_p-1:0] ddr_dq_li;
-  
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dm;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_p;
-    wire [S:N][(dq_data_width_p>>3)-1:0] ddr_dqs_n;
-    wire [S:N][dq_data_width_p-1:0] ddr_dq;
-   
-    for (genvar j = N; j <= S; j++) begin 
-      bsg_dmc #(
-        .ui_addr_width_p(ui_addr_width_p)
-        ,.ui_data_width_p(ui_data_width_p)
-        ,.burst_data_width_p(burst_data_width_p)
-        ,.dq_data_width_p(dq_data_width_p)
-      ) dmc (
-        .dmc_p_i(dmc_p)
-        ,.sys_rst_i(reset_r[2])
-
-        ,.app_addr_i(lv2_dmc.app_addr[j][2+:ui_addr_width_p]) // word_address
-        ,.app_cmd_i(lv2_dmc.app_cmd[j])
-        ,.app_en_i(lv2_dmc.app_en[j])
-        ,.app_rdy_o(lv2_dmc.app_rdy[j])
-
-        ,.app_wdf_wren_i(lv2_dmc.app_wdf_wren[j])
-        ,.app_wdf_data_i(lv2_dmc.app_wdf_data[j])
-        ,.app_wdf_mask_i(lv2_dmc.app_wdf_mask[j])
-        ,.app_wdf_end_i(lv2_dmc.app_wdf_end[j])
-        ,.app_wdf_rdy_o(lv2_dmc.app_wdf_rdy[j])
-
-        ,.app_rd_data_valid_o(lv2_dmc.app_rd_data_valid[j])
-        ,.app_rd_data_o(lv2_dmc.app_rd_data[j])
-        ,.app_rd_data_end_o(lv2_dmc.app_rd_data_end[j])
-
-        ,.app_ref_req_i(1'b0)
-        ,.app_ref_ack_o()
-        ,.app_zq_req_i(1'b0)
-        ,.app_zq_ack_o()
-        ,.app_sr_req_i(1'b0)
-        ,.app_sr_active_o()
-
-        ,.init_calib_complete_o()
-
-        ,.ddr_ck_p_o(ddr_ck_p[j])
-        ,.ddr_ck_n_o(ddr_ck_n[j])
-        ,.ddr_cke_o(ddr_cke[j])
-        ,.ddr_ba_o(ddr_ba[j])
-        ,.ddr_addr_o(ddr_addr[j])
-        ,.ddr_cs_n_o(ddr_cs_n[j])
-        ,.ddr_ras_n_o(ddr_ras_n[j])
-        ,.ddr_cas_n_o(ddr_cas_n[j])
-        ,.ddr_we_n_o(ddr_we_n[j])
-        ,.ddr_reset_n_o()
-        ,.ddr_odt_o()
-
-        ,.ddr_dm_oen_o(ddr_dm_oen_lo[j])
-        ,.ddr_dm_o(ddr_dm_lo[j])
-        ,.ddr_dqs_p_oen_o(ddr_dqs_p_oen_lo[j])
-        ,.ddr_dqs_p_ien_o(ddr_dqs_p_ien_lo[j])
-        ,.ddr_dqs_p_o(ddr_dqs_p_lo[j])
-        ,.ddr_dqs_p_i(ddr_dqs_p_li[j])
-
-        ,.ddr_dqs_n_oen_o()
-        ,.ddr_dqs_n_ien_o()
-        ,.ddr_dqs_n_o()
-        ,.ddr_dqs_n_i()
-
-        ,.ddr_dq_oen_o(ddr_dq_oen_lo[j])
-        ,.ddr_dq_o(ddr_dq_lo[j])
-        ,.ddr_dq_i(ddr_dq_li[j])
-
-        ,.ui_clk_i(core_clk)
-
-        ,.dfi_clk_2x_i(~dfi_clk_2x) // invert this clk, so the posedge of 1x and 2x clk are aligned.
-        ,.dfi_clk_i(dfi_clk)
-
-        ,.ui_clk_sync_rst_o()
-        ,.device_temp_o()
-      );    
-
-      for (genvar i = 0; i < 2; i++) begin
-        mobile_ddr ddr_inst (
-          .Dq(ddr_dq[j][16*i+:16])
-          ,.Dqs(ddr_dqs_p[j][2*i+:2])
-          ,.Addr(ddr_addr[j][13:0])
-          ,.Ba(ddr_ba[j][1:0])
-          ,.Clk(ddr_ck_p[j])
-          ,.Clk_n(ddr_ck_n[j])
-          ,.Cke(ddr_cke[j])
-          ,.Cs_n(ddr_cs_n[j])
-          ,.Ras_n(ddr_ras_n[j])
-          ,.Cas_n(ddr_cas_n[j])
-          ,.We_n(ddr_we_n[j])
-          ,.Dm(ddr_dm[j][2*i+:2])
-        );
-      end
-
-      for (genvar i = 0; i< dq_group_lp; i++) begin
-        assign ddr_dm[j][i] = ddr_dm_oen_lo[j][i] ? 1'bz : ddr_dm_lo[j][i];
-        assign ddr_dqs_p[j][i] = ddr_dqs_p_oen_lo[j][i] ? 1'bz : ddr_dqs_p_lo[j][i];
-        assign ddr_dqs_p_li[j][i] = ddr_dqs_p_ien_lo[j][i] ? 1'b1 : ddr_dqs_p[j][i];
-      end
-
-      for (genvar i = 0; i < dq_data_width_p; i++) begin
-        assign ddr_dq[j][i] = ddr_dq_oen_lo[j][i] ? 1'bz : ddr_dq_lo[j][i];
-        assign ddr_dq_li[j][i] = ddr_dq[j][i];
-      end
-    end
-  end
-  else if (mem_cfg_lp[e_vcache_blocking_dramsim3_hbm2]) begin: lv3_hbm2
-
-    typedef struct packed {
-      logic [14:0] ro;
-      logic [1:0] bg;
-      logic [1:0] ba;
-      logic [5:0] co;
-      logic [4:0] byte_offset;
-    } dram_ch_addr_rev_s;
-
-    logic [hbm2_num_channels_p-1:0] dramsim3_v_li;
-    logic [hbm2_num_channels_p-1:0] dramsim3_write_not_read_li;
-    logic [hbm2_num_channels_p-1:0][hbm2_channel_addr_width_p-1:0] dramsim3_ch_addr_li;
-    logic [hbm2_num_channels_p-1:0] dramsim3_yumi_lo;
-
-    logic [hbm2_num_channels_p-1:0][hbm2_data_width_p-1:0] dramsim3_data_li;
-    logic [hbm2_num_channels_p-1:0] dramsim3_data_v_li;
-    logic [hbm2_num_channels_p-1:0] dramsim3_data_yumi_lo;
-
-    logic [hbm2_num_channels_p-1:0][hbm2_data_width_p-1:0] dramsim3_data_lo;
-    logic [hbm2_num_channels_p-1:0] dramsim3_data_v_lo;
-    dram_ch_addr_rev_s [hbm2_num_channels_p-1:0] dramsim3_ch_addr_lo;
-    
-    bsg_nonsynth_dramsim3 #(
-      .channel_addr_width_p(hbm2_channel_addr_width_p)
-      ,.data_width_p(hbm2_data_width_p)
-      ,.num_channels_p(hbm2_num_channels_p)
-      ,.num_columns_p(`dram_pkg::num_columns_p)
-      ,.num_rows_p(`dram_pkg::num_rows_p)
-      ,.num_ba_p(`dram_pkg::num_ba_p)
-      ,.num_bg_p(`dram_pkg::num_bg_p)
-      ,.num_ranks_p(`dram_pkg::num_ranks_p)
-      ,.address_mapping_p(`dram_pkg::address_mapping_p)
-      ,.size_in_bits_p(`dram_pkg::size_in_bits_p)
-      ,.config_p(`dram_pkg::config_p)
-      ,.init_mem_p(1)
-    ) hbm0 (
-      .clk_i(core_clk)
-      ,.reset_i(reset_r[2])
-    
-      ,.v_i(dramsim3_v_li)
-      ,.write_not_read_i(dramsim3_write_not_read_li)
-      ,.ch_addr_i(dramsim3_ch_addr_li)
-      ,.mask_i('1)
-      ,.yumi_o(dramsim3_yumi_lo)
-
-      ,.data_v_i(dramsim3_data_v_li)
-      ,.data_i(dramsim3_data_li)
-      ,.data_yumi_o(dramsim3_data_yumi_lo)
-
-      ,.data_v_o(dramsim3_data_v_lo)
-      ,.data_o(dramsim3_data_lo)
-      ,.read_done_ch_addr_o(dramsim3_ch_addr_lo)
-
-      ,.write_done_o()
-      ,.write_done_ch_addr_o()
-    );
-
-    for (genvar i = N; i <= S; i++) begin
-      for (genvar j = 0; j < hbm2_num_channel_per_side_p; j++) begin
-        localparam ch_idx_lp = j+(hbm2_num_channel_per_side_p*(i-N));
-
-        assign dramsim3_v_li[ch_idx_lp] = lv2_hbm2.dram_req_v_lo[i][j];
-        assign dramsim3_write_not_read_li[ch_idx_lp] = lv2_hbm2.dram_write_not_read_lo[i][j];
-        assign dramsim3_ch_addr_li[ch_idx_lp] = {
-          lv2_hbm2.dram_ch_addr_lo[i][j].ro,
-          lv2_hbm2.dram_ch_addr_lo[i][j].bg,
-          lv2_hbm2.dram_ch_addr_lo[i][j].ba,
-          lv2_hbm2.dram_ch_addr_lo[i][j].co,
-          lv2_hbm2.dram_ch_addr_lo[i][j].byte_offset
-        };
-        assign lv2_hbm2.dram_req_yumi_li[i][j] = dramsim3_yumi_lo[ch_idx_lp];
-
-        assign dramsim3_data_v_li[ch_idx_lp] = lv2_hbm2.dram_data_v_lo[i][j];
-        assign dramsim3_data_li[ch_idx_lp] = lv2_hbm2.dram_data_lo[i][j];
-        assign lv2_hbm2.dram_data_yumi_li[i][j] = dramsim3_data_yumi_lo[ch_idx_lp];
-
-        assign lv2_hbm2.dram_data_v_li[i][j] = dramsim3_data_v_lo[ch_idx_lp];
-        assign lv2_hbm2.dram_data_li[i][j] = dramsim3_data_lo[ch_idx_lp];
-        assign lv2_hbm2.dram_ch_addr_li[i][j] = {
-          dramsim3_ch_addr_lo[ch_idx_lp].bg,
-          dramsim3_ch_addr_lo[ch_idx_lp].ba,
-          dramsim3_ch_addr_lo[ch_idx_lp].ro,
-          dramsim3_ch_addr_lo[ch_idx_lp].co,
-          dramsim3_ch_addr_lo[ch_idx_lp].byte_offset
-        };
-      end
-    end
-  end
-
-
- 
-  // vanilla core tracer
-  //
+  // trace enable
   int status;
   int trace_arg;
   logic trace_en;
-
   initial begin
     status = $value$plusargs("vanilla_trace_en=%d", trace_arg);
     assign trace_en = (trace_arg == 1);
   end
-  bind vanilla_core vanilla_core_trace #(
-    .x_cord_width_p(x_cord_width_p)
-    ,.y_cord_width_p(y_cord_width_p)
-    ,.icache_tag_width_p(icache_tag_width_p)
-    ,.icache_entries_p(icache_entries_p)
-    ,.data_width_p(data_width_p)
-    ,.dmem_size_p(dmem_size_p)
-  ) vtrace (
-    .*
-    ,.trace_en_i($root.spmd_testbench.trace_en)
-  );
-
-  bind vanilla_core instr_trace #(
-    .x_cord_width_p(x_cord_width_p)
-    ,.y_cord_width_p(y_cord_width_p)
-  ) itrace(
-    .*
-    ,.trace_en_i($root.spmd_testbench.trace_en)
-  );
-
-  // profiler
-  //
-  bind vanilla_core vanilla_core_profiler #(
-    .x_cord_width_p(x_cord_width_p)
-    ,.y_cord_width_p(y_cord_width_p)
-    ,.icache_tag_width_p(icache_tag_width_p)
-    ,.icache_entries_p(icache_entries_p)
-    ,.data_width_p(data_width_p)
-    ,.origin_x_cord_p(`ORIGIN_X_CORD_P)
-    ,.origin_y_cord_p(`ORIGIN_Y_CORD_P)
-  ) vcore_prof (
-    .*
-    ,.global_ctr_i($root.spmd_testbench.global_ctr)
-    ,.print_stat_v_i($root.spmd_testbench.print_stat_v)
-    ,.print_stat_tag_i($root.spmd_testbench.print_stat_tag)
-    ,.trace_en_i($root.spmd_testbench.trace_en)
-  );
-
-
-  // router profiler
-  if (network_cfg_lp[e_network_mesh] | network_cfg_lp[e_network_half_ruche_x]) begin
-    bind bsg_mesh_router router_profiler #(
-      .x_cord_width_p(x_cord_width_p)
-      ,.y_cord_width_p(y_cord_width_p)
-      ,.dims_p(dims_p)
-      ,.XY_order_p(XY_order_p)
-      ,.origin_x_cord_p(`ORIGIN_X_CORD_P)
-      ,.origin_y_cord_p(`ORIGIN_Y_CORD_P)
-    ) rp0 (
-      .*
-      ,.global_ctr_i($root.spmd_testbench.global_ctr)
-      ,.trace_en_i($root.spmd_testbench.trace_en)
-      ,.print_stat_v_i($root.spmd_testbench.print_stat_v)
-    );
-
-  end
 
-
-  // remote load tracer
-  bind network_tx remote_load_trace #(
-    .addr_width_p(addr_width_p)
-    ,.data_width_p(data_width_p)
-    ,.x_cord_width_p(x_cord_width_p)
-    ,.y_cord_width_p(y_cord_width_p)
-    ,.origin_x_cord_p(`ORIGIN_X_CORD_P)
-    ,.origin_y_cord_p(`ORIGIN_Y_CORD_P)
-  ) rlt (
-    .*
-    ,.global_ctr_i($root.spmd_testbench.global_ctr)
-    ,.trace_en_i($root.spmd_testbench.trace_en)
+  // global counter
+  logic [31:0] global_ctr;
+  bsg_cycle_counter global_cc (
+    .clk_i(core_clk)
+    ,.reset_i(reset_r)
+    ,.ctr_r_o(global_ctr)
   );
 
 
-  // tieoffs
-  //
-  for (genvar i = 0; i < num_tiles_y_p; i++) begin: we_tieoff
-
-    bsg_manycore_link_sif_tieoff #(
-      .addr_width_p(bsg_max_epa_width_p)
-      ,.data_width_p(data_width_p)
-      ,.x_cord_width_p(x_cord_width_lp)
-      ,.y_cord_width_p(y_cord_width_lp)
-    ) tieoff_w (
-      .clk_i(core_clk)
-      ,.reset_i(reset_r[2])
-      ,.link_sif_i(hor_link_lo[W][i])
-      ,.link_sif_o(hor_link_li[W][i])
-    );
-
-    bsg_manycore_link_sif_tieoff #(
-      .addr_width_p(bsg_max_epa_width_p)
-      ,.data_width_p(data_width_p)
-      ,.x_cord_width_p(x_cord_width_lp)
-      ,.y_cord_width_p(y_cord_width_lp)
-    ) tieoff_e (
-      .clk_i(core_clk)
-      ,.reset_i(reset_r[2])
-      ,.link_sif_i(hor_link_lo[E][i])
-      ,.link_sif_o(hor_link_li[E][i])
-    );
-  end
-
-  for (genvar i = 1; i < num_tiles_x_p; i++) begin: io_tieoff
-    bsg_manycore_link_sif_tieoff #(
-      .addr_width_p(bsg_max_epa_width_p)
-      ,.data_width_p(data_width_p)
-      ,.x_cord_width_p(x_cord_width_lp)
-      ,.y_cord_width_p(y_cord_width_lp)
-    ) tieoff_io (
-      .clk_i(core_clk)
-      ,.reset_i(reset_r[2])
-      ,.link_sif_i(io_link_lo[i])
-      ,.link_sif_o(io_link_li[i])
-    );
-  end
-
-
-
-
 endmodule
-
-
diff --git a/testbenches/common/v/vanilla_core_profiler.v b/testbenches/common/v/vanilla_core_profiler.v
index 1cbdec7f7..e39a551b4 100644
--- a/testbenches/common/v/vanilla_core_profiler.v
+++ b/testbenches/common/v/vanilla_core_profiler.v
@@ -85,8 +85,8 @@ module vanilla_core_profiler
     , input exe_signals_s exe_r
     , input fp_exe_signals_s fp_exe_r
 
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
 
     , input [31:0] global_ctr_i
     , input print_stat_v_i
@@ -102,7 +102,7 @@ module vanilla_core_profiler
 
   // task to print a line of operation trace
   task print_operation_trace(integer fd, string op, logic [data_width_p-1:0] pc);
-    $fwrite(fd, "%0d,%0d,%0d,%0h,%s\n", global_ctr_i, my_x_i - origin_x_cord_p, my_y_i - origin_y_cord_p, pc, op);
+    $fwrite(fd, "%0d,%0d,%0d,%0h,%s\n", global_ctr_i, global_x_i - origin_x_cord_p, global_y_i - origin_y_cord_p, pc, op);
   endtask
 
 
@@ -177,8 +177,8 @@ module vanilla_core_profiler
 
   wire lr_inc = exe_r.decode.is_lr_op;
   wire lr_aq_inc = exe_r.decode.is_lr_aq_op;
-  wire amoswap_inc = exe_r.decode.is_amo_op & (exe_r.decode.amo_type == e_amo_swap);
-  wire amoor_inc = exe_r.decode.is_amo_op & (exe_r.decode.amo_type == e_amo_or);
+  wire amoswap_inc = exe_r.decode.is_amo_op & (exe_r.decode.amo_type == e_vanilla_amoswap);
+  wire amoor_inc = exe_r.decode.is_amo_op & (exe_r.decode.amo_type == e_vanilla_amoor);
 
   // branch & jump
   wire beq_inc = exe_r.decode.is_branch_op & (exe_r.instruction ==? `RV32_BEQ);
@@ -900,7 +900,7 @@ module vanilla_core_profiler
 
    always @(negedge reset_i) begin      
     // the origin tile opens the logfile and writes the csv header.
-    if ((my_x_i == x_cord_width_p'(origin_x_cord_p)) & (my_y_i == y_cord_width_p'(origin_y_cord_p))) begin
+    if ((global_x_i == x_cord_width_p'(origin_x_cord_p)) & (global_y_i == y_cord_width_p'(origin_y_cord_p))) begin
       fd = $fopen(logfile_lp, "a");
       $fwrite(fd, "time,");
       $fwrite(fd, "x,");
@@ -1057,13 +1057,13 @@ module vanilla_core_profiler
 
    always @(negedge clk_i)  begin
         // stat printing
-        if (~reset_i & print_stat_v_i & print_stat_tag.y_cord == my_y_i & print_stat_tag.x_cord == my_x_i) begin
-          $display("[BSG_INFO][VCORE_PROFILER] t=%0t x,y=%02d,%02d printing stats.", $time, my_x_i, my_y_i);
+        if (~reset_i & print_stat_v_i & print_stat_tag.y_cord == global_y_i & print_stat_tag.x_cord == global_x_i) begin
+          $display("[BSG_INFO][VCORE_PROFILER] t=%0t x,y=%02d,%02d printing stats.", $time, global_x_i, global_y_i);
 
           fd = $fopen(logfile_lp, "a");
           $fwrite(fd, "%0d,", $time);
-          $fwrite(fd, "%0d,", my_x_i - origin_x_cord_p);
-          $fwrite(fd, "%0d,", my_y_i - origin_y_cord_p);
+          $fwrite(fd, "%0d,", global_x_i - origin_x_cord_p);
+          $fwrite(fd, "%0d,", global_y_i - origin_y_cord_p);
           $fwrite(fd, "%0d,", pc_r);
           $fwrite(fd, "%0d,", pc_n);
           $fwrite(fd, "%0d,", print_stat_tag_i);
diff --git a/testbenches/common/v/vanilla_core_saif_dumper.v b/testbenches/common/v/vanilla_core_saif_dumper.v
new file mode 100644
index 000000000..249a33fcf
--- /dev/null
+++ b/testbenches/common/v/vanilla_core_saif_dumper.v
@@ -0,0 +1,80 @@
+/* 
+ * The vanilla_core_saif_dumper is a bind module that attaches to the
+ * vanilla core. It observes instructions in the execution stage and
+ * sets saif_en_o to 1 when SAIF_TRIGGER_START is executed (see
+ * v/vanilla_bean/bsg_vanilla_pkg.v for instruction definition) and sets
+ * saif_en_o to 0 when SAIF_TRIGGER_END is executed.
+ * 
+ * saif_en_i comes from the module that declares the bind statment. It
+ * should be 1 if any vanilla_core_saif_dumper module has saif_en_o
+ * set to 1 and 0 otherwise.
+ * 
+ * If SAIF_TRIGGER_START is executed and saif_en_i is 0, then this
+ * module calls functions to start tracking toggling for saif
+ * generation.
+ *
+ * If SAIF_TRIGGER_END is executed and saif_en_i is 0, then this
+ * module calls functions to write run.saif
+ *
+ */
+module vanilla_core_saif_dumper
+  import bsg_manycore_pkg::*;
+  import bsg_vanilla_pkg::*;
+  #(parameter debug_p = 1 // Turns on display statments
+    )
+  (input clk_i
+   , input reset_i
+   , input stall_all
+   , input exe_signals_s exe_r
+
+   , input  saif_en_i
+   , output logic saif_en_o
+   );
+
+   wire trigger_start = (exe_r.instruction ==? `SAIF_TRIGGER_START) & ~stall_all;
+   wire trigger_end = (exe_r.instruction ==? `SAIF_TRIGGER_END) & ~stall_all;
+
+   logic saif_en_r;
+
+   always @ (negedge clk_i) begin
+      if (reset_i) begin
+         saif_en_o <= 1'b0;
+         saif_en_r <= 1'b0;
+      end
+      else begin
+         saif_en_r <= saif_en_i;
+
+         if(trigger_start) begin
+            saif_en_o <= 1'b1;
+            if(debug_p)
+              $display("TRIGGER_START: i=%b, o=%b, r=%b t=%t (%m)",saif_en_i,saif_en_o, saif_en_o, $time);
+         end
+
+         if (trigger_end) begin
+            saif_en_o <= 1'b0;
+            if(debug_p)
+              $display("TRIGGER_END: i=%b,o=%b t=%t (%m)", saif_en_i, saif_en_o, $time);
+         end
+
+      end
+   end // always @ (posedge clk_i)
+
+   always @(posedge clk_i) begin
+      if(saif_en_i ^ saif_en_r) begin
+         if(trigger_start) begin
+            if(debug_p)
+              $display("TRIGGER_ON t=%t (%m)", $time);
+            $set_gate_level_monitoring("rtl_on", "sv");
+            $set_toggle_region(`HOST_MODULE_PATH.testbench.DUT);
+            $toggle_start();
+         end
+         if(trigger_end) begin
+            if(debug_p)
+              $display("TRIGGER_OFF t=%t (%m)", $time);
+            $toggle_stop();
+            $toggle_report("run.saif", 1.0e-12, `HOST_MODULE_PATH.testbench.DUT);
+         end
+      end // if (saif_en_i ^ saif_en_r)
+   end // always @ (negedge clk_i)
+
+endmodule
diff --git a/testbenches/common/v/vcache_dma_to_dram_channel_map.v b/testbenches/common/v/vcache_dma_to_dram_channel_map.v
new file mode 100644
index 000000000..a9cf15965
--- /dev/null
+++ b/testbenches/common/v/vcache_dma_to_dram_channel_map.v
@@ -0,0 +1,146 @@
+/**
+ *    vcache_dma_to_dram_channel_map.v
+ *
+ */ 
+
+
+
+module vcache_dma_to_dram_channel_map 
+  import bsg_cache_pkg::*;
+  import bsg_noc_pkg::*;
+  #(parameter num_pods_y_p="inv"
+    , parameter num_pods_x_p="inv"
+    , parameter num_tiles_x_p="inv"
+
+    , parameter wh_ruche_factor_p="inv"
+
+    , parameter num_vcache_rows_p="inv"
+    , parameter vcache_addr_width_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    , parameter num_vcaches_per_link_lp = (num_tiles_x_p*num_pods_x_p)/wh_ruche_factor_p/2
+    , parameter num_total_vcaches_lp = (num_pods_x_p*num_pods_y_p*2*num_tiles_x_p*num_vcache_rows_p)
+
+    , parameter num_vcaches_per_slice_lp = (num_pods_x_p == 1)
+        ? (num_tiles_x_p/2)
+        : (num_tiles_x_p)
+    , parameter cache_dma_pkt_width_lp=`bsg_cache_dma_pkt_width(vcache_addr_width_p)
+  )
+  (
+    // unmapped
+    input          [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0][cache_dma_pkt_width_lp-1:0] dma_pkt_i
+    , input        [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_pkt_v_i
+    , output logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_pkt_yumi_o
+
+    , output logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0][vcache_dma_data_width_p-1:0] dma_data_o
+    , output logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_v_o
+    , input        [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_ready_i
+
+    , input        [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0][vcache_dma_data_width_p-1:0] dma_data_i
+    , input        [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_v_i
+    , output logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][num_vcaches_per_link_lp-1:0] dma_data_yumi_o
+    
+    // remapped
+    , output logic [num_total_vcaches_lp-1:0][cache_dma_pkt_width_lp-1:0] remapped_dma_pkt_o
+    , output logic [num_total_vcaches_lp-1:0] remapped_dma_pkt_v_o
+    , input        [num_total_vcaches_lp-1:0] remapped_dma_pkt_yumi_i
+
+    , input        [num_total_vcaches_lp-1:0][vcache_dma_data_width_p-1:0] remapped_dma_data_i
+    , input        [num_total_vcaches_lp-1:0] remapped_dma_data_v_i
+    , output logic [num_total_vcaches_lp-1:0] remapped_dma_data_ready_o
+
+    , output logic [num_total_vcaches_lp-1:0][vcache_dma_data_width_p-1:0] remapped_dma_data_o
+    , output logic [num_total_vcaches_lp-1:0] remapped_dma_data_v_o
+    , input        [num_total_vcaches_lp-1:0] remapped_dma_data_yumi_i
+
+  );
+
+
+  `declare_bsg_cache_dma_pkt_s(vcache_addr_width_p);
+
+
+  // cache dma unruched mapping
+  bsg_cache_dma_pkt_s [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][(num_tiles_x_p*num_pods_x_p/2)-1:0] unruched_dma_pkt_lo;
+  logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][(num_tiles_x_p*num_pods_x_p/2)-1:0][vcache_dma_data_width_p-1:0] unruched_dma_data_li, unruched_dma_data_lo;
+  logic [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][(num_tiles_x_p*num_pods_x_p/2)-1:0] unruched_dma_pkt_v_lo, unruched_dma_pkt_yumi_li, 
+                                                                                                unruched_dma_data_v_li, unruched_dma_data_ready_lo,
+                                                                                                unruched_dma_data_v_lo, unruched_dma_data_yumi_li;
+
+  for (genvar i = W; i <= E; i++) begin
+    for (genvar j = 0; j < num_pods_y_p; j++) begin
+      for (genvar k = N; k <= S; k++) begin
+        for (genvar r = 0; r < num_vcache_rows_p; r++) begin
+          for (genvar l = 0; l < (num_tiles_x_p*num_pods_x_p/2); l++) begin
+
+            assign unruched_dma_pkt_lo[i][j][k][r][l] = dma_pkt_i[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p];
+            assign unruched_dma_pkt_v_lo[i][j][k][r][l] = dma_pkt_v_i[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p];
+            assign dma_pkt_yumi_o[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p] = unruched_dma_pkt_yumi_li[i][j][k][r][l];
+
+            assign dma_data_o[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p] = unruched_dma_data_li[i][j][k][r][l];
+            assign dma_data_v_o[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p] = unruched_dma_data_v_li[i][j][k][r][l];
+            assign unruched_dma_data_ready_lo[i][j][k][r][l] = dma_data_ready_i[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p];
+
+            assign unruched_dma_data_lo[i][j][k][r][l] = dma_data_i[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p];
+            assign unruched_dma_data_v_lo[i][j][k][r][l] = dma_data_v_i[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p];
+            assign dma_data_yumi_o[i][j][k][r][l%wh_ruche_factor_p][l/wh_ruche_factor_p] = unruched_dma_data_yumi_li[i][j][k][r][l];
+
+          end
+        end
+      end
+    end
+  end
+
+  // flatten rows
+  bsg_cache_dma_pkt_s [E:W][num_pods_y_p-1:0][(num_tiles_x_p*num_pods_x_p*num_vcache_rows_p)-1:0] flattened_dma_pkt_lo;
+  logic [E:W][num_pods_y_p-1:0][(num_tiles_x_p*num_pods_x_p*num_vcache_rows_p)-1:0][vcache_dma_data_width_p-1:0] flattened_dma_data_li, flattened_dma_data_lo;
+  logic [E:W][num_pods_y_p-1:0][(num_tiles_x_p*num_pods_x_p*num_vcache_rows_p)-1:0] flattened_dma_pkt_v_lo, flattened_dma_pkt_yumi_li, 
+                                                                                    flattened_dma_data_v_li, flattened_dma_data_ready_lo,
+                                                                                    flattened_dma_data_v_lo, flattened_dma_data_yumi_li;
+
+  for (genvar i = W; i <= E; i++) begin
+    for (genvar j = 0; j < num_pods_y_p; j++) begin
+      for (genvar k = N; k <= S; k++) begin
+        for (genvar r = 0; r < num_vcache_rows_p; r++) begin
+          for (genvar l = 0; l < (num_tiles_x_p*num_pods_x_p/2); l++) begin
+
+            localparam idx = (l % num_vcaches_per_slice_lp)
+              + ((k == S) ? num_vcaches_per_slice_lp : 0)
+              + (2*r*num_vcaches_per_slice_lp) 
+              + (l/num_vcaches_per_slice_lp)*(2*num_vcache_rows_p*num_vcaches_per_slice_lp);
+
+            assign flattened_dma_pkt_lo[i][j][idx]   = unruched_dma_pkt_lo[i][j][k][r][l];
+            assign flattened_dma_pkt_v_lo[i][j][idx] = unruched_dma_pkt_v_lo[i][j][k][r][l];
+            assign unruched_dma_pkt_yumi_li[i][j][k][r][l] = flattened_dma_pkt_yumi_li[i][j][idx];
+
+            assign unruched_dma_data_li[i][j][k][r][l] = flattened_dma_data_li[i][j][idx];
+            assign unruched_dma_data_v_li[i][j][k][r][l] = flattened_dma_data_v_li[i][j][idx];
+            assign flattened_dma_data_ready_lo[i][j][idx] = unruched_dma_data_ready_lo[i][j][k][r][l];
+
+            assign flattened_dma_data_lo[i][j][idx] = unruched_dma_data_lo[i][j][k][r][l];
+            assign flattened_dma_data_v_lo[i][j][idx] = unruched_dma_data_v_lo[i][j][k][r][l];
+            assign unruched_dma_data_yumi_li[i][j][k][r][l] = flattened_dma_data_yumi_li[i][j][idx];
+
+          end
+        end
+      end
+    end
+  end
+
+
+  // connect to remapped
+  assign remapped_dma_pkt_o = flattened_dma_pkt_lo;
+  assign remapped_dma_pkt_v_o = flattened_dma_pkt_v_lo;
+  assign flattened_dma_pkt_yumi_li = remapped_dma_pkt_yumi_i;
+
+  assign flattened_dma_data_li = remapped_dma_data_i;
+  assign flattened_dma_data_v_li = remapped_dma_data_v_i;
+  assign remapped_dma_data_ready_o = flattened_dma_data_ready_lo;
+
+
+  assign remapped_dma_data_o = flattened_dma_data_lo;
+  assign remapped_dma_data_v_o = flattened_dma_data_v_lo;
+  assign flattened_dma_data_yumi_li = remapped_dma_data_yumi_i;
+
+
+
+endmodule
diff --git a/testbenches/dpi/bsg_nonsynth_dpi_manycore.hpp b/testbenches/dpi/bsg_nonsynth_dpi_manycore.hpp
index fbd1d06b2..0f81f6ed9 100644
--- a/testbenches/dpi/bsg_nonsynth_dpi_manycore.hpp
+++ b/testbenches/dpi/bsg_nonsynth_dpi_manycore.hpp
@@ -18,7 +18,8 @@
 // corresponding function declarations in bsg_nonsynth_dpi_manycore.v
 // for additional information.
 extern "C" {
-        extern unsigned char bsg_dpi_credits_is_window();
+        extern unsigned char bsg_dpi_is_window();
+        extern unsigned char bsg_dpi_reset_is_done();
         extern unsigned char bsg_dpi_tx_is_vacant();
         extern int bsg_dpi_credits_get_cur();
         extern int bsg_dpi_credits_get_max();
@@ -52,6 +53,7 @@ namespace bsg_nonsynth_dpi{
                 int max_credits = -1;
                 // Current available credits (used for flow control, and fences)
                 int cur_credits = 0;
+                bool reset_done = false;
         public:
                 // Stores configuration data for the manycore DUT.
                 // Each entry is an unsigned 32-bit value
@@ -89,8 +91,17 @@ namespace bsg_nonsynth_dpi{
                  * window.
                  */
                 int get_credits(int& credits){
+                        int res = BSG_NONSYNTH_DPI_SUCCESS;
                         prev = svSetScope(scope);
-                        if(!bsg_dpi_credits_is_window()){
+                        if(!reset_done)
+                                res = reset_is_done(reset_done);
+
+                        if(res != BSG_NONSYNTH_DPI_SUCCESS){
+                                svSetScope(prev);
+                                return res;
+                        }
+
+                        if(!bsg_dpi_is_window()){
                                 svSetScope(prev);
                                 return BSG_NONSYNTH_DPI_NOT_WINDOW;
                         }
@@ -112,7 +123,7 @@ namespace bsg_nonsynth_dpi{
                  */
                 int tx_is_vacant(bool& vacant){
                         prev = svSetScope(scope);
-                        if(!bsg_dpi_credits_is_window()){
+                        if(!bsg_dpi_is_window()){
                                 svSetScope(prev);
                                 return BSG_NONSYNTH_DPI_NOT_WINDOW;
                         }
@@ -122,6 +133,34 @@ namespace bsg_nonsynth_dpi{
                         return BSG_NONSYNTH_DPI_SUCCESS;
                 }
 
+                /**
+                 * Determines if reset is done
+                 *
+                 * @param[out] done Boolean value, true if reset is
+                 * done
+                 *
+                 * @return BSG_NONSYNTH_DPI_SUCCESS on success,
+                 * BSG_NONSYNTH_DPI_BUSY when not done
+                 * window.
+                 */
+                int reset_is_done(bool& done){
+                        prev = svSetScope(scope);
+                        if(!bsg_dpi_is_window()){
+                                svSetScope(prev);
+                                return BSG_NONSYNTH_DPI_NOT_WINDOW;
+                        }
+
+                        done = bsg_dpi_reset_is_done();
+                        if(!done){
+                                return BSG_NONSYNTH_DPI_BUSY;
+                                svSetScope(prev);
+                        }
+
+                        svSetScope(prev);
+                        return BSG_NONSYNTH_DPI_SUCCESS;
+                }
+
+
                 /**
                  * Transmit a request packet onto the manycore network
                  * using the DPI interface.
@@ -131,6 +170,7 @@ namespace bsg_nonsynth_dpi{
                  *
                  * @return BSG_NONSYNTH_DPI_SUCCESS on success
                  *         (Recoverable Errors)
+                 *         BSG_NONSYNTH_DPI_BUSY when reset is not done
                  *         BSG_NONSYNTH_DPI_NOT_WINDOW when not in valid clock window
                  *         BSG_NONSYNTH_DPI_NO_CREDITS when no transmit credits are available
                  *         BSG_NONSYNTH_DPI_NOT_READY when the packet was not transmitted (call again next cycle)
@@ -138,6 +178,12 @@ namespace bsg_nonsynth_dpi{
                 int tx_req(const __m128i &data){
                         int res = BSG_NONSYNTH_DPI_SUCCESS;
 
+                        if(!reset_done)
+                                res = reset_is_done(reset_done);
+
+                        if(res != BSG_NONSYNTH_DPI_SUCCESS)
+                                return res;
+
                         // Get credits checks for valid window
                         if(!cur_credits)
                                 res = get_credits(cur_credits);
@@ -166,10 +212,18 @@ namespace bsg_nonsynth_dpi{
                  *
                  * @return BSG_NONSYNTH_DPI_SUCCESS on success
                  *         (Recoverable Errors)
+                 *         BSG_NONSYNTH_DPI_BUSY when reset is not done
                  *         BSG_NONSYNTH_DPI_NOT_WINDOW when not in valid clock window
                  *         BSG_NONSYNTH_DPI_NOT_VALID when no packet is available
                  */
                 int rx_rsp(__m128i &data){
+                        int res = BSG_NONSYNTH_DPI_SUCCESS;
+                        if(!reset_done)
+                                res = reset_is_done(reset_done);
+
+                        if(res != BSG_NONSYNTH_DPI_SUCCESS)
+                                return res;
+
                         return f2d_rsp.try_rx(data);
                 }
 
@@ -181,10 +235,18 @@ namespace bsg_nonsynth_dpi{
                  *
                  * @return BSG_NONSYNTH_DPI_SUCCESS on success
                  *         (Recoverable Errors)
+                 *         BSG_NONSYNTH_DPI_BUSY when reset is not done
                  *         BSG_NONSYNTH_DPI_NOT_WINDOW when not in valid clock window
                  *         BSG_NONSYNTH_DPI_NOT_VALID when no packet is available
                  */
                 int rx_req(__m128i &data){
+                        int res = BSG_NONSYNTH_DPI_SUCCESS;
+                        if(!reset_done)
+                                res = reset_is_done(reset_done);
+
+                        if(res != BSG_NONSYNTH_DPI_SUCCESS)
+                                return res;
+
                         return f2d_req.try_rx(data);
                 }
 
diff --git a/testbenches/dpi/bsg_nonsynth_dpi_manycore.v b/testbenches/dpi/bsg_nonsynth_dpi_manycore.v
index d371ac5cb..7e2a13f70 100644
--- a/testbenches/dpi/bsg_nonsynth_dpi_manycore.v
+++ b/testbenches/dpi/bsg_nonsynth_dpi_manycore.v
@@ -29,6 +29,8 @@ module bsg_nonsynth_dpi_manycore
       
     ,input [x_cord_width_p-1:0] my_x_i
     ,input [y_cord_width_p-1:0] my_y_i
+
+    ,input reset_done_i
       
     ,output bit debug_o);
    
@@ -220,7 +222,8 @@ module bsg_nonsynth_dpi_manycore
    export "DPI-C" function bsg_dpi_fini;
    export "DPI-C" function bsg_dpi_debug;
    export "DPI-C" function bsg_dpi_tx_is_vacant;
-   export "DPI-C" function bsg_dpi_credits_is_window;
+   export "DPI-C" function bsg_dpi_is_window;
+   export "DPI-C" function bsg_dpi_reset_is_done;
    export "DPI-C" function bsg_dpi_credits_get_cur;
    export "DPI-C" function bsg_dpi_credits_get_max;
 
@@ -243,6 +246,10 @@ module bsg_nonsynth_dpi_manycore
          $fatal(1, "BSG ERROR (%M): credits_get_cur() called while reset_i === 1");
       end      
 
+      if(reset_done_i === 0) begin
+         $fatal(1, "BSG ERROR (%M): credits_get_cur() called while reset_done_i === 0");
+      end      
+
       if(clk_i === 0) begin
          $fatal(1, "BSG ERROR (%M): credits_get_cur() must be called when clk_i == 1");
       end
@@ -266,6 +273,10 @@ module bsg_nonsynth_dpi_manycore
          $fatal(1, "BSG ERROR (%M): tx_is_vacant() called while reset_i === 1");
       end      
 
+      if(reset_done_i === 0) begin
+         $fatal(1, "BSG ERROR (%M): credits_get_cur() called while reset_done_i === 0");
+      end      
+
       if(clk_i === 0) begin
          $fatal(1, "BSG ERROR (%M): tx_is_vacant() must be called when clk_i == 1");
       end
@@ -281,12 +292,18 @@ module bsg_nonsynth_dpi_manycore
       return ~host_req_v_lo;
    endfunction 
 
-   // The function credits_is_window returns true if the interface is
+   // The function is_window returns true if the interface is
    // in a valid time-window to call read_credits()
-   function bit bsg_dpi_credits_is_window();
+   function bit bsg_dpi_is_window();
       return (clk_i & edgepol_l & ~reset_i);
    endfunction
 
+   // The function is_reset_done returns true if the clock is high,
+   // and reset is done, and the module is no longer in reset.
+   function bit bsg_dpi_reset_is_done();
+      return (clk_i & edgepol_l & ~reset_i & reset_done_i);
+   endfunction
+
    // Initialize this Manycore DPI Interface
    function void bsg_dpi_init();
       if(init_l)
diff --git a/testbenches/py/bsg_tag_trace_gen.py b/testbenches/py/bsg_tag_trace_gen.py
new file mode 100644
index 000000000..8b3efd1fe
--- /dev/null
+++ b/testbenches/py/bsg_tag_trace_gen.py
@@ -0,0 +1,60 @@
+#
+#   bsg_tag_trace_gen.py
+#
+
+import math
+
+SEND_OP       = 0b0001
+DONE_OP       = 0b0011
+COUNT_INIT_OP = 0b0110
+COUNT_WAIT_OP = 0b0101
+
+class TagTraceGen:
+
+  # Constructor
+  def __init__(self, num_masters_p, num_clients_p, max_payload_width_p):
+    self.num_masters_p = num_masters_p
+    self.num_clients_p = num_clients_p
+    self.max_payload_width_p = max_payload_width_p
+    self.client_id_width_lp = self.safe_clog2(num_clients_p)
+    self.length_width_lp = self.safe_clog2(max_payload_width_p+1)
+
+  # BSG_SAFE_CLOG2(x)
+  def safe_clog2(self, x):
+    if x == 1:
+      return 1
+    else:
+      return int(math.ceil(math.log(x,2)))
+  
+  # Get binary string
+  def get_bin_str(self, val, width):
+    return format(val, "0" + str(width) + "b")
+
+  # Print trace.
+  def get_trace(self, opcode, masters, client_id, data_not_reset, length, data):
+    trace = self.get_bin_str(opcode, 4) + "_"
+    trace += self.get_bin_str(masters, self.num_masters_p) + "_"
+    trace += self.get_bin_str(client_id, self.client_id_width_lp) + "_"
+    trace += self.get_bin_str(data_not_reset, 1) + "_"
+    trace += self.get_bin_str(length, self.length_width_lp) + "_"
+    trace += self.get_bin_str(data, self.max_payload_width_p)
+    return trace
+
+  # Send trace
+  def send(self, masters, client_id, data_not_reset, length, data):
+    print(self.get_trace(SEND_OP, masters, client_id, data_not_reset, length, data))
+
+  # Wait cycles.
+  def wait(self, cycles):
+    count_width = self.num_masters_p + self.client_id_width_lp + 1 + self.length_width_lp + self.max_payload_width_p
+    trace = self.get_bin_str(COUNT_INIT_OP, 4) + "_" 
+    trace += self.get_bin_str(cycles, count_width)
+    print(trace)
+    trace = self.get_bin_str(COUNT_WAIT_OP, 4) + "_"
+    trace += self.get_bin_str(0, count_width)
+    print(trace)
+
+  # done
+  def done(self):
+    trace = self.get_trace(DONE_OP, 0,0,0,0,0)
+    print(trace)
diff --git a/testbenches/py/pod_trace_gen.py b/testbenches/py/pod_trace_gen.py
new file mode 100644
index 000000000..14928c278
--- /dev/null
+++ b/testbenches/py/pod_trace_gen.py
@@ -0,0 +1,47 @@
+from bsg_tag_trace_gen import *
+import sys
+import math
+
+if __name__ == "__main__":
+  num_pods_x = int(sys.argv[1])
+  num_pods_y = int(sys.argv[2])
+
+  # each pod has one client for reset
+  num_clients = (num_pods_x*num_pods_y)
+  payload_width = 1 # reset
+  lg_payload_width = int(math.ceil(math.log(payload_width+1,2)))
+  max_payload_width = (1<<lg_payload_width)-1
+  tg = TagTraceGen(1, num_clients, max_payload_width)
+
+
+  # reset all bsg_tag_master
+  tg.send(masters=0b1,client_id=0,data_not_reset=0,length=0,data=0)
+  tg.wait(16)
+  
+  # client indexing [num_pods_y-1:0][num_pods_x-1:0][S:N]
+  # reset all clients
+  for i in range(num_clients):
+    tg.send(masters=0b1, client_id=i, data_not_reset=0, length=max_payload_width, data=(2**max_payload_width)-1)
+    
+  # Assert reset on all pods
+  for i in range(num_pods_y*num_pods_x):
+    tg.send(masters=0b1, client_id=i, data_not_reset=1, length=payload_width, data=0b1)
+
+  # Assert reset on io rtr
+  #for i in range(num_pods_x):
+  #  client_id = (num_pods_y*num_pods_x) + i
+  #  tg.send(masters=0b1, client_id=client_id, data_not_reset=1, length=1, data=0b1)
+
+  # De-assert reset on all pods
+  for i in range(num_pods_y*num_pods_x):
+    tg.send(masters=0b1, client_id=i, data_not_reset=1, length=payload_width, data=0b0)
+
+
+  # De-Assert reset on io rtr
+  #for i in range(num_pods_x):
+  #  client_id = (num_pods_y*num_pods_x) + i
+  #  tg.send(masters=0b1, client_id=client_id, data_not_reset=1, length=1, data=0b0)
+
+
+  tg.wait(16)
+  tg.done()
diff --git a/v/bsg_cache_dma_to_wormhole.v b/v/bsg_cache_dma_to_wormhole.v
new file mode 100644
index 000000000..732fe1c4e
--- /dev/null
+++ b/v/bsg_cache_dma_to_wormhole.v
@@ -0,0 +1,290 @@
+/**
+ *    bsg_cache_dma_to_wormhole.v
+ *
+ *    this module interfaces cache DMA to wormhole router.
+ *    when this module receives a write dma packet from the cache, it sends
+ *    write header flit with evict data following.
+ *    for read dma packets, it sends the read header flit, and receives the fill data asynchronously.
+ */
+
+`include "bsg_noc_links.vh"
+
+
+module bsg_cache_dma_to_wormhole
+  import bsg_cache_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter vcache_addr_width_p="inv"
+    , parameter vcache_data_width_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    
+    // flit width should match the vcache dma width.
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    , parameter wh_cord_width_p = "inv"
+    
+    , parameter data_len_lp = (vcache_data_width_p*vcache_block_size_in_words_p/vcache_dma_data_width_p)
+
+    , parameter dma_pkt_width_lp=`bsg_cache_dma_pkt_width(vcache_addr_width_p)
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    , input [dma_pkt_width_lp-1:0] dma_pkt_i
+    , input dma_pkt_v_i
+    , output dma_pkt_yumi_o
+
+    , output logic [vcache_dma_data_width_p-1:0] dma_data_o
+    , output logic dma_data_v_o
+    , input dma_data_ready_i
+
+    , input [vcache_dma_data_width_p-1:0] dma_data_i
+    , input dma_data_v_i
+    , output logic dma_data_yumi_o
+
+    , input  [wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [wh_link_sif_width_lp-1:0] wh_link_sif_o  
+
+    , input [wh_cord_width_p-1:0] my_wh_cord_i
+    , input [wh_cord_width_p-1:0] dest_wh_cord_i
+    , input [wh_cid_width_p-1:0] my_wh_cid_i
+  );
+
+
+  `declare_bsg_cache_dma_pkt_s(vcache_addr_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+  wh_link_sif_s wh_link_sif_in;
+  wh_link_sif_s wh_link_sif_out;
+  assign wh_link_sif_in = wh_link_sif_i;
+  assign wh_link_sif_o = wh_link_sif_out;
+
+  // dma pkt fifo
+  logic dma_pkt_ready_lo;
+  logic dma_pkt_v_lo;
+  logic dma_pkt_yumi_li;
+  bsg_cache_dma_pkt_s dma_pkt_lo;
+
+  bsg_two_fifo #(
+    .width_p(dma_pkt_width_lp)
+  ) dma_pkt_fifo (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+
+    ,.v_i(dma_pkt_v_i)
+    ,.data_i(dma_pkt_i)
+    ,.ready_o(dma_pkt_ready_lo)
+
+    ,.v_o(dma_pkt_v_lo)
+    ,.data_o(dma_pkt_lo)
+    ,.yumi_i(dma_pkt_yumi_li)
+  );
+
+  assign dma_pkt_yumi_o = dma_pkt_ready_lo & dma_pkt_v_i;
+
+  // FIFO for wormhole flits coming back to vcache.
+  logic return_fifo_v_lo;
+  logic [wh_flit_width_p-1:0] return_fifo_data_lo;
+  logic return_fifo_yumi_li;
+
+  bsg_two_fifo #(
+    .width_p(wh_flit_width_p)
+  ) return_fifo (
+    .clk_i      (clk_i)
+    ,.reset_i   (reset_i)
+
+    ,.v_i       (wh_link_sif_in.v)
+    ,.data_i    (wh_link_sif_in.data)
+    ,.ready_o   (wh_link_sif_out.ready_and_rev)
+
+    ,.v_o       (return_fifo_v_lo)
+    ,.data_o    (return_fifo_data_lo)
+    ,.yumi_i    (return_fifo_yumi_li)
+  );
+
+
+  // counter
+  localparam count_width_lp = `BSG_SAFE_CLOG2(data_len_lp);
+  logic send_clear_li;
+  logic send_up_li;
+  logic [count_width_lp-1:0] send_count_lo;
+
+  bsg_counter_clear_up #(
+    .max_val_p(data_len_lp-1)
+    ,.init_val_p(0)
+  ) send_count (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.clear_i(send_clear_li)
+    ,.up_i(send_up_li)
+    ,.count_o(send_count_lo)
+  );
+  
+  // send FSM
+  typedef enum logic [1:0] {
+    SEND_RESET
+    , SEND_READY
+    , SEND_ADDR
+    , SEND_DATA
+  } send_state_e;
+
+
+  send_state_e send_state_r, send_state_n;
+  logic wh_flit_valid;
+  logic [wh_flit_width_p-1:0] wh_flit_out;
+  assign wh_link_sif_out.v = wh_flit_valid;
+  assign wh_link_sif_out.data = wh_flit_out;
+
+  `declare_bsg_manycore_vcache_wh_header_flit_s(wh_flit_width_p,wh_cord_width_p,wh_len_width_p,wh_cid_width_p);
+
+  bsg_manycore_vcache_wh_header_flit_s header_flit;
+  assign header_flit.unused = '0;
+  assign header_flit.write_not_read = dma_pkt_lo.write_not_read;
+  assign header_flit.src_cord = my_wh_cord_i;
+  assign header_flit.cid = my_wh_cid_i;
+  assign header_flit.len = dma_pkt_lo.write_not_read
+    ? wh_len_width_p'(1+data_len_lp)  // header + addr + data
+    : wh_len_width_p'(1);  // header + addr
+  assign header_flit.dest_cord = dest_wh_cord_i;
+
+
+  always_comb begin
+    
+    send_state_n = send_state_r;
+    dma_pkt_yumi_li = 1'b0;
+    send_clear_li = 1'b0;
+    send_up_li = 1'b0;
+    wh_flit_valid = 1'b0;
+    wh_flit_out = dma_data_i;
+    dma_data_yumi_o = 1'b0;
+
+    case (send_state_r)
+      SEND_RESET: begin
+        send_state_n = SEND_READY;
+      end
+
+      SEND_READY: begin
+        wh_flit_out = header_flit;
+        if (dma_pkt_v_lo) begin
+          wh_flit_valid = 1'b1;
+          send_state_n = wh_link_sif_in.ready_and_rev
+            ? SEND_ADDR
+            : SEND_READY;
+        end
+      end
+
+      SEND_ADDR: begin
+        wh_flit_out = wh_flit_width_p'(dma_pkt_lo.addr);
+        if (dma_pkt_v_lo) begin
+          wh_flit_valid = 1'b1;
+          dma_pkt_yumi_li = wh_link_sif_in.ready_and_rev;
+          send_state_n = wh_link_sif_in.ready_and_rev
+            ? (dma_pkt_lo.write_not_read ? SEND_DATA : SEND_READY)
+            : SEND_ADDR;
+        end
+      end
+
+      SEND_DATA: begin
+        wh_flit_out = dma_data_i;
+        if (dma_data_v_i) begin
+          wh_flit_valid = 1'b1;
+          send_up_li = (send_count_lo != data_len_lp-1) & wh_link_sif_in.ready_and_rev;
+          send_clear_li = (send_count_lo == data_len_lp-1) & wh_link_sif_in.ready_and_rev;
+          dma_data_yumi_o = wh_link_sif_in.ready_and_rev;
+          send_state_n = (send_count_lo == data_len_lp-1) & wh_link_sif_in.ready_and_rev
+            ? SEND_READY
+            : SEND_DATA;
+        end   
+      end
+      
+      // should never happen
+      default: begin
+        send_state_n = SEND_READY;
+      end
+    endcase
+  end
+
+
+
+
+  // receiver FSM
+  logic recv_clear_li;
+  logic recv_up_li;
+  logic [count_width_lp-1:0] recv_count_lo;
+
+  bsg_counter_clear_up #(
+    .max_val_p(data_len_lp-1)
+    ,.init_val_p(0)
+  ) recv_count (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+    ,.clear_i(recv_clear_li)
+    ,.up_i(recv_up_li)
+    ,.count_o(recv_count_lo)
+  );
+
+  typedef enum logic [1:0] {
+    RECV_RESET
+    , RECV_READY
+    , RECV_DATA
+  } recv_state_e;
+
+  recv_state_e recv_state_r, recv_state_n;
+
+
+  always_comb begin
+    recv_state_n = recv_state_r;
+    recv_clear_li = 1'b0;
+    recv_up_li = 1'b0;
+    return_fifo_yumi_li = 1'b0;
+    dma_data_v_o = 1'b0;
+    dma_data_o = return_fifo_data_lo;
+
+    case (recv_state_r) 
+      RECV_RESET: begin
+        recv_state_n = RECV_READY;        
+      end
+    
+      RECV_READY: begin
+        return_fifo_yumi_li = return_fifo_v_lo;
+        recv_state_n = return_fifo_v_lo
+          ? RECV_DATA
+          : RECV_READY;
+      end
+      
+      RECV_DATA: begin
+        return_fifo_yumi_li = return_fifo_v_lo & dma_data_ready_i;
+        dma_data_v_o = return_fifo_v_lo;
+        recv_clear_li = return_fifo_v_lo & dma_data_ready_i & (recv_count_lo == data_len_lp-1);
+        recv_up_li = return_fifo_v_lo & dma_data_ready_i & (recv_count_lo != data_len_lp-1);
+        recv_state_n = return_fifo_v_lo & dma_data_ready_i & (recv_count_lo == data_len_lp-1)
+          ? RECV_READY
+          : RECV_DATA;
+      end
+
+      default: begin
+        recv_state_n = RECV_READY;
+      end
+    endcase    
+  end
+
+
+
+
+  // sequential logic
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      send_state_r <= SEND_RESET;
+      recv_state_r <= RECV_RESET;
+    end
+    else begin
+      send_state_r <= send_state_n;
+      recv_state_r <= recv_state_n;
+    end
+  end
+
+
+endmodule
diff --git a/v/bsg_manycore_dram_hash_function.v b/v/bsg_manycore_dram_hash_function.v
new file mode 100644
index 000000000..0f0e38e6c
--- /dev/null
+++ b/v/bsg_manycore_dram_hash_function.v
@@ -0,0 +1,76 @@
+/**
+ *    bsg_manycore_dram_hash_function.
+ *
+ *    EVA to dram NPA
+ */
+
+  // DRAM hash function
+  // DRAM space is striped across vcaches at a cache line granularity.
+  // Striping starts from the north vcaches, and alternates between north and south from inner layers to outer layers.
+
+  // ungroup this module for synthesis.
+
+module bsg_manycore_dram_hash_function 
+  #(parameter data_width_p="inv"
+    , parameter addr_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+
+    , parameter x_subcord_width_p="inv"
+    , parameter y_subcord_width_p="inv"
+
+    , parameter num_vcache_rows_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+  )
+  (
+    input [data_width_p-1:0] eva_i // 32-bit byte address
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
+
+    , output logic [addr_width_p-1:0] epa_o // word address
+    , output logic [x_cord_width_p-1:0] x_cord_o
+    , output logic [y_cord_width_p-1:0] y_cord_o
+  );
+
+  localparam vcache_word_offset_width_lp = `BSG_SAFE_CLOG2(vcache_block_size_in_words_p);
+  localparam vcache_row_id_width_lp = `BSG_SAFE_CLOG2(2*num_vcache_rows_p);
+  localparam dram_index_width_lp = data_width_p-1-2-vcache_word_offset_width_lp-x_subcord_width_p-vcache_row_id_width_lp;
+
+
+  wire [vcache_row_id_width_lp-1:0] vcache_row_id = eva_i[2+vcache_word_offset_width_lp+x_subcord_width_p+:vcache_row_id_width_lp];
+  wire [x_subcord_width_p-1:0] dram_x_subcord = eva_i[2+vcache_word_offset_width_lp+:x_subcord_width_p];
+  wire [y_subcord_width_p-1:0] dram_y_subcord;
+  wire [pod_y_cord_width_p-1:0] dram_pod_y_cord = vcache_row_id[0]
+    ? pod_y_cord_width_p'(pod_y_i+1)
+    : pod_y_cord_width_p'(pod_y_i-1);
+
+  if (num_vcache_rows_p == 1) begin
+    assign dram_y_subcord = {y_subcord_width_p{~vcache_row_id[0]}};
+  end
+  else begin
+    assign dram_y_subcord = {
+      {(y_subcord_width_p+1-vcache_row_id_width_lp){~vcache_row_id[0]}},
+      (vcache_row_id[0]
+        ?  vcache_row_id[vcache_row_id_width_lp-1:1]
+        : ~vcache_row_id[vcache_row_id_width_lp-1:1])
+    };
+  end
+
+  wire [dram_index_width_lp-1:0] dram_index = eva_i[2+vcache_word_offset_width_lp+x_subcord_width_p+vcache_row_id_width_lp+:dram_index_width_lp];
+
+
+  // NPA
+  assign y_cord_o = {dram_pod_y_cord, dram_y_subcord};
+  assign x_cord_o = {pod_x_i, dram_x_subcord};
+  assign epa_o = {
+    1'b0,
+    {(addr_width_p-1-dram_index_width_lp-vcache_word_offset_width_lp){1'b0}},
+    dram_index,
+    eva_i[2+:vcache_word_offset_width_lp]
+  };
+
+
+endmodule
diff --git a/v/bsg_manycore_endpoint_standard.v b/v/bsg_manycore_endpoint_standard.v
index 7f04dbd90..1b056b896 100644
--- a/v/bsg_manycore_endpoint_standard.v
+++ b/v/bsg_manycore_endpoint_standard.v
@@ -84,18 +84,22 @@ module bsg_manycore_endpoint_standard
     //    responses that send back from the network
     //    the node shold always be ready to receive this response.
     , output [data_width_p-1:0]                 returned_data_r_o
-    , output [4:0]                              returned_reg_id_r_o
+    , output [bsg_manycore_reg_id_width_gp-1:0]  returned_reg_id_r_o
     , output                                    returned_v_r_o
     , output bsg_manycore_return_packet_type_e  returned_pkt_type_r_o
     , input                                     returned_yumi_i
     , output                                    returned_fifo_full_o
 
 
+    , output                                   returned_credit_v_r_o
+    , output [bsg_manycore_reg_id_width_gp-1:0]  returned_credit_reg_id_r_o
+
     , output [$clog2(max_out_credits_p+1)-1:0] out_credits_o
 
-     // tile coordinates
-    , input   [x_cord_width_p-1:0] my_x_i
-    , input   [y_cord_width_p-1:0] my_y_i
+    // tile coordinates (coordinate in a global array)
+    // currently, for debugging only
+    , input   [x_cord_width_p-1:0] global_x_i
+    , input   [y_cord_width_p-1:0] global_y_i
 
   );
 
@@ -149,18 +153,18 @@ module bsg_manycore_endpoint_standard
   // ----------------------------------------------------------------------------------------
   // Handle incoming request packets
   // ----------------------------------------------------------------------------------------
-  // singals between FIFO to lock_ctrl
+  // signals between FIFO to lock_ctrl
   wire in_yumi_lo, in_v_li;
 
   wire [data_width_p-1:0] in_data_lo = packet_lo.payload;
   wire [addr_width_p-1:0] in_addr_lo = packet_lo.addr;
-  wire[(data_width_p>>3)-1:0] in_mask_lo = packet_lo.op_ex;
-  bsg_manycore_amo_type_e in_amo_type;
-  assign in_amo_type = packet_lo.op_ex.amo_type;
+  wire[(data_width_p>>3)-1:0] in_mask_lo = (packet_lo.op_v2 == e_remote_sw)
+    ? 4'b1111
+    : packet_lo.reg_id.store_mask_s.mask;
 
-  wire pkt_remote_store   = packet_v_lo & (packet_lo.op == e_remote_store  );
-  wire pkt_remote_load    = packet_v_lo & (packet_lo.op == e_remote_load   );
-  wire pkt_remote_amo     = packet_v_lo & (packet_lo.op == e_remote_amo    );
+  wire pkt_remote_store   = packet_v_lo & ((packet_lo.op_v2 == e_remote_store  ) | (packet_lo.op_v2 == e_remote_sw));
+  wire pkt_remote_load    = packet_v_lo & (packet_lo.op_v2 == e_remote_load   );
+  wire pkt_remote_amo     = packet_v_lo & (packet_lo.op_v2 == e_remote_amoswap );
 
   bsg_manycore_load_info_s pkt_load_info;
   assign pkt_load_info = packet_lo.payload.load_info_s.load_info;
@@ -195,7 +199,6 @@ module bsg_manycore_endpoint_standard
     ,.in_addr_i    (in_addr_lo      )
     ,.in_we_i      (pkt_remote_store)
     ,.in_amo_op_i  (pkt_remote_amo  )
-    ,.in_amo_type_i(in_amo_type)
     ,.in_x_cord_i  (packet_lo.src_x_cord )
     ,.in_y_cord_i  (packet_lo.src_y_cord )
     // combined  incoming data interface
@@ -231,19 +234,50 @@ module bsg_manycore_endpoint_standard
 
    returning_credit_info  rc_fifo_li, rc_fifo_lo;
 
-
-  assign rc_fifo_li = '{
-    pkt_type : (packet_lo.op == e_remote_store)
-      ? e_return_credit
-      : (pkt_load_info.icache_fetch
-        ? e_return_ifetch 
-        : (pkt_load_info.float_wb
+  // AND-OR 5 LSBs of each byte of payload to get the payload hash and return it as reg_id for e_remote_store.
+  wire [bsg_manycore_reg_id_width_gp-1:0] payload_reg_id;
+  bsg_manycore_reg_id_decode pd0 (
+    .data_i(packet_lo.payload)
+    ,.mask_i(packet_lo.reg_id.store_mask_s.mask)
+    ,.reg_id_o(payload_reg_id)
+  );
+    
+
+
+  always_comb begin
+    rc_fifo_li.y_cord = packet_lo.src_y_cord;
+    rc_fifo_li.x_cord = packet_lo.src_x_cord;
+
+    case (packet_lo.op_v2)
+      e_remote_store: begin
+        rc_fifo_li.pkt_type = e_return_credit;
+        rc_fifo_li.reg_id = payload_reg_id;
+      end
+
+      e_remote_sw: begin
+        rc_fifo_li.pkt_type = e_return_credit;
+        rc_fifo_li.reg_id = packet_lo.reg_id;
+      end
+    
+      e_remote_load: begin
+        rc_fifo_li.pkt_type = pkt_load_info.float_wb
           ? e_return_float_wb
-          : e_return_int_wb))
-    ,y_cord : packet_lo.src_y_cord
-    ,x_cord : packet_lo.src_x_cord
-    ,reg_id : packet_lo.reg_id
-  };
+          : e_return_int_wb;
+        rc_fifo_li.reg_id = packet_lo.reg_id;
+      end
+
+      e_remote_amoswap: begin
+        rc_fifo_li.pkt_type = e_return_int_wb;
+        rc_fifo_li.reg_id = packet_lo.reg_id;
+      end
+      
+      // should not happen.
+      default: begin
+        rc_fifo_li.pkt_type = e_return_int_wb;
+        rc_fifo_li.reg_id = packet_lo.reg_id;
+      end
+    endcase
+  end
 
 
   bsg_two_fifo #(
@@ -328,6 +362,11 @@ module bsg_manycore_endpoint_standard
   assign returned_yumi_li      = returned_yumi_i | (returned_packet_v_lo & (returned_packet_lo.pkt_type == e_return_credit));
 
 
+
+  assign returned_credit_v_r_o      = returned_credit;
+  assign returned_credit_reg_id_r_o = returned_packet_lo.reg_id;
+
+
   //              //
   //  Assertions  //
   //              //
@@ -336,13 +375,29 @@ module bsg_manycore_endpoint_standard
   if (debug_p) begin
     always_ff @(negedge clk_i) begin
       if (returned_credit)
-        $display("## return packet received by (x,y)=%x,%x",my_x_i,my_y_i);
+        $display("## return packet received by (x,y)=%x,%x",global_x_i,global_y_i);
 
       if (out_v_i)
         $display("## attempting remote store send of data %x, out_credit_or_ready_o = %x (%m)",out_packet_i,out_credit_or_ready_o);
     end
   end
 
+  always_ff @ (negedge clk_i) begin
+    if (~reset_i) begin
+      if (packet_v_lo) begin
+        assert(
+          (packet_lo.op_v2 != e_remote_amoadd)
+          & (packet_lo.op_v2 != e_remote_amoxor)
+          & (packet_lo.op_v2 != e_remote_amoand)
+          & (packet_lo.op_v2 != e_remote_amoor)
+          & (packet_lo.op_v2 != e_remote_amomin)
+          & (packet_lo.op_v2 != e_remote_amomax)
+          & (packet_lo.op_v2 != e_remote_amominu)
+          & (packet_lo.op_v2 != e_remote_amomaxu)
+        ) else $error("[BSG_ERROR] Incoming packet has an unsupported amo type. op_v2=%d", packet_lo.op_v2);
+      end
+    end
+  end
 
 
   logic out_of_credits_warned = 0;
@@ -353,7 +408,7 @@ module bsg_manycore_endpoint_standard
         if ( ~(reset_i) & ~out_of_credits_warned)
         assert (out_credits_o === 'X || out_credits_o > 0) else
           begin
-             $display("## out of remote store credits(=%d) x,y=%d,%d displaying only once (%m)",out_credits_o,my_x_i,my_y_i);
+             $display("## out of remote store credits(=%d) x,y=%d,%d displaying only once (%m)",out_credits_o,global_x_i,global_y_i);
              $display("##   (this may be a performance problem; or normal behavior)");
              out_of_credits_warned = 1;
           end
@@ -370,19 +425,19 @@ module bsg_manycore_endpoint_standard
    always_ff @(posedge clk_i)  reset_r <= reset_i;
 
    always_ff @(negedge clk_i)
-     assert ( (reset_r!==0) | ~link_sif_i_cast.rev.v | ({return_packet.y_cord, return_packet.x_cord} == {my_y_i, my_x_i}))
+     assert ( (reset_r!==0) | ~link_sif_i_cast.rev.v | ({return_packet.y_cord, return_packet.x_cord} == {global_y_i, global_x_i}))
        else begin
          $error("## errant credit packet v=%b for YX=%d,%d landed at YX=%d,%d (%m)"
                 ,link_sif_i_cast.rev.v
                 ,link_sif_i_cast.rev.data[x_cord_width_p+:y_cord_width_p]
                 ,link_sif_i_cast.rev.data[0+:x_cord_width_p]
-                ,my_y_i,my_x_i);
+                ,global_y_i,global_x_i);
         $finish();
        end
 
    always_ff @(negedge clk_i) begin
         if( (returned_v_r_o === 1'b1) && ( returned_yumi_i != 1'b1) && returned_fifo_full_o ) begin
-                $display("## Returned response will be dropped at YX=%d, %d (%m)", my_y_i, my_x_i);
+                $display("## Returned response will be dropped at YX=%d, %d (%m)", global_y_i, global_x_i);
                 $finish();
         end
   end
diff --git a/v/bsg_manycore_eva_to_npa.v b/v/bsg_manycore_eva_to_npa.v
index 49ceb2894..0b4e4b198 100644
--- a/v/bsg_manycore_eva_to_npa.v
+++ b/v/bsg_manycore_eva_to_npa.v
@@ -9,6 +9,11 @@
  *      2) Global
  *      3) Tile-Group
  *
+ *    Modifying this mapping requires the same change in the following files.
+ *    - Cuda-lite
+ *    https://github.com/bespoke-silicon-group/bsg_replicant/blob/master/libraries/bsg_manycore_eva.cpp
+ *    - SPMD testbench
+ *    https://github.com/bespoke-silicon-group/bsg_manycore/blob/master/software/py/nbf.py
  */
 
 
@@ -18,20 +23,27 @@ module bsg_manycore_eva_to_npa
     , parameter addr_width_p="inv"
     , parameter x_cord_width_p="inv"
     , parameter y_cord_width_p="inv"
-    
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+ 
     , parameter num_tiles_x_p="inv"
     , parameter num_tiles_y_p="inv"
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
 
+    , parameter num_vcache_rows_p = "inv"
     , parameter vcache_block_size_in_words_p="inv"  // block size in vcache
     , parameter vcache_size_p="inv" // vcache capacity in words
     , parameter vcache_sets_p="inv" // number of sets in vcache
-
   )
   (
     // EVA 32-bit virtual address used by vanilla core
     input [data_width_p-1:0] eva_i  // byte addr
-    , input [x_cord_width_p-1:0] tgo_x_i // tile-group origin x
-    , input [y_cord_width_p-1:0] tgo_y_i // tile-group origin y
+    , input [x_subcord_width_lp-1:0] tgo_x_i // tile-group origin x
+    , input [y_subcord_width_lp-1:0] tgo_y_i // tile-group origin y
+
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
 
     // When DRAM mode is enabled, DRAM EVA space is striped across vcaches at a cache line granularity.
     // When DRAM mode is disabled, vcaches are only used as block memory, and the striping is disabled,
@@ -39,8 +51,8 @@ module bsg_manycore_eva_to_npa
     , input dram_enable_i // DRAM MODE enable
 
     // NPA (x,y,EPA)
-    , output logic [x_cord_width_p-1:0] x_cord_o  // destination x_cord
-    , output logic [y_cord_width_p-1:0] y_cord_o  // destination y_cord
+    , output logic [x_cord_width_p-1:0] x_cord_o  // destination x_cord (global)
+    , output logic [y_cord_width_p-1:0] y_cord_o  // destination y_cord (global)
     , output logic [addr_width_p-1:0] epa_o       // endpoint physical address (word addr)
 
     // EVA does not map to any valid remote NPA location.
@@ -51,12 +63,10 @@ module bsg_manycore_eva_to_npa
   //
   localparam vcache_word_offset_width_lp = `BSG_SAFE_CLOG2(vcache_block_size_in_words_p);
   localparam lg_vcache_size_lp = `BSG_SAFE_CLOG2(vcache_size_p);
+  localparam vcache_row_id_width_lp = `BSG_SAFE_CLOG2(2*num_vcache_rows_p);
 
 
   // figure out what type of EVA this is.
-  `declare_bsg_manycore_global_addr_s;
-  `declare_bsg_manycore_tile_group_addr_s;
-
   bsg_manycore_global_addr_s global_addr;
   bsg_manycore_tile_group_addr_s tile_group_addr;
 
@@ -67,64 +77,83 @@ module bsg_manycore_eva_to_npa
   wire is_global_addr = global_addr.remote == 2'b01;
   wire is_tile_group_addr = tile_group_addr.remote == 3'b001;
 
-  assign is_invalid_addr_o = ~(is_dram_addr | is_global_addr | is_tile_group_addr);
 
   
   // DRAM hash function
-  localparam hash_bank_input_width_lp = data_width_p-1-2-vcache_word_offset_width_lp;
-  localparam hash_bank_index_width_lp = $clog2(((2**hash_bank_input_width_lp)+(2*num_tiles_x_p)-1)/(num_tiles_x_p*2));
-
-  logic [hash_bank_input_width_lp-1:0] hash_bank_input;
-  logic [x_cord_width_p:0] hash_bank_lo;  // {bot_not_top, x_cord}
-  logic [hash_bank_index_width_lp-1:0] hash_bank_index_lo;
-
-  hash_function #(
-    .banks_p(num_tiles_x_p*2)
-    ,.width_p(hash_bank_input_width_lp)
-    ,.vcache_sets_p(vcache_sets_p)
-  ) hashb (
-    .i(hash_bank_input)
-    ,.bank_o(hash_bank_lo)
-    ,.index_o(hash_bank_index_lo)
+  // DRAM space is striped across vcaches at a cache line granularity.
+  // Striping starts from the north vcaches, and alternates between north and south from inner layers to outer layers.
+  
+  logic [x_cord_width_p-1:0] dram_x_cord_lo;
+  logic [y_cord_width_p-1:0] dram_y_cord_lo;
+  logic [addr_width_p-1:0] dram_epa_lo;
+
+  bsg_manycore_dram_hash_function #(
+    .data_width_p(data_width_p)
+    ,.addr_width_p(addr_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
+    ,.x_subcord_width_p(x_subcord_width_lp)
+    ,.y_subcord_width_p(y_subcord_width_lp)
+    ,.num_vcache_rows_p(num_vcache_rows_p)
+    ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+  ) dram_hash (
+    .eva_i(eva_i)
+    ,.pod_x_i(pod_x_i)
+    ,.pod_y_i(pod_y_i)
+
+    ,.x_cord_o(dram_x_cord_lo)
+    ,.y_cord_o(dram_y_cord_lo)
+    ,.epa_o(dram_epa_lo)
   );
 
 
-  assign hash_bank_input = eva_i[2+vcache_word_offset_width_lp+:hash_bank_input_width_lp];
+  // NO DRAM mode hash function
+  wire [x_subcord_width_lp-1:0] no_dram_x_subcord = eva_i[2+lg_vcache_size_lp+:x_subcord_width_lp];
+  wire [vcache_row_id_width_lp-1:0] no_dram_vcache_row_id = eva_i[2+lg_vcache_size_lp+x_subcord_width_lp+:vcache_row_id_width_lp];
+  wire [y_subcord_width_lp-1:0] no_dram_y_subcord;
+  wire [pod_y_cord_width_p-1:0] no_dram_pod_y_cord = no_dram_vcache_row_id[0]
+    ? pod_y_cord_width_p'(pod_y_i+1)
+    : pod_y_cord_width_p'(pod_y_i-1);
+
+  if (num_vcache_rows_p == 1) begin
+    assign no_dram_y_subcord = {y_subcord_width_lp{~no_dram_vcache_row_id[0]}};
+  end
+  else begin
+    assign no_dram_y_subcord = {
+      {(y_subcord_width_lp+1-vcache_row_id_width_lp){~no_dram_vcache_row_id[0]}},
+      (no_dram_vcache_row_id[0]
+        ?  no_dram_vcache_row_id[vcache_row_id_width_lp-1:1]
+        : ~no_dram_vcache_row_id[vcache_row_id_width_lp-1:1])
+    };
+  end
+
 
 
+  // EVA->NPA table
   always_comb begin
+    is_invalid_addr_o = 1'b0;
+
     if (is_dram_addr) begin
       if (dram_enable_i) begin
-        y_cord_o = hash_bank_lo[x_cord_width_p]
-          ? (y_cord_width_p)'(num_tiles_y_p+1) // DRAM ports are directly below the manycore tiles.
-          : {y_cord_width_p{1'b0}};
-        x_cord_o = hash_bank_lo[0+:x_cord_width_p];
+        y_cord_o = dram_y_cord_lo;
+        x_cord_o = dram_x_cord_lo;
+        epa_o = dram_epa_lo;
+      end
+      else begin
+        // DRAM disabled mode is used when vcaches are used as block RAMs,
+        // in which case the tags will be manually written in the vcaches
+        // to prevent cache miss.
+        // striping is disabled.
+        y_cord_o = {no_dram_pod_y_cord, no_dram_y_subcord};
+        x_cord_o = {pod_x_i, no_dram_x_subcord};
+
         epa_o = {
           1'b0,
-          {(addr_width_p-1-vcache_word_offset_width_lp-hash_bank_index_width_lp){1'b0}},
-          hash_bank_index_lo,
-          eva_i[2+:vcache_word_offset_width_lp]
+          {(addr_width_p-1-lg_vcache_size_lp){1'b0}},
+          eva_i[2+:lg_vcache_size_lp]
         };
-
-      end
-      else begin
-        // DRAM disabled.
-        if (eva_i[30]) begin
-          y_cord_o = (y_cord_width_p)'(1);
-          x_cord_o = '0;
-          epa_o = {1'b1, eva_i[2+:addr_width_p-1]}; // HOST DRAM address
-        end
-        else begin
-          y_cord_o = eva_i[2+lg_vcache_size_lp+x_cord_width_p]
-            ? (y_cord_width_p)'(num_tiles_y_p+1)  // DRAM ports are directly below the manycore tiles.
-            : {y_cord_width_p{1'b0}};
-          x_cord_o = eva_i[2+lg_vcache_size_lp+:x_cord_width_p];
-          epa_o = {
-            1'b0,
-            {(addr_width_p-1-lg_vcache_size_lp){1'b0}},
-            eva_i[2+:lg_vcache_size_lp]
-          };
-        end
       end
     end
     else if (is_global_addr) begin
@@ -132,20 +161,21 @@ module bsg_manycore_eva_to_npa
       // x,y-cord and EPA is directly encoded in EVA.
       y_cord_o = y_cord_width_p'(global_addr.y_cord);
       x_cord_o = x_cord_width_p'(global_addr.x_cord);
-      epa_o = {{(addr_width_p-epa_word_addr_width_gp){1'b0}}, global_addr.addr};
+      epa_o = {{(addr_width_p-global_epa_word_addr_width_gp){1'b0}}, global_addr.addr};
     end
     else if (is_tile_group_addr) begin
       // tile-group addr
       // tile-coordinate in the EVA is added to the tile-group origin register.
-      y_cord_o = y_cord_width_p'(tile_group_addr.y_cord + tgo_y_i);
-      x_cord_o = x_cord_width_p'(tile_group_addr.x_cord + tgo_x_i);
-      epa_o = {{(addr_width_p-epa_word_addr_width_gp){1'b0}}, tile_group_addr.addr};
+      y_cord_o = {pod_y_i, y_subcord_width_lp'(tile_group_addr.y_cord + tgo_y_i)};
+      x_cord_o = {pod_x_i, x_subcord_width_lp'(tile_group_addr.x_cord + tgo_x_i)};
+      epa_o = {{(addr_width_p-tile_group_epa_word_addr_width_gp){1'b0}}, tile_group_addr.addr};
     end
     else begin
       // should never happen
       y_cord_o = '0;
       x_cord_o = '0;
       epa_o = '0;
+      is_invalid_addr_o = 1'b1;
     end
   end
 
diff --git a/v/bsg_manycore_hetero_socket.v b/v/bsg_manycore_hetero_socket.v
index 8f4ef45e3..a4274e7a6 100644
--- a/v/bsg_manycore_hetero_socket.v
+++ b/v/bsg_manycore_hetero_socket.v
@@ -20,16 +20,18 @@
                           ,.data_width_p(data_width_p)                                 \
                           ,.addr_width_p(addr_width_p)                                 \
                           ,.dmem_size_p (dmem_size_p )                                 \
+                          ,.num_vcache_rows_p(num_vcache_rows_p)                       \
                           ,.vcache_size_p(vcache_size_p)                               \
                           ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p) \
                           ,.vcache_sets_p(vcache_sets_p)                               \
                           ,.debug_p(debug_p)                                           \
-                          ,.branch_trace_en_p(branch_trace_en_p)                       \
                           ,.icache_entries_p(icache_entries_p)                         \
                           ,.icache_tag_width_p (icache_tag_width_p)                    \
                           ,.max_out_credits_p(max_out_credits_p)                       \
                           ,.num_tiles_x_p(num_tiles_x_p)                               \
                           ,.num_tiles_y_p(num_tiles_y_p)                               \
+                          ,.pod_x_cord_width_p(pod_x_cord_width_p)                     \
+                          ,.pod_y_cord_width_p(pod_y_cord_width_p)                     \
                           ,.fwd_fifo_els_p(fwd_fifo_els_p)                             \
                           ) z                                                          \
           (.clk_i                                                                      \
@@ -38,6 +40,8 @@
            ,.link_sif_o                                                                \
            ,.my_x_i                                                                    \
            ,.my_y_i                                                                    \
+           ,.pod_x_i                                                                    \
+           ,.pod_y_i                                                                    \
            );                                                                          \
      end
 
@@ -50,13 +54,17 @@ module bsg_manycore_hetero_socket
     , parameter dmem_size_p = "inv"
     , parameter icache_entries_p = "inv" // in words
     , parameter icache_tag_width_p = "inv"
+    , parameter num_vcache_rows_p = "inv"
     , parameter vcache_size_p = "inv"
     , parameter debug_p = 0
-    , parameter branch_trace_en_p = 0
     , parameter max_out_credits_p = 32
     , parameter int hetero_type_p = 0
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
     , parameter num_tiles_x_p="inv"
     , parameter num_tiles_y_p="inv"
+    , parameter x_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p)
     , parameter vcache_block_size_in_words_p="inv"
     , parameter vcache_sets_p="inv"
     , parameter fwd_fifo_els_p = "inv"
@@ -73,8 +81,11 @@ module bsg_manycore_hetero_socket
     , output [bsg_manycore_link_sif_width_lp-1:0] link_sif_o
 
     // tile coordinates
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    , input [x_subcord_width_lp-1:0] my_x_i
+    , input [y_subcord_width_lp-1:0] my_y_i
+
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
   );
 
   // add as many types as you like...
diff --git a/v/bsg_manycore_hor_io_router.v b/v/bsg_manycore_hor_io_router.v
new file mode 100644
index 000000000..b06cc8176
--- /dev/null
+++ b/v/bsg_manycore_hor_io_router.v
@@ -0,0 +1,174 @@
+/**
+ *    bsg_manycore_hor_io_router.v
+ *
+ *    this router is used for attaching accelerators on the side of the pod.
+ *    this router gets the x coordinate of the 1-minus of the leftmost tile x coordinate.
+ *    this router needs to make a connection with a local link and a ruche link reaching this router.
+ *    the accelerator connects to the P-port.
+ *
+ *    use tieoff_west_p, if this router is attaching to the west side of the pod.
+ *    use tieoff_east_p, if this router is attaching to the east side of the pod.
+ */
+
+
+module bsg_manycore_hor_io_router
+  import bsg_noc_pkg::*;
+  import bsg_manycore_pkg::*;
+  import bsg_mesh_router_pkg::*;
+  #(parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+   
+    , parameter ruche_factor_X_p="inv"
+ 
+    , parameter tieoff_west_p="inv"
+    , parameter tieoff_east_p="inv"
+    , parameter tieoff_proc_p=0
+
+    , parameter dims_lp=3 // only support 3
+
+    , parameter link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+    , parameter ruche_x_link_sif_width_lp =
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+  )
+  ( 
+    input clk_i
+    , input reset_i
+    
+    , input  [S:W][link_sif_width_lp-1:0] link_sif_i
+    , output [S:W][link_sif_width_lp-1:0] link_sif_o
+
+    , input  [link_sif_width_lp-1:0] proc_link_sif_i
+    , output [link_sif_width_lp-1:0] proc_link_sif_o
+
+    , input  [E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_i
+    , output [E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_o
+    
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
+  );
+
+  // RE, RW, S, N, E, W
+  localparam stub_lp = {
+    (tieoff_east_p ? 1'b1 : 1'b0), // RE  
+    (tieoff_west_p ? 1'b1 : 1'b0), // RW
+    2'b00,  // S, N
+    (tieoff_east_p ? 1'b1 : 1'b0), // E  
+    (tieoff_west_p ? 1'b1 : 1'b0)  // W
+  };
+
+
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_ruche_x_link_sif_s [E:W] ruche_link_in;
+  bsg_manycore_ruche_x_link_sif_s [E:W] ruche_link_out;
+  assign ruche_link_in = ruche_link_i;
+  assign ruche_link_o = ruche_link_out;
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_link_sif_s [RE:W] link_sif_li;
+  bsg_manycore_link_sif_s [RE:W] link_sif_lo;
+  bsg_manycore_link_sif_s proc_link_sif_li;
+  bsg_manycore_link_sif_s proc_link_sif_lo;
+
+  localparam fwd_use_credits_lp = 7'b0000000;
+  localparam int fwd_fifo_els_lp[dims_lp*2:0] = '{2,2,2,2,2,2,2};
+  localparam rev_use_credits_lp = 7'b0000000;
+  localparam int rev_fifo_els_lp[dims_lp*2:0] = '{2,2,2,2,2,2,2};
+
+
+  bsg_manycore_mesh_node #(
+    .x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.data_width_p(data_width_p)
+    ,.addr_width_p(addr_width_p)
+    ,.dims_p(dims_lp)
+    ,.ruche_factor_X_p(ruche_factor_X_p)
+    ,.stub_p(stub_lp) 
+
+    ,.fwd_use_credits_p(fwd_use_credits_lp)
+    ,.fwd_fifo_els_p(fwd_fifo_els_lp)
+    ,.rev_use_credits_p(rev_use_credits_lp)
+    ,.rev_fifo_els_p(rev_fifo_els_lp)
+  ) rtr (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+
+    ,.links_sif_i(link_sif_li)
+    ,.links_sif_o(link_sif_lo)
+
+    ,.proc_link_sif_i(proc_link_sif_li)
+    ,.proc_link_sif_o(proc_link_sif_lo)
+
+    ,.global_x_i(global_x_i)
+    ,.global_y_i(global_y_i)
+  );
+
+  if (tieoff_proc_p) begin
+    assign proc_link_sif_li = '0;
+    assign proc_link_sif_o = '0;
+  end
+  else begin
+    assign proc_link_sif_li = proc_link_sif_i;
+    assign proc_link_sif_o = proc_link_sif_lo;
+  end
+
+  // connect N,S
+  assign link_sif_o[N] = link_sif_lo[N];
+  assign link_sif_li[N] = link_sif_i[N];
+  assign link_sif_o[S] = link_sif_lo[S];
+  assign link_sif_li[S] = link_sif_i[S];
+
+
+  // west link
+  if (tieoff_west_p) begin: tw
+    assign link_sif_li[W] = '0;
+    assign link_sif_li[RW] = '0;
+    assign ruche_link_out[W] = '0;
+  end
+  else begin: tnw
+    // local
+    assign link_sif_li[W] = link_sif_i[W];
+    assign link_sif_o[W] = link_sif_lo[W];
+    // ruche
+    assign link_sif_li[RW].fwd = tieoff_east_p
+      ? '{ready_and_rev: ruche_link_in[W].fwd.ready_and_rev, default: '0}
+      : `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_in[W].fwd, global_y_i);
+    assign link_sif_li[RW].rev = 
+      `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_in[W].rev, global_y_i);
+    assign ruche_link_out[W].fwd =
+      `bsg_manycore_link_sif_fwd_filter_src_y(x_cord_width_p,y_cord_width_p,link_sif_lo[RW].fwd);
+    assign ruche_link_out[W].rev = tieoff_east_p
+      ? '{ready_and_rev: link_sif_lo[RW].rev.ready_and_rev, default: '0}
+      : `bsg_manycore_link_sif_rev_filter_dest_y(x_cord_width_p,y_cord_width_p,link_sif_lo[RW].rev);
+  end
+
+
+ 
+  // east link
+  if (tieoff_east_p) begin
+    assign link_sif_li[E] = '0;
+    assign link_sif_li[RE] = '0;
+    assign ruche_link_out[E] = '0;
+  end
+  else begin
+    // local
+    assign link_sif_li[E] = link_sif_i[E];
+    assign link_sif_o[E] = link_sif_lo[E];
+    // ruche
+    assign link_sif_li[RE].fwd = tieoff_west_p
+      ? '{ready_and_rev: ruche_link_in[E].fwd.ready_and_rev, default: '0}
+      : `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_in[E].fwd, global_y_i);
+    assign link_sif_li[RE].rev = 
+      `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_in[E].rev, global_y_i);
+    assign ruche_link_out[E].fwd =
+      `bsg_manycore_link_sif_fwd_filter_src_y(x_cord_width_p,y_cord_width_p,link_sif_lo[RE].fwd);
+    assign ruche_link_out[E].rev = tieoff_west_p
+      ? '{ready_and_rev: link_sif_lo[RE].rev.ready_and_rev, default: '0}
+      : `bsg_manycore_link_sif_rev_filter_dest_y(x_cord_width_p,y_cord_width_p,link_sif_lo[RE].rev);
+  end
+
+
+endmodule
diff --git a/v/bsg_manycore_hor_io_router_column.v b/v/bsg_manycore_hor_io_router_column.v
new file mode 100644
index 000000000..de7ac65b4
--- /dev/null
+++ b/v/bsg_manycore_hor_io_router_column.v
@@ -0,0 +1,105 @@
+/**
+ *    bsg_manycore_hor_io_router_column.v
+ *
+ *    This modules instantiates a vertical chain of bsg_manycore_hor_io_router,
+ *    which can attach to the side of the pods to provide accelerator connectivity.
+ */
+
+
+module bsg_manycore_hor_io_router_column
+  import bsg_noc_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+    , parameter ruche_factor_X_p="inv"
+    
+    , parameter num_row_p="inv"
+    , parameter bit [num_row_p-1:0] tieoff_west_p="inv"
+    , parameter bit [num_row_p-1:0] tieoff_east_p ="inv"
+
+
+    , parameter link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+    , parameter ruche_x_link_sif_width_lp =
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    // vertical links
+    , input  [S:N][link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][link_sif_width_lp-1:0] ver_link_sif_o
+    
+    , input  [num_row_p-1:0][link_sif_width_lp-1:0] proc_link_sif_i
+    , output [num_row_p-1:0][link_sif_width_lp-1:0] proc_link_sif_o
+    
+    , input  [num_row_p-1:0][E:W][link_sif_width_lp-1:0] hor_link_sif_i
+    , output [num_row_p-1:0][E:W][link_sif_width_lp-1:0] hor_link_sif_o
+
+    , input  [num_row_p-1:0][E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_i
+    , output [num_row_p-1:0][E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_o
+
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [num_row_p-1:0][y_cord_width_p-1:0] global_y_i
+  );
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_link_sif_s [num_row_p-1:0][S:W] link_sif_li;
+  bsg_manycore_link_sif_s [num_row_p-1:0][S:W] link_sif_lo;
+
+
+  for (genvar i = 0; i < num_row_p; i++) begin: r
+
+    bsg_manycore_hor_io_router #(
+      .addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.ruche_factor_X_p(ruche_factor_X_p)
+    
+      ,.tieoff_west_p(tieoff_west_p[i])
+      ,.tieoff_east_p(tieoff_east_p[i])
+    ) io_rtr (
+      .clk_i(clk_i)
+      ,.reset_i(reset_i)
+
+      ,.link_sif_i(link_sif_li[i])
+      ,.link_sif_o(link_sif_lo[i])
+
+      ,.proc_link_sif_i(proc_link_sif_i[i])
+      ,.proc_link_sif_o(proc_link_sif_o[i])
+
+      ,.ruche_link_i(ruche_link_i[i])
+      ,.ruche_link_o(ruche_link_o[i])
+    
+      ,.global_x_i(global_x_i)
+      ,.global_y_i(global_y_i[i])
+    );
+
+    assign hor_link_sif_o[i][W] = link_sif_lo[i][W];
+    assign link_sif_li[i][W] = hor_link_sif_i[i][W];
+    assign hor_link_sif_o[i][E] = link_sif_lo[i][E];
+    assign link_sif_li[i][E] = hor_link_sif_i[i][E];
+
+    if (i != num_row_p-1) begin
+      assign link_sif_li[i][S] = link_sif_lo[i+1][N];
+      assign link_sif_li[i+1][N] = link_sif_lo[i][S];
+    end
+
+  end
+
+  assign ver_link_sif_o[N] = link_sif_lo[0][N];
+  assign link_sif_li[0][N] = ver_link_sif_i[N];
+
+  assign ver_link_sif_o[S] = link_sif_lo[num_row_p-1][S];
+  assign link_sif_li[num_row_p-1][S] = ver_link_sif_i[S];
+
+  
+
+
+
+endmodule
diff --git a/v/bsg_manycore_link_to_cache.v b/v/bsg_manycore_link_to_cache.v
index f30ccd48a..51d6b1d9f 100644
--- a/v/bsg_manycore_link_to_cache.v
+++ b/v/bsg_manycore_link_to_cache.v
@@ -52,6 +52,8 @@ module bsg_manycore_link_to_cache
     , output logic yumi_o
 
     , input v_we_i
+
+    , output logic wh_dest_east_not_west_o
   );
 
 
@@ -120,7 +122,8 @@ module bsg_manycore_link_to_cache
   state_e state_r, state_n;
   logic [lg_sets_lp+lg_ways_lp:0] tagst_sent_r, tagst_sent_n;
   logic [lg_sets_lp+lg_ways_lp:0] tagst_received_r, tagst_received_n;
-
+  logic wh_dest_east_not_west_r, wh_dest_east_not_west_n;
+  assign wh_dest_east_not_west_o = wh_dest_east_not_west_r;
 
   // cache pipeline tracker
   // 
@@ -136,32 +139,46 @@ module bsg_manycore_link_to_cache
   bsg_manycore_return_packet_type_e return_pkt_type;
 
   always_comb begin
-    if (packet_lo.op == e_remote_store) begin
-      return_pkt_type = e_return_credit;
-    end
-    else if (packet_lo.op == e_remote_amo) begin
-        return_pkt_type = e_return_int_wb;
-    end
-    else if (packet_lo.op == e_cache_op) begin
+    unique case (packet_lo.op_v2)
+      e_remote_store, e_remote_sw: begin
         return_pkt_type = e_return_credit;
-    end
-    else begin
-      if (load_info.icache_fetch)
-        return_pkt_type = e_return_ifetch;
-      else if (load_info.float_wb)
-        return_pkt_type = e_return_float_wb;
-      else
+      end
+      e_remote_load: begin
+        if (load_info.icache_fetch)
+          return_pkt_type = e_return_ifetch;
+        else if (load_info.float_wb)
+          return_pkt_type = e_return_float_wb;
+        else
+          return_pkt_type = e_return_int_wb;
+      end
+      e_cache_op: begin
+        return_pkt_type = e_return_credit;
+      end
+      e_remote_amoswap, e_remote_amoor, e_remote_amoadd: begin
         return_pkt_type = e_return_int_wb;
-    end
+      end
+      // should never happen
+      default: begin
+        return_pkt_type = e_return_credit;
+      end
+    endcase
   end  
 
+  wire [bsg_manycore_reg_id_width_gp-1:0] payload_reg_id;
+  bsg_manycore_reg_id_decode pd0 (
+    .data_i(packet_lo.payload)
+    ,.mask_i(packet_lo.reg_id.store_mask_s.mask)
+    ,.reg_id_o(payload_reg_id)
+  );
 
   always_ff @ (posedge clk_i) begin
     if (state_r == READY) begin
       if (v_o & ready_i) begin
         tl_info_r <= '{
           pkt_type: return_pkt_type,
-          reg_id: packet_lo.reg_id,
+          reg_id : ((packet_lo.op_v2 == e_remote_store) | (packet_lo.op_v2 == e_cache_op)) 
+              ? payload_reg_id
+              : packet_lo.reg_id,
           y_cord: packet_lo.src_y_cord,
           x_cord: packet_lo.src_x_cord
         };
@@ -172,6 +189,8 @@ module bsg_manycore_link_to_cache
     end
   end
 
+
+
   always_comb begin
 
     cache_pkt.mask = '0;
@@ -180,6 +199,7 @@ module bsg_manycore_link_to_cache
     cache_pkt.opcode = TAGST;
     tagst_sent_n = tagst_sent_r;
     tagst_received_n = tagst_received_r;
+    wh_dest_east_not_west_n = wh_dest_east_not_west_r;
     v_o = 1'b0;
     yumi_o = 1'b0;
     state_n = state_r;
@@ -225,59 +245,98 @@ module bsg_manycore_link_to_cache
         v_o = packet_v_lo;
         packet_yumi_li = packet_v_lo & ready_i;
     
+
+        // if two MSBs are ones, then it maps to wh_dest_east_not_west.
+        // store-only;
+        if (packet_lo.addr[link_addr_width_p-1-:2] == 2'b11) begin
+
+          case (packet_lo.op_v2)
+            e_remote_store: cache_pkt.opcode = TAGLA; // it injects TAGLA as noop;
+            default: begin
+              cache_pkt.opcode = TAGLA;
+              // synopsys translate_off
+              assert final(reset_i !== 1'b0 | ~packet_v_lo)
+                else $error("[BSG_ERROR] Invalid packet op for wh_dest_east_not_west EPA: %b", packet_lo.op_v2);
+              // synopsys translate_on
+            end
+          endcase
+
+          // updated when nop packet is taken by the cache.
+          wh_dest_east_not_west_n = (packet_v_lo & ready_i)
+            ? packet_lo.payload[0]
+            : wh_dest_east_not_west_r;
+        end
         // if MSB of addr is one, then it maps to tag_mem
         // otherwise it's regular access to data_mem.
         // we want to expose read/write access to tag_mem on NPA
         // for extra debugging capability.
-        if (packet_lo.addr[link_addr_width_p-1]) begin
-          case (packet_lo.op)
-            e_remote_store: cache_pkt.opcode = TAGST;
+        else if (packet_lo.addr[link_addr_width_p-1]) begin
+          case (packet_lo.op_v2)
+            e_remote_store, e_remote_sw: cache_pkt.opcode = TAGST;
             e_remote_load:  cache_pkt.opcode = TAGLA;
             e_cache_op:     cache_pkt.opcode = TAGFL;
             default:        cache_pkt.opcode = TAGLA;
           endcase
         end
         else begin
-          if (packet_lo.op == e_remote_store) begin
-            cache_pkt.opcode = SM;
-          end
-          else if (packet_lo.op == e_remote_amo) begin
-            case (packet_lo.op_ex.amo_type)
-              e_amo_swap: cache_pkt.opcode = AMOSWAP_W;
-              e_amo_or: cache_pkt.opcode = AMOOR_W;
-              default: cache_pkt.opcode = AMOSWAP_W; // this should never happen!
-            endcase
-          end
-          else if (packet_lo.op == e_cache_op) begin
-            case (packet_lo.op_ex.cache_op_type)
-              e_afl: cache_pkt.opcode = AFL;
-              e_aflinv: cache_pkt.opcode = AFLINV;
-              e_ainv: cache_pkt.opcode = AINV;
-              default: cache_pkt.opcode = AINV; // (what should the default be? shouldn't happen)
-            endcase
-          end
-          else begin
-            if (load_info.is_byte_op)
-              cache_pkt.opcode = load_info.is_unsigned_op
-                ? LBU
-                : LB;
-            else if (load_info.is_hex_op)
-              cache_pkt.opcode = load_info.is_unsigned_op
-                ? LHU
-                : LH;
-            else begin
-              cache_pkt.opcode = LW;
+          unique case (packet_lo.op_v2)
+            e_remote_store, e_remote_sw: begin
+              cache_pkt.opcode = SM;
+            end
+
+            e_remote_amoswap: begin
+              cache_pkt.opcode = AMOSWAP_W;
             end
-          end
+
+            e_remote_amoor: begin
+              cache_pkt.opcode = AMOOR_W;
+            end
+
+            e_remote_amoadd: begin
+              cache_pkt.opcode = AMOADD_W;
+            end
+
+            e_cache_op: begin
+              case (packet_lo.reg_id.cache_op)
+                e_afl: cache_pkt.opcode = AFL;
+                e_aflinv: cache_pkt.opcode = AFLINV;
+                e_ainv: cache_pkt.opcode = AINV;
+                default: begin
+                  cache_pkt.opcode = AFL; // should never happen
+                end
+              endcase
+            end
+
+            e_remote_load: begin
+              if (load_info.is_byte_op)
+                cache_pkt.opcode = load_info.is_unsigned_op
+                  ? LBU
+                  : LB;
+              else if (load_info.is_hex_op)
+                cache_pkt.opcode = load_info.is_unsigned_op
+                  ? LHU
+                  : LH;
+              else begin
+                cache_pkt.opcode = LW;
+              end
+            end            
+            // this should never happen.
+            default: begin
+              cache_pkt.opcode = AFL;
+              // synopsys translate_off
+              assert final(reset_i !== 1'b0 | ~packet_v_lo) else $error("[BSG_ERROR] Invalid packet op: %b", packet_lo.op_v2);
+              // synopsys translate_on
+            end
+          endcase
         end
 
         cache_pkt.data = packet_lo.payload;
-        cache_pkt.mask = packet_lo.op_ex;
+        cache_pkt.mask = (packet_lo.op_v2 == e_remote_sw)
+          ? 4'b1111
+          : packet_lo.reg_id.store_mask_s.mask;
         cache_pkt.addr = {
           packet_lo.addr[0+:link_addr_width_p-1],
-          (packet_lo.op == e_remote_store | packet_lo.op == e_remote_amo | packet_lo.op == e_cache_op)
-            ? 2'b00
-            : load_info.part_sel
+          (packet_lo.op_v2 == e_remote_load) ? load_info.part_sel : 2'b00
         };
 
         // return pkt
@@ -309,11 +368,13 @@ module bsg_manycore_link_to_cache
       state_r          <= RESET;
       tagst_sent_r     <= '0;
       tagst_received_r <= '0;
+      wh_dest_east_not_west_r <= 1'b0;
     end
     else begin
       state_r          <= state_n;
       tagst_sent_r     <= tagst_sent_n;
       tagst_received_r <= tagst_received_n;
+      wh_dest_east_not_west_r <= wh_dest_east_not_west_n;
     end
   end
 
diff --git a/v/bsg_manycore_link_to_cache_non_blocking.v b/v/bsg_manycore_link_to_cache_non_blocking.v
index 8e53fa5f9..c0de52312 100644
--- a/v/bsg_manycore_link_to_cache_non_blocking.v
+++ b/v/bsg_manycore_link_to_cache_non_blocking.v
@@ -188,19 +188,19 @@ module bsg_manycore_link_to_cache_non_blocking
         packet_yumi_li = packet_v_lo & ready_i;
         
         if (packet_lo.addr[addr_width_p-1]) begin
-	   case (packet_lo.op)
-	      e_remote_store: cache_pkt.opcode = TAGST;
-	      e_remote_load:  cache_pkt.opcode = TAGLA;
-	      e_cache_op:     cache_pkt.opcode = TAGFL;
-	      default:        cache_pkt.opcode = TAGLA; // should never happen
-	   endcase
+          case (packet_lo.op_v2)
+            e_remote_store: cache_pkt.opcode = TAGST;
+            e_remote_load:  cache_pkt.opcode = TAGLA;
+            e_cache_op:     cache_pkt.opcode = TAGFL;
+            default:        cache_pkt.opcode = TAGLA; // should never happen
+          endcase
         end
         else begin
-          if (packet_lo.op == e_remote_store) begin
+          if (packet_lo.op_v2 == e_remote_store) begin
             cache_pkt.opcode = SM;
           end
-          else if (packet_lo.op == e_cache_op) begin
-            case (packet_lo.op_ex.cache_op_type)
+          else if (packet_lo.op_v2 == e_cache_op) begin
+            case (packet_lo.reg_id.cache_op)
               e_afl: cache_pkt.opcode = AFL;
               e_aflinv: cache_pkt.opcode = AFLINV;
               e_ainv: cache_pkt.opcode = AINV;
@@ -227,7 +227,7 @@ module bsg_manycore_link_to_cache_non_blocking
           packet_lo.addr[0+:addr_width_p-1],
           (packet_lo.op == e_remote_store) | (packet_lo.op == e_cache_op) ? 2'b00 : load_info.part_sel
         };
-        cache_pkt.mask = packet_lo.op_ex;
+        cache_pkt.mask = packet_lo.store_mask_s.mask;
 
         cache_pkt_id.src_x = packet_lo.src_x_cord;
         cache_pkt_id.src_y = packet_lo.src_y_cord;
diff --git a/v/bsg_manycore_lock_ctrl.v b/v/bsg_manycore_lock_ctrl.v
index 99d651950..369ce80b1 100644
--- a/v/bsg_manycore_lock_ctrl.v
+++ b/v/bsg_manycore_lock_ctrl.v
@@ -41,8 +41,7 @@ module bsg_manycore_lock_ctrl
     , input [(data_width_p>>3)-1:0] in_mask_i
     , input [addr_width_p-1:0]      in_addr_i
     , input                         in_we_i
-    , input                         in_amo_op_i
-    , input bsg_manycore_amo_type_e in_amo_type_i
+    , input                         in_amo_op_i  // amoswap is assumed
     , input [x_cord_width_p-1:0]    in_x_cord_i
     , input [y_cord_width_p-1:0]    in_y_cord_i
 
@@ -77,7 +76,7 @@ module bsg_manycore_lock_ctrl
     end
     else begin
       // only support amoswap
-      if (amo_op_yumi & (in_amo_type_i == e_amo_swap))
+      if (amo_op_yumi)
         amo_lock_r <= in_data_i[0];
     end
   end
diff --git a/v/bsg_manycore_mesh_node.v b/v/bsg_manycore_mesh_node.v
index c052923fb..10dd223a1 100644
--- a/v/bsg_manycore_mesh_node.v
+++ b/v/bsg_manycore_mesh_node.v
@@ -50,9 +50,9 @@ module bsg_manycore_mesh_node
     , input  [bsg_manycore_link_sif_width_lp-1:0] proc_link_sif_i
     , output [bsg_manycore_link_sif_width_lp-1:0] proc_link_sif_o
 
-    // tile coordinates
-    , input  [x_cord_width_p-1:0] my_x_i
-    , input  [y_cord_width_p-1:0] my_y_i
+    // tile coordinates (relative to entire array of pods)
+    , input  [x_cord_width_p-1:0] global_x_i
+    , input  [y_cord_width_p-1:0] global_y_i
   );
 
 
@@ -93,8 +93,8 @@ module bsg_manycore_mesh_node
     ,.link_i(link_fwd_sif_li)
     ,.link_o(link_fwd_sif_lo)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.my_x_i(global_x_i)
+    ,.my_y_i(global_y_i)
    );
 
  
@@ -124,8 +124,8 @@ module bsg_manycore_mesh_node
     ,.link_i(link_rev_sif_li)
     ,.link_o(link_rev_sif_lo)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.my_x_i(global_x_i)
+    ,.my_y_i(global_y_i)
    );
 
   
diff --git a/v/bsg_manycore_pkg.v b/v/bsg_manycore_pkg.v
index 9ab0ab445..3c749258f 100644
--- a/v/bsg_manycore_pkg.v
+++ b/v/bsg_manycore_pkg.v
@@ -22,11 +22,20 @@ package bsg_manycore_pkg;
 
   //  request packet type
   //
-  typedef enum logic [1:0] {
+  typedef enum logic [3:0] {
     e_remote_load
-    , e_remote_store
-    , e_remote_amo
+    , e_remote_store  // masked store (reg_id is used as store mask)
+    , e_remote_sw     // store word   (reg_id is used as a tracking id)
     , e_cache_op // AFL, AFLINV, AINV for DRAM addresses - TAGFL for tag memory
+    , e_remote_amoswap
+    , e_remote_amoadd
+    , e_remote_amoxor
+    , e_remote_amoand
+    , e_remote_amoor
+    , e_remote_amomin
+    , e_remote_amomax
+    , e_remote_amominu
+    , e_remote_amomaxu
   } bsg_manycore_packet_op_e;
 
 
@@ -58,39 +67,26 @@ package bsg_manycore_pkg;
     logic [1:0] part_sel;
   } bsg_manycore_load_info_s;
  
-  typedef enum logic [3:0] {
-    e_amo_swap
-    ,e_amo_add
-    ,e_amo_xor
-    ,e_amo_and
-    ,e_amo_or
-    ,e_amo_min
-    ,e_amo_max
-    ,e_amo_minu
-    ,e_amo_maxu
-  } bsg_manycore_amo_type_e;
 
-  typedef enum logic [3:0] {
+  // e_cache_op subop
+  typedef enum logic [bsg_manycore_reg_id_width_gp-1:0] {
     e_afl
     ,e_ainv
     ,e_aflinv
     ,e_tagfl
   } bsg_manycore_cache_op_type_e;
 
-  typedef union packed {
-    bsg_manycore_cache_op_type_e cache_op_type; // for operations applied to victim cache
-    bsg_manycore_amo_type_e amo_type; // for remote atomic packet
-    logic [3:0] store_mask;           // for remote store packet
-  } bsg_manycore_packet_op_ex_u;
-
 
   //  Declare fwd and rev packet
   //
   //  Request Packet (fwd)
   //  addr         :  EPA (word address)
   //  op           :  packet opcode 
-  //  op_ex        :  opcode extension; for store, this is store mask. for amo, this is amo type. for cache_op, this is the cache op type.
-  //  reg_id       :  for amo and int/float load, this is the rd. for store, this should be zero.
+  //  reg_id       :  This field is unionized with bsg_manycore_packet_reg_id_u.
+  //                  For remote load/atomic (e_remote_load/e_remote_amo*), this field contains reg_id (rd), which gets returned by the return packet.
+  //                  For e_cache_op, this field contains bsg_manycore_cache_op_type_e.
+  //                  For e_remote_sw, this field contains reg_id.
+  //                  For e_remote_store, this field contains store mask. reg_id for tracking is placed in unmasked bytes of payload.
   //  payload      :  for store and amo, this is the store data. for load, it contains load info.
   //  src_y_cord   :  y-cord, origin of this packet
   //  src_x_cord   :  x_cord, origin of this packet
@@ -100,7 +96,10 @@ package bsg_manycore_pkg;
   //  Return Packet (rev)
   //  pkt_type     :  return pkt type
   //  data         :  load data
-  //  reg_id       :  rd for int and float load/
+  //  reg_id       :  reg_id in all responses should return the same reg_id in the request packet,
+  //                  except for e_remote_store and e_cache_op, where reg_id is used as store mask, or cache op.
+  //                  For e_remote_store and e_cache_op, some AND-OR decoding is neede to retrieve the reg_id from the payload.
+  //                  Basically, OR all the 5 LSBs of unmasked bytes.
   //  y_cord       :  y-cord of the destination
   //  x_cord       :  x-cord of the destination
   `define declare_bsg_manycore_packet_s(addr_width_mp,data_width_mp,x_cord_width_mp,y_cord_width_mp) \
@@ -119,12 +118,20 @@ package bsg_manycore_pkg;
         bsg_manycore_load_info_s load_info;                                              \
       } load_info_s;                                                                     \
     } bsg_manycore_packet_payload_u;                                                     \
+                                                                                \
+    typedef union packed {                                                      \
+      bsg_manycore_cache_op_type_e cache_op;                                    \
+      struct packed {                                                           \
+        logic [bsg_manycore_reg_id_width_gp-(data_width_mp>>3)-1:0] unused;     \
+        logic [(data_width_mp>>3)-1:0] mask;                                    \
+      } store_mask_s;                                                           \
+      logic [bsg_manycore_reg_id_width_gp-1:0] reg_id;                          \
+    } bsg_manycore_packet_reg_id_u;                                             \
                                                                         \
     typedef struct packed {                                             \
        logic [addr_width_mp-1:0] addr;                                  \
-       bsg_manycore_packet_op_e op;                                     \
-       bsg_manycore_packet_op_ex_u op_ex;                               \
-       logic [bsg_manycore_reg_id_width_gp-1:0] reg_id;                 \
+       bsg_manycore_packet_op_e op_v2;                                  \
+       bsg_manycore_packet_reg_id_u reg_id;                             \
        bsg_manycore_packet_payload_u payload;                           \
        logic [y_cord_width_mp-1:0] src_y_cord;                          \
        logic [x_cord_width_mp-1:0] src_x_cord;                          \
@@ -136,7 +143,7 @@ package bsg_manycore_pkg;
     ($bits(bsg_manycore_return_packet_type_e)+data_width_mp+bsg_manycore_reg_id_width_gp+x_cord_width_mp+y_cord_width_mp)
 
   `define bsg_manycore_packet_width(addr_width_mp,data_width_mp,x_cord_width_mp,y_cord_width_mp) \
-    (addr_width_mp+$bits(bsg_manycore_packet_op_e)+$bits(bsg_manycore_packet_op_ex_u)+bsg_manycore_reg_id_width_gp+data_width_mp+(2*(y_cord_width_mp+x_cord_width_mp)))
+    (addr_width_mp+$bits(bsg_manycore_packet_op_e)+bsg_manycore_reg_id_width_gp+data_width_mp+(2*(y_cord_width_mp+x_cord_width_mp)))
 
 
 
@@ -209,35 +216,55 @@ package bsg_manycore_pkg;
       link_rev[$bits(link_rev)-1:x_cord_width_mp+y_cord_width_mp],                                              \
       link_rev[x_cord_width_mp-1:0]                                                                             \
     }
-  
+ 
 
+  // vcache DMA wormhole header flit format 
+  `define declare_bsg_manycore_vcache_wh_header_flit_s(wh_flit_width_mp,wh_cord_width_mp,wh_len_width_mp,wh_cid_width_mp) \
+    typedef struct packed { \
+      logic [wh_flit_width_mp-(wh_cord_width_mp*2)-1-wh_len_width_mp-wh_cid_width_mp-1:0] unused; \
+      logic write_not_read; \
+      logic [wh_cord_width_mp-1:0] src_cord; \
+      logic [wh_cid_width_mp-1:0] cid; \
+      logic [wh_len_width_mp-1:0] len; \
+      logic [wh_cord_width_mp-1:0] dest_cord; \
+    } bsg_manycore_vcache_wh_header_flit_s
+
+
+  // manycore POD bsg_tag_client payload
+  // contains reset
+  typedef struct packed {
+    logic reset;
+  } bsg_manycore_pod_tag_payload_s;
 
 
   // EVA Address Format
 
-  localparam epa_word_addr_width_gp = 16; // max EPA width on vanilla core. (word addr)
-  localparam max_x_cord_width_gp = 6;
-  localparam max_y_cord_width_gp = 6;
 
   // global
-  `define declare_bsg_manycore_global_addr_s \
-    typedef struct packed {                  \
-      logic [1:0]       remote;              \
-      logic [max_y_cord_width_gp-1:0]       y_cord;                \
-      logic [max_x_cord_width_gp-1:0]       x_cord;                \
-      logic [epa_word_addr_width_gp-1:0]    addr;                  \
-      logic [1:0]                           low_bits;              \
-    } bsg_manycore_global_addr_s;
+  localparam global_epa_word_addr_width_gp = 14; // max EPA width on global EVA. (word addr)
+  localparam max_global_x_cord_width_gp = 7;
+  localparam max_global_y_cord_width_gp = 7;
+
+  typedef struct packed {
+    logic [1:0]                                   remote;
+    logic [max_global_y_cord_width_gp-1:0]        y_cord;
+    logic [max_global_x_cord_width_gp-1:0]        x_cord;
+    logic [global_epa_word_addr_width_gp-1:0]     addr;
+    logic [1:0]                                   low_bits;
+  } bsg_manycore_global_addr_s;
 
   // tile-group
-  `define declare_bsg_manycore_tile_group_addr_s \
-    typedef struct packed {                      \
-      logic [2:0]       remote;                  \
-      logic [max_y_cord_width_gp-2:0]       y_cord;                \
-      logic [max_x_cord_width_gp-1:0]       x_cord;                \
-      logic [epa_word_addr_width_gp-1:0]    addr;                  \
-      logic [1:0]                           low_bits;              \
-    } bsg_manycore_tile_group_addr_s;
+  localparam tile_group_epa_word_addr_width_gp = 16; // max EPA width on tile-group EVA. (word addr)
+  localparam max_tile_group_x_cord_width_gp = 6;
+  localparam max_tile_group_y_cord_width_gp = 5;
+
+  typedef struct packed {
+    logic [2:0]                                       remote;
+    logic [max_tile_group_y_cord_width_gp-1:0]        y_cord;
+    logic [max_tile_group_x_cord_width_gp-1:0]        x_cord;
+    logic [tile_group_epa_word_addr_width_gp-1:0]     addr;
+    logic [1:0]                                       low_bits;
+  } bsg_manycore_tile_group_addr_s;
 
 
 
diff --git a/v/bsg_manycore_pod_ruche.v b/v/bsg_manycore_pod_ruche.v
new file mode 100644
index 000000000..573e528a7
--- /dev/null
+++ b/v/bsg_manycore_pod_ruche.v
@@ -0,0 +1,427 @@
+/**
+ *    bsg_manycore_pod_ruche.v
+ *
+ *    manycore pod with ruche network.
+ *
+ */
+
+
+`include "bsg_noc_links.vh"
+
+module bsg_manycore_pod_ruche
+  import bsg_noc_pkg::*;
+  import bsg_tag_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(// number of tiles in a pod
+    parameter num_tiles_x_p="inv"
+    , parameter num_tiles_y_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+    , parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+
+    // This determines how to divide the pod into smaller hierarchical blocks.
+    , parameter num_subarray_x_p="inv"
+    , parameter num_subarray_y_p="inv"
+    // Number of tiles in a subarray
+    , parameter subarray_num_tiles_x_lp = (num_tiles_x_p/num_subarray_x_p)
+    , parameter subarray_num_tiles_y_lp = (num_tiles_y_p/num_subarray_y_p)
+    
+    // coordinate width within a pod
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
+  
+    , parameter dmem_size_p="inv"
+    , parameter icache_entries_p="inv"
+    , parameter icache_tag_width_p="inv"
+ 
+    , parameter num_vcache_rows_p="inv"  
+    , parameter vcache_addr_width_p="inv" 
+    , parameter vcache_data_width_p="inv" 
+    , parameter vcache_ways_p="inv"
+    , parameter vcache_sets_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_size_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    , parameter ruche_factor_X_p="inv"
+  
+    , parameter wh_ruche_factor_p="inv"
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    
+    // number of clock ports on vcache/tile subarray
+    , parameter num_clk_ports_p=1
+
+    , parameter manycore_link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    , parameter manycore_ruche_link_sif_width_lp =
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+
+    // This is used to define heterogeneous arrays. Each index defines
+    // the type of an X/Y coordinate in the array. This is a vector of
+    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
+    // default. See bsg_manycore_hetero_socket.v for more types.
+    `ifndef SYNTHESIS
+    , parameter int hetero_type_vec_p [0:(num_tiles_y_p*num_tiles_x_p) - 1]  = '{default:0}
+    `endif
+  )
+  (
+    // manycore 
+    input clk_i
+    , input [num_tiles_x_p-1:0] reset_i
+
+    , input  [E:W][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_i
+    , output [E:W][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_o
+
+    , input  [S:N][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_o
+
+    , input  [E:W][num_tiles_y_p-1:0][ruche_factor_X_p-1:0][manycore_ruche_link_sif_width_lp-1:0] ruche_link_i
+    , output [E:W][num_tiles_y_p-1:0][ruche_factor_X_p-1:0][manycore_ruche_link_sif_width_lp-1:0] ruche_link_o
+
+
+    // vcache
+    , input  [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] north_wh_link_sif_i
+    , output [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] north_wh_link_sif_o
+
+    , input  [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] south_wh_link_sif_i
+    , output [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] south_wh_link_sif_o
+
+
+    // pod cord (should be all same value for all columns)
+    , input [num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_i
+    , input [num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_i
+  );
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+
+
+  // vcache row (north)
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0] north_vc_reset_lo;
+  wh_link_sif_s [num_subarray_x_p-1:0][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] north_vc_wh_link_sif_li;
+  wh_link_sif_s [num_subarray_x_p-1:0][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] north_vc_wh_link_sif_lo;
+  bsg_manycore_link_sif_s [num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] north_vc_ver_link_sif_li;
+  bsg_manycore_link_sif_s [num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] north_vc_ver_link_sif_lo;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][x_cord_width_p-1:0] north_vc_global_x_li;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][y_cord_width_p-1:0] north_vc_global_y_li;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][x_cord_width_p-1:0] north_vc_global_x_lo;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][y_cord_width_p-1:0] north_vc_global_y_lo;
+
+  for (genvar x = 0; x < num_subarray_x_p; x++) begin: north_vc_x
+    bsg_manycore_tile_vcache_array #(
+      .addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.pod_x_cord_width_p(pod_x_cord_width_p)
+      ,.pod_y_cord_width_p(pod_y_cord_width_p)
+
+      ,.num_tiles_x_p(num_tiles_x_p)
+      ,.num_tiles_y_p(num_tiles_y_p)
+
+      ,.subarray_num_tiles_x_p(subarray_num_tiles_x_lp)
+
+      ,.num_vcache_rows_p(num_vcache_rows_p)
+      ,.vcache_addr_width_p(vcache_addr_width_p)
+      ,.vcache_data_width_p(vcache_data_width_p)
+      ,.vcache_ways_p(vcache_ways_p)
+      ,.vcache_sets_p(vcache_sets_p)
+      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+      ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+      ,.wh_ruche_factor_p(wh_ruche_factor_p)
+      ,.wh_cid_width_p(wh_cid_width_p)
+      ,.wh_flit_width_p(wh_flit_width_p)
+      ,.wh_len_width_p(wh_len_width_p)
+      ,.wh_cord_width_p(wh_cord_width_p)
+
+      ,.num_clk_ports_p(num_clk_ports_p)
+    ) north_vc_row (
+      .clk_i({num_clk_ports_p{clk_i}})
+
+      ,.reset_i(reset_i[(subarray_num_tiles_x_lp*x)+:subarray_num_tiles_x_lp])
+      ,.reset_o(north_vc_reset_lo[x])
+
+      ,.wh_link_sif_i(north_vc_wh_link_sif_li[x])
+      ,.wh_link_sif_o(north_vc_wh_link_sif_lo[x])
+    
+      ,.ver_link_sif_i(north_vc_ver_link_sif_li[x])
+      ,.ver_link_sif_o(north_vc_ver_link_sif_lo[x])
+
+      ,.global_x_i(north_vc_global_x_li[x])
+      ,.global_y_i(north_vc_global_y_li[x])
+      ,.global_x_o(north_vc_global_x_lo[x])
+      ,.global_y_o(north_vc_global_y_lo[x])
+    );
+
+
+    // connect coordinates
+    assign north_vc_global_x_li[x] = global_x_i[x*subarray_num_tiles_x_lp+:subarray_num_tiles_x_lp];
+    assign north_vc_global_y_li[x] = global_y_i[x*subarray_num_tiles_x_lp+:subarray_num_tiles_x_lp];
+
+    // connect north ver link
+    assign ver_link_sif_o[N][(x*subarray_num_tiles_x_lp)+:subarray_num_tiles_x_lp] = north_vc_ver_link_sif_lo[x][N];
+    assign north_vc_ver_link_sif_li[x][N] = ver_link_sif_i[N][(x*subarray_num_tiles_x_lp)+:subarray_num_tiles_x_lp]; 
+
+    // connect wh link to west
+    if (x == 0) begin
+      assign north_wh_link_sif_o[W] = north_vc_wh_link_sif_lo[x][W];
+      assign north_vc_wh_link_sif_li[x][W] = north_wh_link_sif_i[W];
+    end
+
+    // connect wh link to east
+    if (x == num_subarray_x_p-1) begin
+      assign north_wh_link_sif_o[E] = north_vc_wh_link_sif_lo[x][E];
+      assign north_vc_wh_link_sif_li[x][E] = north_wh_link_sif_i[E];
+    end
+   
+    // connect wh links between vc arrays
+    if (x < num_subarray_x_p-1) begin
+      assign north_vc_wh_link_sif_li[x+1][W] = north_vc_wh_link_sif_lo[x][E];
+      assign north_vc_wh_link_sif_li[x][E] = north_vc_wh_link_sif_lo[x+1][W];
+    end
+
+  end
+
+
+
+  // manycore subarray
+  bsg_manycore_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][E:W][subarray_num_tiles_y_lp-1:0] mc_hor_link_sif_li;
+  bsg_manycore_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][E:W][subarray_num_tiles_y_lp-1:0] mc_hor_link_sif_lo;
+  bsg_manycore_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] mc_ver_link_sif_li;
+  bsg_manycore_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] mc_ver_link_sif_lo;
+  bsg_manycore_ruche_x_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][E:W][subarray_num_tiles_y_lp-1:0][ruche_factor_X_p-1:0] mc_ruche_link_li;
+  bsg_manycore_ruche_x_link_sif_s [num_subarray_y_p-1:0][num_subarray_x_p-1:0][E:W][subarray_num_tiles_y_lp-1:0][ruche_factor_X_p-1:0] mc_ruche_link_lo;
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][x_cord_width_p-1:0] mc_global_x_li;
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][y_cord_width_p-1:0] mc_global_y_li;
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][x_cord_width_p-1:0] mc_global_x_lo;
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][y_cord_width_p-1:0] mc_global_y_lo;
+
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0] mc_reset_li;
+  logic [num_subarray_y_p-1:0][num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0] mc_reset_lo;
+
+  // Split the hetero_type_vec_p array into sub-arrays.
+  `ifndef SYNTHESIS
+  typedef int hetero_type_sub_vec[0:(subarray_num_tiles_y_lp*subarray_num_tiles_x_lp) - 1];
+  function hetero_type_sub_vec get_subarray_hetero_type_vec(int y, int x);
+    hetero_type_sub_vec vec;
+    for (int sy_i = 0; sy_i < subarray_num_tiles_y_lp; sy_i++) begin
+      for (int sx_i = 0; sx_i < subarray_num_tiles_x_lp; sx_i++) begin
+        vec[sy_i*subarray_num_tiles_x_lp + sx_i] = hetero_type_vec_p[(sy_i + y * subarray_num_tiles_y_lp) * num_tiles_x_p + x * subarray_num_tiles_x_lp + sx_i];
+      end
+    end
+    return vec;
+  endfunction
+  `endif
+
+  for (genvar y = 0; y < num_subarray_y_p; y++) begin: mc_y
+    for (genvar x = 0; x < num_subarray_x_p; x++) begin: mc_x
+      bsg_manycore_tile_compute_array_ruche #(
+        .dmem_size_p(dmem_size_p)
+        ,.icache_entries_p(icache_entries_p)
+        ,.icache_tag_width_p(icache_tag_width_p)
+
+        ,.num_vcache_rows_p(num_vcache_rows_p)
+        ,.vcache_size_p(vcache_size_p)
+        ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+        ,.vcache_sets_p(vcache_sets_p)
+        ,.num_tiles_x_p(num_tiles_x_p)
+        ,.num_tiles_y_p(num_tiles_y_p)
+
+        ,.subarray_num_tiles_x_p(subarray_num_tiles_x_lp)
+        ,.subarray_num_tiles_y_p(subarray_num_tiles_y_lp)
+
+        ,.pod_x_cord_width_p(pod_x_cord_width_p)
+        ,.pod_y_cord_width_p(pod_y_cord_width_p)
+        ,.x_cord_width_p(x_cord_width_p)
+        ,.y_cord_width_p(y_cord_width_p)
+        ,.addr_width_p(addr_width_p)
+        ,.data_width_p(data_width_p)
+        ,.ruche_factor_X_p(ruche_factor_X_p)
+          `ifndef SYNTHESIS
+        ,.hetero_type_vec_p(get_subarray_hetero_type_vec(y, x))
+          `endif
+        ,.num_clk_ports_p(num_clk_ports_p)
+      ) mc (
+        .clk_i({num_clk_ports_p{clk_i}})
+
+        ,.reset_i(mc_reset_li[y][x])
+        ,.reset_o(mc_reset_lo[y][x])
+    
+        ,.hor_link_sif_i(mc_hor_link_sif_li[y][x])
+        ,.hor_link_sif_o(mc_hor_link_sif_lo[y][x])
+
+        ,.ver_link_sif_i(mc_ver_link_sif_li[y][x])
+        ,.ver_link_sif_o(mc_ver_link_sif_lo[y][x])
+
+        ,.ruche_link_i(mc_ruche_link_li[y][x])
+        ,.ruche_link_o(mc_ruche_link_lo[y][x])
+
+        ,.global_x_i(mc_global_x_li[y][x])
+        ,.global_y_i(mc_global_y_li[y][x])
+        ,.global_x_o(mc_global_x_lo[y][x])
+        ,.global_y_o(mc_global_y_lo[y][x])
+      );
+
+      // connect to north vcache
+      if (y == 0) begin
+        // ver link
+        assign north_vc_ver_link_sif_li[x][S] = mc_ver_link_sif_lo[y][x][N];
+        assign mc_ver_link_sif_li[y][x][N] = north_vc_ver_link_sif_lo[x][S];
+        // coordinates
+        assign mc_global_x_li[y][x] = north_vc_global_x_lo[x];
+        assign mc_global_y_li[y][x] = north_vc_global_y_lo[x];
+        // reset
+        assign mc_reset_li[y][x] = north_vc_reset_lo[x];
+      end
+  
+      // connect ver links to the next row
+      if (y < num_subarray_y_p-1) begin
+        // ver link
+        assign mc_ver_link_sif_li[y+1][x][N] = mc_ver_link_sif_lo[y][x][S];
+        assign mc_ver_link_sif_li[y][x][S] = mc_ver_link_sif_lo[y+1][x][N];
+        // coordinates
+        assign mc_global_x_li[y+1][x] = mc_global_x_lo[y][x];
+        assign mc_global_y_li[y+1][x] = mc_global_y_lo[y][x];
+        // reset
+        assign mc_reset_li[y+1][x] = mc_reset_lo[y][x];
+      end
+
+      // connect to west
+      if (x == 0) begin
+        // local link
+        assign hor_link_sif_o[W][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp] = mc_hor_link_sif_lo[y][x][W];
+        assign mc_hor_link_sif_li[y][x][W] = hor_link_sif_i[W][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp];
+        // ruche link
+        assign ruche_link_o[W][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp] = mc_ruche_link_lo[y][x][W];
+        assign mc_ruche_link_li[y][x][W] = ruche_link_i[W][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp];
+      end
+
+      // connect hor links to the next col
+      if (x < num_subarray_x_p-1) begin
+        // local
+        assign mc_hor_link_sif_li[y][x+1][W] = mc_hor_link_sif_lo[y][x][E];
+        assign mc_hor_link_sif_li[y][x][E] = mc_hor_link_sif_lo[y][x+1][W];
+        // ruche
+        assign mc_ruche_link_li[y][x+1][W] = mc_ruche_link_lo[y][x][E];
+        assign mc_ruche_link_li[y][x][E] = mc_ruche_link_lo[y][x+1][W];
+      end
+
+      // connect to east
+      if (x == num_subarray_x_p-1) begin
+        // local
+        assign hor_link_sif_o[E][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp] = mc_hor_link_sif_lo[y][x][E];
+        assign mc_hor_link_sif_li[y][x][E] = hor_link_sif_i[E][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp];
+        // ruche
+        assign ruche_link_o[E][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp] = mc_ruche_link_lo[y][x][E];
+        assign mc_ruche_link_li[y][x][E] = ruche_link_i[E][y*subarray_num_tiles_y_lp+:subarray_num_tiles_y_lp];
+      end
+
+    end
+  end
+
+
+  // vcache row (south)
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0] south_vc_reset_li;
+  wh_link_sif_s [num_subarray_x_p-1:0][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] south_vc_wh_link_sif_li;
+  wh_link_sif_s [num_subarray_x_p-1:0][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] south_vc_wh_link_sif_lo;
+  bsg_manycore_link_sif_s [num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] south_vc_ver_link_sif_li;
+  bsg_manycore_link_sif_s [num_subarray_x_p-1:0][S:N][subarray_num_tiles_x_lp-1:0] south_vc_ver_link_sif_lo;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][x_cord_width_p-1:0] south_vc_global_x_li;
+  logic [num_subarray_x_p-1:0][subarray_num_tiles_x_lp-1:0][y_cord_width_p-1:0] south_vc_global_y_li;
+  
+  for (genvar x = 0; x < num_subarray_x_p; x++) begin: south_vc_x
+    bsg_manycore_tile_vcache_array #(
+      .addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.pod_x_cord_width_p(pod_x_cord_width_p)
+      ,.pod_y_cord_width_p(pod_y_cord_width_p)
+
+      ,.num_tiles_x_p(num_tiles_x_p)
+      ,.num_tiles_y_p(num_tiles_y_p)
+
+      ,.subarray_num_tiles_x_p(subarray_num_tiles_x_lp)
+
+      ,.num_vcache_rows_p(num_vcache_rows_p)
+      ,.vcache_addr_width_p(vcache_addr_width_p)
+      ,.vcache_data_width_p(vcache_data_width_p)
+      ,.vcache_ways_p(vcache_ways_p)
+      ,.vcache_sets_p(vcache_sets_p)
+      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+      ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+      ,.wh_ruche_factor_p(wh_ruche_factor_p)
+      ,.wh_cid_width_p(wh_cid_width_p)
+      ,.wh_flit_width_p(wh_flit_width_p)
+      ,.wh_len_width_p(wh_len_width_p)
+      ,.wh_cord_width_p(wh_cord_width_p)
+
+      ,.num_clk_ports_p(num_clk_ports_p)
+    ) south_vc_row (
+      .clk_i({num_clk_ports_p{clk_i}})
+      ,.reset_i(south_vc_reset_li[x])
+      ,.reset_o()
+    
+      ,.wh_link_sif_i(south_vc_wh_link_sif_li[x])
+      ,.wh_link_sif_o(south_vc_wh_link_sif_lo[x])
+    
+      ,.ver_link_sif_i(south_vc_ver_link_sif_li[x])
+      ,.ver_link_sif_o(south_vc_ver_link_sif_lo[x])
+
+      ,.global_x_i(south_vc_global_x_li[x])
+      ,.global_y_i(south_vc_global_y_li[x])
+      ,.global_x_o()
+      ,.global_y_o()
+    );
+
+    // connect reset
+    assign south_vc_reset_li[x] = mc_reset_lo[num_subarray_y_p-1][x];
+
+    // connect ver link to manycore
+    assign south_vc_ver_link_sif_li[x][N] = mc_ver_link_sif_lo[num_subarray_y_p-1][x][S];
+    assign mc_ver_link_sif_li[num_subarray_y_p-1][x][S] = south_vc_ver_link_sif_lo[x][N];
+ 
+    // connect ver link to south
+    assign ver_link_sif_o[S][x*subarray_num_tiles_x_lp+:subarray_num_tiles_x_lp] = south_vc_ver_link_sif_lo[x][S];
+    assign south_vc_ver_link_sif_li[x][S] = ver_link_sif_i[S][x*subarray_num_tiles_x_lp+:subarray_num_tiles_x_lp];
+   
+    // coordinate
+    assign south_vc_global_x_li[x] = mc_global_x_lo[num_subarray_y_p-1][x];
+    assign south_vc_global_y_li[x] = mc_global_y_lo[num_subarray_y_p-1][x];
+
+    // connect wh link to west
+    if (x == 0) begin
+      assign south_wh_link_sif_o[W] = south_vc_wh_link_sif_lo[x][W];
+      assign south_vc_wh_link_sif_li[x][W] = south_wh_link_sif_i[W];
+    end
+
+    // connect wh link to east
+    if (x == num_subarray_x_p-1) begin
+      assign south_wh_link_sif_o[E] = south_vc_wh_link_sif_lo[x][E];
+      assign south_vc_wh_link_sif_li[x][E] = south_wh_link_sif_i[E];
+    end
+   
+    // connect wh links between vc arrays
+    if (x < num_subarray_x_p-1) begin
+      assign south_vc_wh_link_sif_li[x+1][W] = south_vc_wh_link_sif_lo[x][E];
+      assign south_vc_wh_link_sif_li[x][E] = south_vc_wh_link_sif_lo[x+1][W];
+    end
+
+  end
+
+
+endmodule
diff --git a/v/bsg_manycore_pod_ruche_array.v b/v/bsg_manycore_pod_ruche_array.v
new file mode 100644
index 000000000..536db3efa
--- /dev/null
+++ b/v/bsg_manycore_pod_ruche_array.v
@@ -0,0 +1,251 @@
+/**
+ *    bsg_manycore_pod_ruche_array.v
+ *
+ *    this module instantiates an array of pods and io routers on the left and right sides.
+ *
+ */
+
+
+`include "bsg_noc_links.vh"
+
+
+module bsg_manycore_pod_ruche_array
+  import bsg_noc_pkg::*;
+  import bsg_tag_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter num_tiles_x_p="inv"
+    , parameter num_tiles_y_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+    , parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter ruche_factor_X_p=3  // only support 3 for now
+
+    , parameter num_subarray_x_p=1
+    , parameter num_subarray_y_p=1
+
+    , parameter dmem_size_p="inv"
+    , parameter icache_entries_p="inv"
+    , parameter icache_tag_width_p="inv"
+
+    , parameter num_vcache_rows_p=1
+    , parameter vcache_addr_width_p="inv"
+    , parameter vcache_data_width_p="inv"
+    , parameter vcache_ways_p="inv"
+    , parameter vcache_sets_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_size_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    , parameter wh_ruche_factor_p=2 // only support 2 for now
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter wh_len_width_p="inv"
+
+    // number of pods to instantiate
+    , parameter num_pods_y_p="inv"
+    , parameter num_pods_x_p="inv"
+
+    , parameter reset_depth_p=3
+
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
+
+
+    , parameter manycore_link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+    , parameter ruche_x_link_sif_width_lp = 
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    // This is used to define heterogeneous arrays. Each index defines
+    // the type of an X/Y coordinate in the array. This is a vector of
+    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
+    // default. See bsg_manycore_hetero_socket.v for more types.
+    , parameter int hetero_type_vec_p [0:(num_tiles_y_p*num_tiles_x_p) - 1]  = '{default:0}
+  )
+  (
+    input clk_i
+
+    // vertical router links 
+    , input  [S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_o
+
+    // vcache wormhole links
+    , input  [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [E:W][num_pods_y_p-1:0][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_o
+
+    // horizontal local links
+    , input  [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_i
+    , output [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_o
+    
+    // horizontal ruche links
+    , input  [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_i
+    , output [E:W][num_pods_y_p-1:0][num_tiles_y_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_o
+    
+
+    // bsg_tag interface
+    // Each pod has one tag client for reset.
+    , input bsg_tag_s [num_pods_y_p-1:0][num_pods_x_p-1:0] pod_tags_i
+  );
+
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+
+  bsg_manycore_link_sif_s [num_pods_y_p-1:0][E:W][num_tiles_y_p-1:0] hor_link_sif_li;
+  bsg_manycore_link_sif_s [num_pods_y_p-1:0][E:W][num_tiles_y_p-1:0] hor_link_sif_lo;
+  bsg_manycore_link_sif_s [num_pods_y_p-1:0][S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0] ver_link_sif_li;
+  bsg_manycore_link_sif_s [num_pods_y_p-1:0][S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0] ver_link_sif_lo;
+
+  bsg_manycore_ruche_x_link_sif_s [num_pods_y_p-1:0][E:W][num_tiles_y_p-1:0] ruche_link_li;  
+  bsg_manycore_ruche_x_link_sif_s [num_pods_y_p-1:0][E:W][num_tiles_y_p-1:0] ruche_link_lo;  
+
+  wh_link_sif_s [num_pods_y_p-1:0][E:W][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_li;
+  wh_link_sif_s [num_pods_y_p-1:0][E:W][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_lo;
+
+  logic [num_pods_y_p-1:0][(num_pods_x_p*num_tiles_x_p)-1:0][x_cord_width_p-1:0] global_x_li;
+  logic [num_pods_y_p-1:0][(num_pods_x_p*num_tiles_x_p)-1:0][y_cord_width_p-1:0] global_y_li;
+
+  logic [num_pods_y_p-1:0][num_pods_x_p-1:0] reset_lo;
+  logic [num_pods_y_p-1:0][num_pods_x_p-1:0][num_tiles_x_p-1:0] reset_r;
+
+  // Instantiate pod rows
+  for (genvar y = 0; y < num_pods_y_p; y++) begin: py
+    for (genvar x = 0; x < num_pods_x_p; x++) begin: px
+      bsg_tag_client #(
+        .width_p($bits(bsg_manycore_pod_tag_payload_s))
+        ,.default_p(0)
+      ) btc (
+        .bsg_tag_i(pod_tags_i[y][x])
+        ,.recv_clk_i(clk_i)
+        ,.recv_reset_i(1'b0)
+        ,.recv_new_r_o()
+        ,.recv_data_r_o(reset_lo[y][x])
+      );
+      bsg_dff_chain #(
+        .width_p(num_tiles_x_p)
+        ,.num_stages_p(reset_depth_p-1)
+      ) reset_dff (
+        .clk_i(clk_i)
+        ,.data_i({num_tiles_x_p{reset_lo[y][x]}})
+        ,.data_o(reset_r[y][x])
+      );
+    end
+
+    bsg_manycore_pod_ruche_row #(
+      .num_tiles_x_p(num_tiles_x_p)
+      ,.num_tiles_y_p(num_tiles_y_p)
+      ,.pod_x_cord_width_p(pod_x_cord_width_p)
+      ,.pod_y_cord_width_p(pod_y_cord_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.ruche_factor_X_p(ruche_factor_X_p)
+      ,.num_pods_x_p(num_pods_x_p)    
+  
+      ,.num_subarray_x_p(num_subarray_x_p)
+      ,.num_subarray_y_p(num_subarray_y_p)
+
+      ,.dmem_size_p(dmem_size_p)
+      ,.icache_entries_p(icache_entries_p)
+      ,.icache_tag_width_p(icache_tag_width_p)
+
+      ,.num_vcache_rows_p(num_vcache_rows_p)
+      ,.vcache_addr_width_p(vcache_addr_width_p)
+      ,.vcache_data_width_p(vcache_data_width_p)
+      ,.vcache_ways_p(vcache_ways_p)
+      ,.vcache_sets_p(vcache_sets_p)
+      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+      ,.vcache_size_p(vcache_size_p)
+      ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+      ,.wh_ruche_factor_p(wh_ruche_factor_p)
+      ,.wh_cid_width_p(wh_cid_width_p)
+      ,.wh_flit_width_p(wh_flit_width_p)
+      ,.wh_cord_width_p(wh_cord_width_p)
+      ,.wh_len_width_p(wh_len_width_p)
+
+      ,.hetero_type_vec_p(hetero_type_vec_p)
+    ) podrow (
+      .clk_i(clk_i)
+      ,.reset_i(reset_r[y])
+
+      ,.hor_link_sif_i(hor_link_sif_li[y])
+      ,.hor_link_sif_o(hor_link_sif_lo[y])
+      ,.ver_link_sif_i(ver_link_sif_li[y])
+      ,.ver_link_sif_o(ver_link_sif_lo[y])
+      ,.ruche_link_i(ruche_link_li[y])
+      ,.ruche_link_o(ruche_link_lo[y])
+
+      ,.wh_link_sif_i(wh_link_sif_li[y])
+      ,.wh_link_sif_o(wh_link_sif_lo[y])
+
+      ,.global_x_i(global_x_li[y])
+      ,.global_y_i(global_y_li[y])
+    );
+
+    // assign global_x/y
+    for (genvar i = 0; i < num_tiles_x_p*num_pods_x_p; i++) begin
+      assign global_x_li[y][i] = {  (pod_x_cord_width_p)'((i/num_tiles_x_p)+1), (x_subcord_width_lp)'(i%num_tiles_x_p)    };
+      assign global_y_li[y][i] = {  (pod_y_cord_width_p)'(y*2), (y_subcord_width_lp)'((1<<y_subcord_width_lp)-num_vcache_rows_p)  };
+    end
+
+    // connect vertical local links to north
+    if (y == 0) begin
+      assign ver_link_sif_o[N] = ver_link_sif_lo[y][N];
+      assign ver_link_sif_li[y][N] = ver_link_sif_i[N];
+    end
+
+    // connect vertical local_links to south
+    if (y == num_pods_y_p-1) begin
+      assign ver_link_sif_o[S] = ver_link_sif_lo[y][S];
+      assign ver_link_sif_li[y][S] = ver_link_sif_i[S];
+    end
+
+    // connect vertical local links between pods
+    if (y < num_pods_y_p-1) begin
+      assign ver_link_sif_li[y+1][N] = ver_link_sif_lo[y][S];
+      assign ver_link_sif_li[y][S] = ver_link_sif_lo[y+1][N];
+    end
+
+
+    // connect horizontal links on the side to the west
+    // local
+    assign hor_link_sif_o[W][y] = hor_link_sif_lo[y][W];
+    assign hor_link_sif_li[y][W] = hor_link_sif_i[W][y];
+    // ruche
+    assign ruche_link_o[W][y] = ruche_link_lo[y][W];
+    assign ruche_link_li[y][W] = ruche_link_i[W][y];
+
+    // connect horizontal links on the side to the east
+    // local
+    assign hor_link_sif_o[E][y] = hor_link_sif_lo[y][E];
+    assign hor_link_sif_li[y][E] = hor_link_sif_i[E][y];
+    // ruche
+    assign ruche_link_o[E][y] = ruche_link_lo[y][E];
+    assign ruche_link_li[y][E] = ruche_link_i[E][y];
+
+    // connect wh to the west
+    assign wh_link_sif_o[W][y][N] = wh_link_sif_lo[y][W][N];
+    assign wh_link_sif_li[y][W][N] = wh_link_sif_i[W][y][N];
+    assign wh_link_sif_o[W][y][S] = wh_link_sif_lo[y][W][S];
+    assign wh_link_sif_li[y][W][S] = wh_link_sif_i[W][y][S];
+
+    // connect wh to the east
+    assign wh_link_sif_o[E][y][N] = wh_link_sif_lo[y][E][N];
+    assign wh_link_sif_li[y][E][N] = wh_link_sif_i[E][y][N];
+    assign wh_link_sif_o[E][y][S] = wh_link_sif_lo[y][E][S];
+    assign wh_link_sif_li[y][E][S] = wh_link_sif_i[E][y][S];
+  end
+
+
+
+endmodule
diff --git a/v/bsg_manycore_pod_ruche_row.v b/v/bsg_manycore_pod_ruche_row.v
new file mode 100644
index 000000000..bc6ee96a1
--- /dev/null
+++ b/v/bsg_manycore_pod_ruche_row.v
@@ -0,0 +1,263 @@
+/**
+ *    bsg_manycore_pod_ruche_row.v
+ *
+ */
+
+
+`include "bsg_noc_links.vh"
+
+
+
+module bsg_manycore_pod_ruche_row
+  import bsg_noc_pkg::*;
+  import bsg_tag_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter num_tiles_x_p="inv"
+    , parameter num_tiles_y_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+    , parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter ruche_factor_X_p=3  // only support 3 for now
+
+    , parameter num_subarray_x_p=1
+    , parameter num_subarray_y_p=1
+
+    , parameter dmem_size_p="inv"
+    , parameter icache_entries_p="inv"
+    , parameter icache_tag_width_p="inv"
+
+    , parameter num_vcache_rows_p=1
+    , parameter vcache_addr_width_p="inv"
+    , parameter vcache_data_width_p="inv"
+    , parameter vcache_ways_p="inv"
+    , parameter vcache_sets_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_size_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    , parameter wh_ruche_factor_p=2 // only support 2 for now
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter wh_len_width_p="inv"
+
+    // number of pods to instantiate
+    , parameter num_pods_x_p="inv"
+
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
+
+    // subarray num clk ports
+    , parameter num_clk_ports_p=1 
+
+    , parameter manycore_link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+    , parameter ruche_x_link_sif_width_lp = 
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    // This is used to define heterogeneous arrays. Each index defines
+    // the type of an X/Y coordinate in the array. This is a vector of
+    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
+    // default. See bsg_manycore_hetero_socket.v for more types.
+    `ifndef SYNTHESIS
+    , parameter int hetero_type_vec_p [0:(num_tiles_y_p*num_tiles_x_p) - 1]  = '{default:0}
+    `endif
+  )
+  (
+    input clk_i
+    , input [num_pods_x_p-1:0][num_tiles_x_p-1:0] reset_i
+    
+    // vertical router links 
+    , input  [S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][num_pods_x_p-1:0][num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_o
+
+    // vcache wormhole links
+    , input  [E:W][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [E:W][S:N][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_o
+
+    // horizontal local links
+    , input  [E:W][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_i
+    , output [E:W][num_tiles_y_p-1:0][manycore_link_sif_width_lp-1:0] hor_link_sif_o
+
+    // horizontal ruche links
+    , input  [E:W][num_tiles_y_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_i
+    , output [E:W][num_tiles_y_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_o
+
+    , input [num_pods_x_p-1:0][num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_i
+    , input [num_pods_x_p-1:0][num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_i
+  );
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+
+
+
+  // Instantiate pods
+  bsg_manycore_link_sif_s [num_pods_x_p-1:0][E:W][num_tiles_y_p-1:0] hor_link_sif_li;
+  bsg_manycore_link_sif_s [num_pods_x_p-1:0][E:W][num_tiles_y_p-1:0] hor_link_sif_lo;
+  bsg_manycore_link_sif_s [num_pods_x_p-1:0][S:N][num_tiles_x_p-1:0] ver_link_sif_li;
+  bsg_manycore_link_sif_s [num_pods_x_p-1:0][S:N][num_tiles_x_p-1:0] ver_link_sif_lo;
+  bsg_manycore_ruche_x_link_sif_s [num_pods_x_p-1:0][E:W][num_tiles_y_p-1:0][ruche_factor_X_p-1:0] ruche_link_li;  
+  bsg_manycore_ruche_x_link_sif_s [num_pods_x_p-1:0][E:W][num_tiles_y_p-1:0][ruche_factor_X_p-1:0] ruche_link_lo;  
+  wh_link_sif_s [num_pods_x_p-1:0][S:N][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_li;
+  wh_link_sif_s [num_pods_x_p-1:0][S:N][E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0] wh_link_sif_lo;
+
+  for (genvar x = 0; x < num_pods_x_p; x++) begin: px
+
+    bsg_manycore_pod_ruche #(
+      .num_tiles_x_p(num_tiles_x_p)
+      ,.num_tiles_y_p(num_tiles_y_p)
+      ,.pod_x_cord_width_p(pod_x_cord_width_p)
+      ,.pod_y_cord_width_p(pod_y_cord_width_p)
+      ,.x_cord_width_p(x_cord_width_p)
+      ,.y_cord_width_p(y_cord_width_p)
+      ,.addr_width_p(addr_width_p)
+      ,.data_width_p(data_width_p)
+      ,.ruche_factor_X_p(ruche_factor_X_p)
+      
+      ,.num_subarray_x_p(num_subarray_x_p)
+      ,.num_subarray_y_p(num_subarray_y_p)
+
+      ,.dmem_size_p(dmem_size_p)
+      ,.icache_entries_p(icache_entries_p)
+      ,.icache_tag_width_p(icache_tag_width_p)
+
+      ,.num_vcache_rows_p(num_vcache_rows_p)
+      ,.vcache_addr_width_p(vcache_addr_width_p)
+      ,.vcache_data_width_p(vcache_data_width_p)
+      ,.vcache_ways_p(vcache_ways_p)
+      ,.vcache_sets_p(vcache_sets_p)
+      ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+      ,.vcache_size_p(vcache_size_p)
+      ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+      ,.wh_ruche_factor_p(wh_ruche_factor_p)
+      ,.wh_cid_width_p(wh_cid_width_p)
+      ,.wh_flit_width_p(wh_flit_width_p)
+      ,.wh_cord_width_p(wh_cord_width_p)
+      ,.wh_len_width_p(wh_len_width_p)
+
+        `ifndef SYNTHESIS
+      ,.hetero_type_vec_p(hetero_type_vec_p)
+        `endif
+
+      ,.num_clk_ports_p(num_clk_ports_p)
+    ) pod (
+      .clk_i(clk_i)
+      ,.reset_i(reset_i[x])
+
+      ,.hor_link_sif_i(hor_link_sif_li[x])
+      ,.hor_link_sif_o(hor_link_sif_lo[x])
+      ,.ver_link_sif_i(ver_link_sif_li[x])
+      ,.ver_link_sif_o(ver_link_sif_lo[x])
+      ,.ruche_link_i(ruche_link_li[x])
+      ,.ruche_link_o(ruche_link_lo[x])
+
+      ,.north_wh_link_sif_i(wh_link_sif_li[x][N])
+      ,.north_wh_link_sif_o(wh_link_sif_lo[x][N])
+      ,.south_wh_link_sif_i(wh_link_sif_li[x][S])
+      ,.south_wh_link_sif_o(wh_link_sif_lo[x][S])
+
+      ,.global_x_i(global_x_i[x])
+      ,.global_y_i(global_y_i[x])
+    );
+
+
+    // connect vertical local links to north
+    assign ver_link_sif_o[N][x] = ver_link_sif_lo[x][N];
+    assign ver_link_sif_li[x][N] = ver_link_sif_i[N][x];
+
+    // connect vertical local_links to south
+    assign ver_link_sif_o[S][x] = ver_link_sif_lo[x][S];
+    assign ver_link_sif_li[x][S] = ver_link_sif_i[S][x];
+
+    // connect horizontal local links between pods
+    if (x < num_pods_x_p-1) begin
+      assign hor_link_sif_li[x][E] = hor_link_sif_lo[x+1][W];
+      assign hor_link_sif_li[x+1][W] = hor_link_sif_lo[x][E];
+    end
+
+    // connect horizontal links on the side to the west
+    if (x == 0) begin
+      // local
+      assign hor_link_sif_o[W] = hor_link_sif_lo[x][W];
+      assign hor_link_sif_li[x][W] = hor_link_sif_i[W];
+    end
+
+    // connect horizontal links on the side to the east
+    if (x == num_pods_x_p-1) begin
+      // local
+      assign hor_link_sif_o[E] = hor_link_sif_lo[x][E];
+      assign hor_link_sif_li[x][E] = hor_link_sif_i[E];
+    end
+
+    // connect ruche links between pods
+    if (x < num_pods_x_p-1) begin
+      // manycore
+      assign ruche_link_li[x][E] = ruche_link_lo[x+1][W];
+      assign ruche_link_li[x+1][W] = ruche_link_lo[x][E];;
+
+      // vcache wh
+      for (genvar m = N; m <= S; m++) begin
+        assign wh_link_sif_li[x][m][E] = wh_link_sif_lo[x+1][m][W];
+        assign wh_link_sif_li[x+1][m][W] = wh_link_sif_lo[x][m][E];
+      end
+    end
+  end
+
+
+
+  // hard-coded for ruche factor = 3
+  for (genvar y = 0; y < num_tiles_y_p; y++) begin: ry
+    // connect manycore ruche to west
+    assign ruche_link_li[0][W][y][1] = ruche_link_i[W][y];
+    assign ruche_link_o[W][y] = ruche_link_lo[0][W][y][1];
+
+    // tieoff west manycore ruche
+    assign ruche_link_li[0][W][y][0] = '0; // tieoff
+    assign ruche_link_li[0][W][y][2] = '1; // tieoff
+
+    // connect manycore ruche to east
+    assign ruche_link_li[num_pods_x_p-1][E][y][0] = ruche_link_i[E][y];
+    assign ruche_link_o[E][y] = ruche_link_lo[num_pods_x_p-1][E][y][0];
+
+    // tieoff east manycore ruche
+    assign ruche_link_li[num_pods_x_p-1][E][y][1] = '1;
+    assign ruche_link_li[num_pods_x_p-1][E][y][2] = '0;
+  end
+
+  // connect wormhole ruche links to the outside
+  // (hardcoded for wh ruche factor 2)
+  // For north vcaches, the vcache row orders are reversed, so that the inner vcache layers appear at index 0.
+  for (genvar j = 0; j < num_vcache_rows_p; j++) begin
+    for (genvar r = 0; r < wh_ruche_factor_p; r++) begin
+    // north VC row
+    // west out
+    assign wh_link_sif_o[W][N][j][r] = wh_link_sif_lo[0][N][W][num_vcache_rows_p-1-j][r];
+    // west in
+    assign wh_link_sif_li[0][N][W][num_vcache_rows_p-1-j][r] =  wh_link_sif_i[W][N][j][r];
+    // east out
+    assign wh_link_sif_o[E][N][j][r] =  wh_link_sif_lo[num_pods_x_p-1][N][E][num_vcache_rows_p-1-j][r];
+    // east in
+    assign wh_link_sif_li[num_pods_x_p-1][N][E][num_vcache_rows_p-1-j][r] =  wh_link_sif_i[E][N][j][r];
+
+    // south VC row
+    // west out
+    assign wh_link_sif_o[W][S][j][r] =  wh_link_sif_lo[0][S][W][j][r];
+    // west in
+    assign wh_link_sif_li[0][S][W][j][r] =  wh_link_sif_i[W][S][j][r];
+    // east out
+    assign wh_link_sif_o[E][S][j][r] =  wh_link_sif_lo[num_pods_x_p-1][S][E][j][r];
+    // east in
+    assign wh_link_sif_li[num_pods_x_p-1][S][E][j][r] =  wh_link_sif_i[E][S][j][r];
+    end
+  end
+
+endmodule
diff --git a/v/bsg_manycore_reg_id_decode.v b/v/bsg_manycore_reg_id_decode.v
new file mode 100644
index 000000000..8cbf5b724
--- /dev/null
+++ b/v/bsg_manycore_reg_id_decode.v
@@ -0,0 +1,20 @@
+module bsg_manycore_reg_id_decode
+  import bsg_manycore_pkg::*;
+  #(parameter data_width_p=32
+    , parameter data_mask_width_lp=data_width_p>>3
+    , parameter reg_id_width_p=bsg_manycore_reg_id_width_gp
+  )
+  (
+    input [data_width_p-1:0] data_i
+    , input [data_mask_width_lp-1:0] mask_i
+    , output logic [reg_id_width_p-1:0] reg_id_o
+  );
+
+  
+  assign reg_id_o =
+    (data_i[0+:reg_id_width_p] & {reg_id_width_p{~mask_i[0]}})
+    | (data_i[8+:reg_id_width_p] & {reg_id_width_p{~mask_i[1]}})
+    | (data_i[16+:reg_id_width_p] & {reg_id_width_p{~mask_i[2]}})
+    | (data_i[24+:reg_id_width_p] & {reg_id_width_p{~mask_i[3]}});
+
+endmodule
diff --git a/v/bsg_manycore_reg_id_encode.v b/v/bsg_manycore_reg_id_encode.v
new file mode 100644
index 000000000..8694a1baa
--- /dev/null
+++ b/v/bsg_manycore_reg_id_encode.v
@@ -0,0 +1,49 @@
+/**
+ *    bsg_manycore_reg_id_encode.v
+ *
+ *    encode reg_id into the data field for non-word sized access.
+ *
+ *    data_i should be byte-selected for its access size.
+ */
+
+module bsg_manycore_reg_id_encode
+  import bsg_manycore_pkg::*;
+  #(parameter data_width_p=32
+    , parameter data_mask_width_lp=(data_width_p>>3)
+    , parameter reg_id_width_p=bsg_manycore_reg_id_width_gp
+  )
+  ( 
+    input [data_width_p-1:0] data_i
+    , input [data_mask_width_lp-1:0] mask_i
+    , input [reg_id_width_p-1:0] reg_id_i
+
+    , output logic [data_width_p-1:0] data_o
+    , output logic [reg_id_width_p-1:0] reg_id_o
+    , output bsg_manycore_packet_op_e op_o
+  );
+
+
+  bsg_mux_segmented #(
+    .segments_p(data_mask_width_lp)
+    ,.segment_width_p(8)
+  ) mux0 (
+    .data0_i({4{3'b0,reg_id_i}})
+    ,.data1_i(data_i)
+    ,.sel_i(mask_i)
+    ,.data_o(data_o)
+  );
+
+
+  always_comb begin
+
+    if (mask_i == 4'b1111) begin
+      reg_id_o = reg_id_i;
+      op_o = e_remote_sw;
+    end
+    else begin
+      reg_id_o = {1'b0, mask_i};
+      op_o = e_remote_store;
+    end
+  end
+
+endmodule
diff --git a/v/bsg_manycore_ruche_x_link_sif_tieoff.v b/v/bsg_manycore_ruche_x_link_sif_tieoff.v
index 1d4763c69..1eedae02c 100644
--- a/v/bsg_manycore_ruche_x_link_sif_tieoff.v
+++ b/v/bsg_manycore_ruche_x_link_sif_tieoff.v
@@ -18,12 +18,33 @@ module bsg_manycore_ruche_x_link_sif_tieoff
 
     , parameter ruche_factor_X_p="inv"
     , parameter ruche_stage_p="inv"
+    , parameter bit west_not_east_p="inv" // 1'b0 or 1'b1
+  
+    , parameter bit ruche_factor_even_lp = (ruche_factor_X_p % 2 == 0)
+    , parameter bit ruche_stage_even_lp = (ruche_stage_p % 2 == 0)
 
-    // For ruche stage greater than 0,
-    // 1) ruche factor is even: invert if ruche stage is odd.
-    // 2) ruche factor is odd:  invert if ruche stage is even.
-    , parameter bit invert_lp = (ruche_stage_p > 0) & (ruche_stage_p % 2 == ((ruche_factor_X_p % 2 == 0) ? 1 : 0))
-
+    , parameter bit invert_output_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ruche_stage_even_lp
+            : ~ruche_stage_even_lp))
+    , parameter bit invert_input_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ~ruche_stage_even_lp
+            : ruche_stage_even_lp))
+/*
+    , parameter bit invert_output_lp = (ruche_stage_p > 0)
+        & (west_not_east_p
+          ? (ruche_factor_X_even_lp ^ ruche_stage_even_lp)
+          : (ruche_factor_X_even_lp ^ ~ruche_stage_even_lp))
+    , parameter bit invert_input_lp = (ruche_stage_p > 0)
+        & (west_not_east_p
+          ? (ruche_factor_X_even_lp ^ ~ruche_stage_even_lp)
+          : (ruche_factor_X_even_lp ^ ruche_stage_even_lp))
+*/
     , parameter ruche_x_link_sif_width_lp=
       `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
   )
@@ -40,7 +61,7 @@ module bsg_manycore_ruche_x_link_sif_tieoff
   `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
   bsg_manycore_ruche_x_link_sif_s ruche_link_in;
   assign ruche_link_in = ruche_link_i;
-  assign ruche_link_o = invert_lp ? '1 : '0; 
+  assign ruche_link_o = invert_output_lp ? '1 : '0; 
 
 
   // synopsys translate_off
@@ -57,11 +78,11 @@ module bsg_manycore_ruche_x_link_sif_tieoff
   always_ff @ (negedge clk_i) begin
     if (~reset_i) begin
       
-      if (invert_lp ^ ruche_link_in.fwd.v)
+      if (invert_input_lp ^ ruche_link_in.fwd.v)
         $error("[BSG_ERROR] Errant fwd packet detected. src_x=%0d, dest_y=%0d, dest_x=%0d.",
           fwd_src_x, fwd_dest_y, fwd_dest_x);
 
-      if (invert_lp ^ ruche_link_in.rev.v)
+      if (invert_input_lp ^ ruche_link_in.rev.v)
         $error("[BSG_ERROR] Errant rev packet detected. dest_x=%0d.", rev_dest_x);
     end
   end
diff --git a/v/bsg_manycore_top_mesh.v b/v/bsg_manycore_tile_compute_array_mesh.v
similarity index 88%
rename from v/bsg_manycore_top_mesh.v
rename to v/bsg_manycore_tile_compute_array_mesh.v
index 69af10173..e4ce940d5 100644
--- a/v/bsg_manycore_top_mesh.v
+++ b/v/bsg_manycore_tile_compute_array_mesh.v
@@ -1,10 +1,10 @@
 /**
- *    bsg_manycore_top_mesh.v
+ *    bsg_manycore_tile_compute_array_mesh.v
  *
  */
 
 
-module bsg_manycore_top_mesh
+module bsg_manycore_tile_compute_array_mesh
   import bsg_manycore_pkg::*;
   import bsg_noc_pkg::*; // {P=0, W,E,N,S }
   #(parameter dmem_size_p = "inv" // number of words in DMEM
@@ -36,14 +36,20 @@ module bsg_manycore_top_mesh
     // Enable branch/jalr trace
     , parameter branch_trace_en_p = 0
 
+    // x-coordinate of the leftmost tiles
+    // This can be set to 1 or greater to allow attaching accelerators on the left side.
+    , parameter start_x_cord_p = 0
+
     // y = 0                  top vcache
     // y = 1                  IO routers
     // y = num_tiles_y_p+1    bottom vcache
     , parameter y_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p+2)
-    , parameter x_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p)
+
+    // By default, x-coordinate is clog2(num_tiles_x_p), but it can be set to greater value to allow attaching accelerators on the side.
+    , parameter x_cord_width_p = `BSG_SAFE_CLOG2(start_x_cord_p+num_tiles_x_p)
 
     , parameter link_sif_width_lp =
-      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp)
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_lp)
 
     // The number of registers between the reset_i port and the reset sinks
     // Must be >= 1
@@ -116,7 +122,7 @@ module bsg_manycore_top_mesh
 
 
   // Instantiate tiles.
-  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp);
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_lp);
   bsg_manycore_link_sif_s [num_tiles_y_p-1:0][num_tiles_x_p-1:0][S:W] link_in;
   bsg_manycore_link_sif_s [num_tiles_y_p-1:0][num_tiles_x_p-1:0][S:W] link_out;
  
@@ -128,7 +134,8 @@ module bsg_manycore_top_mesh
         ,.vcache_size_p (vcache_size_p)
         ,.icache_entries_p(icache_entries_p)
         ,.icache_tag_width_p(icache_tag_width_p)
-        ,.x_cord_width_p(x_cord_width_lp)
+        ,.start_x_cord_p(start_x_cord_p)
+        ,.x_cord_width_p(x_cord_width_p)
         ,.y_cord_width_p(y_cord_width_lp)
         ,.data_width_p(data_width_p)
         ,.addr_width_p(addr_width_p)
@@ -146,7 +153,7 @@ module bsg_manycore_top_mesh
         ,.link_i(link_in[r-1][c])
         ,.link_o(link_out[r-1][c])
 
-        ,.my_x_i(x_cord_width_lp'(c))
+        ,.my_x_i(x_cord_width_p'(c+start_x_cord_p))
         ,.my_y_i(y_cord_width_lp'(r))
       );
     end
@@ -156,7 +163,7 @@ module bsg_manycore_top_mesh
   // Instantiate IO routers.
   for (genvar c = 0; c < num_tiles_x_p; c=c+1) begin: io
     bsg_manycore_mesh_node #(
-      .x_cord_width_p     (x_cord_width_lp )
+      .x_cord_width_p     (x_cord_width_p )
       ,.y_cord_width_p     (y_cord_width_lp )
       ,.data_width_p       (data_width_p    )
       ,.addr_width_p       (addr_width_p    )
@@ -171,7 +178,7 @@ module bsg_manycore_top_mesh
       ,.proc_link_sif_o  ( io_link_sif_o [ c ])
         
       // tile coordinates
-      ,.my_x_i   ( x_cord_width_lp'(c))
+      ,.my_x_i   ( x_cord_width_p'(c+start_x_cord_p))
       ,.my_y_i   ( y_cord_width_lp'(1))
    );
   end
diff --git a/v/bsg_manycore_tile_compute_array_ruche.v b/v/bsg_manycore_tile_compute_array_ruche.v
new file mode 100644
index 000000000..5457b09a2
--- /dev/null
+++ b/v/bsg_manycore_tile_compute_array_ruche.v
@@ -0,0 +1,289 @@
+/**
+ *    bsg_manycore_tile_compute_array_ruche.v
+ *
+ *    A compute tile with 2D mesh router with half ruche x.
+ *  
+ */
+
+
+module bsg_manycore_tile_compute_array_ruche
+  import bsg_manycore_pkg::*;
+  import bsg_noc_pkg::*; // {P=0, W,E,N,S }
+  #(parameter dmem_size_p = "inv" // number of words in DMEM
+    , parameter icache_entries_p = "inv" // in words
+    , parameter icache_tag_width_p = "inv"
+
+    , parameter num_vcache_rows_p = "inv"
+    , parameter vcache_size_p = "inv" // capacity per vcache in words
+    , parameter vcache_block_size_in_words_p ="inv"
+    , parameter vcache_sets_p = "inv"
+
+    // change the default values from "inv" back to -1
+    // since num_tiles_x_p and num_tiles_y_p will be used to define the size of 2D array
+    // hetero_type_vec_p, they should be int by default to avoid tool crash during
+    // synthesis (DC versions at least up to 2018.06)
+
+    // Number of tiles in the entire pod
+    , parameter int num_tiles_x_p = -1
+    , parameter int num_tiles_y_p = -1
+
+    // Number of tiles in this subarray.
+    , parameter subarray_num_tiles_x_p = -1
+    , parameter subarray_num_tiles_y_p = -1
+
+    // This is used to define heterogeneous arrays. Each index defines
+    // the type of an X/Y coordinate in the array. This is a vector of
+    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
+    // default. See bsg_manycore_hetero_socket.v for more types.
+    , parameter int hetero_type_vec_p [0:(subarray_num_tiles_y_p*subarray_num_tiles_x_p) - 1]  = '{default:0}
+
+    // this is the addr width on the manycore network packet (word addr).
+    // also known as endpoint physical address (EPA).
+    , parameter addr_width_p = "inv"
+    , parameter data_width_p = "inv" // 32
+
+    // default ruche factor
+    , parameter ruche_factor_X_p=3
+
+    // global coordinate width
+    // global_x/y_i
+    // pod_*_cord_width_p  and *_subcord_width_p should sum up to *_cord_width_p.
+    , parameter y_cord_width_p = -1
+    , parameter x_cord_width_p = -1
+
+    // pod coordinate width
+    // pod_x/y_i
+    , parameter pod_y_cord_width_p = -1
+    , parameter pod_x_cord_width_p = -1
+
+    , parameter num_clk_ports_p=1
+
+    // coordinate within a pod
+    // my_x/y_i
+    // A multiple of these modules can be instantiated within a pod as a subarray to form a larger array.
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+
+    
+    , parameter link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    , parameter ruche_x_link_sif_width_lp =
+      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    // The number of registers between the reset_i port and the reset sinks
+    // Must be >= 1
+    , parameter reset_depth_p = 3
+
+    // enable debugging
+    , parameter debug_p = 0
+  )
+  (
+    input [num_clk_ports_p-1:0] clk_i
+
+    , input [subarray_num_tiles_x_p-1:0] reset_i
+    , output logic [subarray_num_tiles_x_p-1:0] reset_o
+  
+    // horizontal -- {E,W}
+    , input [E:W][subarray_num_tiles_y_p-1:0][link_sif_width_lp-1:0] hor_link_sif_i
+    , output [E:W][subarray_num_tiles_y_p-1:0][link_sif_width_lp-1:0] hor_link_sif_o
+
+    // vertical -- {S,N}
+    , input [S:N][subarray_num_tiles_x_p-1:0][link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][subarray_num_tiles_x_p-1:0][link_sif_width_lp-1:0] ver_link_sif_o
+
+    // ruche link
+    , input [E:W][subarray_num_tiles_y_p-1:0][ruche_factor_X_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_i
+    , output [E:W][subarray_num_tiles_y_p-1:0][ruche_factor_X_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_o
+
+
+    , input [subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_i
+    , input [subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_i
+    , output [subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_o
+    , output [subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_o
+  );
+
+  // synopsys translate_off
+  initial begin
+    assert ((subarray_num_tiles_x_p > 0) && (subarray_num_tiles_y_p > 0))
+      else $error("num_tiles_x_p and num_tiles_y_p must be positive constants");
+    $display("## ----------------------------------------------------------------");
+    $display("## MANYCORE HETERO TYPE CONFIGURATIONS");
+    $display("## ----------------------------------------------------------------");
+    for (integer i=0; i < subarray_num_tiles_y_p; i++) begin
+      $write("## ");
+      for(integer j=0; j < subarray_num_tiles_x_p; j++) begin
+        $write("%0d,", hetero_type_vec_p[i * subarray_num_tiles_x_p + j]);
+      end
+      $write("\n");
+    end
+    $display("## ----------------------------------------------------------------");
+  end
+  // synopsys translate_on
+
+
+
+  
+
+  // Instantiate tiles.
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_link_sif_s [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][S:W] link_in;
+  bsg_manycore_link_sif_s [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][S:W] link_out;
+ 
+  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  bsg_manycore_ruche_x_link_sif_s [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][ruche_factor_X_p-1:0][E:W] ruche_link_in;   
+  bsg_manycore_ruche_x_link_sif_s [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][ruche_factor_X_p-1:0][E:W] ruche_link_out;
+ 
+  logic [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_li, global_x_lo;
+  logic [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_li, global_y_lo;
+
+  logic [subarray_num_tiles_y_p-1:0][subarray_num_tiles_x_p-1:0] reset_li, reset_lo;
+
+  for (genvar r = 0; r < subarray_num_tiles_y_p; r++) begin: y
+    for (genvar c = 0; c < subarray_num_tiles_x_p; c++) begin: x
+      bsg_manycore_tile_compute_ruche #(
+        .dmem_size_p(dmem_size_p)
+        ,.vcache_size_p(vcache_size_p)
+        ,.icache_entries_p(icache_entries_p)
+        ,.icache_tag_width_p(icache_tag_width_p)
+        ,.x_cord_width_p(x_cord_width_p)
+        ,.y_cord_width_p(y_cord_width_p)
+        ,.pod_x_cord_width_p(pod_x_cord_width_p)
+        ,.pod_y_cord_width_p(pod_y_cord_width_p)
+        ,.data_width_p(data_width_p)
+        ,.addr_width_p(addr_width_p)
+        ,.hetero_type_p(hetero_type_vec_p[(r*subarray_num_tiles_x_p)+c])
+        ,.debug_p(debug_p)
+        ,.num_tiles_x_p(num_tiles_x_p)
+        ,.num_tiles_y_p(num_tiles_y_p)
+        ,.num_vcache_rows_p(num_vcache_rows_p)
+        ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+        ,.vcache_sets_p(vcache_sets_p)
+        ,.ruche_factor_X_p(ruche_factor_X_p)
+      ) tile (
+        .clk_i(clk_i[c/(subarray_num_tiles_x_p/num_clk_ports_p)])
+
+        ,.reset_i(reset_li[r][c])
+        ,.reset_o(reset_lo[r][c])
+
+        ,.link_i(link_in[r][c])
+        ,.link_o(link_out[r][c])
+
+        ,.ruche_link_i(ruche_link_in[r][c])
+        ,.ruche_link_o(ruche_link_out[r][c])
+
+        ,.global_x_i(global_x_li[r][c])
+        ,.global_y_i(global_y_li[r][c])
+
+        ,.global_x_o(global_x_lo[r][c])
+        ,.global_y_o(global_y_lo[r][c])
+      );
+
+      // connect north
+      if (r == 0) begin
+        assign global_x_li[r][c] = global_x_i[c];
+        assign global_y_li[r][c] = global_y_i[c];
+
+        assign reset_li[r][c] = reset_i[c];
+      end
+
+      // connect south
+      if (r == subarray_num_tiles_y_p-1) begin
+        assign global_x_o[c] = global_x_lo[r][c];
+        assign global_y_o[c] = global_y_lo[r][c];
+  
+        assign reset_o[c] = reset_lo[r][c];
+      end
+
+      // connect between rows
+      if (r < subarray_num_tiles_y_p-1) begin
+        assign global_x_li[r+1][c] = global_x_lo[r][c];
+        assign global_y_li[r+1][c] = global_y_lo[r][c];
+
+        assign reset_li[r+1][c] = reset_lo[r][c];
+      end
+
+    end
+  end
+
+
+  // stitch together all of the tiles into a mesh
+  bsg_mesh_stitch #(
+    .width_p(link_sif_width_lp)
+    ,.x_max_p(subarray_num_tiles_x_p)
+    ,.y_max_p(subarray_num_tiles_y_p)
+  ) link (
+    .outs_i(link_out)
+    ,.ins_o(link_in)
+    ,.hor_i(hor_link_sif_i)
+    ,.hor_o(hor_link_sif_o)
+    ,.ver_i(ver_link_sif_i)
+    ,.ver_o(ver_link_sif_o)
+  );
+
+
+  // stitch ruche links
+  for (genvar r = 0; r < subarray_num_tiles_y_p; r++) begin: rr
+    for (genvar c = 0; c < subarray_num_tiles_x_p; c++) begin: rc
+      for (genvar l = 0; l < ruche_factor_X_p; l++) begin: rl    // ruche stage
+        if (c == subarray_num_tiles_x_p-1) begin: cl
+          bsg_ruche_buffer #(
+            .width_p(ruche_x_link_sif_width_lp)
+            ,.ruche_factor_p(ruche_factor_X_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_w (
+            .i(ruche_link_i[E][r][l])
+            ,.o(ruche_link_in[r][c][(l+ruche_factor_X_p-1) % ruche_factor_X_p][E])
+          );
+
+          bsg_ruche_buffer #(
+            .width_p(ruche_x_link_sif_width_lp)
+            ,.ruche_factor_p(ruche_factor_X_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_e (
+            .i(ruche_link_out[r][c][l][E])
+            ,.o(ruche_link_o[E][r][(l+1)%ruche_factor_X_p])
+          );
+        end
+        else begin: cn
+          bsg_ruche_buffer #(
+            .width_p(ruche_x_link_sif_width_lp)
+            ,.ruche_factor_p(ruche_factor_X_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_w (
+            .i(ruche_link_out[r][c+1][l][W])
+            ,.o(ruche_link_in[r][c][(l+ruche_factor_X_p-1) % ruche_factor_X_p][E])
+          );
+
+          bsg_ruche_buffer #(
+            .width_p(ruche_x_link_sif_width_lp)
+            ,.ruche_factor_p(ruche_factor_X_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_e (
+            .i(ruche_link_out[r][c][l][E])
+            ,.o(ruche_link_in[r][c+1][(l+1)%ruche_factor_X_p][W])
+          );
+        end
+      end
+    end
+  end
+
+
+  // edge ruche links
+  for (genvar r = 0; r < subarray_num_tiles_y_p; r++) begin: er
+    for (genvar l = 0; l < ruche_factor_X_p; l++) begin: el
+      // west
+      assign ruche_link_o[W][r][l] = ruche_link_out[r][0][l][W];
+      assign ruche_link_in[r][0][l][W] = ruche_link_i[W][r][l];
+      // east
+      //assign ruche_link_o[E][r][l] = ruche_link_out[r][num_tiles_x_p-1][l][E];
+      //assign ruche_link_in[r][num_tiles_x_p-1][l][E] = ruche_link_i[E][r][l];
+    end
+  end
+
+
+endmodule
diff --git a/v/bsg_manycore_tile_mesh.v b/v/bsg_manycore_tile_compute_mesh.v
similarity index 95%
rename from v/bsg_manycore_tile_mesh.v
rename to v/bsg_manycore_tile_compute_mesh.v
index 8cce9ee56..88d58011d 100644
--- a/v/bsg_manycore_tile_mesh.v
+++ b/v/bsg_manycore_tile_compute_mesh.v
@@ -1,15 +1,17 @@
 /**
- *  bsg_manycore_tile_mesh.v
+ *  bsg_manycore_tile_compute_mesh.v
  *
+ *  A compute tile with 2D mesh router
  */
 
-module bsg_manycore_tile_mesh
+module bsg_manycore_tile_compute_mesh
   import bsg_noc_pkg::*; // { P=0, W,E,N,S }
   import bsg_manycore_pkg::*;
   #(parameter dmem_size_p = "inv"
     , parameter vcache_size_p ="inv"
     , parameter icache_entries_p = "inv"
     , parameter icache_tag_width_p = "inv"
+    , parameter start_x_cord_p ="inv"
     , parameter x_cord_width_p = "inv"
     , parameter y_cord_width_p = "inv"
     , parameter num_tiles_x_p="inv"
@@ -111,6 +113,7 @@ module bsg_manycore_tile_mesh
     ,.data_width_p(data_width_p)
     ,.addr_width_p(addr_width_p)
 
+    ,.start_x_cord_p(start_x_cord_p)
     ,.dmem_size_p(dmem_size_p)
     ,.vcache_size_p(vcache_size_p)
     ,.icache_entries_p(icache_entries_p)
diff --git a/v/bsg_manycore_tile_ruche.v b/v/bsg_manycore_tile_compute_ruche.v
similarity index 77%
rename from v/bsg_manycore_tile_ruche.v
rename to v/bsg_manycore_tile_compute_ruche.v
index 9eb868ff8..47b768f59 100644
--- a/v/bsg_manycore_tile_ruche.v
+++ b/v/bsg_manycore_tile_compute_ruche.v
@@ -1,9 +1,9 @@
 /**
- *  bsg_manycore_tile_ruche.v
+ *  bsg_manycore_tile_compute_ruche.v
  *
  */
 
-module bsg_manycore_tile_ruche
+module bsg_manycore_tile_compute_ruche
   import bsg_noc_pkg::*; // { P=0, W,E,N,S }
   import bsg_manycore_pkg::*;
   #(parameter dmem_size_p = "inv"
@@ -12,14 +12,21 @@ module bsg_manycore_tile_ruche
     , parameter icache_tag_width_p = "inv"
     , parameter x_cord_width_p = "inv"
     , parameter y_cord_width_p = "inv"
+    , parameter pod_x_cord_width_p = "inv"
+    , parameter pod_y_cord_width_p = "inv"
+
+    // Number of tiles in a pod
     , parameter num_tiles_x_p="inv"
     , parameter num_tiles_y_p="inv"
+    , parameter x_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p)
 
     , parameter ruche_factor_X_p = 3
     
     , parameter data_width_p = "inv"
     , parameter addr_width_p = "inv"
 
+    , parameter num_vcache_rows_p = "inv"
     , parameter vcache_block_size_in_words_p="inv"
     , parameter vcache_sets_p="inv"
 
@@ -31,32 +38,29 @@ module bsg_manycore_tile_ruche
     , parameter hetero_type_p = 0
     , parameter debug_p = 0
 
-    , parameter branch_trace_en_p = 0
-
     , parameter link_sif_width_lp =
       `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
-    
     , parameter ruche_x_link_sif_width_lp =
       `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
   )
   (
     input clk_i
     , input reset_i
-
+    , output logic reset_o
 
     // local links
     , input  [S:W][link_sif_width_lp-1:0] link_i
     , output [S:W][link_sif_width_lp-1:0] link_o
 
-
     // ruche links
     , input  [ruche_factor_X_p-1:0][E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_i
     , output [ruche_factor_X_p-1:0][E:W][ruche_x_link_sif_width_lp-1:0] ruche_link_o
 
-
     // tile coordinates
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
+    , output logic [x_cord_width_p-1:0] global_x_o
+    , output logic [y_cord_width_p-1:0] global_y_o
   );
 
 
@@ -74,6 +78,34 @@ module bsg_manycore_tile_ruche
     ,.data_o(reset_r)
   );
 
+  assign reset_o = reset_r;
+
+  // feedthrough coordinate bits
+  logic [x_subcord_width_lp-1:0] my_x_r;
+  logic [y_subcord_width_lp-1:0] my_y_r;
+  logic [pod_x_cord_width_p-1:0] pod_x_r;
+  logic [pod_y_cord_width_p-1:0] pod_y_r;
+
+
+  bsg_dff #(
+    .width_p(x_cord_width_p)
+  ) dff_x (
+    .clk_i(clk_i)
+    ,.data_i(global_x_i)
+    ,.data_o({pod_x_r, my_x_r})
+  );
+
+  bsg_dff #(
+    .width_p(y_cord_width_p)
+  ) dff_y (
+    .clk_i(clk_i)
+    ,.data_i(global_y_i)
+    ,.data_o({pod_y_r, my_y_r})
+  );
+
+  assign global_x_o = {pod_x_r, my_x_r};
+  assign global_y_o = (y_cord_width_p)'(({pod_y_r, my_y_r}) + 1);
+
 
   // For vanilla core (hetero type = 0), it uses credit interface for the P ports,
   // which has three-element fifo because the credit returns with one extra cycle delay.
@@ -113,16 +145,17 @@ module bsg_manycore_tile_ruche
     ,.links_sif_o(links_sif_lo)
     ,.proc_link_sif_i(proc_link_sif_li)
     ,.proc_link_sif_o(proc_link_sif_lo)
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.global_x_i({pod_x_r, my_x_r})
+    ,.global_y_i({pod_y_r, my_y_r})
   );
 
   bsg_manycore_hetero_socket #(
     .x_cord_width_p(x_cord_width_p)
     ,.y_cord_width_p(y_cord_width_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
     ,.data_width_p(data_width_p)
     ,.addr_width_p(addr_width_p)
-
     ,.dmem_size_p(dmem_size_p)
     ,.vcache_size_p(vcache_size_p)
     ,.icache_entries_p(icache_entries_p)
@@ -130,12 +163,10 @@ module bsg_manycore_tile_ruche
     ,.hetero_type_p(hetero_type_p)
     ,.num_tiles_x_p(num_tiles_x_p)
     ,.num_tiles_y_p(num_tiles_y_p)
+    ,.num_vcache_rows_p(num_vcache_rows_p)
     ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
     ,.vcache_sets_p(vcache_sets_p)
     ,.fwd_fifo_els_p(fwd_fifo_els_lp[0])
-
-    ,.branch_trace_en_p(branch_trace_en_p)
-
     ,.debug_p(debug_p)
   ) proc (
     .clk_i(clk_i)
@@ -144,8 +175,11 @@ module bsg_manycore_tile_ruche
     ,.link_sif_i(proc_link_sif_lo)
     ,.link_sif_o(proc_link_sif_li)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.pod_x_i(pod_x_r)
+    ,.pod_y_i(pod_y_r)
+
+    ,.my_x_i(my_x_r)
+    ,.my_y_i(my_y_r)
   );
 
 
@@ -164,13 +198,13 @@ module bsg_manycore_tile_ruche
   // For incoming fwd, inject my_y_i as src_y.
   // For incoming rev, inject my_y_i as dest_y.
   assign links_sif_li[5].fwd =
-    `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][E].fwd,my_y_i);
+    `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][E].fwd,{pod_y_r, my_y_r});
   assign links_sif_li[5].rev =
-    `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][E].rev,my_y_i);
+    `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][E].rev,{pod_y_r, my_y_r});
   assign links_sif_li[4].fwd =
-    `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][W].fwd,my_y_i);
+    `bsg_manycore_ruche_x_link_fwd_inject_src_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][W].fwd,{pod_y_r, my_y_r});
   assign links_sif_li[4].rev =
-    `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][W].rev,my_y_i);
+    `bsg_manycore_ruche_x_link_rev_inject_dest_y(x_cord_width_p,y_cord_width_p,ruche_link_li[0][W].rev,{pod_y_r, my_y_r});
 
 
   // For outgoing fwd, filter out src_y.
diff --git a/v/bsg_manycore_tile_vcache.v b/v/bsg_manycore_tile_vcache.v
new file mode 100644
index 000000000..4e783d789
--- /dev/null
+++ b/v/bsg_manycore_tile_vcache.v
@@ -0,0 +1,326 @@
+/**
+ *    bsg_manycore_tile_vcache.v
+ *
+ *    A vcache tile that contains bsg_cache, vertical mesh router. horizontal wormhole router.
+ *    and bsg_manycore_link_to cache adapter, and bsg_cache dma to wormhole adapter.
+ *    this tile can connect to the top and bottom side of the compute tile array.
+ *    the vcache DMA interface is connected to the horizontal 1D wormhole ruche network.
+ */
+
+module bsg_manycore_tile_vcache
+  import bsg_noc_pkg::*;
+  import bsg_cache_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+
+    , parameter num_tiles_y_p="inv"
+
+    , parameter vcache_addr_width_p="inv"
+    , parameter vcache_data_width_p="inv"
+    , parameter vcache_ways_p="inv"
+    , parameter vcache_sets_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    // wh_ruche_factor_p supported only for 2^n, n>0.
+    , parameter wh_ruche_factor_p="inv"
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+    , parameter int wh_cord_markers_pos_lp[1:0] = '{wh_cord_width_p, 0}
+
+    , parameter req_fifo_els_p=4
+
+    , parameter lg_wh_ruche_factor_lp = `BSG_SAFE_CLOG2(wh_ruche_factor_p)
+
+    , parameter y_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p)
+
+    , parameter manycore_link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+
+    , parameter vcache_amo_support_p = (1 << e_cache_amo_swap)
+                                | (1 << e_cache_amo_or)
+                                | (1 << e_cache_amo_add)
+  )
+  (
+    input clk_i
+    , input reset_i
+    , output logic reset_o
+
+    , input  [wh_ruche_factor_p-1:0][E:W][wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [wh_ruche_factor_p-1:0][E:W][wh_link_sif_width_lp-1:0] wh_link_sif_o  
+
+    , input  [S:N][manycore_link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][manycore_link_sif_width_lp-1:0] ver_link_sif_o
+
+    // manycore cord
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
+
+    , output logic [x_cord_width_p-1:0] global_x_o
+    , output logic [y_cord_width_p-1:0] global_y_o
+  );
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+  `declare_bsg_cache_pkt_s(vcache_addr_width_p,vcache_data_width_p);
+  `declare_bsg_cache_dma_pkt_s(vcache_addr_width_p);
+
+
+  // reset dff
+  logic reset_r;
+  bsg_dff #(
+    .width_p(1)
+  ) reset_dff (
+    .clk_i(clk_i)
+    ,.data_i(reset_i)
+    ,.data_o(reset_r)
+  );
+
+  assign reset_o = reset_r;
+
+
+  // feedthrough coordinate bits
+  logic [x_cord_width_p-1:0] global_x_r;
+  logic [y_cord_width_p-1:0] global_y_r;
+
+  bsg_dff #(
+    .width_p(x_cord_width_p)
+  ) x_dff (
+    .clk_i(clk_i)
+    ,.data_i(global_x_i)
+    ,.data_o(global_x_r)
+  );
+
+  bsg_dff #(
+    .width_p(y_cord_width_p)
+  ) y_dff (
+    .clk_i(clk_i)
+    ,.data_i(global_y_i)
+    ,.data_o(global_y_r)
+  );
+
+  assign global_x_o = global_x_r;
+  assign global_y_o = y_cord_width_p'(global_y_r+1);
+
+
+
+  // mesh router
+  // vcache connects to P
+  bsg_manycore_link_sif_s [S:W] link_sif_li;
+  bsg_manycore_link_sif_s [S:W] link_sif_lo;
+  bsg_manycore_link_sif_s proc_link_sif_li;
+  bsg_manycore_link_sif_s proc_link_sif_lo;
+  
+  bsg_manycore_mesh_node #(
+    .x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+    ,.data_width_p(data_width_p)
+    ,.addr_width_p(addr_width_p)
+    // Because vcaches do not initiate packets, and there are no clients on the same Row,
+    // horizontal manycore links are unnecessary.
+    ,.stub_p(4'b0011) // stub E and W
+  ) rtr (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+
+    ,.links_sif_i(link_sif_li)
+    ,.links_sif_o(link_sif_lo)
+
+    ,.proc_link_sif_i(proc_link_sif_li)
+    ,.proc_link_sif_o(proc_link_sif_lo)
+
+    ,.global_x_i(global_x_r)
+    ,.global_y_i(global_y_r)
+  );
+
+  assign ver_link_sif_o[S] = link_sif_lo[S];
+  assign link_sif_li[S] = ver_link_sif_i[S];
+  assign ver_link_sif_o[N] = link_sif_lo[N];
+  assign link_sif_li[N] = ver_link_sif_i[N];
+
+  assign link_sif_li[E] = '0;
+  assign link_sif_li[W] = '0;
+
+  // link_to_cache
+  bsg_cache_pkt_s cache_pkt;
+  logic cache_v_li;
+  logic cache_ready_lo;
+  logic [vcache_data_width_p-1:0] cache_data_lo;
+  logic cache_v_lo;
+  logic cache_yumi_li;  
+  logic v_we_lo;
+  logic wh_dest_east_not_west_lo;
+
+  bsg_manycore_link_to_cache #(
+    .link_addr_width_p(addr_width_p) // word addr
+    ,.data_width_p(data_width_p)
+    ,.x_cord_width_p(x_cord_width_p)
+    ,.y_cord_width_p(y_cord_width_p)
+
+    ,.sets_p(vcache_sets_p)
+    ,.ways_p(vcache_ways_p)
+    ,.block_size_in_words_p(vcache_block_size_in_words_p)
+    
+    ,.fifo_els_p(req_fifo_els_p)
+  ) link_to_cache (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+
+    ,.link_sif_i(proc_link_sif_lo)
+    ,.link_sif_o(proc_link_sif_li)
+
+    ,.cache_pkt_o(cache_pkt)
+    ,.v_o(cache_v_li)
+    ,.ready_i(cache_ready_lo)
+
+    ,.data_i(cache_data_lo)
+    ,.v_i(cache_v_lo)
+    ,.yumi_o(cache_yumi_li)
+
+    ,.v_we_i(v_we_lo)
+
+    ,.wh_dest_east_not_west_o(wh_dest_east_not_west_lo)
+  );
+
+
+  // vcache
+  bsg_cache_dma_pkt_s dma_pkt_lo;
+  logic dma_pkt_v_lo;
+  logic dma_pkt_yumi_li;
+  
+  logic [vcache_dma_data_width_p-1:0] dma_data_li;
+  logic dma_data_v_li;
+  logic dma_data_ready_lo;
+
+  logic [vcache_dma_data_width_p-1:0] dma_data_lo;
+  logic dma_data_v_lo;
+  logic dma_data_yumi_li;
+  
+
+  bsg_cache #(
+    .addr_width_p(vcache_addr_width_p)
+    ,.data_width_p(vcache_data_width_p)
+    ,.block_size_in_words_p(vcache_block_size_in_words_p)
+    ,.sets_p(vcache_sets_p)
+    ,.ways_p(vcache_ways_p)
+    ,.dma_data_width_p(vcache_dma_data_width_p)
+    ,.amo_support_p(vcache_amo_support_p)
+  ) cache (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+    
+    // to manycore
+    ,.cache_pkt_i(cache_pkt)
+    ,.v_i(cache_v_li)
+    ,.ready_o(cache_ready_lo)
+
+    ,.data_o(cache_data_lo)
+    ,.v_o(cache_v_lo)
+    ,.yumi_i(cache_yumi_li)
+
+    ,.v_we_o(v_we_lo)
+
+    // to wormhole
+    ,.dma_pkt_o(dma_pkt_lo)
+    ,.dma_pkt_v_o(dma_pkt_v_lo)
+    ,.dma_pkt_yumi_i(dma_pkt_yumi_li)
+
+    ,.dma_data_i(dma_data_li)
+    ,.dma_data_v_i(dma_data_v_li)
+    ,.dma_data_ready_o(dma_data_ready_lo)
+
+    ,.dma_data_o(dma_data_lo)
+    ,.dma_data_v_o(dma_data_v_lo)
+    ,.dma_data_yumi_i(dma_data_yumi_li)
+  );
+  
+
+  // cache DMA to wormhole
+  wh_link_sif_s cache_wh_link_li;
+  wh_link_sif_s cache_wh_link_lo;
+
+  bsg_cache_dma_to_wormhole #(
+    .vcache_addr_width_p(vcache_addr_width_p)
+    ,.vcache_data_width_p(vcache_data_width_p)
+    ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+    ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+
+    ,.wh_flit_width_p(wh_flit_width_p)
+    ,.wh_cid_width_p(wh_cid_width_p)
+    ,.wh_len_width_p(wh_len_width_p)
+    ,.wh_cord_width_p(wh_cord_width_p)
+  ) dma_to_wh (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+
+    ,.dma_pkt_i(dma_pkt_lo)
+    ,.dma_pkt_v_i(dma_pkt_v_lo)
+    ,.dma_pkt_yumi_o(dma_pkt_yumi_li)
+
+    ,.dma_data_o(dma_data_li)
+    ,.dma_data_v_o(dma_data_v_li)
+    ,.dma_data_ready_i(dma_data_ready_lo)
+
+    ,.dma_data_i(dma_data_lo)
+    ,.dma_data_v_i(dma_data_v_lo)
+    ,.dma_data_yumi_o(dma_data_yumi_li)
+
+    ,.wh_link_sif_i(cache_wh_link_li)
+    ,.wh_link_sif_o(cache_wh_link_lo)
+
+    ,.my_wh_cord_i(global_x_r)
+    ,.dest_wh_cord_i({wh_cord_width_p{wh_dest_east_not_west_lo}})
+    // concentrator id
+    // lower bits come from lower bits of global_x
+    // upper bits come from whether its north or south vc.
+    ,.my_wh_cid_i({~global_y_r[y_subcord_width_lp-1], global_x_r[0+:lg_wh_ruche_factor_lp]})
+  );
+ 
+
+  // wormhole router
+  // vcache DMA connects to P
+  wh_link_sif_s [E:P] wh_link_li;
+  wh_link_sif_s [E:P] wh_link_lo;
+
+  bsg_wormhole_router #(
+    .flit_width_p(wh_flit_width_p)
+    ,.dims_p(1)
+    ,.cord_markers_pos_p(wh_cord_markers_pos_lp)
+    ,.len_width_p(wh_len_width_p)
+  ) wh_rtr (
+    .clk_i(clk_i)
+    ,.reset_i(reset_r)
+
+    ,.link_i(wh_link_li)
+    ,.link_o(wh_link_lo)
+
+    ,.my_cord_i(global_x_r)
+  );
+
+  assign wh_link_li[P] = cache_wh_link_lo;
+  assign cache_wh_link_li = wh_link_lo[P];
+
+
+  // connect wh ruche links
+  assign wh_link_sif_o[0][E] = wh_link_lo[E];
+  assign wh_link_li[E] = wh_link_sif_i[0][E];
+  assign wh_link_sif_o[0][W] = wh_link_lo[W];
+  assign wh_link_li[W] = wh_link_sif_i[0][W];
+
+  // feedthrough ruche links
+  for (genvar i = 1; i < wh_ruche_factor_p; i++) begin
+    assign wh_link_sif_o[i][E] = wh_link_sif_i[i][W];
+    assign wh_link_sif_o[i][W] = wh_link_sif_i[i][E];
+  end
+
+
+endmodule
diff --git a/v/bsg_manycore_tile_vcache_array.v b/v/bsg_manycore_tile_vcache_array.v
new file mode 100644
index 000000000..dbacd6071
--- /dev/null
+++ b/v/bsg_manycore_tile_vcache_array.v
@@ -0,0 +1,222 @@
+/**
+ *    bsg_manycore_tile_vcache_array.v
+ *  
+ *    This module instantiates vcaches and associated ruche buffers.
+ */
+
+
+
+module bsg_manycore_tile_vcache_array
+  import bsg_noc_pkg::*;
+  import bsg_manycore_pkg::*;
+  #(parameter addr_width_p="inv"
+    , parameter data_width_p="inv"
+    , parameter x_cord_width_p="inv"
+    , parameter y_cord_width_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
+
+    // Number of tiles in a pod
+    , parameter num_tiles_x_p="inv"
+    , parameter num_tiles_y_p="inv"
+    
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
+
+    // Number of tiles in a subarray 
+    , parameter subarray_num_tiles_x_p="inv"
+
+    , parameter num_vcache_rows_p = "inv"
+    , parameter vcache_addr_width_p ="inv"
+    , parameter vcache_data_width_p ="inv"
+    , parameter vcache_ways_p="inv"
+    , parameter vcache_sets_p="inv"
+    , parameter vcache_block_size_in_words_p="inv"
+    , parameter vcache_dma_data_width_p="inv"
+
+    , parameter wh_ruche_factor_p="inv"
+    , parameter wh_cid_width_p="inv"
+    , parameter wh_flit_width_p="inv"
+    , parameter wh_len_width_p="inv"
+    , parameter wh_cord_width_p="inv"
+
+    , parameter num_clk_ports_p=1
+    //, parameter reset_depth_p = 3
+
+    , parameter manycore_link_sif_width_lp =
+      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p)
+
+    , parameter wh_link_sif_width_lp = 
+      `bsg_ready_and_link_sif_width(wh_flit_width_p)
+  )
+  (
+    input [num_clk_ports_p-1:0] clk_i
+    , input [subarray_num_tiles_x_p-1:0] reset_i
+    , output logic [subarray_num_tiles_x_p-1:0] reset_o
+
+    , input  [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_i
+    , output [E:W][num_vcache_rows_p-1:0][wh_ruche_factor_p-1:0][wh_link_sif_width_lp-1:0] wh_link_sif_o  
+    
+    , input  [S:N][subarray_num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_i
+    , output [S:N][subarray_num_tiles_x_p-1:0][manycore_link_sif_width_lp-1:0] ver_link_sif_o
+
+    // coord id
+    , input [subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_i
+    , input [subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_i
+    , output [subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_o
+    , output [subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_o
+  );
+
+
+  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
+  `declare_bsg_ready_and_link_sif_s(wh_flit_width_p, wh_link_sif_s);
+
+  logic [num_vcache_rows_p-1:0][subarray_num_tiles_x_p-1:0] reset_li, reset_lo;
+  wh_link_sif_s [num_vcache_rows_p-1:0][subarray_num_tiles_x_p-1:0][wh_ruche_factor_p-1:0][E:W] wh_link_sif_li, wh_link_sif_lo;
+  bsg_manycore_link_sif_s [num_vcache_rows_p-1:0][subarray_num_tiles_x_p-1:0][S:N] ver_link_sif_li, ver_link_sif_lo;
+  logic [num_vcache_rows_p-1:0][subarray_num_tiles_x_p-1:0][x_cord_width_p-1:0] global_x_li, global_x_lo;
+  logic [num_vcache_rows_p-1:0][subarray_num_tiles_x_p-1:0][y_cord_width_p-1:0] global_y_li, global_y_lo;
+
+  // instantiate vcaches.
+  for (genvar y = 0; y < num_vcache_rows_p; y++) begin: vc_y
+    for (genvar x = 0; x < subarray_num_tiles_x_p; x++) begin: vc_x
+      bsg_manycore_tile_vcache #(
+        .addr_width_p(addr_width_p)
+        ,.data_width_p(data_width_p)
+        ,.x_cord_width_p(x_cord_width_p)
+        ,.y_cord_width_p(y_cord_width_p)
+        ,.num_tiles_y_p(num_tiles_y_p)  
+
+        ,.vcache_addr_width_p(vcache_addr_width_p)
+        ,.vcache_data_width_p(vcache_data_width_p)
+        ,.vcache_ways_p(vcache_ways_p)
+        ,.vcache_sets_p(vcache_sets_p)
+        ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
+        ,.vcache_dma_data_width_p(vcache_dma_data_width_p)
+
+        ,.wh_ruche_factor_p(wh_ruche_factor_p)
+        ,.wh_cid_width_p(wh_cid_width_p)
+        ,.wh_flit_width_p(wh_flit_width_p)
+        ,.wh_len_width_p(wh_len_width_p)
+        ,.wh_cord_width_p(wh_cord_width_p)
+      ) vc (
+        .clk_i(clk_i[x/(subarray_num_tiles_x_p/num_clk_ports_p)])
+        ,.reset_i(reset_li[y][x])
+        ,.reset_o(reset_lo[y][x])
+
+        ,.wh_link_sif_i(wh_link_sif_li[y][x])
+        ,.wh_link_sif_o(wh_link_sif_lo[y][x])
+
+        ,.ver_link_sif_i(ver_link_sif_li[y][x])
+        ,.ver_link_sif_o(ver_link_sif_lo[y][x])
+
+        ,.global_x_i(global_x_li[y][x])
+        ,.global_y_i(global_y_li[y][x])
+
+        ,.global_x_o(global_x_lo[y][x])
+        ,.global_y_o(global_y_lo[y][x])
+
+      );
+    
+      // connect north
+      if (y == 0) begin
+        assign reset_li[y][x] = reset_i[x];
+        assign global_x_li[y][x] = global_x_i[x];
+        assign global_y_li[y][x] = global_y_i[x];
+
+        assign ver_link_sif_o[N][x] = ver_link_sif_lo[y][x][N];
+        assign ver_link_sif_li[y][x][N] = ver_link_sif_i[N][x];
+      end
+
+      // connect between rows
+      if (y < num_vcache_rows_p-1) begin
+        assign reset_li[y+1][x] = reset_lo[y][x];
+        assign global_x_li[y+1][x] = global_x_lo[y][x];
+        assign global_y_li[y+1][x] = global_y_lo[y][x];
+
+        assign ver_link_sif_li[y+1][x][N] = ver_link_sif_lo[y][x][S];
+        assign ver_link_sif_li[y][x][S] = ver_link_sif_lo[y+1][x][N];
+      end
+
+      // connect south
+      if (y == num_vcache_rows_p-1) begin
+        assign reset_o[x] = reset_lo[y][x];
+        assign global_x_o[x] = global_x_lo[y][x];
+        assign global_y_o[x] = global_y_lo[y][x];
+    
+        assign ver_link_sif_o[S][x] = ver_link_sif_lo[y][x][S];
+        assign ver_link_sif_li[y][x][S] = ver_link_sif_i[S][x];
+      end
+      
+    end
+  end
+
+
+
+  // connect wh ruche link
+  for (genvar r = 0; r < num_vcache_rows_p; r++) begin: rr
+    for (genvar c = 0; c < subarray_num_tiles_x_p; c++) begin: rc
+      for (genvar l = 0; l < wh_ruche_factor_p; l++) begin: rl // ruche stage
+        if (c == subarray_num_tiles_x_p-1) begin: cl
+
+          bsg_ruche_buffer #(
+            .width_p(wh_link_sif_width_lp)
+            ,.ruche_factor_p(wh_ruche_factor_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_w (
+            .i(wh_link_sif_i[E][r][l])
+            ,.o(wh_link_sif_li[r][c][(l+wh_ruche_factor_p-1) % wh_ruche_factor_p][E])
+          );
+
+          bsg_ruche_buffer #(
+            .width_p(wh_link_sif_width_lp)
+            ,.ruche_factor_p(wh_ruche_factor_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_e (
+            .i(wh_link_sif_lo[r][c][l][E])
+            ,.o(wh_link_sif_o[E][r][(l+1) % wh_ruche_factor_p])
+          );
+
+        end
+        else begin: cn
+
+          bsg_ruche_buffer #(
+            .width_p(wh_link_sif_width_lp)
+            ,.ruche_factor_p(wh_ruche_factor_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_w (
+            .i(wh_link_sif_lo[r][c+1][l][W])
+            ,.o(wh_link_sif_li[r][c][(l+wh_ruche_factor_p-1) % wh_ruche_factor_p][E])
+          );
+
+          bsg_ruche_buffer #(
+            .width_p(wh_link_sif_width_lp)
+            ,.ruche_factor_p(wh_ruche_factor_p)
+            ,.ruche_stage_p(l)
+            ,.harden_p(1)
+          ) rb_e (
+            .i(wh_link_sif_lo[r][c][l][E])
+            ,.o(wh_link_sif_li[r][c+1][(l+1) % wh_ruche_factor_p][W])
+          );
+
+        end
+      end
+    end
+  end
+
+
+  // connect edge ruche links
+  for (genvar r = 0; r < num_vcache_rows_p; r++) begin
+    for (genvar l = 0; l < wh_ruche_factor_p; l++) begin
+      //  west
+      assign wh_link_sif_o[W][r][l] = wh_link_sif_lo[r][0][l][W];
+      assign wh_link_sif_li[r][0][l][W] = wh_link_sif_i[W][r][l];
+    end
+  end 
+
+
+
+endmodule
diff --git a/v/bsg_manycore_top_ruche.v b/v/bsg_manycore_top_ruche.v
deleted file mode 100644
index d5f0ab4e0..000000000
--- a/v/bsg_manycore_top_ruche.v
+++ /dev/null
@@ -1,298 +0,0 @@
-/**
- *    bsg_manycore_top_ruche.v
- *
- *    bsg_manycore toplevel
- *    
- *    Tiles in this array have half-ruche x network.
- */
-
-
-module bsg_manycore_top_ruche
-  import bsg_manycore_pkg::*;
-  import bsg_noc_pkg::*; // {P=0, W,E,N,S }
-  #(parameter dmem_size_p = "inv" // number of words in DMEM
-    , parameter icache_entries_p = "inv" // in words
-    , parameter icache_tag_width_p = "inv"
-
-    , parameter vcache_size_p = "inv" // capacity per vcache in words
-    , parameter vcache_block_size_in_words_p ="inv"
-    , parameter vcache_sets_p = "inv"
-
-    // change the default values from "inv" back to -1
-    // since num_tiles_x_p and num_tiles_y_p will be used to define the size of 2D array
-    // hetero_type_vec_p, they should be int by default to avoid tool crash during
-    // synthesis (DC versions at least up to 2018.06)
-    , parameter int num_tiles_x_p = -1
-    , parameter int num_tiles_y_p = -1
-
-    // This is used to define heterogeneous arrays. Each index defines
-    // the type of an X/Y coordinate in the array. This is a vector of
-    // num_tiles_x_p*num_tiles_y_p ints; type "0" is the
-    // default. See bsg_manycore_hetero_socket.v for more types.
-    , parameter int hetero_type_vec_p [0:((num_tiles_y_p-1)*num_tiles_x_p) - 1]  = '{default:0}
-
-    // this is the addr width on the manycore network packet (word addr).
-    // also known as endpoint physical address (EPA).
-    , parameter addr_width_p = "inv"
-    , parameter data_width_p = "inv" // 32
-
-    , parameter ruche_factor_X_p=3
-
-    // Enable branch/jalr trace
-    , parameter branch_trace_en_p = 0
-
-    // y = 0                  top vcache
-    // y = 1                  IO routers
-    // y = num_tiles_y_p+1    bottom vcache
-    , parameter y_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p+2)
-    , parameter x_cord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p)
-
-    , parameter link_sif_width_lp =
-      `bsg_manycore_link_sif_width(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp)
-
-    , parameter ruche_x_link_sif_width_lp =
-      `bsg_manycore_ruche_x_link_sif_width(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp)
-
-    // The number of registers between the reset_i port and the reset sinks
-    // Must be >= 1
-    , parameter reset_depth_p = 3
-
-    // enable debugging
-    , parameter debug_p = 0
-  )
-  (
-    input clk_i
-    , input reset_i
-
-    // horizontal -- {E,W}
-    , input [E:W][num_tiles_y_p-1:0][link_sif_width_lp-1:0] hor_link_sif_i
-    , output [E:W][num_tiles_y_p-1:0][link_sif_width_lp-1:0] hor_link_sif_o
-
-    // vertical -- {S,N}
-    , input [S:N][num_tiles_x_p-1:0][link_sif_width_lp-1:0] ver_link_sif_i
-    , output [S:N][num_tiles_x_p-1:0][link_sif_width_lp-1:0] ver_link_sif_o
-
-    // IO-row p-ports
-    , input [num_tiles_x_p-1:0][link_sif_width_lp-1:0] io_link_sif_i
-    , output [num_tiles_x_p-1:0][link_sif_width_lp-1:0] io_link_sif_o
-
-
-    // ruche link
-    , input [E:W][num_tiles_y_p-2:0][ruche_factor_X_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_i
-    , output [E:W][num_tiles_y_p-2:0][ruche_factor_X_p-1:0][ruche_x_link_sif_width_lp-1:0] ruche_link_o
-
-  );
-
-  // synopsys translate_off
-  initial begin
-    int i,j;
-    assert ((num_tiles_x_p > 0) && (num_tiles_y_p > 0))
-      else $error("num_tiles_x_p and num_tiles_y_p must be positive constants");
-    $display("## ----------------------------------------------------------------");
-    $display("## MANYCORE HETERO TYPE CONFIGURATIONS");
-    $display("## ----------------------------------------------------------------");
-    for (i=0; i < num_tiles_y_p-1; i++) begin
-      $write("## ");
-      for(j=0; j< num_tiles_x_p; j++) begin
-        $write("%0d,", hetero_type_vec_p[i * num_tiles_x_p + j]);
-      end
-      $write("\n");
-    end
-    $display("## ----------------------------------------------------------------");
-  end
-  // synopsys translate_on
-
-
-
-  // Pipeline the reset. The bsg_manycore_tile has a single pipeline register
-  // on reset already, so we only want to pipeline reset_depth_p-1 times.
-  logic [num_tiles_y_p-2:0][num_tiles_x_p-1:0] tile_reset_r;
-  logic [num_tiles_x_p-1:0] io_reset_r;
-
-  bsg_dff_chain #(
-    .width_p(num_tiles_x_p*(num_tiles_y_p-1))
-    ,.num_stages_p(reset_depth_p-1)
-  ) tile_reset (
-    .clk_i(clk_i)
-    ,.data_i({(num_tiles_x_p*(num_tiles_y_p-1)){reset_i}})
-    ,.data_o(tile_reset_r)
-  );
-  
-  bsg_dff_chain #(
-    .width_p(num_tiles_x_p)
-    ,.num_stages_p(reset_depth_p)
-  ) io_reset (
-    .clk_i(clk_i)
-    ,.data_i({num_tiles_x_p{reset_i}})
-    ,.data_o(io_reset_r)
-  );
-
-
-  // Instantiate tiles.
-  `declare_bsg_manycore_link_sif_s(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp);
-  bsg_manycore_link_sif_s [num_tiles_y_p-1:0][num_tiles_x_p-1:0][S:W] link_in;
-  bsg_manycore_link_sif_s [num_tiles_y_p-1:0][num_tiles_x_p-1:0][S:W] link_out;
- 
-  `declare_bsg_manycore_ruche_x_link_sif_s(addr_width_p,data_width_p,x_cord_width_lp,y_cord_width_lp);
-  bsg_manycore_ruche_x_link_sif_s [num_tiles_y_p-2:0][num_tiles_x_p-1:0][ruche_factor_X_p-1:0][E:W] ruche_link_in;   
-  bsg_manycore_ruche_x_link_sif_s [num_tiles_y_p-2:0][num_tiles_x_p-1:0][ruche_factor_X_p-1:0][E:W] ruche_link_out;
- 
-
-  for (genvar r = 2; r <= num_tiles_y_p; r++) begin: y
-    for (genvar c = 0; c < num_tiles_x_p; c++) begin: x
-      bsg_manycore_tile_ruche #(
-        .dmem_size_p     (dmem_size_p)
-        ,.vcache_size_p (vcache_size_p)
-        ,.icache_entries_p(icache_entries_p)
-        ,.icache_tag_width_p(icache_tag_width_p)
-        ,.x_cord_width_p(x_cord_width_lp)
-        ,.y_cord_width_p(y_cord_width_lp)
-        ,.data_width_p(data_width_p)
-        ,.addr_width_p(addr_width_p)
-        ,.hetero_type_p( hetero_type_vec_p[(r-2) * num_tiles_x_p + c] )
-        ,.debug_p(debug_p)
-        ,.branch_trace_en_p(branch_trace_en_p)
-        ,.num_tiles_x_p(num_tiles_x_p)
-        ,.num_tiles_y_p(num_tiles_y_p)
-        ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
-        ,.vcache_sets_p(vcache_sets_p)
-        ,.ruche_factor_X_p(ruche_factor_X_p)
-      ) tile (
-        .clk_i(clk_i)
-        ,.reset_i(tile_reset_r[r-2][c])
-
-        ,.link_i(link_in[r-1][c])
-        ,.link_o(link_out[r-1][c])
-
-        ,.ruche_link_i(ruche_link_in[r-2][c])
-        ,.ruche_link_o(ruche_link_out[r-2][c])
-
-        ,.my_x_i(x_cord_width_lp'(c))
-        ,.my_y_i(y_cord_width_lp'(r))
-      );
-    end
-  end
-
-
-  // Instantiate IO routers.
-  for (genvar c = 0; c < num_tiles_x_p; c++) begin: io
-    bsg_manycore_mesh_node #(
-      .x_cord_width_p     (x_cord_width_lp )
-      ,.y_cord_width_p     (y_cord_width_lp )
-      ,.data_width_p       (data_width_p    )
-      ,.addr_width_p       (addr_width_p    )
-    ) io_router (
-      .clk_i    (clk_i      )
-      ,.reset_i  (io_reset_r[c])
-        
-      ,.links_sif_i      ( link_in [0][ c ] )
-      ,.links_sif_o      ( link_out[0][ c ] )
-
-      ,.proc_link_sif_i  ( io_link_sif_i [ c ])
-      ,.proc_link_sif_o  ( io_link_sif_o [ c ])
-        
-      // tile coordinates
-      ,.my_x_i   ( x_cord_width_lp'(c))
-      ,.my_y_i   ( y_cord_width_lp'(1))
-   );
-  end
-
-
-
-  // stitch together all of the tiles into a mesh
-  bsg_mesh_stitch #(
-    .width_p(link_sif_width_lp)
-    ,.x_max_p(num_tiles_x_p)
-    ,.y_max_p(num_tiles_y_p)
-  ) link (
-    .outs_i(link_out)
-    ,.ins_o(link_in)
-    ,.hor_i(hor_link_sif_i)
-    ,.hor_o(hor_link_sif_o)
-    ,.ver_i(ver_link_sif_i)
-    ,.ver_o(ver_link_sif_o)
-  );
-
-
-
-
-  // stitch ruche links
-  for (genvar r = 0; r < num_tiles_y_p-1; r++) begin: rr
-    for (genvar c = 0; c < num_tiles_x_p-1; c++) begin: rc
-      for (genvar l = 0; l < ruche_factor_X_p; l++) begin: rl    // ruche stage
-        if (l == 0) begin: l0
-          if (ruche_factor_X_p % 2 == 0) begin: l0e
-            // if ruche factor is even, buffer with inverter
-            bsg_inv #(
-              .width_p(ruche_x_link_sif_width_lp)
-              ,.harden_p(1)
-            ) inv_e (
-              .i(ruche_link_out[r][c+1][l][W])
-              ,.o(ruche_link_in[r][c][l+1][E])
-            );
-
-            bsg_inv #(
-              .width_p(ruche_x_link_sif_width_lp)
-              ,.harden_p(1)
-            ) inv_w (
-              .i(ruche_link_out[r][c][l][E])
-              ,.o(ruche_link_in[r][c+1][l+1][W])
-            );
-
-          end
-          else begin: l0o
-            // if ruche factor is odd, buffer with buffer
-            bsg_buf #(
-              .width_p(ruche_x_link_sif_width_lp)
-              ,.harden_p(1)
-            ) buf_e (
-              .i(ruche_link_out[r][c+1][l][W])
-              ,.o(ruche_link_in[r][c][l+1][E])
-            );
-
-            bsg_buf #(
-              .width_p(ruche_x_link_sif_width_lp)
-              ,.harden_p(1)
-            ) buf_w (
-              .i(ruche_link_out[r][c][l][E])
-              ,.o(ruche_link_in[r][c+1][l+1][W])
-            );
-          end
-        end
-        else begin: lnz
-          // inverters
-          bsg_inv #(
-            .width_p(ruche_x_link_sif_width_lp)
-            ,.harden_p(1)
-          ) inv_e (
-            .i(ruche_link_out[r][c+1][l][W])
-            ,.o(ruche_link_in[r][c][(l+1) % ruche_factor_X_p][E])
-          );
-
-          bsg_inv #(
-            .width_p(ruche_x_link_sif_width_lp)
-            ,.harden_p(1)
-          ) inv_w (
-            .i(ruche_link_out[r][c][l][E])
-            ,.o(ruche_link_in[r][c+1][(l+1) % ruche_factor_X_p][W])
-          );
-        end
-      end
-    end
-  end
-
-
-  // edge ruche links
-  for (genvar r = 0; r < num_tiles_y_p-1; r++) begin: er
-    for (genvar l = 0; l < ruche_factor_X_p; l++) begin: el
-      // west
-      assign ruche_link_o[W][r][l] = ruche_link_out[r][0][l][W];
-      assign ruche_link_in[r][0][l][W] = ruche_link_i[W][r][l];
-      // east
-      assign ruche_link_o[E][r][l] = ruche_link_out[r][num_tiles_x_p-1][l][E];
-      assign ruche_link_in[r][num_tiles_x_p-1][l][E] = ruche_link_i[E][r][l];
-    end
-  end
-
-
-endmodule
diff --git a/v/bsg_ruche_anti_buffer.v b/v/bsg_ruche_anti_buffer.v
new file mode 100644
index 000000000..d7ea091dc
--- /dev/null
+++ b/v/bsg_ruche_anti_buffer.v
@@ -0,0 +1,69 @@
+/**
+ *    bsg_ruche_anti_buffer.v
+ *
+ *    used at the end of the ruche link, to bring all the buses to the positive polarity before making connection to other modules
+ *    such as wormhole concentrators.
+ */
+
+
+module bsg_ruche_anti_buffer
+  #(parameter width_p="inv"
+
+    , parameter ruche_factor_p ="inv"
+    , parameter ruche_stage_p ="inv"
+    , parameter bit west_not_east_p="inv"
+    , parameter bit input_not_output_p="inv"
+
+    , parameter bit ruche_factor_even_lp = (ruche_factor_p % 2 == 0)
+    , parameter bit ruche_stage_even_lp = (ruche_stage_p % 2 == 0)
+
+    , parameter bit invert_input_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ruche_stage_even_lp
+            : ~ruche_stage_even_lp))
+    , parameter bit invert_output_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ~ruche_stage_even_lp
+            : ruche_stage_even_lp))
+
+    , parameter bit invert_lp = input_not_output_p
+      ? invert_input_lp
+      : invert_output_lp
+
+    , parameter harden_p=1
+  )
+  (
+    input [width_p-1:0] i
+    , output [width_p-1:0] o
+  );
+
+
+  if (invert_lp) begin: inv
+
+    bsg_inv #(
+      .width_p(width_p)
+      ,.harden_p(harden_p)
+    ) inv0 (
+      .i(i)
+      ,.o(o)
+    );
+
+  end
+  else begin: bf
+
+    bsg_buf #(
+      .width_p(width_p)
+      ,.harden_p(harden_p)
+    ) buf0 (
+      .i(i)
+      ,.o(o)
+    );
+
+  end
+
+
+endmodule
diff --git a/v/bsg_ruche_buffer.v b/v/bsg_ruche_buffer.v
new file mode 100644
index 000000000..49f678636
--- /dev/null
+++ b/v/bsg_ruche_buffer.v
@@ -0,0 +1,49 @@
+/**
+ *    bsg_ruche_buffer.v
+ *
+ */
+
+
+module bsg_ruche_buffer
+  #(parameter width_p="inv"
+    , parameter ruche_factor_p="inv"
+    , parameter ruche_stage_p="inv"
+
+    , parameter bit invert_lp = (ruche_stage_p == 0)
+      ? (ruche_factor_p % 2 == 0)
+      : 1'b1
+
+    , parameter harden_p=1
+  )
+  (
+    input [width_p-1:0] i
+    , output [width_p-1:0] o
+  );
+
+
+  if (invert_lp) begin: inv
+
+    bsg_inv #(
+      .width_p(width_p)
+      ,.harden_p(harden_p)
+    ) inv0 (
+      .i(i)
+      ,.o(o)
+    );
+
+  end
+  else begin: bf
+
+    bsg_buf #(
+      .width_p(width_p)
+      ,.harden_p(harden_p)
+    ) buf0 (
+      .i(i)
+      ,.o(o)
+    );
+
+  end
+
+
+
+endmodule
diff --git a/v/bsg_ruche_link_sif_tieoff.v b/v/bsg_ruche_link_sif_tieoff.v
new file mode 100644
index 000000000..d09478d38
--- /dev/null
+++ b/v/bsg_ruche_link_sif_tieoff.v
@@ -0,0 +1,69 @@
+/**
+ *    bsg_ruche_link_sif_tieoff.v
+ *
+ *    used for tieing off ruche links (wh) on the sides.
+ */
+
+
+`include "bsg_noc_links.vh"
+
+module bsg_ruche_link_sif_tieoff
+  #(parameter link_data_width_p="inv"
+    , parameter ruche_factor_p="inv"
+    , parameter ruche_stage_p="inv"
+    , parameter bit west_not_east_p="inv" // tie-off on west or east side??
+  
+
+    , parameter bit ruche_factor_even_lp = (ruche_factor_p % 2 == 0)
+    , parameter bit ruche_stage_even_lp = (ruche_stage_p % 2 == 0)
+
+    , parameter bit invert_output_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ruche_stage_even_lp
+            : ~ruche_stage_even_lp))
+    , parameter bit invert_input_lp = (ruche_stage_p > 0)
+        & (ruche_factor_even_lp
+          ? ~ruche_stage_even_lp
+          : (west_not_east_p
+            ? ~ruche_stage_even_lp
+            : ruche_stage_even_lp))
+
+
+    , parameter link_width_lp=`bsg_ready_and_link_sif_width(link_data_width_p)
+  )
+  (
+    // debug only
+    input clk_i
+    , input reset_i
+
+    , input [link_width_lp-1:0] ruche_link_i
+    , output [link_width_lp-1:0] ruche_link_o
+  );
+
+
+  `declare_bsg_ready_and_link_sif_s(link_data_width_p, ruche_link_sif_s);
+  ruche_link_sif_s ruche_link_in;
+  assign ruche_link_in = ruche_link_i;
+  assign ruche_link_o = invert_output_lp ? '1 : '0; 
+
+
+  // synopsys translate_off
+  // For debugging only
+  always_ff @ (negedge clk_i) begin
+    if (~reset_i) begin
+      
+      if (invert_input_lp ^ ruche_link_in.v)
+        $error("[BSG_ERROR] Errant packet detected at the tied off ruche link.");
+
+    end
+  end
+  // synopsys translate_on
+
+
+
+
+
+
+endmodule
diff --git a/v/vanilla_bean/alu.v b/v/vanilla_bean/alu.v
index b248987ba..0b037dacf 100644
--- a/v/vanilla_bean/alu.v
+++ b/v/vanilla_bean/alu.v
@@ -53,7 +53,7 @@ always_comb
       `RV32_AUIPC:
         result_o = `RV32_signext_Uimm(op_i) + pc_plus4_i - 3'b100;
 
-      `RV32_ADDI, `RV32_ADD:
+      `RV32_ADDI, `RV32_ADD, `RV32_FLWADD:
         begin
           result_o = sum[31:0];
           sub_not_add = 1'b0;
diff --git a/v/vanilla_bean/bsg_manycore_proc_vanilla.v b/v/vanilla_bean/bsg_manycore_proc_vanilla.v
index eae17173a..7e65fe1bf 100644
--- a/v/vanilla_bean/bsg_manycore_proc_vanilla.v
+++ b/v/vanilla_bean/bsg_manycore_proc_vanilla.v
@@ -9,6 +9,8 @@ module bsg_manycore_proc_vanilla
   import bsg_vanilla_pkg::*;
   #(parameter x_cord_width_p = "inv"
     , parameter y_cord_width_p = "inv"
+    , parameter pod_x_cord_width_p = "inv"
+    , parameter pod_y_cord_width_p = "inv"
     , parameter data_width_p = "inv"
     , parameter addr_width_p = "inv"
 
@@ -16,6 +18,7 @@ module bsg_manycore_proc_vanilla
     , parameter icache_entries_p = "inv"
 
     , parameter dmem_size_p = "inv"
+    , parameter num_vcache_rows_p = "inv"
     , parameter vcache_size_p = "inv"
     , parameter vcache_block_size_in_words_p="inv"
     , parameter vcache_sets_p = "inv"
@@ -23,15 +26,15 @@ module bsg_manycore_proc_vanilla
     , parameter num_tiles_x_p="inv"
     , parameter num_tiles_y_p="inv"
 
+    , parameter x_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp = `BSG_SAFE_CLOG2(num_tiles_y_p)
+
     , parameter fwd_fifo_els_p="inv" // for FIFO credit counting.
   
     , parameter max_out_credits_p = 32
     , parameter proc_fifo_els_p = 4
     , parameter debug_p = 1
 
-    
-    , parameter branch_trace_en_p = 0
-
     , parameter credit_counter_width_lp=$clog2(max_out_credits_p+1)
     , parameter icache_addr_width_lp = `BSG_SAFE_CLOG2(icache_entries_p)
     , parameter dmem_addr_width_lp = `BSG_SAFE_CLOG2(dmem_size_p)
@@ -50,8 +53,13 @@ module bsg_manycore_proc_vanilla
     , input [link_sif_width_lp-1:0] link_sif_i
     , output logic [link_sif_width_lp-1:0] link_sif_o
 
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    // subcord within a pod
+    , input [x_subcord_width_lp-1:0] my_x_i
+    , input [y_subcord_width_lp-1:0] my_y_i
+
+    // pod coordinate
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
   );
 
   // endpoint standard
@@ -82,7 +90,9 @@ module bsg_manycore_proc_vanilla
   logic returned_fifo_full_lo;
 
   logic [credit_counter_width_lp-1:0] out_credits_lo;
-
+  logic [x_cord_width_p-1:0] 	      src_x_cord_debug_lo;
+  logic [y_cord_width_p-1:0] 	      src_y_cord_debug_lo;   
+   
   bsg_manycore_endpoint_standard #(
     .x_cord_width_p(x_cord_width_p)
     ,.y_cord_width_p(y_cord_width_p)
@@ -109,8 +119,8 @@ module bsg_manycore_proc_vanilla
     ,.in_mask_o(in_mask_lo)
     ,.in_yumi_i(in_yumi_li)
     ,.in_load_info_o(in_load_info_lo)
-    ,.in_src_x_cord_o()
-    ,.in_src_y_cord_o()
+    ,.in_src_x_cord_o(src_x_cord_debug_lo)
+    ,.in_src_y_cord_o(src_y_cord_debug_lo)
 
     ,.returning_v_i(returning_data_v_li)
     ,.returning_data_i(returning_data_li)
@@ -127,10 +137,13 @@ module bsg_manycore_proc_vanilla
     ,.returned_fifo_full_o(returned_fifo_full_lo)
     ,.returned_yumi_i(returned_yumi_li)
 
+    ,.returned_credit_v_r_o()
+    ,.returned_credit_reg_id_r_o()
+
     ,.out_credits_o(out_credits_lo)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.global_x_i({pod_x_i, my_x_i})
+    ,.global_y_i({pod_y_i, my_y_i})
   );
 
 
@@ -150,11 +163,15 @@ module bsg_manycore_proc_vanilla
   logic icache_yumi_li;
 
   logic freeze;
-  logic [x_cord_width_p-1:0] tgo_x;
-  logic [y_cord_width_p-1:0] tgo_y;
+  logic [x_subcord_width_lp-1:0] tgo_x;
+  logic [y_subcord_width_lp-1:0] tgo_y;
   logic [pc_width_lp-1:0] pc_init_val;
   logic dram_enable;
 
+  logic remote_interrupt_set_lo;
+  logic remote_interrupt_clear_lo;
+  logic remote_interrupt_pending_bit_li;
+
   network_rx #(
     .addr_width_p(addr_width_p)
     ,.data_width_p(data_width_p)
@@ -163,6 +180,8 @@ module bsg_manycore_proc_vanilla
     ,.icache_entries_p(icache_entries_p)
     ,.x_cord_width_p(x_cord_width_p)
     ,.y_cord_width_p(y_cord_width_p)
+    ,.x_subcord_width_p(x_subcord_width_lp)
+    ,.y_subcord_width_p(y_subcord_width_lp)
   ) rx (
     .clk_i(clk_i)
     ,.reset_i(reset_i)
@@ -174,7 +193,9 @@ module bsg_manycore_proc_vanilla
     ,.mask_i(in_mask_lo)
     ,.load_info_i(in_load_info_lo)
     ,.yumi_o(in_yumi_li)
-
+    ,.src_x_cord_debug_i(src_x_cord_debug_lo)
+    ,.src_y_cord_debug_i(src_y_cord_debug_lo)
+	
     ,.returning_data_o(returning_data_li)
     ,.returning_data_v_o(returning_data_v_li)
 
@@ -197,8 +218,12 @@ module bsg_manycore_proc_vanilla
     ,.pc_init_val_o(pc_init_val)
     ,.dram_enable_o(dram_enable)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.remote_interrupt_set_o(remote_interrupt_set_lo)
+    ,.remote_interrupt_clear_o(remote_interrupt_clear_lo)
+    ,.remote_interrupt_pending_bit_i(remote_interrupt_pending_bit_li)
+
+    ,.global_x_i({pod_x_i, my_x_i})
+    ,.global_y_i({pod_y_i, my_y_i})
   );
 
 
@@ -230,6 +255,9 @@ module bsg_manycore_proc_vanilla
     ,.addr_width_p(addr_width_p)
     ,.x_cord_width_p(x_cord_width_p)
     ,.y_cord_width_p(y_cord_width_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
+    ,.num_vcache_rows_p(num_vcache_rows_p)
     ,.vcache_size_p(vcache_size_p)
     ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
     ,.vcache_sets_p(vcache_sets_p)
@@ -260,6 +288,9 @@ module bsg_manycore_proc_vanilla
     ,.tgo_y_i(tgo_y) 
     ,.dram_enable_i(dram_enable)
 
+    ,.pod_x_i(pod_x_i)
+    ,.pod_y_i(pod_y_i)
+
     ,.my_x_i(my_x_i)
     ,.my_y_i(my_y_i)
 
@@ -282,7 +313,6 @@ module bsg_manycore_proc_vanilla
     ,.int_remote_load_resp_force_o(int_remote_load_resp_force_lo)
     ,.int_remote_load_resp_yumi_i(int_remote_load_resp_yumi_li)
 
-
     ,.invalid_eva_access_o(invalid_eva_access_lo)
   );
 
@@ -295,7 +325,6 @@ module bsg_manycore_proc_vanilla
     ,.icache_tag_width_p(icache_tag_width_p)
     ,.x_cord_width_p(x_cord_width_p)
     ,.y_cord_width_p(y_cord_width_p)
-    ,.branch_trace_en_p(branch_trace_en_p)
     ,.max_out_credits_p(max_out_credits_p)
     ,.fwd_fifo_els_p(fwd_fifo_els_p)
   ) vcore (
@@ -338,9 +367,13 @@ module bsg_manycore_proc_vanilla
 
     ,.out_credits_i(out_credits_lo)
     ,.invalid_eva_access_i(invalid_eva_access_lo)
+  
+    ,.remote_interrupt_set_i(remote_interrupt_set_lo)
+    ,.remote_interrupt_clear_i(remote_interrupt_clear_lo)
+    ,.remote_interrupt_pending_bit_o(remote_interrupt_pending_bit_li)
 
-    ,.my_x_i(my_x_i)
-    ,.my_y_i(my_y_i)
+    ,.global_x_i({pod_x_i, my_x_i})
+    ,.global_y_i({pod_y_i, my_y_i})
   );
 
 endmodule
diff --git a/v/vanilla_bean/bsg_vanilla_pkg.v b/v/vanilla_bean/bsg_vanilla_pkg.v
index abb1d1fe6..7f3f83605 100644
--- a/v/vanilla_bean/bsg_vanilla_pkg.v
+++ b/v/vanilla_bean/bsg_vanilla_pkg.v
@@ -29,6 +29,10 @@ localparam fpu_recoded_exp_width_gp    = 8;
 localparam fpu_recoded_sig_width_gp    = 24;
 localparam fpu_recoded_data_width_gp   = (1+fpu_recoded_exp_width_gp+fpu_recoded_sig_width_gp);
 
+// Maximum EPA width for vanilla core (word addr)
+localparam epa_word_addr_width_gp=16;
+
+
 // RV32 Instruction structure
 // Ideally represents a R-type instruction; these fields if
 // present in other types of instructions, appear at same positions
@@ -55,11 +59,17 @@ typedef struct packed {
 
 // remote request from vanilla core
 //
+typedef enum logic [1:0] {
+  e_vanilla_amoswap
+  , e_vanilla_amoor
+  , e_vanilla_amoadd
+} bsg_vanilla_amo_type_e;
+
 typedef struct packed
 {
   logic write_not_read;
   logic is_amo_op;
-  bsg_manycore_amo_type_e amo_type;
+  bsg_vanilla_amo_type_e amo_type;
   logic [3:0] mask;
   bsg_manycore_load_info_s load_info;
   logic [bsg_manycore_reg_id_width_gp-1:0] reg_id;
@@ -92,16 +102,17 @@ typedef enum logic [1:0] {
 
 typedef struct packed {
   // int regfile
-  logic read_rs1;
-  logic read_rs2;
-  logic write_rd;
+  logic read_rs1;         // Op reads integer rs1
+  logic read_rs2;         // Op reads integer rs2
+  logic write_rd;         // Op writes to integer rd
 
   // Load & Store
-  logic is_load_op;       // Op loads data from memory
-  logic is_store_op;      // Op stores data to memory
+  logic is_load_op;       // Op is lw or flw
+  logic is_store_op;      // Op is sw or fsw 
   logic is_byte_op;       // Op is byte load/store
   logic is_hex_op;        // Op is hex load/store
   logic is_load_unsigned; // Op is unsigned load
+  logic is_flwadd_op;     // flwadd
 
   // Branch & Jump
   logic is_branch_op;
@@ -124,7 +135,7 @@ typedef struct packed {
   logic is_amo_op;
   logic is_amo_aq;
   logic is_amo_rl;
-  bsg_manycore_amo_type_e amo_type;
+  bsg_vanilla_amo_type_e amo_type;
 
   // FPU
   logic is_fp_op;           // goes into FP_EXE
@@ -136,6 +147,9 @@ typedef struct packed {
   // CSR
   logic is_csr_op;
 
+  // MRET
+  logic is_mret_op;
+
   // This signal is for debugging only.
   // It shouldn't be used to synthesize any actual circuits.
   logic unsupported;
@@ -223,6 +237,7 @@ typedef struct packed
     decode_s                           decode;            // Decode signals
     fp_decode_s                        fp_decode;
     logic                              icache_miss;
+    logic                              valid;             // valid instruction in ID
 } id_signals_s;
 
 // Execute stage signals
@@ -234,10 +249,11 @@ typedef struct packed
     decode_s                           decode;            // Decode signals
     logic [RV32_reg_data_width_gp-1:0] rs1_val;           // RF output data from RS1 address
     logic [RV32_reg_data_width_gp-1:0] rs2_val;           // RF output data from RS2 address
+                                                          // CSR instructions use this register for loading CSR vals
     logic [RV32_reg_data_width_gp-1:0] mem_addr_op2;      // the second operands to compute
                                                           // memory address
     logic                              icache_miss;
-    fcsr_s fcsr_data;
+    logic                              valid;             // valid instruction in EXE
 } exe_signals_s;
 
 
@@ -245,6 +261,8 @@ typedef struct packed
 typedef struct packed
 {
     logic [RV32_reg_addr_width_gp-1:0] rd_addr;
+    logic [RV32_reg_addr_width_gp-1:0] frd_addr;
+
     logic [RV32_reg_data_width_gp-1:0] exe_result;
     logic write_rd;
     logic write_frd;
@@ -252,6 +270,7 @@ typedef struct packed
     logic is_hex_op;
     logic is_load_unsigned;
     logic local_load;
+    logic local_flwadd;
     logic [RV32_reg_data_width_gp-1:0] mem_addr_sent;
     logic icache_miss;
 } mem_signals_s;
@@ -290,6 +309,26 @@ typedef struct packed
 
 
 
+// MACHINE CSR structs, constants
+// mstatus
+typedef struct packed {
+  logic mpie;   //  machine previous interrupt enabler (using bit-7)
+  logic mie;    //  machine interrupt enable (using bit-3)
+} csr_mstatus_s;
+`define RV32_MSTATUS_MIE_BIT_IDX  3
+`define RV32_MSTATUS_MPIE_BIT_IDX 7
+
+// machine interrupt pending/enable vector
+typedef struct packed {
+  logic trace;  // bit-17
+  logic remote; // bit-16
+} csr_interrupt_vector_s;
+
+
+`define REMOTE_INTERRUPT_JUMP_ADDR  0   // remote interrupt jump addr (word addr)
+`define TRACE_INTERRUPT_JUMP_ADDR   1   // trace interrupt jump addr (word addr)
+// remote 
+
 
 //                            //
 //    RISCV                   //
@@ -390,6 +429,10 @@ typedef struct packed
 `define RV32_FENCE_FUN3   3'b000
 `define RV32_FENCE   {4'b0000,4'b????,4'b????,5'b00000,`RV32_FENCE_FUN3,5'b00000,`RV32_MISC_MEM}
 
+//TRIGGER SAIF DUMP defines
+`define SAIF_TRIGGER_START {12'b000000000001,5'b00000,3'b000,5'b00000,`RV32_OP_IMM}
+`define SAIF_TRIGGER_END {12'b000000000010,5'b00000,3'b000,5'b00000,`RV32_OP_IMM}
+
 // CSR encoding
 `define RV32_CSRRW_FUN3  3'b001
 `define RV32_CSRRS_FUN3  3'b010
@@ -405,9 +448,20 @@ typedef struct packed
 `define RV32_CSRRSI     `RV32_Itype(`RV32_SYSTEM, `RV32_CSRRSI_FUN3)
 `define RV32_CSRRCI     `RV32_Itype(`RV32_SYSTEM, `RV32_CSRRCI_FUN3)
 
+// fcsr CSR addr
 `define RV32_CSR_FFLAGS_ADDR  12'h001
 `define RV32_CSR_FRM_ADDR     12'h002  
 `define RV32_CSR_FCSR_ADDR    12'h003
+// machine CSR addr
+`define RV32_CSR_MSTATUS_ADDR   12'h300
+`define RV32_CSR_MTVEC_ADDR     12'h305
+`define RV32_CSR_MIE_ADDR       12'h304
+`define RV32_CSR_MIP_ADDR       12'h344
+`define RV32_CSR_MEPC_ADDR      12'h341
+
+// mret
+// used for returning from the interrupt
+`define RV32_MRET     {7'b0011000, 5'b00010, 5'b00000, 3'b000, 5'b00000, `RV32_SYSTEM}
 
 // RV32M Instruction Encodings
 `define MD_MUL_FUN3       3'b000
@@ -432,6 +486,7 @@ typedef struct packed
 `define RV32_LR_W_AQ    {5'b00010,2'b10,5'b00000,5'b?????,3'b010,5'b?????,`RV32_AMO_OP}
 `define RV32_AMOSWAP_W  {5'b00001,2'b??,5'b?????,5'b?????,3'b010,5'b?????,`RV32_AMO_OP}
 `define RV32_AMOOR_W    {5'b01000,2'b??,5'b?????,5'b?????,3'b010,5'b?????,`RV32_AMO_OP}
+`define RV32_AMOADD_W   {5'b00000,2'b??,5'b?????,5'b?????,3'b010,5'b?????,`RV32_AMO_OP}
 
 // RV32F Instruction Encodings
 `define RV32_OP_FP            7'b1010011
@@ -488,4 +543,25 @@ typedef struct packed
 `define RV32_FSQRT_S  {7'b0101100, 5'b00000, 5'b?????, 3'b???, 5'b?????, 7'b1010011}
 
 
+//                                    //
+//  NON-STANDARD RISC-V Instructions  //
+//                                    //
+
+
+//  [FLWADD]
+//
+//  Assembly format
+//  flwadd fd, rs2, 0(rs1)
+//
+//  Semantic:
+//  fd = *rs1; rs1 = rs1 + rs2;
+//
+//  Machine Format:
+//          rs1   rs2       rd    opcode
+//  0000000_?????_?????_111_?????_0000100
+`define RV32_FLWADD_OP 7'b0000100
+`define RV32_FLWADD {7'b0000000, 5'b?????, 5'b?????, 3'b111, 5'b?????, `RV32_FLWADD_OP}
+
+
+
 endpackage
diff --git a/v/vanilla_bean/cl_decode.v b/v/vanilla_bean/cl_decode.v
index 0b2b13c09..caf1b2838 100644
--- a/v/vanilla_bean/cl_decode.v
+++ b/v/vanilla_bean/cl_decode.v
@@ -22,36 +22,36 @@ import bsg_manycore_pkg::*;
 );
 
 
-// Op Writes RF -- register file write operation
+// Op Writes Integer RF -- register file write operation
 always_comb begin
-  if (instruction_i.rd == 0) begin
-    decode_o.write_rd = 1'b0; // reg 0 is always 0
-  end
-  else begin
-    unique casez (instruction_i.op)
-      `RV32_LUI_OP, `RV32_AUIPC_OP,
-      `RV32_JAL_OP, `RV32_JALR_OP,
-      `RV32_LOAD, `RV32_OP,
-      `RV32_OP_IMM, `RV32_AMO_OP: begin
-        decode_o.write_rd = 1'b1;
-      end
-      `RV32_OP_FP: begin
-        decode_o.write_rd = 
-          (instruction_i.funct7 == `RV32_FCMP_S_FUN7) // FEQ, FLT, FLE
-          | ((instruction_i.funct7 == `RV32_FCLASS_S_FUN7) & (instruction_i.rs2 == 5'b00000)) // FCLASS, FMV.X.W
-          | ((instruction_i.funct7 == `RV32_FCVT_S_F2I_FUN7)); // FCVT.W.S, FCVT.WU.S
-      end
-      `RV32_SYSTEM: begin
-        decode_o.write_rd = 1'b1; // CSRRW, CSRRS
-      end
-      default: begin
-        decode_o.write_rd = 1'b0;
-      end
-    endcase
-  end
+
+  unique casez (instruction_i.op)
+    `RV32_LUI_OP, `RV32_AUIPC_OP,
+    `RV32_JAL_OP, `RV32_JALR_OP,
+    `RV32_LOAD, `RV32_OP,
+    `RV32_OP_IMM, `RV32_AMO_OP: begin
+      decode_o.write_rd = (instruction_i.rd != '0);
+    end
+    `RV32_OP_FP: begin
+      decode_o.write_rd = (instruction_i.rd != '0) & 
+        ((instruction_i.funct7 == `RV32_FCMP_S_FUN7) // FEQ, FLT, FLE
+        | ((instruction_i.funct7 == `RV32_FCLASS_S_FUN7) & (instruction_i.rs2 == 5'b00000)) // FCLASS, FMV.X.W
+        | (instruction_i.funct7 == `RV32_FCVT_S_F2I_FUN7)); // FCVT.W.S, FCVT.WU.S
+    end
+    `RV32_FLWADD_OP: begin
+      decode_o.write_rd = (instruction_i.funct7 == 7'b0000000) & (instruction_i.funct3 == 3'b111) & (instruction_i.rs1 != '0);
+    end
+    `RV32_SYSTEM: begin
+      decode_o.write_rd = (instruction_i.rd != '0); // CSRRW, CSRRS
+    end
+    default: begin
+      decode_o.write_rd = 1'b0;
+    end
+  endcase
+
 end
 
-// declares if OP reads from first port of register file
+// declares if OP reads from first port of integer register file
 always_comb begin
   unique casez (instruction_i.op)
     `RV32_JALR_OP, `RV32_BRANCH,
@@ -65,7 +65,7 @@ always_comb begin
        (instruction_i.funct7 == `RV32_FCVT_S_I2F_FUN7) // FCVT.S.W, FCVT.S.WU
        | (instruction_i.funct7 == `RV32_FMV_W_X_FUN7); // FMV.W.X
     end
-    `RV32_LOAD_FP, `RV32_STORE_FP: begin // FLW, FSW
+    `RV32_LOAD_FP, `RV32_STORE_FP, `RV32_FLWADD_OP: begin // FLW, FSW, FLWADD
       decode_o.read_rs1 = 1'b1;
      end
     `RV32_SYSTEM: begin
@@ -84,16 +84,20 @@ always_comb begin
   endcase
 end
 
-// declares if Op reads from second port of register file
+// declares if Op reads from second port of integer register file
 always_comb begin
   unique casez (instruction_i.op)
     `RV32_BRANCH, `RV32_STORE, `RV32_OP: begin
       decode_o.read_rs2 = 1'b1;
     end
+    `RV32_FLWADD_OP: begin
+      decode_o.read_rs2 = 1'b1;
+    end
     `RV32_AMO_OP: begin
       // According the ISA, LR instruction don't read rs2
       decode_o.read_rs2 = (instruction_i.funct7 ==? 7'b00001??)   // amoswap
-                            | (instruction_i.funct7 ==? 7'b01000??);  // amoor
+                            | (instruction_i.funct7 ==? 7'b01000??)  // amoor
+                            | (instruction_i.funct7 ==? 7'b00000??); // amoadd
     end
     default: begin
       decode_o.read_rs2 = 1'b0;
@@ -102,6 +106,7 @@ always_comb begin
 end
 
 // Load & Store
+assign decode_o.is_flwadd_op = (instruction_i ==? `RV32_FLWADD);
 assign decode_o.is_load_op = (instruction_i.op == `RV32_LOAD) | (instruction_i.op == `RV32_LOAD_FP);
 assign decode_o.is_store_op = (instruction_i.op == `RV32_STORE) | (instruction_i.op == `RV32_STORE_FP);
 
@@ -144,6 +149,10 @@ always_comb begin
   end
 end
 
+// MRET
+assign decode_o.is_mret_op = (instruction_i == `RV32_MRET);
+
+
 //+----------------------------------------------
 //|
 //|     RISC-V edit: "M" STANDARD EXTENSION
@@ -193,15 +202,19 @@ always_comb begin
   unique casez (instruction_i)
     `RV32_AMOSWAP_W: begin
       decode_o.is_amo_op = 1'b1;
-      decode_o.amo_type = e_amo_swap;
+      decode_o.amo_type = e_vanilla_amoswap;
     end    
     `RV32_AMOOR_W: begin
       decode_o.is_amo_op = 1'b1;
-      decode_o.amo_type = e_amo_or;
+      decode_o.amo_type = e_vanilla_amoor;
+    end
+    `RV32_AMOADD_W: begin
+      decode_o.is_amo_op = 1'b1;
+      decode_o.amo_type = e_vanilla_amoadd;
     end
     default: begin
       decode_o.is_amo_op = 1'b0;
-      decode_o.amo_type = e_amo_swap;
+      decode_o.amo_type = e_vanilla_amoswap;
     end
   endcase
 end
@@ -216,6 +229,8 @@ assign decode_o.is_amo_rl = instruction_i[25];
 //|
 //+----------------------------------------------
 
+
+// is_fp_op means that this instruction goes to FP_EXE stage.
 always_comb begin
   decode_o.read_frs1 = 1'b0;
   decode_o.read_frs2 = 1'b0;
@@ -313,6 +328,14 @@ always_comb begin
       decode_o.write_frd = 1'b1;
       decode_o.is_fp_op = 1'b1;
     end
+    // FLWADD
+    `RV32_FLWADD: begin
+      decode_o.read_frs1 = 1'b0;
+      decode_o.read_frs2 = 1'b0;
+      decode_o.read_frs3 = 1'b0;
+      decode_o.write_frd = 1'b1;
+      decode_o.is_fp_op = 1'b0;
+    end
     default: begin
       decode_o.read_frs1 = 1'b0;
       decode_o.read_frs2 = 1'b0;
diff --git a/v/vanilla_bean/fcsr.v b/v/vanilla_bean/fcsr.v
index 1ed292704..1c4da0358 100644
--- a/v/vanilla_bean/fcsr.v
+++ b/v/vanilla_bean/fcsr.v
@@ -20,8 +20,10 @@ module fcsr
     , input [reg_addr_width_lp-1:0] rs1_i
     , input fcsr_s data_i
     , input [11:0] addr_i
+
     , output fcsr_s data_o // data that goes to rd.
-  
+    , output logic data_v_o      // 1, if addr_i matches fcsr addr.
+
     // exception accrue interface
     , input [1:0] fflags_v_i
     , input [1:0][fflags_width_lp-1:0] fflags_i
@@ -152,13 +154,25 @@ module fcsr
     end
   end
 
-
+  // output
   always_comb begin
     case (addr_i)
-      `RV32_CSR_FFLAGS_ADDR: data_o = {3'b0, fflags_r};
-      `RV32_CSR_FRM_ADDR: data_o = {5'b0, frm_r};
-      `RV32_CSR_FCSR_ADDR: data_o = {frm_r, fflags_r};
-      default: data_o = '0;
+      `RV32_CSR_FFLAGS_ADDR: begin
+        data_o = {3'b0, fflags_r};
+        data_v_o = 1'b1;
+      end
+      `RV32_CSR_FRM_ADDR: begin
+        data_o = {5'b0, frm_r};
+        data_v_o = 1'b1;
+      end
+      `RV32_CSR_FCSR_ADDR: begin
+         data_o = {frm_r, fflags_r};
+        data_v_o = 1'b1;
+      end
+      default: begin
+        data_o = '0;
+        data_v_o = 1'b0;
+      end
     endcase 
   end
 
@@ -168,7 +182,7 @@ module fcsr
   // synopsys translate_off
   always_ff @ (negedge clk_i) begin
     if (~reset_i) begin
-      if (v_i) begin
+      if (v_i & data_v_o) begin
         assert(~(|fflags_v_i)) else $error("Exception cannot be accrued while being written by fcsr op.");
       end
     end
diff --git a/v/vanilla_bean/fpu_float_fma.v b/v/vanilla_bean/fpu_float_fma.v
index faffa5b2e..aa498bbac 100644
--- a/v/vanilla_bean/fpu_float_fma.v
+++ b/v/vanilla_bean/fpu_float_fma.v
@@ -63,7 +63,17 @@ module fpu_float_fma
     fma_op_li = ePM_PB;
     is_fma_op = 1'b0;
 
-    if (fp_v_i) begin
+    // FPU gets imul inputs only when there is imul in EXE.
+    // so that it does not cause spurious toggles in FPU by normal integer ops.
+
+    // assumption: imul_v_i is coming straight of a register and does not glitch
+    if (imul_v_i) begin
+      fma_a_li = {1'b0, imul_rs1_i};
+      fma_b_li = {1'b0, imul_rs2_i};
+      fma_c_li = 33'h0;
+      fma_op_li = eIMUL;
+    end
+    else begin
       case (fpu_float_op_i)
         eFADD: begin
           fma_a_li = fp_rs1_i;
@@ -121,12 +131,6 @@ module fpu_float_fma
         end
       endcase
     end
-    else begin
-      fma_a_li = {1'b0, imul_rs1_i};
-      fma_b_li = {1'b0, imul_rs2_i};
-      fma_c_li = 33'h0;
-      fma_op_li = eIMUL;
-    end
   end
 
 
diff --git a/v/vanilla_bean/hash_function.v b/v/vanilla_bean/hash_function.v
deleted file mode 100644
index 22f7d5a91..000000000
--- a/v/vanilla_bean/hash_function.v
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- *  hash_function.v
- *
- */
-
-module hash_function 
-  #(parameter banks_p="inv"
-    , parameter width_p="inv"
-    , parameter vcache_sets_p="inv"
-
-    , parameter lg_banks_lp=`BSG_SAFE_CLOG2(banks_p)
-    , parameter index_width_lp=$clog2((2**width_p+banks_p-1)/banks_p)
-    , parameter lg_vcache_sets_lp=`BSG_SAFE_CLOG2(vcache_sets_p)
-  )
-  (
-    input [width_p-1:0] i
-    , output logic [lg_banks_lp-1:0] bank_o
-    , output logic [index_width_lp-1:0] index_o
-  );
-
-
-  if (banks_p == 9) begin: b9
-
-    always_comb begin
-      // we want to pick i[lg_vcache_sets_lp+3] to XOR with i[3],
-      // since this is the first non-index bit used by vcache.
-      if (i[2:0] == {i[5:4], i[3] ^ i[lg_vcache_sets_lp+3]}) begin
-        bank_o = 'd8;
-      end
-      else begin
-        bank_o = {1'b0, i[2:0]};
-      end
-
-      index_o = i[width_p-1:3];
-    end
-
-  end
-  else if (`BSG_IS_POW2(banks_p)) begin: p2
-
-    assign bank_o  = i[0+:lg_banks_lp];
-    assign index_o = i[lg_banks_lp+:index_width_lp];
-  end
-  else begin: unhandled
-    // synopsys translate_off
-    initial assert("banks_p" == "unhandled") else $error("unhandled case for %m");
-    // synopsys translate_on
-  end
-
-
-
-endmodule
diff --git a/v/vanilla_bean/lsu.v b/v/vanilla_bean/lsu.v
index 390f6f780..53ed615bb 100644
--- a/v/vanilla_bean/lsu.v
+++ b/v/vanilla_bean/lsu.v
@@ -20,9 +20,6 @@ module lsu
     , parameter pc_width_p="inv"
     , parameter dmem_size_p="inv"
 
-    // Enables branch & jalr target-addr stream on stderr
-    , parameter branch_trace_en_p=0
-
     , localparam dmem_addr_width_lp=`BSG_SAFE_CLOG2(dmem_size_p)
     , localparam data_mask_width_lp=(data_width_p>>3)
     , localparam reg_addr_width_lp=RV32_reg_addr_width_gp
@@ -39,9 +36,6 @@ module lsu
     , input [data_width_p-1:0] mem_offset_i
     , input [data_width_p-1:0] pc_plus4_i
     , input icache_miss_i
-    , input [pc_width_p-1:0] pc_target_i 
-
-    // from FP_EXE
 
     // to network TX
     , output remote_req_s remote_req_o
@@ -59,16 +53,6 @@ module lsu
 
   );
 
-  // Branch/jalr trace control signal
-  //
-  logic stream_target_pc;
-
-  // Does a store of target pc on every branch/jalr instruction to stderr epa
-  if (branch_trace_en_p == 1) begin
-    assign stream_target_pc = (exe_decode_i.is_branch_op | exe_decode_i.is_jalr_op);
-  end else begin
-    assign stream_target_pc = 1'b0; // tied lo by default
-  end
 
 
   logic [data_width_p-1:0] mem_addr;
@@ -83,11 +67,7 @@ module lsu
   logic [data_mask_width_lp-1:0] store_mask;
 
   always_comb begin
-    if (stream_target_pc) begin
-      store_data = data_width_p'(pc_target_i << 2);
-      store_mask = 4'b1111;
-    end
-    else if (exe_decode_i.is_byte_op) begin
+    if (exe_decode_i.is_byte_op) begin
       store_data = {4{exe_rs2_i[7:0]}};
       store_mask = {
          mem_addr[1] &  mem_addr[0],
@@ -113,11 +93,11 @@ module lsu
 
   // to local DMEM
   //
-  wire is_local_dmem_addr = (mem_addr ==? 32'b00000000_00000000_0001????_????????);
+  wire is_local_dmem_addr = (mem_addr ==? 32'b00000000_00000000_0000????_????????);
 
   assign dmem_v_o = is_local_dmem_addr &
     (exe_decode_i.is_load_op | exe_decode_i.is_store_op |
-     exe_decode_i.is_lr_op | exe_decode_i.is_lr_aq_op);
+     exe_decode_i.is_lr_op | exe_decode_i.is_lr_aq_op | exe_decode_i.is_flwadd_op);
   assign dmem_w_o = exe_decode_i.is_store_op;
   assign dmem_addr_o = mem_addr[2+:dmem_addr_width_lp]; 
   assign dmem_data_o = store_data;
@@ -158,23 +138,21 @@ module lsu
     end
 
     remote_req_o = '{
-      write_not_read : (exe_decode_i.is_store_op | stream_target_pc),
+      write_not_read : (exe_decode_i.is_store_op),
       is_amo_op : exe_decode_i.is_amo_op, 
       amo_type : exe_decode_i.amo_type,
       mask: store_mask,
       load_info : load_info,
       reg_id : exe_rd_i,
       data : store_data,
-      addr : (stream_target_pc
-              ? bsg_branch_trace_npa_gp 
-              : (icache_miss_i ? miss_addr : mem_addr))
+      addr : (icache_miss_i ? miss_addr : mem_addr)
     }; 
 
   end
 
 
-  assign remote_req_v_o = icache_miss_i | stream_target_pc |
-    ((exe_decode_i.is_load_op | exe_decode_i.is_store_op | exe_decode_i.is_amo_op) & ~is_local_dmem_addr);
+  assign remote_req_v_o = icache_miss_i |
+    ((exe_decode_i.is_load_op | exe_decode_i.is_store_op | exe_decode_i.is_amo_op | exe_decode_i.is_flwadd_op) & ~is_local_dmem_addr);
 
   // reserve
   // only valid on local DMEM
diff --git a/v/vanilla_bean/mcsr.v b/v/vanilla_bean/mcsr.v
new file mode 100644
index 000000000..40e7cee55
--- /dev/null
+++ b/v/vanilla_bean/mcsr.v
@@ -0,0 +1,285 @@
+/**
+ *    mcsr.v
+ *
+ *    machine CSR
+ */
+
+//  this contains the following.
+//  - mstatus (MIE, MPIE) (read-write)
+//  - mie and mip (read-write)
+//  - mepc (read-write)
+
+
+module mcsr
+  import bsg_vanilla_pkg::*;
+  #(parameter reg_addr_width_lp = RV32_reg_addr_width_gp
+    , parameter reg_data_width_lp = RV32_reg_data_width_gp
+    , parameter pc_width_p="inv"
+  )
+  (
+    input clk_i
+    , input reset_i
+
+    // remote interrupt set/clear (from network_rx)
+    , input remote_interrupt_set_i
+    , input remote_interrupt_clear_i
+
+
+    // csr instruction writes to this when moving from ID to EXE.
+    , input we_i
+    , input [11:0] addr_i
+    , input [2:0] funct3_i
+    , input [reg_data_width_lp-1:0] data_i  // rs1 data
+    , input [reg_addr_width_lp-1:0] rs1_i   // for immediate val write
+    , output logic [reg_data_width_lp-1:0] data_o
+
+    // from between ID and EXE
+    , input instr_executed_i
+
+    // from EXE
+    , input interrupt_entered_i
+    , input mret_called_i
+    , input [pc_width_p-1:0] npc_r_i
+    
+    // output
+    , output csr_mstatus_s mstatus_r_o
+    , output csr_interrupt_vector_s mip_r_o
+    , output csr_interrupt_vector_s mie_r_o
+    , output logic [pc_width_p-1:0] mepc_r_o
+  );
+
+  csr_mstatus_s mstatus_n, mstatus_r;
+  csr_interrupt_vector_s mie_n, mie_r;
+  csr_interrupt_vector_s mip_n, mip_r;
+  logic [pc_width_p-1:0] mepc_r, mepc_n;
+
+  assign mstatus_r_o = mstatus_r;
+  assign mip_r_o = mip_r;
+  assign mie_r_o = mie_r;
+  assign mepc_r_o = mepc_r;
+
+
+  // mstatus
+  // priority (high to low)
+  // 1) mret
+  // 2) interrupt taken
+  // 3) csr instr
+  // *1,2 are mutually exclusive events.
+  always_comb begin
+    mstatus_n = mstatus_r;
+
+    if (mret_called_i) begin
+      mstatus_n.mie = mstatus_r.mpie;
+      mstatus_n.mpie = 1'b0;
+    end
+    else if (interrupt_entered_i) begin
+      mstatus_n.mie = 1'b0;
+      mstatus_n.mpie = mstatus_r.mie;
+    end
+    else if (we_i & (addr_i == `RV32_CSR_MSTATUS_ADDR)) begin
+      case (funct3_i)
+        `RV32_CSRRW_FUN3: begin
+          mstatus_n.mpie = data_i[`RV32_MSTATUS_MPIE_BIT_IDX];
+          mstatus_n.mie = data_i[`RV32_MSTATUS_MIE_BIT_IDX];
+        end
+        `RV32_CSRRS_FUN3: begin
+          mstatus_n.mpie = data_i[`RV32_MSTATUS_MPIE_BIT_IDX]
+            ? 1'b1
+            : mstatus_r.mpie;
+          mstatus_n.mie = data_i[`RV32_MSTATUS_MIE_BIT_IDX]
+            ? 1'b1
+            : mstatus_r.mie;
+        end
+        `RV32_CSRRC_FUN3: begin
+          mstatus_n.mpie = data_i[`RV32_MSTATUS_MPIE_BIT_IDX]
+            ? 1'b0
+            : mstatus_r.mpie;
+          mstatus_n.mie = data_i[`RV32_MSTATUS_MIE_BIT_IDX]
+            ? 1'b0
+            : mstatus_r.mie;
+        end
+        `RV32_CSRRWI_FUN3: begin
+          mstatus_n.mie = rs1_i[`RV32_MSTATUS_MIE_BIT_IDX];
+        end
+        `RV32_CSRRSI_FUN3: begin
+          mstatus_n.mie = rs1_i[`RV32_MSTATUS_MIE_BIT_IDX]
+            ? 1'b1
+            : mstatus_r.mie;
+        end
+        `RV32_CSRRCI_FUN3: begin
+          mstatus_n.mie = rs1_i[`RV32_MSTATUS_MIE_BIT_IDX]
+            ? 1'b0
+            : mstatus_r.mie;
+        end
+        default: begin
+          mstatus_n = mstatus_r;
+        end
+      endcase
+    end
+  end
+
+
+  // mie 
+  // this can be only modified by csr instr. 
+  always_comb begin
+    mie_n = mie_r;
+    if (we_i & (addr_i == `RV32_CSR_MIE_ADDR)) begin
+      case (funct3_i)
+        `RV32_CSRRW_FUN3: begin
+          mie_n = data_i[17:16];
+        end
+        `RV32_CSRRS_FUN3: begin
+          mie_n.trace = data_i[17]
+            ? 1'b1
+            : mie_r.trace;
+          mie_n.remote = data_i[16]
+            ? 1'b1
+            : mie_r.remote;
+        end
+        `RV32_CSRRC_FUN3: begin
+          mie_n.trace = data_i[17]
+            ? 1'b0
+            : mie_r.trace;
+          mie_n.remote = data_i[16]
+            ? 1'b0
+            : mie_r.remote;
+        end
+        default: mie_n = mie_r;
+      endcase
+    end
+  end
+
+
+  // mip
+  // mip.trace is set when an instruction is executed (ID->EXE), while outside interrupt.
+  // mip.remote can be set/clear by remote packet.
+  // Both can be modified by CSR instr, which has lower priority.
+  always_comb begin
+    mip_n = mip_r;
+
+    // trace
+    // if trace enable bit is low, then execute instruction signal does not set the trace pending bit.
+    if (instr_executed_i & mie_r.trace) begin
+      mip_n.trace = 1'b1;
+    end
+    else if (we_i & (addr_i == `RV32_CSR_MIP_ADDR)) begin
+      case (funct3_i)
+        `RV32_CSRRW_FUN3: begin
+          mip_n.trace = data_i[17];
+        end
+        `RV32_CSRRS_FUN3: begin
+          mip_n.trace = data_i[17]
+            ? 1'b1
+            : mip_r.trace;
+        end
+        `RV32_CSRRC_FUN3: begin
+          mip_n.trace = data_i[17]
+            ? 1'b0
+            : mip_r.trace;
+        end
+        default: mip_n = mip_r;
+      endcase
+    end
+
+    // remote
+    if (remote_interrupt_set_i) begin
+      mip_n.remote = 1'b1;
+    end
+    else if (remote_interrupt_clear_i) begin
+      mip_n.remote = 1'b0;
+    end
+    else if (we_i & (addr_i == `RV32_CSR_MIP_ADDR)) begin
+      case (funct3_i)
+        `RV32_CSRRW_FUN3: begin
+          mip_n.remote = data_i[16];
+        end
+        `RV32_CSRRS_FUN3: begin
+          mip_n.remote = data_i[16]
+            ? 1'b1
+            : mip_r.remote;
+        end
+        `RV32_CSRRC_FUN3: begin
+          mip_n.remote = data_i[16]
+            ? 1'b0
+            : mip_r.remote;
+        end
+        default: mip_n = mip_r;
+      endcase
+    end
+  end
+
+  
+  // mepc
+  // mepc is set when the interrupt is taken.
+  // when the interrupt is taken, ID stage will be flushed, so CSR instr in ID will not get a chance to modify.
+  always_comb begin
+    mepc_n = mepc_r;
+    
+    if (interrupt_entered_i) begin
+      mepc_n = npc_r_i;
+    end
+    else if (we_i & (addr_i == `RV32_CSR_MEPC_ADDR)) begin
+      case (funct3_i)
+        `RV32_CSRRW_FUN3: begin
+          mepc_n = data_i[2+:pc_width_p];
+        end
+        `RV32_CSRRS_FUN3: begin
+          for (integer i = 0; i < pc_width_p; i++) begin
+            mepc_n[i] = data_i[2+i]
+              ? 1'b1
+              : mepc_r[i];
+          end
+        end
+        `RV32_CSRRC_FUN3: begin
+          for (integer i = 0; i < pc_width_p; i++) begin
+            mepc_n[i] = data_i[2+i]
+              ? 1'b0
+              : mepc_r[i];
+          end
+        end
+        default: mepc_n = mepc_r;
+      endcase
+    end
+
+  end
+
+  // reading CSR values
+  always_comb begin
+    data_o = '0;
+    case (addr_i)
+      `RV32_CSR_MSTATUS_ADDR: begin
+        data_o[`RV32_MSTATUS_MPIE_BIT_IDX] = mstatus_r.mpie;
+        data_o[`RV32_MSTATUS_MIE_BIT_IDX] = mstatus_r.mie;
+      end
+      `RV32_CSR_MIE_ADDR: begin
+        data_o[17:16] = mie_r;
+      end
+      `RV32_CSR_MIP_ADDR: begin
+        data_o[17:16] = mip_r;
+      end
+      `RV32_CSR_MEPC_ADDR: begin
+        data_o[2+:pc_width_p] = mepc_r;
+      end
+      default: data_o = '0;
+    endcase
+  end
+
+
+  // sequential logic
+  always_ff @ (posedge clk_i) begin
+    if (reset_i) begin
+      mstatus_r <= '0;
+      mie_r <= '0;
+      mip_r <= '0;
+      mepc_r <= '0;
+    end
+    else begin
+      mstatus_r <= mstatus_n;
+      mie_r <= mie_n;
+      mip_r <= mip_n;
+      mepc_r <= mepc_n;
+    end
+  end
+
+
+endmodule
diff --git a/v/vanilla_bean/network_rx.v b/v/vanilla_bean/network_rx.v
index 120d34c55..a6caf584f 100644
--- a/v/vanilla_bean/network_rx.v
+++ b/v/vanilla_bean/network_rx.v
@@ -16,12 +16,14 @@ module network_rx
     , parameter x_cord_width_p="inv"
     , parameter y_cord_width_p="inv"
 
+    , parameter x_subcord_width_p="inv"
+    , parameter y_subcord_width_p="inv"
+
     , parameter tgo_x_init_val_p = 0
-    , parameter tgo_y_init_val_p = 1
+    , parameter tgo_y_init_val_p = 0
     , parameter freeze_init_val_p = 1
     , parameter default_pc_init_val_p = 0
 
-    , localparam epa_word_addr_width_lp=epa_word_addr_width_gp
     , localparam data_mask_width_lp=(data_width_p>>3)
     , localparam dmem_addr_width_lp=`BSG_SAFE_CLOG2(dmem_size_p)
     , localparam icache_addr_width_lp=`BSG_SAFE_CLOG2(icache_entries_p)
@@ -39,7 +41,9 @@ module network_rx
     , input [data_mask_width_lp-1:0] mask_i
     , input bsg_manycore_load_info_s load_info_i
     , output logic yumi_o
-
+    , input [x_cord_width_p-1:0] src_x_cord_debug_i
+    , input [y_cord_width_p-1:0] src_y_cord_debug_i   
+   
     , output logic [data_width_p-1:0] returning_data_o
     , output logic returning_data_v_o
 
@@ -58,46 +62,58 @@ module network_rx
     , input icache_yumi_i
 
     , output logic freeze_o
-    , output logic [x_cord_width_p-1:0] tgo_x_o
-    , output logic [y_cord_width_p-1:0] tgo_y_o 
+    , output logic [x_subcord_width_p-1:0] tgo_x_o
+    , output logic [y_subcord_width_p-1:0] tgo_y_o 
     , output logic [pc_width_lp-1:0] pc_init_val_o
     , output logic dram_enable_o
 
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    // remote interrupt to core
+    , output logic remote_interrupt_set_o
+    , output logic remote_interrupt_clear_o
+
+    // remote interrupt from core
+    , input remote_interrupt_pending_bit_i
+
+    // for debugging
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
   );
 
   `declare_bsg_manycore_packet_s(addr_width_p,data_width_p,x_cord_width_p,y_cord_width_p);
 
+
   // address decoding
-  //
-  logic is_dmem_addr;
-  logic is_icache_addr;
-
-  logic is_csr_addr;
-  logic is_freeze_addr;
-  logic is_tgo_x_addr;
-  logic is_tgo_y_addr;
-  logic is_pc_init_val_addr;
-  logic is_dram_enable_addr;
+  // dmem addr space starts from EPA = 0
+  wire is_dmem_addr = (addr_i[addr_width_p-1:dmem_addr_width_lp] == '0);
+  // icache addr space (1024-entry, 12-bit tag):
+  // EPA = 0000_01tt_tttt_tttt_tt??_????_???? (word addr)
+  wire is_icache_addr = addr_i[pc_width_lp] & (addr_i[addr_width_p-1:pc_width_lp+1] == '0);
+
+  wire is_csr_addr = addr_i[epa_word_addr_width_gp-1]
+    & (addr_i[addr_width_p-1:epa_word_addr_width_gp] == '0);
+  wire is_freeze_addr = is_csr_addr & (addr_i[epa_word_addr_width_gp-2:0] == 'd0);
+  wire is_tgo_x_addr = is_csr_addr & (addr_i[epa_word_addr_width_gp-2:0] == 'd1);
+  wire is_tgo_y_addr = is_csr_addr & (addr_i[epa_word_addr_width_gp-2:0] == 'd2);
+  wire is_pc_init_val_addr = is_csr_addr & (addr_i[epa_word_addr_width_gp-2:0] == 'd3);
+  wire is_dram_enable_addr = is_csr_addr & (addr_i[epa_word_addr_width_gp-2:0] == 'd4);
   
-  assign is_dmem_addr = addr_i[dmem_addr_width_lp] & (addr_i[addr_width_p-1:dmem_addr_width_lp+1] == '0);
-  assign is_icache_addr = addr_i[pc_width_lp] & (addr_i[addr_width_p-1:pc_width_lp+1] == '0);
 
-  assign is_csr_addr = addr_i[epa_word_addr_width_lp-1]
-    & (addr_i[addr_width_p-1:epa_word_addr_width_lp] == '0);
-  assign is_freeze_addr = is_csr_addr & (addr_i[epa_word_addr_width_lp-2:0] == 'd0);
-  assign is_tgo_x_addr = is_csr_addr & (addr_i[epa_word_addr_width_lp-2:0] == 'd1);
-  assign is_tgo_y_addr = is_csr_addr & (addr_i[epa_word_addr_width_lp-2:0] == 'd2);
-  assign is_pc_init_val_addr = is_csr_addr & (addr_i[epa_word_addr_width_lp-2:0] == 'd3);
-  assign is_dram_enable_addr = is_csr_addr & (addr_i[epa_word_addr_width_lp-2:0] == 'd4);
+  // Remote interrupt pending bit (mip.remote)
+  // For write, the write enable signal is sent to the core for one cycle.
+  // writing 1 sets the mip.remote.
+  // writing 0 clears the mip.remote.
+  // This can be also read by the remote packet.
+  // This bit can also be modified by the vanilla core using csr instructions.
+  // When a remote packet and csr instr both tries to modify mip.remote, the remote packet has higher priority.
+  // EPA (word) = 3fff
+  wire is_remote_interrupt_addr = (addr_i == 'h3fff);
 
 
   // CSR registers
   //
   logic freeze_r;
-  logic [x_cord_width_p-1:0] tgo_x_r;
-  logic [y_cord_width_p-1:0] tgo_y_r;
+  logic [x_subcord_width_p-1:0] tgo_x_r;
+  logic [y_subcord_width_p-1:0] tgo_y_r;
   logic [pc_width_lp-1:0] pc_init_val_r;
   logic dram_enable_r;
 
@@ -110,8 +126,8 @@ module network_rx
   // incoming request handling logic
   //
   logic freeze_n;
-  logic [x_cord_width_p-1:0] tgo_x_n;
-  logic [y_cord_width_p-1:0] tgo_y_n;
+  logic [x_subcord_width_p-1:0] tgo_x_n;
+  logic [y_subcord_width_p-1:0] tgo_y_n;
   logic [pc_width_lp-1:0] pc_init_val_n;
   logic dram_enable_n;
 
@@ -123,6 +139,7 @@ module network_rx
   logic send_invalid_r, send_invalid_n;
   logic send_zero_r, send_zero_n;
   logic send_dram_enable_r, send_dram_enable_n; 
+  logic send_remote_interrupt_r, send_remote_interrupt_n;
 
 
   bsg_manycore_load_info_s load_info_r, load_info_n;
@@ -156,6 +173,9 @@ module network_rx
     icache_pc_o = addr_i[0+:pc_width_lp];
     icache_instr_o = data_i;
     
+    remote_interrupt_clear_o = 1'b0;
+    remote_interrupt_set_o = 1'b0;
+    send_remote_interrupt_n = 1'b0;
     yumi_o = 1'b0;
 
 
@@ -188,7 +208,7 @@ module network_rx
           send_zero_n = 1'b1;
         end
         else if (is_pc_init_val_addr) begin
-          pc_init_val_n = data_i[2+:pc_width_lp];
+          pc_init_val_n = data_i[0+:pc_width_lp];
           yumi_o = 1'b1;
           send_zero_n = 1'b1;
         end
@@ -197,6 +217,12 @@ module network_rx
           yumi_o = 1'b1;
           send_zero_n = 1'b1;
         end
+        else if (is_remote_interrupt_addr) begin
+          remote_interrupt_clear_o = ~data_i[0];
+          remote_interrupt_set_o = data_i[0];
+          yumi_o = 1'b1;
+          send_zero_n = 1'b1;
+        end
         else begin
           yumi_o = 1'b1;
           send_invalid_n = 1'b1;
@@ -232,6 +258,10 @@ module network_rx
           yumi_o = 1'b1;
           send_dram_enable_n = 1'b1;
         end
+        else if (is_remote_interrupt_addr) begin
+          yumi_o = 1'b1;
+          send_remote_interrupt_n = 1'b1;
+        end
         else begin
           yumi_o = 1'b1;
           send_invalid_n = 1'b1;
@@ -261,7 +291,8 @@ module network_rx
       | send_pc_init_val_r
       | send_dram_enable_r
       | send_invalid_r
-      | send_zero_r;
+      | send_zero_r
+      | send_remote_interrupt_r;
       
     if (send_dmem_data_r) begin
       returning_data_o = load_data_lo;
@@ -270,13 +301,13 @@ module network_rx
       returning_data_o = {{(data_width_p-1){1'b0}}, freeze_r};
     end
     else if (send_tgo_x_r) begin
-      returning_data_o = {{(data_width_p-x_cord_width_p){1'b0}}, tgo_x_r};
+      returning_data_o = {{(data_width_p-x_subcord_width_p){1'b0}}, tgo_x_r};
     end
     else if (send_tgo_y_r) begin
-      returning_data_o = {{(data_width_p-y_cord_width_p){1'b0}}, tgo_y_r};
+      returning_data_o = {{(data_width_p-y_subcord_width_p){1'b0}}, tgo_y_r};
     end
     else if (send_pc_init_val_r) begin
-      returning_data_o = {{(data_width_p-pc_width_lp-2){1'b0}}, pc_init_val_r, 2'b00};
+      returning_data_o = {{(data_width_p-pc_width_lp){1'b0}}, pc_init_val_r};
     end
     else if (send_dram_enable_r) begin
       returning_data_o = {{(data_width_p-1){1'b0}}, dram_enable_r};
@@ -287,6 +318,9 @@ module network_rx
     else if (send_invalid_r) begin
       returning_data_o = 'hdead_beef;
     end
+    else if (send_remote_interrupt_r) begin
+      returning_data_o = {{(data_width_p-1){1'b0}}, remote_interrupt_pending_bit_i};
+    end
     else begin
       returning_data_o = '0;
     end
@@ -298,8 +332,8 @@ module network_rx
   always_ff @ (posedge clk_i) begin
     if (reset_i) begin
       freeze_r <= (1)'(freeze_init_val_p);
-      tgo_x_r <= (x_cord_width_p)'(tgo_x_init_val_p);
-      tgo_y_r <= (y_cord_width_p)'(tgo_y_init_val_p);
+      tgo_x_r <= (x_subcord_width_p)'(tgo_x_init_val_p);
+      tgo_y_r <= (y_subcord_width_p)'(tgo_y_init_val_p);
       pc_init_val_r <= (pc_width_lp)'(default_pc_init_val_p);
       dram_enable_r <= 1'b1; // DRAM is enabled by default.
 
@@ -311,6 +345,7 @@ module network_rx
       send_invalid_r <= 1'b0;
       send_zero_r <= 1'b0;
       send_dram_enable_r <= 1'b0;
+      send_remote_interrupt_r <= 1'b0;
       load_info_r <= '0;
     end
     else begin
@@ -328,6 +363,7 @@ module network_rx
       send_invalid_r <= send_invalid_n;
       send_zero_r <= send_zero_n;
       send_dram_enable_r <= send_dram_enable_n;
+      send_remote_interrupt_r <= send_remote_interrupt_n;
       load_info_r <= load_info_n;
     end
   end
@@ -340,22 +376,30 @@ module network_rx
 
   assign is_valid_csr_addr = is_csr_addr & 
     (is_freeze_addr | is_tgo_x_addr | is_tgo_y_addr | is_pc_init_val_addr | is_dram_enable_addr);
-  assign is_invalid_addr = ~(is_dmem_addr | is_icache_addr | is_valid_csr_addr);
+  assign is_invalid_addr = ~(is_dmem_addr | is_icache_addr | is_valid_csr_addr | is_remote_interrupt_addr);
 
   always_ff @ (negedge clk_i) begin
     if (~reset_i & v_i & is_invalid_addr) begin
-      $display("[ERROR][RX] Invalid EPA Access. t=%0t, x=%d, y=%d, we=%d, addr=%h, data=%h",
-        $time, my_x_i, my_y_i, w_i, addr_i, data_i);
+      $display("[ERROR][RX] Invalid EPA Access. t=%0t, x=%d, y=%d, src_x=%d, src_y=%d, we=%d, addr=%h, data=%h",
+        $time, global_x_i, global_y_i, src_x_cord_debug_i, src_y_cord_debug_i, w_i, addr_i, data_i);
     end
 
+     /*
+          // uncomment to trace packets between tiles 
+    if (~reset_i & v_i & ~is_invalid_addr) begin
+      $display("[INFO][RX] EPA Access. t=%0t, x=%d, y=%d, src_x=%d, src_y=%d, we=%d, addr=%h, data=%h mask=%h",
+        $time, global_x_i, global_y_i, src_x_cord_debug_i, src_y_cord_debug_i, w_i, addr_i, data_i, mask_i);
+    end
+     */
+     
     // FREEZE / UNFREEZE 
     if (~reset_i) begin
       if (freeze_n & ~freeze_r)
-        $display("[INFO][RX] Freezing tile t=%0t, x=%d, y=%d", $time, my_x_i, my_y_i);
+        $display("[INFO][RX] Freezing tile t=%0t, x=%d, y=%d", $time, global_x_i, global_y_i);
       if (~freeze_n & freeze_r)
-        $display("[INFO][RX] Unfreezing tile t=%0t, x=%d, y=%d", $time, my_x_i, my_y_i);
+        $display("[INFO][RX] Unfreezing tile t=%0t, x=%d, y=%d", $time, global_x_i, global_y_i);
       if (dram_enable_r & ~dram_enable_n)
-        $display("[INFO][RX] Disabling DRAM ctrl t=%0t, x=%d, y=%d", $time, my_x_i, my_y_i);
+        $display("[INFO][RX] Disabling DRAM ctrl t=%0t, x=%d, y=%d", $time, global_x_i, global_y_i);
     end
 
 
diff --git a/v/vanilla_bean/network_tx.v b/v/vanilla_bean/network_tx.v
index 3faa011a7..445738f6b 100644
--- a/v/vanilla_bean/network_tx.v
+++ b/v/vanilla_bean/network_tx.v
@@ -12,13 +12,18 @@ module network_tx
     , parameter addr_width_p="inv"
     , parameter x_cord_width_p="inv"
     , parameter y_cord_width_p="inv"
+    , parameter pod_x_cord_width_p="inv"
+    , parameter pod_y_cord_width_p="inv"
     , parameter epa_byte_addr_width_p="inv"
+    , parameter num_vcache_rows_p="inv"
     , parameter vcache_size_p="inv" // vcache capacity in words
     , parameter vcache_block_size_in_words_p="inv"
     , parameter vcache_sets_p="inv"
-    
+ 
     , parameter num_tiles_x_p="inv"
     , parameter num_tiles_y_p="inv"
+    , parameter x_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_x_p)
+    , parameter y_subcord_width_lp=`BSG_SAFE_CLOG2(num_tiles_y_p)
   
     , parameter icache_entries_p="inv"
     , parameter icache_tag_width_p="inv"
@@ -34,8 +39,6 @@ module network_tx
     , parameter icache_addr_width_lp=`BSG_SAFE_CLOG2(icache_entries_p)
     , parameter pc_width_lp=(icache_tag_width_p+icache_addr_width_lp)
 
-    , parameter epa_word_addr_width_lp=epa_word_addr_width_gp
-
     , parameter reg_addr_width_lp=RV32_reg_addr_width_gp
 
     , parameter packet_width_lp=
@@ -57,12 +60,14 @@ module network_tx
     , input returned_fifo_full_i
     , output logic returned_yumi_o
     
-    , input [x_cord_width_p-1:0] tgo_x_i
-    , input [y_cord_width_p-1:0] tgo_y_i
+    , input [x_subcord_width_lp-1:0] tgo_x_i
+    , input [y_subcord_width_lp-1:0] tgo_y_i
     , input dram_enable_i
 
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    , input [x_subcord_width_lp-1:0] my_x_i
+    , input [y_subcord_width_lp-1:0] my_y_i
+    , input [pod_x_cord_width_p-1:0] pod_x_i
+    , input [pod_y_cord_width_p-1:0] pod_y_i
 
     // core side
     // vanilla core uses valid-credit interface for outgoing requests.
@@ -112,9 +117,12 @@ module network_tx
     ,.y_cord_width_p(y_cord_width_p)
     ,.num_tiles_x_p(num_tiles_x_p)
     ,.num_tiles_y_p(num_tiles_y_p)
+    ,.num_vcache_rows_p(num_vcache_rows_p)
     ,.vcache_block_size_in_words_p(vcache_block_size_in_words_p)
     ,.vcache_size_p(vcache_size_p)
     ,.vcache_sets_p(vcache_sets_p)
+    ,.pod_x_cord_width_p(pod_x_cord_width_p)
+    ,.pod_y_cord_width_p(pod_y_cord_width_p)
   ) eva2npa (
     .eva_i(remote_req_i.addr)
     ,.dram_enable_i(dram_enable_i)
@@ -126,6 +134,9 @@ module network_tx
     ,.epa_o(epa_lo)
 
     ,.is_invalid_addr_o(is_invalid_addr_lo) 
+
+    ,.pod_x_i(pod_x_i)
+    ,.pod_y_i(pod_y_i)
   );
 
 
@@ -145,18 +156,33 @@ module network_tx
     out_packet.x_cord = x_cord_lo;
     out_packet.addr = epa_lo;
 
-    out_packet.reg_id = remote_req_i.reg_id;
-    out_packet.op_ex = remote_req_i.is_amo_op
-      ? remote_req_i.amo_type
-      : remote_req_i.mask;
-    out_packet.src_y_cord = my_y_i;
-    out_packet.src_x_cord = my_x_i;
-
-    out_packet.op = remote_req_i.is_amo_op
-      ? e_remote_amo
-      : (remote_req_i.write_not_read
-        ? e_remote_store
-        : e_remote_load);
+    if (remote_req_i.write_not_read) begin
+      out_packet.reg_id.store_mask_s.mask = remote_req_i.mask;
+      out_packet.reg_id.store_mask_s.unused = 1'b0;
+    end
+    else begin
+      out_packet.reg_id = remote_req_i.reg_id;
+    end
+    
+    out_packet.src_y_cord = {pod_y_i, my_y_i};
+    out_packet.src_x_cord = {pod_x_i, my_x_i};
+
+    if (remote_req_i.is_amo_op) begin
+      case (remote_req_i.amo_type)
+        e_vanilla_amoswap:  out_packet.op_v2 = e_remote_amoswap;
+        e_vanilla_amoor:    out_packet.op_v2 = e_remote_amoor;
+        e_vanilla_amoadd:   out_packet.op_v2 = e_remote_amoadd;
+        default:            out_packet.op_v2 = e_remote_amoswap;  // should never happen.
+      endcase
+    end
+    else begin
+      if (remote_req_i.write_not_read) begin
+        out_packet.op_v2 = e_remote_store;
+      end
+      else begin
+        out_packet.op_v2 = e_remote_load;
+      end
+    end
 
   end
 
@@ -188,13 +214,13 @@ module network_tx
     end
     else if (returned_pkt_type_i == e_return_float_wb) begin
       float_remote_load_resp_v_o = returned_v_i;
-      float_remote_load_resp_force_o = returned_fifo_full_i & returned_v_i;
-      returned_yumi_o = float_remote_load_resp_yumi_i | (returned_fifo_full_i & returned_v_i);
+      float_remote_load_resp_force_o = returned_fifo_full_i;
+      returned_yumi_o = float_remote_load_resp_yumi_i | (returned_fifo_full_i);
     end
     else begin
       int_remote_load_resp_v_o = returned_v_i;
-      int_remote_load_resp_force_o = returned_fifo_full_i & returned_v_i;
-      returned_yumi_o = int_remote_load_resp_yumi_i | (returned_fifo_full_i & returned_v_i);
+      int_remote_load_resp_force_o = returned_fifo_full_i;
+      returned_yumi_o = int_remote_load_resp_yumi_i | (returned_fifo_full_i);
     end
 
   end
@@ -203,8 +229,8 @@ module network_tx
   always_ff @ (negedge clk_i) begin
 
     if (remote_req_v_i & is_invalid_addr_lo) begin
-      $display("[ERROR][TX] Invalid EVA access. t=%0t, x=%d, y=%d, addr=%h",
-        $time, my_x_i, my_y_i, remote_req_i.addr);
+      $display("[ERROR][TX] Invalid EVA access. t=%0t, x=%d, y=%d, addr=%h data=%h w=%b",
+        $time, {pod_x_i, my_x_i}, {pod_y_i, my_y_i}, remote_req_i.addr, remote_req_i.data, remote_req_i.write_not_read);
     end 
 
     if (returned_v_i) begin
@@ -212,6 +238,11 @@ module network_tx
         else $error("[ERROR][TX] Credit packet should not be given to vanilla core.");
     end
 
+    // if the return fifo is full, the response has to be taken by the core at that cycle.
+    if (returned_fifo_full_i) begin
+      assert(returned_yumi_o) else $error("[ERROR][TX] Return fifo is full, but the response is not taken by the core.");
+    end
+
   end
   // synopsys translate_on
 
diff --git a/v/vanilla_bean/scoreboard.v b/v/vanilla_bean/scoreboard.v
index 9d7f94a8d..02d228a3e 100644
--- a/v/vanilla_bean/scoreboard.v
+++ b/v/vanilla_bean/scoreboard.v
@@ -177,12 +177,8 @@ module scoreboard
   // synopsys translate_off
   always_ff @ (negedge clk_i) begin
     if (~reset_i) begin
-      for (integer i = 0; i < num_clear_port_p; i++) begin
-        if (score_i & clear_i[i]) begin
-          assert(score_id_i != clear_id_i[i])
-            else $error("score and clear on the same id cannot happen.");
-        end
-      end
+      assert((score_bits & clear_combined) == '0)
+        else $error("[BSG_ERROR] score and clear on the same id cannot happen.");
     end
   end
   // synopsys translate_on
diff --git a/v/vanilla_bean/vanilla_core.v b/v/vanilla_bean/vanilla_core.v
index c3a5ab28c..c90ac1e01 100644
--- a/v/vanilla_bean/vanilla_core.v
+++ b/v/vanilla_bean/vanilla_core.v
@@ -21,9 +21,6 @@ module vanilla_core
 
     , parameter max_out_credits_p="inv"
 
-    // Enables branch & jalr target-addr stream on stderr
-    , parameter branch_trace_en_p=0
-
     // For network input FIFO credit counting
       // By default, 3 credits are needed, because the round trip to get the credit back takes three cycles.
       // ID->EXE->FIFO->CREDIT.
@@ -82,12 +79,17 @@ module vanilla_core
 
     , input invalid_eva_access_i
 
+    // remote interrupt interface
+    , input remote_interrupt_set_i
+    , input remote_interrupt_clear_i
+    , output logic remote_interrupt_pending_bit_o
+
     // remaining credits
     , input [credit_counter_width_lp-1:0] out_credits_i    
 
     // For debugging
-    , input [x_cord_width_p-1:0] my_x_i
-    , input [y_cord_width_p-1:0] my_y_i
+    , input [x_cord_width_p-1:0] global_x_i
+    , input [y_cord_width_p-1:0] global_y_i
   );
 
   // pipeline signals
@@ -306,6 +308,7 @@ module vanilla_core
   fcsr_s fcsr_data_li;
   logic [11:0] fcsr_addr_li;
   fcsr_s fcsr_data_lo;
+  logic fcsr_data_v_lo;
   logic [1:0] fcsr_fflags_v_li;
   fflags_s [1:0] fcsr_fflags_li;
   frm_e frm_r;
@@ -320,6 +323,7 @@ module vanilla_core
     ,.data_i(fcsr_data_li)
     ,.addr_i(fcsr_addr_li)
     ,.data_o(fcsr_data_lo)
+    ,.data_v_o(fcsr_data_v_lo)
     // [0] fpu_int -> MEM
     // [1] fpu_float, fdiv -> FP_WB
     ,.fflags_v_i(fcsr_fflags_v_li)
@@ -327,6 +331,64 @@ module vanilla_core
     ,.frm_o(frm_r)
   );
 
+  
+  // MCSR
+  logic mcsr_we_li;
+  logic [data_width_p-1:0] mcsr_data_li;
+  logic [data_width_p-1:0] mcsr_data_lo;
+
+  logic mcsr_instr_executed_li;
+  logic mcsr_interrupt_entered_li;
+  logic mcsr_mret_called_li;
+  logic [pc_width_lp-1:0] mcsr_npc_r_li;
+
+  csr_mstatus_s mstatus_r;
+  csr_interrupt_vector_s mip_r;
+  csr_interrupt_vector_s mie_r;
+  logic [pc_width_lp-1:0] mepc_r;
+
+  mcsr #(
+    .pc_width_p(pc_width_lp)
+  ) mcsr0 (
+    .clk_i(clk_i)
+    ,.reset_i(reset_i)
+
+    ,.remote_interrupt_set_i(remote_interrupt_set_i)
+    ,.remote_interrupt_clear_i(remote_interrupt_clear_i)
+
+    ,.we_i      (mcsr_we_li)
+    ,.addr_i    (id_r.instruction[31:20])
+    ,.funct3_i  (id_r.instruction.funct3)
+    ,.data_i    (mcsr_data_li)
+    ,.rs1_i     (id_r.instruction.rs1)
+    ,.data_o    (mcsr_data_lo)
+    
+    ,.instr_executed_i(mcsr_instr_executed_li)
+    ,.interrupt_entered_i(mcsr_interrupt_entered_li)
+    ,.mret_called_i(mcsr_mret_called_li)
+    ,.npc_r_i(mcsr_npc_r_li)
+
+    ,.mstatus_r_o(mstatus_r)
+    ,.mip_r_o(mip_r)
+    ,.mie_r_o(mie_r)
+    ,.mepc_r_o(mepc_r)
+  );
+
+  // synopsys translate_off
+  wire [pc_width_lp+2-1:0] mepc_00 = {mepc_r, 2'b00};
+  // synopsys translate_on
+
+  assign remote_interrupt_pending_bit_o = mip_r.remote; // make it accessible by remote packet.
+
+  // Interrupt can be taken when mstatus.mie=1 and enable and pending bits are both on for an interrupt source,
+  // When icache miss is not already in progress (e.g. no icache bubble in EXE, MEM or WB)
+  wire remote_interrupt_ready = mip_r.remote & mie_r.remote;
+  wire trace_interrupt_ready = mip_r.trace & mie_r.trace;
+  wire interrupt_ready = mstatus_r.mie
+                       & (remote_interrupt_ready | trace_interrupt_ready)
+                       & ~(exe_r.icache_miss | mem_r.icache_miss | wb_r.icache_miss);
+
+
 
   // calculate mem address offset
   //
@@ -334,7 +396,7 @@ module vanilla_core
     | id_r.decode.is_lr_aq_op
     | id_r.decode.is_amo_op;
 
-  wire [data_width_p-1:0] mem_addr_op2 = is_amo_or_lr_op
+  wire [data_width_p-1:0] mem_addr_op2 = (is_amo_or_lr_op | id_r.decode.is_flwadd_op)
     ? '0
     : (id_r.decode.is_store_op
       ? `RV32_signext_Simm(id_r.instruction)
@@ -487,6 +549,7 @@ module vanilla_core
   );
 
 
+
   // ALU
   //
   logic [data_width_p-1:0] alu_result;
@@ -505,44 +568,27 @@ module vanilla_core
     ,.jump_now_o(alu_jump_now)
   );
 
-  wire branch_under_predict = alu_jump_now & ~exe_r.instruction[0]; 
-  wire branch_over_predict = ~alu_jump_now & exe_r.instruction[0]; 
-  wire branch_mispredict = exe_r.decode.is_branch_op & (branch_under_predict | branch_over_predict);
-  wire jalr_mispredict = exe_r.decode.is_jalr_op & (alu_jalr_addr != exe_r.pred_or_jump_addr[2+:pc_width_lp]);
-
-  // Compute branch/jalr target address
-  logic [pc_width_lp-1:0] exe_pc_target;
 
-  always_comb begin
-    if (exe_r.decode.is_branch_op) begin
-      exe_pc_target = branch_under_predict
-        ? exe_r.pred_or_jump_addr[2+:pc_width_lp]
-        : exe_r.pc_plus4[2+:pc_width_lp];
-    end
-    else begin
-      exe_pc_target = alu_jalr_addr;
-    end
-  end
 
   // save pc+4 of jalr/jal for predicting jalr branch target
-  logic [pc_width_lp-1:0] jalr_prediction_r;
-
-  assign jalr_prediction = (exe_r.decode.is_jal_op | exe_r.decode.is_jalr_op)
-    ? exe_r.pc_plus4[2+:pc_width_lp]
-    : jalr_prediction_r;
+  // For risc-v, hints for saving return address for jalr/jal are encoded implicitly in the rd used.
+  // For jalr/jal, save the pc+4 when rd = x1 or x5.
+  wire jalr_prediction_write_en = (exe_r.decode.is_jal_op | exe_r.decode.is_jalr_op)
+    & ((exe_r.instruction.rd == 5'd1) | (exe_r.instruction.rd == 5'd5));
 
-  bsg_dff_reset #(
+  bsg_dff_reset_en_bypass #(
     .width_p(pc_width_lp)
   ) jalr_pred_dff (
     .clk_i(clk_i)
     ,.reset_i(reset_i)
-    ,.data_i(jalr_prediction)
-    ,.data_o(jalr_prediction_r)
+    ,.en_i(jalr_prediction_write_en)
+    ,.data_i(exe_r.pc_plus4[2+:pc_width_lp])
+    ,.data_o(jalr_prediction)
   ); 
 
   // alu/csr result mux
   wire [data_width_p-1:0] alu_or_csr_result = exe_r.decode.is_csr_op
-    ? {24'b0, exe_r.fcsr_data}
+    ? exe_r.rs2_val
     : alu_result;
 
 
@@ -588,7 +634,6 @@ module vanilla_core
     .data_width_p(data_width_p)
     ,.pc_width_p(pc_width_lp)
     ,.dmem_size_p(dmem_size_p)
-    ,.branch_trace_en_p(branch_trace_en_p)
   ) lsu0 (
     .clk_i(clk_i)
     ,.reset_i(reset_i)
@@ -599,7 +644,6 @@ module vanilla_core
     ,.mem_offset_i(exe_r.mem_addr_op2)
     ,.pc_plus4_i(exe_r.pc_plus4)
     ,.icache_miss_i(exe_r.icache_miss)
-    ,.pc_target_i(exe_pc_target)
 
     ,.remote_req_o(remote_req_o)
     ,.remote_req_v_o(lsu_remote_req_v_lo)
@@ -616,6 +660,60 @@ module vanilla_core
   );
 
 
+  // npc_r ('true next pc')
+  // this keeps track of what should be the next PC of the instruction that was last in EXE (i.e. latest committed instruction).
+  // this is updated when a valid instruction moves out of EXE (or FP_EXE)
+  // For non-control instructions, this is pc+4.
+  // For control instructions, this is the branch/jump target. 
+  // This is used for setting mepc_r, when the interrupt is taken.
+  // this is different from pc_n in IF, which could have mispredicted pc.
+  logic npc_write_en;
+  logic [pc_width_lp-1:0] npc_n, npc_r; 
+
+  bsg_dff_en_bypass #(
+    .width_p(pc_width_lp)
+  ) npc_dff (
+    .clk_i(clk_i)
+    ,.en_i(npc_write_en)
+    ,.data_i(npc_n)
+    ,.data_o(npc_r)
+  );
+
+
+  // synopsys translate_off
+  wire [pc_width_lp+2-1:0] npc_00 = {npc_r, 2'b00}; // for debugging
+  // synopsys translate_on
+
+
+  // In the icache, branch instruction has the direction of the branch encoded in the bit-0 of the instruction.
+  // 0 = forward branch (always predict 'not taken')
+  // 1 = backward branch (always predict 'taken')
+  // 'branch underpredict' means that branch was predicted to be "not taken", but actually needs to be taken.
+  // 'branch overpredict' means that branch was predicted to be "taken", but actually needs to be not taken.
+  // In either cases, the frontend should be flushed. 
+  wire branch_under_predict = (alu_jump_now & ~exe_r.instruction[0]);
+  wire branch_over_predict  = (~alu_jump_now & exe_r.instruction[0]); 
+  wire branch_mispredict = (branch_under_predict | branch_over_predict) & exe_r.decode.is_branch_op;
+  wire jalr_mispredict = exe_r.decode.is_jalr_op & (alu_jalr_addr != exe_r.pred_or_jump_addr[2+:pc_width_lp]);
+
+  always_comb begin
+    if (exe_r.decode.is_jalr_op) begin
+      npc_n = alu_jalr_addr;
+    end
+    else if (exe_r.decode.is_mret_op) begin
+      npc_n = mepc_r;
+    end
+    else if (exe_r.decode.is_jal_op | (exe_r.decode.is_branch_op & alu_jump_now)) begin
+      npc_n = exe_r.pred_or_jump_addr[2+:pc_width_lp];
+    end
+    else begin
+      npc_n = exe_r.pc_plus4[2+:pc_width_lp];
+    end
+  end
+
+
+
+
   //////////////////////////////
   //                          //
   //      FP EXE STAGE        //
@@ -821,14 +919,14 @@ module vanilla_core
         reserved_addr_r <= dmem_addr_li;
         // synopsys translate_off
         if (debug_p)
-          $display("[INFO][VCORE] making reservation. t=%0t, addr=%x, x=%0d, y=%0d", $time, dmem_addr_li, my_x_i, my_y_i);
+          $display("[INFO][VCORE] making reservation. t=%0t, addr=%x, x=%0d, y=%0d", $time, dmem_addr_li, global_x_i, global_y_i);
         // synopsys translate_on
       end
       else if (break_reserve) begin
         reserved_r <= 1'b0;
         // synopsys translate_off
         if (debug_p)
-          $display("[INFO][VCORE] breaking reservation. t=%0t, x=%0d, y=%0d.", $time, my_x_i, my_y_i);
+          $display("[INFO][VCORE] breaking reservation. t=%0t, x=%0d, y=%0d.", $time, global_x_i, global_y_i);
         // synopsys translate_on
       end
     end
@@ -899,6 +997,7 @@ module vanilla_core
   // ID stall signals
   logic stall_depend_long_op;
   logic stall_depend_local_load;
+  logic stall_depend_local_flwadd;
   logic stall_depend_imul;
   logic stall_bypass;
   logic stall_lr_aq;
@@ -922,6 +1021,7 @@ module vanilla_core
 
   wire stall_id = stall_depend_long_op
     | stall_depend_local_load
+    | stall_depend_local_flwadd
     | stall_depend_imul
     | stall_bypass
     | stall_lr_aq
@@ -943,7 +1043,10 @@ module vanilla_core
 
 
   // flush condition
-  wire flush = (branch_mispredict | jalr_mispredict);
+  // 1) branch/jalr mispredict
+  // 2) mret in EXE
+  // 3) interrupt taken
+  wire flush = (branch_mispredict | jalr_mispredict) | (exe_r.decode.is_mret_op) | interrupt_ready;
   wire icache_miss_in_pipe = id_r.icache_miss | exe_r.icache_miss | mem_r.icache_miss | wb_r.icache_miss;
 
   // reset edge down detect
@@ -956,21 +1059,76 @@ module vanilla_core
 
   wire reset_down = reset_r & ~reset_i;
 
+
+  // Next PC logic
   always_comb begin
-    if (reset_down)
+    if (reset_down) begin
       pc_n = pc_init_val_i;
-    else if (wb_r.icache_miss)
+    end
+    else if (wb_r.icache_miss) begin
       pc_n = wb_r.icache_miss_pc[2+:pc_width_lp];
-    else if (flush)
-      pc_n = exe_pc_target;
-    else if (decode.is_branch_op & instruction[0])
+    end
+    else if (interrupt_ready) begin
+      if (remote_interrupt_ready) begin
+        pc_n = `REMOTE_INTERRUPT_JUMP_ADDR;
+      end
+      else begin
+        pc_n = `TRACE_INTERRUPT_JUMP_ADDR;
+      end
+    end
+    else if (exe_r.decode.is_mret_op) begin
+      pc_n = mepc_r;
+    end
+    else if (branch_mispredict) begin
+      pc_n = alu_jump_now
+        ? exe_r.pred_or_jump_addr[2+:pc_width_lp]
+        : exe_r.pc_plus4[2+:pc_width_lp];
+    end
+    else if (jalr_mispredict) begin
+      pc_n = alu_jalr_addr;
+    end
+    else if (decode.is_branch_op & instruction[0]) begin
       pc_n = pred_or_jump_addr;
-    else if (decode.is_jal_op | decode.is_jalr_op)
+    end
+    else if (decode.is_jal_op | decode.is_jalr_op) begin
       pc_n = pred_or_jump_addr;
-    else
+    end
+    else begin
       pc_n = pc_plus4;
+    end
   end
   
+  // debug printing for interrupt and mret
+  // synopsys translate_off
+
+  always_ff @ (negedge clk_i) begin
+    if (~reset_i & ~stall_all & interrupt_ready) begin
+      if (remote_interrupt_ready) begin
+        $display("[INFO][VCORE] Remote interrupt taken. t=%0t, x=%0d, y=%0d, mepc=%h",
+          $time, global_x_i, global_y_i, {npc_r, 2'b00});
+      end
+      else begin
+        $display("[INFO][VCORE] Trace interrupt taken. t=%0t, x=%0d, y=%0d, mepc=%h",
+          $time, global_x_i, global_y_i, {npc_r, 2'b00});
+      end
+    end
+
+    if (~reset_i & ~stall_all & exe_r.decode.is_mret_op) begin
+      $display("[INFO][VCORE] mret called. t=%0t, x=%0d, y=%0d, mepc=%h",
+        $time, global_x_i, global_y_i, {mepc_r, 2'b00});
+    end
+
+/*    if (jalr_mispredict)
+      $display("[INFO][VCORE] jalr_mispredict. t=%0t, x=%0d, y=%0d, true=%x pred=%x\n", 
+	       $time, global_x_i, global_y_i, 
+	       { alu_jalr_addr, 2'b00 },
+	       { exe_r.pred_or_jump_addr[2+:pc_width_lp], 2'b00 }
+	       );
+ */
+  end
+  // synopsys translate_on
+
+
 
   // icache logic
   wire read_icache = (icache_miss_in_pipe & ~flush)
@@ -1020,7 +1178,8 @@ module vanilla_core
           instruction: '0,
           decode: '0,
           fp_decode: '0,
-          icache_miss: 1'b1
+          icache_miss: 1'b1,
+          valid: 1'b0
         };
       end
       else begin
@@ -1030,7 +1189,8 @@ module vanilla_core
           instruction: instruction,
           decode: decode,
           fp_decode: fp_decode,
-          icache_miss: 1'b0
+          icache_miss: 1'b0,
+          valid: 1'b1
         };
       end
     end
@@ -1050,12 +1210,15 @@ module vanilla_core
   wire [reg_addr_width_lp-1:0] id_rs3 = id_r.instruction[31:27];
   wire [reg_addr_width_lp-1:0] id_rd = id_r.instruction.rd;
   wire remote_req_in_exe = lsu_remote_req_v_lo;
-  wire local_load_in_exe = lsu_dmem_v_lo & ~lsu_dmem_w_lo;
+  wire local_load_in_exe = lsu_dmem_v_lo & ~lsu_dmem_w_lo & ~exe_r.decode.is_flwadd_op;       // local lw or flw
+  wire local_flwadd_in_exe = lsu_dmem_v_lo & ~lsu_dmem_w_lo & exe_r.decode.is_flwadd_op;   // local flwadd
   wire id_rs1_non_zero = id_rs1 != '0;
   wire id_rs2_non_zero = id_rs2 != '0;
   wire id_rd_non_zero = id_rd != '0;
+  // is_flwadd_op does not cause is_load_op to be set high, so the fact that
+  // is_flwadd_op has exe_r.decode.write_rd high will not trigger int_remote_load_in_exe
   wire int_remote_load_in_exe = remote_req_in_exe & exe_r.decode.is_load_op & exe_r.decode.write_rd;
-  wire float_remote_load_in_exe = remote_req_in_exe & exe_r.decode.is_load_op & exe_r.decode.write_frd;
+  wire float_remote_load_in_exe = remote_req_in_exe & (exe_r.decode.is_load_op | exe_r.decode.is_flwadd_op) & exe_r.decode.write_frd;
   wire fdiv_fsqrt_in_fp_exe = fp_exe_r.fp_decode.is_fdiv_op | fp_exe_r.fp_decode.is_fsqrt_op;
   wire remote_credit_pending = (out_credits_i != max_out_credits_p);
 
@@ -1077,6 +1240,11 @@ module vanilla_core
     |(id_r.decode.read_frs2 & (id_rs2 == exe_r.instruction.rd) & exe_r.decode.write_frd)
     |(id_r.decode.read_frs3 & (id_rs3 == exe_r.instruction.rd) & exe_r.decode.write_frd));
 
+  assign stall_depend_local_flwadd = local_flwadd_in_exe & 
+    ((id_r.decode.read_frs1 & (id_rs1 == exe_r.instruction.rd) & exe_r.decode.write_frd)
+    |(id_r.decode.read_frs2 & (id_rs2 == exe_r.instruction.rd) & exe_r.decode.write_frd)
+    |(id_r.decode.read_frs3 & (id_rs3 == exe_r.instruction.rd) & exe_r.decode.write_frd));
+
 
   // stall_depend_imul
   assign stall_depend_imul = exe_r.decode.is_imul_op &
@@ -1085,6 +1253,7 @@ module vanilla_core
 
 
   // stall_bypass
+  // stalling because there is no forward path from FP_EXE, FPU1, MEM to ID for frs.
   wire stall_bypass_fp_frs = 
     (id_r.decode.read_frs1 & (id_rs1 == fp_exe_r.rd) & fp_exe_r.fp_decode.is_fpu_float_op)
     |(id_r.decode.read_frs2 & (id_rs2 == fp_exe_r.rd) & fp_exe_r.fp_decode.is_fpu_float_op)
@@ -1092,10 +1261,10 @@ module vanilla_core
     |(id_r.decode.read_frs1 & (id_rs1 == fpu1_rd_r) & fpu1_v_r)
     |(id_r.decode.read_frs2 & (id_rs2 == fpu1_rd_r) & fpu1_v_r)
     |(id_r.decode.read_frs3 & (id_rs3 == fpu1_rd_r) & fpu1_v_r)
-    |(id_r.decode.read_frs1 & (id_rs1 == mem_r.rd_addr) & mem_r.write_frd)
-    |(id_r.decode.read_frs2 & (id_rs2 == mem_r.rd_addr) & mem_r.write_frd)
-    |(id_r.decode.read_frs3 & (id_rs3 == mem_r.rd_addr) & mem_r.write_frd);
-
+    |(id_r.decode.read_frs1 & (id_rs1 == mem_r.frd_addr) & mem_r.write_frd)
+    |(id_r.decode.read_frs2 & (id_rs2 == mem_r.frd_addr) & mem_r.write_frd)
+    |(id_r.decode.read_frs3 & (id_rs3 == mem_r.frd_addr) & mem_r.write_frd);
+  
   wire stall_bypass_fp_rs1 = (id_r.decode.read_rs1 & id_rs1_non_zero) &
     (((id_rs1 == fp_exe_r.rd) & fp_exe_r.fp_decode.is_fpu_int_op)
     |((id_rs1 == imul_rd_lo) & imul_v_lo)
@@ -1128,7 +1297,8 @@ module vanilla_core
     |id_r.decode.is_store_op
     |id_r.decode.is_amo_op
     |id_r.decode.is_lr_aq_op
-    |id_r.decode.is_lr_op);
+    |id_r.decode.is_lr_op
+    |id_r.decode.is_flwadd_op);
 
   // stall_amo_rl
   // If there is a remote request in EXE, there is a technically remote request pending, even if the credit counter has not yet been decremented.
@@ -1139,7 +1309,7 @@ module vanilla_core
   // stall_remote_req
   logic [lg_fwd_fifo_els_lp-1:0] remote_req_counter_r;
   wire local_mem_op_restore = (lsu_dmem_v_lo & ~exe_r.decode.is_lr_op & ~exe_r.decode.is_lr_aq_op) & ~stall_all;
-  wire id_remote_req_op = (id_r.decode.is_load_op | id_r.decode.is_store_op | id_r.decode.is_amo_op | id_r.icache_miss);
+  wire id_remote_req_op = (id_r.decode.is_load_op | id_r.decode.is_store_op | id_r.decode.is_amo_op | id_r.icache_miss | id_r.decode.is_flwadd_op);
   wire memory_op_issued = id_remote_req_op & ~flush & ~stall_id & ~stall_all;
   wire [lg_fwd_fifo_els_lp-1:0] remote_req_available =
     remote_req_counter_r +
@@ -1198,7 +1368,7 @@ module vanilla_core
   logic [2:0] has_forward_data_rs2;
 
   assign has_forward_data_rs1[0] =
-    ((exe_r.decode.write_rd & (exe_r.instruction.rd == id_rs1))
+    ((exe_r.decode.write_rd & ((exe_r.decode.is_flwadd_op ? exe_r.instruction.rs1 : exe_r.instruction.rd) == id_rs1))
     |(fp_exe_r.fp_decode.is_fpu_int_op & (fp_exe_r.rd == id_rs1)))
     & id_rs1_non_zero;
   assign has_forward_data_rs1[1] =
@@ -1219,7 +1389,7 @@ module vanilla_core
   );
 
   assign has_forward_data_rs2[0] =
-    ((exe_r.decode.write_rd & (exe_r.instruction.rd == id_rs2))
+    ((exe_r.decode.write_rd & ((exe_r.decode.is_flwadd_op ? exe_r.instruction.rs1 : exe_r.instruction.rd) == id_rs2))
     |(fp_exe_r.fp_decode.is_fpu_int_op & (fp_exe_r.rd == id_rs2)))
     & id_rs2_non_zero;
   assign has_forward_data_rs2[1] =
@@ -1253,26 +1423,61 @@ module vanilla_core
   assign fcsr_addr_li = id_r.instruction[31:20];
 
 
+  // interrupt / CSR control
+  assign mcsr_we_li = (id_r.decode.is_csr_op) & ~flush & ~stall_all & ~stall_id;
+  assign mcsr_data_li = rs1_val_to_exe;
+  assign mcsr_instr_executed_li = id_r.valid & ~flush & ~stall_all & ~stall_id & mstatus_r.mie; // trace interrupt pending can be set outside interrupt.
+  assign mcsr_interrupt_entered_li = interrupt_ready & ~stall_all;
+  assign mcsr_mret_called_li = exe_r.decode.is_mret_op & ~stall_all;
+  assign mcsr_npc_r_li = npc_r;
+  
+
+
+
   // ID -> EXE
+  // update npc_r, when the pipeline is not stalled, and there is a valid instruction in EXE/FP_EXE;
   always_comb begin
     if (stall_all) begin
       exe_n = exe_r;
+      npc_write_en = 1'b0;
     end
     else begin
-      if (flush | stall_id | id_r.decode.is_fp_op) begin
+      npc_write_en = (exe_r.valid & mstatus_r.mie) | exe_r.decode.is_mret_op;
+      if (flush | stall_id) begin
         exe_n = '0;
       end
+      else if (id_r.decode.is_fp_op) begin
+        // for fp_op, we still want to keep track of npc_r.
+        // so we set the valid and pc_plus4.
+        exe_n = '{
+          pc_plus4: id_r.pc_plus4,
+          valid: id_r.valid,
+          pred_or_jump_addr: '0,
+          instruction: '0,
+          decode: '0,
+          rs1_val: '0,
+          rs2_val: '0,
+          mem_addr_op2: '0,
+          icache_miss: 1'b0
+        };
+      end
       else begin
         exe_n = '{
           pc_plus4: id_r.pc_plus4,
+          valid: id_r.valid,
           pred_or_jump_addr: id_r.pred_or_jump_addr,
           instruction: id_r.instruction,
           decode: id_r.decode,
           rs1_val: rs1_val_to_exe,
-          rs2_val: rs2_val_to_exe,
+          // rs2_val carries csr load values
+          // if csr addr matches any of fcsr addr, then fcsr_data_v_lo will be asserted.
+          rs2_val: (id_r.decode.is_csr_op
+                    ? (fcsr_data_v_lo
+                      ? (data_width_p)'(fcsr_data_lo)
+                      : mcsr_data_lo)
+                    : rs2_val_to_exe),
           mem_addr_op2: mem_addr_op2,
-          icache_miss: id_r.icache_miss,
-          fcsr_data: fcsr_data_lo
+          icache_miss: id_r.icache_miss
         };
       end
     end
@@ -1300,12 +1505,20 @@ module vanilla_core
     : frm_e'(id_r.instruction.funct3);
 
   always_comb begin
+    fp_exe_n = fp_exe_r;
     if (stall_all) begin
       fp_exe_n = fp_exe_r;
     end
     else begin
       if (flush | stall_id | ~id_r.decode.is_fp_op) begin
-        fp_exe_n = '0;
+        // put nop in fp_exe.
+        // we hold the data inputs steady in the case of a stall,
+        // or if there is not a floating point operation
+        // to avoid unnecessarily toggling of the FP unit
+        fp_exe_n.fp_decode.is_fpu_float_op = 1'b0;
+        fp_exe_n.fp_decode.is_fpu_int_op   = 1'b0;
+        fp_exe_n.fp_decode.is_fdiv_op  = 1'b0;
+        fp_exe_n.fp_decode.is_fsqrt_op = 1'b0;
       end
       else begin
         fp_exe_n = '{
@@ -1339,12 +1552,13 @@ module vanilla_core
     if (stall_all) begin
       mem_n = mem_r;
     end
-    else if (exe_r.decode.is_idiv_op | (remote_req_in_exe & ~exe_r.icache_miss)) begin
+    else if (exe_r.decode.is_idiv_op | (remote_req_in_exe & ~exe_r.icache_miss & ~exe_r.decode.is_flwadd_op)) begin
       mem_n = '0;
     end
     else if (fp_exe_r.fp_decode.is_fpu_int_op) begin
       fcsr_fflags_v_li[0] = 1'b1;
       mem_n = '{
+        frd_addr: '0,
         rd_addr: fp_exe_r.rd,
         exe_result: fpu_int_result_lo,
         write_rd: 1'b1,
@@ -1353,13 +1567,15 @@ module vanilla_core
         is_hex_op: 1'b0,
         is_load_unsigned: 1'b0,
         local_load: 1'b0,
+        local_flwadd: 1'b0,
         mem_addr_sent: '0,
         icache_miss: 1'b0
       };      
     end
     else begin
       mem_n = '{
-        rd_addr: exe_r.instruction.rd,
+        rd_addr: (exe_r.decode.is_flwadd_op ? exe_r.instruction.rs1 : exe_r.instruction.rd),
+        frd_addr: exe_r.instruction.rd,
         exe_result: alu_or_csr_result,
         write_rd: exe_r.decode.write_rd,
         write_frd: exe_r.decode.write_frd,
@@ -1367,6 +1583,7 @@ module vanilla_core
         is_hex_op: exe_r.decode.is_hex_op,
         is_load_unsigned: exe_r.decode.is_load_unsigned,
         local_load: local_load_in_exe,
+        local_flwadd: local_flwadd_in_exe,
         mem_addr_sent: lsu_mem_addr_sent_lo,
         icache_miss: exe_r.icache_miss
       };      
@@ -1440,7 +1657,8 @@ module vanilla_core
     stall_idiv_wb = 1'b0;
     stall_remote_ld_wb = 1'b0;
 
-    if (int_remote_load_resp_v_i & int_remote_load_resp_force_i) begin
+    // int remote_load_resp and icache response are mutually exclusive events.
+    if (int_remote_load_resp_force_i) begin
       wb_n.write_rd = 1'b1;
       wb_n.rd_addr = int_remote_load_resp_rd_i;
       wb_n.rf_data = int_remote_load_resp_data_i;
@@ -1503,7 +1721,7 @@ module vanilla_core
     else begin
       flw_wb_n = '{
         valid: mem_r.write_frd,
-        rd_addr: mem_r.rd_addr,
+        rd_addr: mem_r.frd_addr,
         rf_data: local_load_data_r
       };
     end
@@ -1532,7 +1750,7 @@ module vanilla_core
     fcsr_fflags_li[1] = fpu_float_fflags_lo;
     
 
-    if (float_remote_load_resp_v_i & float_remote_load_resp_force_i) begin
+    if (float_remote_load_resp_force_i) begin
       select_remote_flw = 1'b1;
       float_rf_wen = 1'b1;
       float_rf_waddr = float_remote_load_resp_rd_i;