diff --git a/software/bsg_manycore_lib/bsg_shared_mem.hpp b/software/bsg_manycore_lib/bsg_shared_mem.hpp
index 9854613d0..2b72ce20f 100644
--- a/software/bsg_manycore_lib/bsg_shared_mem.hpp
+++ b/software/bsg_manycore_lib/bsg_shared_mem.hpp
@@ -1,6 +1,8 @@
 #pragma once
+
 extern "C" {
 #include "bsg_manycore.h"
+#include "bsg_tile_group_barrier.hpp"
 }
 #include <cstdlib>
 #include <cmath>
@@ -96,6 +98,49 @@ namespace bsg_manycore {
         }
 
 
+        // Reduce (sum) all elements in tile group shared memory
+        // and store in first element. We perform reduction in this loop,
+        // starting from an offset of 1 and a multiplicand of 2:
+        // For every element with index multiplicand of 2: A[i] <-- A[i] + A[i+1]
+        // For every element with index multiplicand of 4: A[i] <-- A[i] + 2
+        // For every element with index multiplicand of 8: A[i] <-- A[i] + 4
+        // .... Continue until offset is larger that array size ....
+        // Example
+        // |1|1|1|1|1|1|1|1|   Offset: 1  - Mult: 2
+        //  |/  |/  |/  |/
+        // |2|1|2|1|2|1|2|1|   Offset: 2  - Mult: 4
+        //  |  /   |  /  
+        //  | /    | /
+        //  |/     |/
+        // |4|1|2|1|4|1|2|1|   Offset: 4  - Mult: 8
+        //  |       /
+        //  |      /
+        //  |     /
+        //  |    /
+        //  |   /
+        //  |  /
+        //  | /
+        //  |/
+        // |8|1|2|1|4|1|2|1|
+        void reduce(bsg_barrier<TG_DIM_X, TG_DIM_Y> &barrier) {
+
+            int offset = 1;
+            int mult = 2;
+
+            while (offset < SIZE) {
+                for (int iter_x = bsg_id; iter_x < SIZE; iter_x += TILES) {
+                    if (!(iter_x % mult)){
+                        (*this)[iter_x] += (*this)[iter_x + offset];
+                    }
+                }
+                
+                barrier.sync();
+               
+                mult <<= 1;
+                offset <<= 1;
+            }
+            return;
+        }
 
        
     private:
diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp b/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp
deleted file mode 100644
index 59ddf2440..000000000
--- a/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// This kernel performs tests hardware tile group shared memory.
-
-#include "bsg_manycore.h"
-#include "bsg_set_tile_x_y.h"
-#include "bsg_tile_group_barrier.hpp"
-#include "bsg_shared_mem.hpp"
-
-using namespace bsg_manycore;
-
-bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
-
-extern "C" int  __attribute__ ((noinline)) kernel_hard_shared() {
-
-
-    TileGroupSharedMem<int, 64, bsg_tiles_X, bsg_tiles_Y, 8> A;
-
-//    if (__bsg_id == 0) {
-//        bsg_print_hexadecimal(A._local_addr);
-//    }
-//
-    if (__bsg_id == 0) {
-        A[0] = 0x32;
-    }
-
-//    bsg_print_hexadecimal(A._local_addr);
-//    bsg_print_hexadecimal(reinterpret_cast<int> (A._addr));
-//    bsg_print_hexadecimal(reinterpret_cast<int> (A[1]));
-//    bsg_print_hexadecimal(reinterpret_cast<int> (A[2]));
-//    bsg_print_hexadecimal(reinterpret_cast<int> (A[3]));
-//    bsg_print_hexadecimal(reinterpret_cast<int> (A[4]));
-
-
-    barrier.sync();
-    return 0;
-}
diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile
similarity index 84%
rename from software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile
rename to software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile
index 6eaa27ba2..7177d3175 100644
--- a/software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile
+++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile
@@ -11,16 +11,16 @@
 	bsg_tiles_org_Y ?= 1
 
 # If not configured, Will use default Values
-	bsg_tiles_X ?= 2
-	bsg_tiles_Y ?= 2
+	bsg_tiles_X ?= 4
+	bsg_tiles_Y ?= 4
 
 
 all: main.run
 
 
-KERNEL_NAME ?=kernel_hard_shared
+KERNEL_NAME ?=kernel_hardware_shared_mem_reduce
 
-OBJECT_FILES=main.o kernel_hard_shared.o
+OBJECT_FILES=main.o kernel_hardware_shared_mem_reduce.o
 
 include ../../Makefile.include
 
diff --git a/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp
new file mode 100644
index 000000000..7dfdbb694
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp
@@ -0,0 +1,76 @@
+// * This kernel performs sum reduction on hardware tile group
+//   shared memory. It uses the built-in reduce() method of the
+//   hardware tile group shared memory library.
+// * Tile group dimensions are fixed at 4x4.
+
+// TEMPLATE_TG_DIM_X/Y must be defined before bsg_manycore.h is
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+
+
+#define TEMPLATE_TG_DIM_X 4
+#define TEMPLATE_TG_DIM_Y 4
+#define TEMPLATE_BLOCK_SIZE    1024
+#define TEMPLATE_STRIPE_SIZE   1
+#define bsg_tiles_X TEMPLATE_TG_DIM_X
+#define bsg_tiles_Y TEMPLATE_TG_DIM_Y
+
+#include <bsg_manycore.h>
+#include "kernel_hardware_shared_mem_reduce.hpp"
+#include <bsg_tile_group_barrier.hpp>
+#include "bsg_shared_mem.hpp"
+
+using namespace bsg_manycore;
+
+
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+
+template <int TG_DIM_X,
+          int TG_DIM_Y,
+          int BLOCK_SIZE,
+          int STRIPE_SIZE,
+          typename TA>
+    int __attribute__ ((noinline))
+    hardware_shared_mem_reduce(TA *A, TA *sum) {
+    
+        // Declare tile-group shared memory
+        TileGroupSharedMem<TA, BLOCK_SIZE, TG_DIM_X, TG_DIM_Y, STRIPE_SIZE> A_sh;
+        
+        for (int iter_x = __bsg_id; iter_x < BLOCK_SIZE; iter_x += TG_DIM_X * TG_DIM_Y) {
+            A_sh[iter_x] = A[iter_x];
+        }
+
+        barrier.sync();
+
+        A_sh.reduce(barrier);
+
+        *sum = A_sh[0];
+
+        barrier.sync();
+
+        return 0;
+    }
+
+
+extern "C" {
+    int  __attribute__ ((noinline)) kernel_hardware_shared_mem_reduce(float *A,
+                                                                      float *sum, 
+                                                                      uint32_t WIDTH, 
+                                                                      uint32_t block_size) {
+        int rc;
+        bsg_cuda_print_stat_kernel_start();
+
+        rc = hardware_shared_mem_reduce <TEMPLATE_TG_DIM_X,
+                                         TEMPLATE_TG_DIM_Y,
+                                         TEMPLATE_BLOCK_SIZE,
+                                         TEMPLATE_STRIPE_SIZE>  (A,
+                                                                 sum);
+
+        barrier.sync();
+
+        bsg_cuda_print_stat_kernel_end();
+
+        return rc;
+    }
+}
diff --git a/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp
new file mode 100644
index 000000000..eb12adb12
--- /dev/null
+++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp
@@ -0,0 +1,5 @@
+#ifndef __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
+#define __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
+#include <cstdint>
+
+#endif //__KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/main.c b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/main.c
similarity index 100%
rename from software/spmd/bsg_cuda_lite_runtime/hard_shared/main.c
rename to software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/main.c
diff --git a/v/vanilla_bean/hash_function_shared.v b/v/vanilla_bean/hash_function_shared.v
index a5351c43b..c94ed64e0 100644
--- a/v/vanilla_bean/hash_function_shared.v
+++ b/v/vanilla_bean/hash_function_shared.v
@@ -34,7 +34,6 @@ module hash_function_shared
 
   always_comb begin
     // Hash bits cannot be larger than the entire address bits
-    // TODO: add an assert
     if (~en_i | (hash_i > max_local_offset_width_gp)) begin
       x_o = '0;
       y_o = '0;
@@ -42,10 +41,12 @@ module hash_function_shared
     end
    
     else begin
+      // X coordinate
       for (integer i = 0; i < tg_dim_x_width_i; i = i + 1) begin
         x_o[i] = shared_eva_i[i+hash_i];
       end
 
+      // Y coordinate
       for (integer i = 0; i < tg_dim_y_width_i; i = i + 1) begin
         y_o[i] = shared_eva_i[i+tg_dim_x_width_i+hash_i];
       end