Added a reduce method to hardware tile group shared memory, and added…

… a hardware tile group shared memory cuda lite regression test
bespoke-silicon-group · Aug 24, 2020 · 41221a7 · 41221a7
1 parent d5a0ac4
commit 41221a7
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 40 deletions.
diff --git a/software/bsg_manycore_lib/bsg_shared_mem.hpp b/software/bsg_manycore_lib/bsg_shared_mem.hpp
@@ -1,6 +1,8 @@
 #pragma once
+
 extern "C" {
 #include "bsg_manycore.h"
+#include "bsg_tile_group_barrier.hpp"
 }
 #include <cstdlib>
 #include <cmath>
@@ -96,6 +98,49 @@ namespace bsg_manycore {
         }
 
 
+        // Reduce (sum) all elements in tile group shared memory
+        // and store in first element. We perform reduction in this loop,
+        // starting from an offset of 1 and a multiplicand of 2:
+        // For every element with index multiplicand of 2: A[i] <-- A[i] + A[i+1]
+        // For every element with index multiplicand of 4: A[i] <-- A[i] + 2
+        // For every element with index multiplicand of 8: A[i] <-- A[i] + 4
+        // .... Continue until offset is larger that array size ....
+        // Example
+        // |1|1|1|1|1|1|1|1|   Offset: 1  - Mult: 2
+        //  |/  |/  |/  |/
+        // |2|1|2|1|2|1|2|1|   Offset: 2  - Mult: 4
+        //  |  /   |  /  
+        //  | /    | /
+        //  |/     |/
+        // |4|1|2|1|4|1|2|1|   Offset: 4  - Mult: 8
+        //  |       /
+        //  |      /
+        //  |     /
+        //  |    /
+        //  |   /
+        //  |  /
+        //  | /
+        //  |/
+        // |8|1|2|1|4|1|2|1|
+        void reduce(bsg_barrier<TG_DIM_X, TG_DIM_Y> &barrier) {
+
+            int offset = 1;
+            int mult = 2;
+
+            while (offset < SIZE) {
+                for (int iter_x = bsg_id; iter_x < SIZE; iter_x += TILES) {
+                    if (!(iter_x % mult)){
+                        (*this)[iter_x] += (*this)[iter_x + offset];
+                    }
+                }
+
+                barrier.sync();
+
+                mult <<= 1;
+                offset <<= 1;
+            }
+            return;
+        }
 
 
     private:

diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp b/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp
diff --git a/...sg_cuda_lite_runtime/hard_shared/Makefile → ...ntime/hardware_shared_mem_reduce/Makefile b/...sg_cuda_lite_runtime/hard_shared/Makefile → ...ntime/hardware_shared_mem_reduce/Makefile
@@ -11,16 +11,16 @@
 	bsg_tiles_org_Y ?= 1
 
 # If not configured, Will use default Values
-	bsg_tiles_X ?= 2
-	bsg_tiles_Y ?= 2
+	bsg_tiles_X ?= 4
+	bsg_tiles_Y ?= 4
 
 
 all: main.run
 
 
-KERNEL_NAME ?=kernel_hard_shared
+KERNEL_NAME ?=kernel_hardware_shared_mem_reduce
 
-OBJECT_FILES=main.o kernel_hard_shared.o
+OBJECT_FILES=main.o kernel_hardware_shared_mem_reduce.o
 
 include ../../Makefile.include
 

diff --git a/...md/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp b/...md/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp
@@ -0,0 +1,76 @@
+// * This kernel performs sum reduction on hardware tile group
+//   shared memory. It uses the built-in reduce() method of the
+//   hardware tile group shared memory library.
+// * Tile group dimensions are fixed at 4x4.
+
+// TEMPLATE_TG_DIM_X/Y must be defined before bsg_manycore.h is
+// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
+// legacy reasons, but they are deprecated.
+
+
+#define TEMPLATE_TG_DIM_X 4
+#define TEMPLATE_TG_DIM_Y 4
+#define TEMPLATE_BLOCK_SIZE    1024
+#define TEMPLATE_STRIPE_SIZE   1
+#define bsg_tiles_X TEMPLATE_TG_DIM_X
+#define bsg_tiles_Y TEMPLATE_TG_DIM_Y
+
+#include <bsg_manycore.h>
+#include "kernel_hardware_shared_mem_reduce.hpp"
+#include <bsg_tile_group_barrier.hpp>
+#include "bsg_shared_mem.hpp"
+
+using namespace bsg_manycore;
+
+
+bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;
+
+
+template <int TG_DIM_X,
+          int TG_DIM_Y,
+          int BLOCK_SIZE,
+          int STRIPE_SIZE,
+          typename TA>
+    int __attribute__ ((noinline))
+    hardware_shared_mem_reduce(TA *A, TA *sum) {
+
+        // Declare tile-group shared memory
+        TileGroupSharedMem<TA, BLOCK_SIZE, TG_DIM_X, TG_DIM_Y, STRIPE_SIZE> A_sh;
+
+        for (int iter_x = __bsg_id; iter_x < BLOCK_SIZE; iter_x += TG_DIM_X * TG_DIM_Y) {
+            A_sh[iter_x] = A[iter_x];
+        }
+
+        barrier.sync();
+
+        A_sh.reduce(barrier);
+
+        *sum = A_sh[0];
+
+        barrier.sync();
+
+        return 0;
+    }
+
+
+extern "C" {
+    int  __attribute__ ((noinline)) kernel_hardware_shared_mem_reduce(float *A,
+                                                                      float *sum, 
+                                                                      uint32_t WIDTH, 
+                                                                      uint32_t block_size) {
+        int rc;
+        bsg_cuda_print_stat_kernel_start();
+
+        rc = hardware_shared_mem_reduce <TEMPLATE_TG_DIM_X,
+                                         TEMPLATE_TG_DIM_Y,
+                                         TEMPLATE_BLOCK_SIZE,
+                                         TEMPLATE_STRIPE_SIZE>  (A,
+                                                                 sum);
+
+        barrier.sync();
+
+        bsg_cuda_print_stat_kernel_end();
+
+        return rc;
+    }
+}
diff --git a/...md/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp b/...md/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp
@@ -0,0 +1,5 @@
+#ifndef __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
+#define __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
+#include <cstdint>
+
+#endif //__KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
diff --git a/.../bsg_cuda_lite_runtime/hard_shared/main.c → ...runtime/hardware_shared_mem_reduce/main.c b/.../bsg_cuda_lite_runtime/hard_shared/main.c → ...runtime/hardware_shared_mem_reduce/main.c
diff --git a/v/vanilla_bean/hash_function_shared.v b/v/vanilla_bean/hash_function_shared.v
@@ -34,18 +34,19 @@ module hash_function_shared
 
   always_comb begin
     // Hash bits cannot be larger than the entire address bits
-    // TODO: add an assert
     if (~en_i | (hash_i > max_local_offset_width_gp)) begin
       x_o = '0;
       y_o = '0;
       addr_o = '0;
     end
 
     else begin
+      // X coordinate
       for (integer i = 0; i < tg_dim_x_width_i; i = i + 1) begin
         x_o[i] = shared_eva_i[i+hash_i];
       end
 
+      // Y coordinate
       for (integer i = 0; i < tg_dim_y_width_i; i = i + 1) begin
         y_o[i] = shared_eva_i[i+tg_dim_x_width_i+hash_i];
       end