diff --git a/software/bsg_manycore_lib/bsg_shared_mem.hpp b/software/bsg_manycore_lib/bsg_shared_mem.hpp index 9854613d0..2b72ce20f 100644 --- a/software/bsg_manycore_lib/bsg_shared_mem.hpp +++ b/software/bsg_manycore_lib/bsg_shared_mem.hpp @@ -1,6 +1,8 @@ #pragma once + extern "C" { #include "bsg_manycore.h" +#include "bsg_tile_group_barrier.hpp" } #include #include @@ -96,6 +98,49 @@ namespace bsg_manycore { } + // Reduce (sum) all elements in tile group shared memory + // and store in first element. We perform reduction in this loop, + // starting from an offset of 1 and a multiplicand of 2: + // For every element with index multiplicand of 2: A[i] <-- A[i] + A[i+1] + // For every element with index multiplicand of 4: A[i] <-- A[i] + 2 + // For every element with index multiplicand of 8: A[i] <-- A[i] + 4 + // .... Continue until offset is larger that array size .... + // Example + // |1|1|1|1|1|1|1|1| Offset: 1 - Mult: 2 + // |/ |/ |/ |/ + // |2|1|2|1|2|1|2|1| Offset: 2 - Mult: 4 + // | / | / + // | / | / + // |/ |/ + // |4|1|2|1|4|1|2|1| Offset: 4 - Mult: 8 + // | / + // | / + // | / + // | / + // | / + // | / + // | / + // |/ + // |8|1|2|1|4|1|2|1| + void reduce(bsg_barrier &barrier) { + + int offset = 1; + int mult = 2; + + while (offset < SIZE) { + for (int iter_x = bsg_id; iter_x < SIZE; iter_x += TILES) { + if (!(iter_x % mult)){ + (*this)[iter_x] += (*this)[iter_x + offset]; + } + } + + barrier.sync(); + + mult <<= 1; + offset <<= 1; + } + return; + } private: diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp b/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp deleted file mode 100644 index 59ddf2440..000000000 --- a/software/spmd/bsg_cuda_lite_runtime/hard_shared/kernel_hard_shared.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// This kernel performs tests hardware tile group shared memory. - -#include "bsg_manycore.h" -#include "bsg_set_tile_x_y.h" -#include "bsg_tile_group_barrier.hpp" -#include "bsg_shared_mem.hpp" - -using namespace bsg_manycore; - -bsg_barrier barrier; - -extern "C" int __attribute__ ((noinline)) kernel_hard_shared() { - - - TileGroupSharedMem A; - -// if (__bsg_id == 0) { -// bsg_print_hexadecimal(A._local_addr); -// } -// - if (__bsg_id == 0) { - A[0] = 0x32; - } - -// bsg_print_hexadecimal(A._local_addr); -// bsg_print_hexadecimal(reinterpret_cast (A._addr)); -// bsg_print_hexadecimal(reinterpret_cast (A[1])); -// bsg_print_hexadecimal(reinterpret_cast (A[2])); -// bsg_print_hexadecimal(reinterpret_cast (A[3])); -// bsg_print_hexadecimal(reinterpret_cast (A[4])); - - - barrier.sync(); - return 0; -} diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile similarity index 84% rename from software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile rename to software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile index 6eaa27ba2..7177d3175 100644 --- a/software/spmd/bsg_cuda_lite_runtime/hard_shared/Makefile +++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/Makefile @@ -11,16 +11,16 @@ bsg_tiles_org_Y ?= 1 # If not configured, Will use default Values - bsg_tiles_X ?= 2 - bsg_tiles_Y ?= 2 + bsg_tiles_X ?= 4 + bsg_tiles_Y ?= 4 all: main.run -KERNEL_NAME ?=kernel_hard_shared +KERNEL_NAME ?=kernel_hardware_shared_mem_reduce -OBJECT_FILES=main.o kernel_hard_shared.o +OBJECT_FILES=main.o kernel_hardware_shared_mem_reduce.o include ../../Makefile.include diff --git a/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp new file mode 100644 index 000000000..7dfdbb694 --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.cpp @@ -0,0 +1,76 @@ +// * This kernel performs sum reduction on hardware tile group +// shared memory. It uses the built-in reduce() method of the +// hardware tile group shared memory library. +// * Tile group dimensions are fixed at 4x4. + +// TEMPLATE_TG_DIM_X/Y must be defined before bsg_manycore.h is +// included. bsg_tiles_X and bsg_tiles_Y must also be defined for +// legacy reasons, but they are deprecated. + + +#define TEMPLATE_TG_DIM_X 4 +#define TEMPLATE_TG_DIM_Y 4 +#define TEMPLATE_BLOCK_SIZE 1024 +#define TEMPLATE_STRIPE_SIZE 1 +#define bsg_tiles_X TEMPLATE_TG_DIM_X +#define bsg_tiles_Y TEMPLATE_TG_DIM_Y + +#include +#include "kernel_hardware_shared_mem_reduce.hpp" +#include +#include "bsg_shared_mem.hpp" + +using namespace bsg_manycore; + + +bsg_barrier barrier; + + +template + int __attribute__ ((noinline)) + hardware_shared_mem_reduce(TA *A, TA *sum) { + + // Declare tile-group shared memory + TileGroupSharedMem A_sh; + + for (int iter_x = __bsg_id; iter_x < BLOCK_SIZE; iter_x += TG_DIM_X * TG_DIM_Y) { + A_sh[iter_x] = A[iter_x]; + } + + barrier.sync(); + + A_sh.reduce(barrier); + + *sum = A_sh[0]; + + barrier.sync(); + + return 0; + } + + +extern "C" { + int __attribute__ ((noinline)) kernel_hardware_shared_mem_reduce(float *A, + float *sum, + uint32_t WIDTH, + uint32_t block_size) { + int rc; + bsg_cuda_print_stat_kernel_start(); + + rc = hardware_shared_mem_reduce (A, + sum); + + barrier.sync(); + + bsg_cuda_print_stat_kernel_end(); + + return rc; + } +} diff --git a/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp new file mode 100644 index 000000000..eb12adb12 --- /dev/null +++ b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/kernel_hardware_shared_mem_reduce.hpp @@ -0,0 +1,5 @@ +#ifndef __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP +#define __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP +#include + +#endif //__KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP diff --git a/software/spmd/bsg_cuda_lite_runtime/hard_shared/main.c b/software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/main.c similarity index 100% rename from software/spmd/bsg_cuda_lite_runtime/hard_shared/main.c rename to software/spmd/bsg_cuda_lite_runtime/hardware_shared_mem_reduce/main.c diff --git a/v/vanilla_bean/hash_function_shared.v b/v/vanilla_bean/hash_function_shared.v index a5351c43b..c94ed64e0 100644 --- a/v/vanilla_bean/hash_function_shared.v +++ b/v/vanilla_bean/hash_function_shared.v @@ -34,7 +34,6 @@ module hash_function_shared always_comb begin // Hash bits cannot be larger than the entire address bits - // TODO: add an assert if (~en_i | (hash_i > max_local_offset_width_gp)) begin x_o = '0; y_o = '0; @@ -42,10 +41,12 @@ module hash_function_shared end else begin + // X coordinate for (integer i = 0; i < tg_dim_x_width_i; i = i + 1) begin x_o[i] = shared_eva_i[i+hash_i]; end + // Y coordinate for (integer i = 0; i < tg_dim_y_width_i; i = i + 1) begin y_o[i] = shared_eva_i[i+tg_dim_x_width_i+hash_i]; end