Skip to content

Commit

Permalink
Added a reduce method to hardware tile group shared memory, and added…
Browse files Browse the repository at this point in the history
… a hardware tile group shared memory cuda lite regression test
  • Loading branch information
bornaehsani committed Aug 24, 2020
1 parent d5a0ac4 commit 41221a7
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 40 deletions.
45 changes: 45 additions & 0 deletions software/bsg_manycore_lib/bsg_shared_mem.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

extern "C" {
#include "bsg_manycore.h"
#include "bsg_tile_group_barrier.hpp"
}
#include <cstdlib>
#include <cmath>
Expand Down Expand Up @@ -96,6 +98,49 @@ namespace bsg_manycore {
}


// Reduce (sum) all elements in tile group shared memory
// and store in first element. We perform reduction in this loop,
// starting from an offset of 1 and a multiplicand of 2:
// For every element with index multiplicand of 2: A[i] <-- A[i] + A[i+1]
// For every element with index multiplicand of 4: A[i] <-- A[i] + 2
// For every element with index multiplicand of 8: A[i] <-- A[i] + 4
// .... Continue until offset is larger that array size ....
// Example
// |1|1|1|1|1|1|1|1| Offset: 1 - Mult: 2
// |/ |/ |/ |/
// |2|1|2|1|2|1|2|1| Offset: 2 - Mult: 4
// | / | /
// | / | /
// |/ |/
// |4|1|2|1|4|1|2|1| Offset: 4 - Mult: 8
// | /
// | /
// | /
// | /
// | /
// | /
// | /
// |/
// |8|1|2|1|4|1|2|1|
void reduce(bsg_barrier<TG_DIM_X, TG_DIM_Y> &barrier) {

int offset = 1;
int mult = 2;

while (offset < SIZE) {
for (int iter_x = bsg_id; iter_x < SIZE; iter_x += TILES) {
if (!(iter_x % mult)){
(*this)[iter_x] += (*this)[iter_x + offset];
}
}

barrier.sync();

mult <<= 1;
offset <<= 1;
}
return;
}


private:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
bsg_tiles_org_Y ?= 1

# If not configured, Will use default Values
bsg_tiles_X ?= 2
bsg_tiles_Y ?= 2
bsg_tiles_X ?= 4
bsg_tiles_Y ?= 4


all: main.run


KERNEL_NAME ?=kernel_hard_shared
KERNEL_NAME ?=kernel_hardware_shared_mem_reduce

OBJECT_FILES=main.o kernel_hard_shared.o
OBJECT_FILES=main.o kernel_hardware_shared_mem_reduce.o

include ../../Makefile.include

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// * This kernel performs sum reduction on hardware tile group
// shared memory. It uses the built-in reduce() method of the
// hardware tile group shared memory library.
// * Tile group dimensions are fixed at 4x4.

// TEMPLATE_TG_DIM_X/Y must be defined before bsg_manycore.h is
// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
// legacy reasons, but they are deprecated.


#define TEMPLATE_TG_DIM_X 4
#define TEMPLATE_TG_DIM_Y 4
#define TEMPLATE_BLOCK_SIZE 1024
#define TEMPLATE_STRIPE_SIZE 1
#define bsg_tiles_X TEMPLATE_TG_DIM_X
#define bsg_tiles_Y TEMPLATE_TG_DIM_Y

#include <bsg_manycore.h>
#include "kernel_hardware_shared_mem_reduce.hpp"
#include <bsg_tile_group_barrier.hpp>
#include "bsg_shared_mem.hpp"

using namespace bsg_manycore;


bsg_barrier<bsg_tiles_X, bsg_tiles_Y> barrier;


template <int TG_DIM_X,
int TG_DIM_Y,
int BLOCK_SIZE,
int STRIPE_SIZE,
typename TA>
int __attribute__ ((noinline))
hardware_shared_mem_reduce(TA *A, TA *sum) {

// Declare tile-group shared memory
TileGroupSharedMem<TA, BLOCK_SIZE, TG_DIM_X, TG_DIM_Y, STRIPE_SIZE> A_sh;

for (int iter_x = __bsg_id; iter_x < BLOCK_SIZE; iter_x += TG_DIM_X * TG_DIM_Y) {
A_sh[iter_x] = A[iter_x];
}

barrier.sync();

A_sh.reduce(barrier);

*sum = A_sh[0];

barrier.sync();

return 0;
}


extern "C" {
int __attribute__ ((noinline)) kernel_hardware_shared_mem_reduce(float *A,
float *sum,
uint32_t WIDTH,
uint32_t block_size) {
int rc;
bsg_cuda_print_stat_kernel_start();

rc = hardware_shared_mem_reduce <TEMPLATE_TG_DIM_X,
TEMPLATE_TG_DIM_Y,
TEMPLATE_BLOCK_SIZE,
TEMPLATE_STRIPE_SIZE> (A,
sum);

barrier.sync();

bsg_cuda_print_stat_kernel_end();

return rc;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#ifndef __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
#define __KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
#include <cstdint>

#endif //__KERNEL_HARDWARE_SHARED_MEM_REDUCE_HPP
3 changes: 2 additions & 1 deletion v/vanilla_bean/hash_function_shared.v
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,19 @@ module hash_function_shared

always_comb begin
// Hash bits cannot be larger than the entire address bits
// TODO: add an assert
if (~en_i | (hash_i > max_local_offset_width_gp)) begin
x_o = '0;
y_o = '0;
addr_o = '0;
end

else begin
// X coordinate
for (integer i = 0; i < tg_dim_x_width_i; i = i + 1) begin
x_o[i] = shared_eva_i[i+hash_i];
end

// Y coordinate
for (integer i = 0; i < tg_dim_y_width_i; i = i + 1) begin
y_o[i] = shared_eva_i[i+tg_dim_x_width_i+hash_i];
end
Expand Down

0 comments on commit 41221a7

Please sign in to comment.