Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add latency estimations to simulator #21

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/cgra.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from ctypes import c_int32
import csv
import os.path

from characterization import load_operation_characterization, display_characterization, get_latency_cc
from kernels import *

# CGRA from left to right, top to bottom
Expand All @@ -20,7 +20,8 @@
srcs = ['ZERO', 'SELF', 'RCL', 'RCR', 'RCT', 'RCB', 'R0', 'R1', 'R2', 'R3', 'IMM']
dsts = ['SELF', 'RCL', 'RCR', 'RCT', 'RCB','R0', 'R1', 'R2', 'R3']
regs = dsts[-4:]

operation_latency_mapping = {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we move the latency-related operations to a separate module latency.py?
It is unclear to me when this code is gonna execute (when loading the module probably). I would rather have a latency_load_characterization( filename ) function, so we could actually have more than one possible characterization (for example, for the case where you wanna test different scenarios.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, I added a function that can eventually be reused for other characterizations

operation_latency_mapping = load_operation_characterization(operation_latency_mapping, "latency_cc")
class INSTR:
def __init__( self,matrix):
self.time = matrix[0][0] # ToDo: Fix how we assign this length
Expand Down Expand Up @@ -71,6 +72,9 @@ def __init__( self, kernel, memory, read_addrs, write_addrs):
self.memory = memory
self.instr2exec = 0
self.cycles = 0
self.total_latency_cc = 0
self.instr_latency_cc = []
self.max_latency_instr = None
JuanSapriza marked this conversation as resolved.
Show resolved Hide resolved
if read_addrs is not None and len(read_addrs) == N_COLS:
self.load_addr = read_addrs
else:
Expand Down Expand Up @@ -110,10 +114,13 @@ def step( self, prs="ROUT" ):
reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ]
print_out( prs, outs, insts, ops, reg )

get_latency_cc(self)
self.instr2exec += 1
self.cycles += 1
return self.exit



def get_neighbour_address( self, r, c, dir ):
n_r = r
n_c = c
Expand Down Expand Up @@ -181,6 +188,7 @@ def __init__( self, parent, row, col ):
self.regs = {'R0':0, 'R1':0, 'R2':0, 'R3':0 }
self.op = ""
self.instr = ""
self.latency_cc = 0

def get_out( self ):
return self.old_out
Expand Down Expand Up @@ -222,7 +230,7 @@ def run_instr( self, instr):
self.op = instr[0]
except:
self.op = instr

self.latency_cc = int(operation_latency_mapping[self.op])
if self.op in self.ops_arith:
des = instr[1]
val1 = self.fetch_val( instr[2] )
Expand Down Expand Up @@ -375,6 +383,8 @@ def blt( self, val1, val2, branch ):
ops_jump = { 'JUMP' : '' }
ops_exit = { 'EXIT' : '' }



def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None):
ker = []
mem = []
Expand Down Expand Up @@ -407,5 +417,5 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=
sorted_mem = sorted(mem, key=lambda x: x[0])
with open( kernel + "/"+FILENAME_MEM_O+version+EXT, 'w+') as f:
for row in sorted_mem: csv.writer(f).writerow(row)

display_characterization(cgra)
print("\n\nEND")
66 changes: 66 additions & 0 deletions src/characterization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import copy
import os.path
import csv

OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"]

def load_operation_characterization(operation_mapping, characterization_type):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why take operation mapping as a parameter?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed ✔️

script_dir = os.path.dirname(os.path.abspath(__file__))
csv_file_path = os.path.join(script_dir, 'operation_characterization.csv')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we leave the characterization file as a parameter?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✔️


with open(csv_file_path, 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
if not row:
continue
if row[0].startswith('#'):
current_section = row[0].strip('# ')
continue
if current_section == f'operation_{characterization_type}_mapping':
if len(row) == 3:
key_type, value_type = int, float
operation, key, value = row
key = key_type(key)
value = value_type(value)
if operation not in operation_mapping:
operation_mapping[operation] = {}
operation_mapping[operation][key] = value
elif len(row) == 2:
key_type, value_type = int, int
operation, key = row
key = key_type(key)
if operation not in operation_mapping:
operation_mapping[operation] = key
else:
continue
return operation_mapping

def get_latency_cc(self):
from cgra import N_ROWS, N_COLS
self.max_latency_instr = None
mem_latency_cc = 0
for r in range(N_ROWS):
for c in range(N_COLS):
if self.max_latency_instr is None or self.cells[r][c].latency_cc > self.max_latency_instr.latency_cc:
self.max_latency_instr = self.cells[r][c]
if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS:
mem_latency_cc += 1
# A memory access to a memory bank has a 2-cycle overhead,
# plus 1 additional cycle per PE trying to access it.
if mem_latency_cc >= 1:
mem_latency_cc += 1
self.max_latency_instr.latency_cc = max(self.max_latency_instr.latency_cc, mem_latency_cc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find it a little unhappy that this logic has to be hardcoded. This is easy now, but when we speak about the INTERLEAVED BUS you will not want to change the code to switch from one to other. What about this other approach:

operation_characterization_N2M.csv

TYPE, OPERATION, LATENCY, OVERHEAD, ADD_SAME_ROW, ADD_SAME_COL, ADD_CGRA, ADD_ADDR_SAME_INTEGER, ADD_ADDR_SAME_MODULO
0, NOP,  1, 0, 0, 0, 0, 0, 0
1, SADD, 1, 0, 0, 0, 0, 0, 0
1, SSUB, 1, 0, 0, 0, 0, 0, 0
2, SMUL, 3, 0, 0, 0, 0, 0, 0
3, LWD,  0, 2, 0, 1, 0, 1, 0

operation_characterization_ONE2M.csv

TYPE, OPERATION, LATENCY, OVERHEAD, ADD_SAME_ROW, ADD_SAME_COL, ADD_CGRA, ADD_ADDR_SAME_INTEGER, ADD_ADDR_SAME_MODULO
0, NOP,  1, 0, 0, 0, 0, 0, 0
1, SADD, 1, 0, 0, 0, 0, 0, 0
1, SSUB, 1, 0, 0, 0, 0, 0, 0
2, SMUL, 3, 0, 0, 0, 0, 0, 0
3, LWD,  0, 2, 0, 1, 1, 0, 0

Here you kind of embed the logic in the CSV and simplify the logic in the code, so if you want to test different architectures you dont need to modify the code. The example from above is the different bus topologies you already encountered (dont take this as is, as I could be wrong)

The idea is that you encode in the different columns different latency patterns and how they are affected by operations of the same type. For example, ADD_SAME_ROW is the cc you add if you encounter another instruction of the same type in the same row, ADD_ADDR_SAME_INTEGER is, you take the target address, perform the integer division with the memory bank size and, if its the same, add 1 cc of penalty, same for the MODULO with the % modulo division (this will be useful for the interleaved memory banks.

I'm not saying DO IT LIKE THIS ... but this kind of spirit that you will not want to change the code every time. It's something one day we will also do for the instruction operations to make the logic independent of the simulator so we can test different architecture changes without modifying the code. BTW.. the passion for not modifying the code is because we would like to have e.g. 20 different architectures and test which is the best.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, given how you implemented the fetch of the values, in the same CSV we could have different implementations:

 #operation_latency_cc_mapping
. . . 

#operation_memory_interleaved_latency_cc_mapping
. . . 

if (self.exit):
if (self.max_latency_instr.latency_cc > 2):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this tho?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-The "max" function is used to only retain the largest operation (ie: between 3CC multiplication and only one 2CC memory operation, conserve the SMUL)
-An instruction takes an extra CC if it contains the EXIT operation (= last instruction)

self.max_latency_instr.latency_cc += 1

self.max_latency_instr.instr2exec = self.instr2exec
self.instr_latency_cc.append(copy.copy(self.max_latency_instr))
self.total_latency_cc += self.instr_latency_cc[-1].latency_cc
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's all this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First line: we need the current CGRA-cycle number as well as that of the instruction being executed (instr2exec)
Second line: Self.instr_latency is an array containing the longest operation for each instruction
Third line: With each instruction, we also sum up the total latency (to avoid doing so in the display_characterization function)


def display_characterization(cgra):
print("Longest instructions per cycle:\n")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with "cycle" you mean "CGRA-cycle" (i.e. the execution of an instruction)?
If an execution has 1256655 CGRA-cycles, we are gonna print them all?
Why not make use of the system that is already implemented where the user can choose what to print? For example, I could call

run(kernel_name, version=version, pr=["ROUT","OPS",  "OP_MAX_LAT", "TOTAL_LAT"  ], load_addrs=load_addrs, store_addrs=store_addrs)

Instead of forcing the user to see all the time everything

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, I'm going to do this

print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)"))
for index, item in enumerate(cgra.instr_latency_cc):
print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc))
print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC")
27 changes: 27 additions & 0 deletions src/operation_characterization.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# operation_latency_cc_mapping
NOP,1
EXIT,2
SADD,1
SSUB,1
SLT,1
SRT,1
SRA,1
LAND,1
LOR,1
LXOR,1
LNAND,1
LNOR,1
LXNOR,1
BSFA,1
BZFA,1
BEQ,1
BNE,1
BLT,1
BGE,1
JUMP,1
LWD,2
SWD,2
LWI,2
SWI,2
SMUL,3
FXPMUL,3