From fa4016df3f5400a657855eed48658ce81e241e90 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Wed, 10 Jul 2024 11:52:30 +0200 Subject: [PATCH 1/9] estimator can simulate latency --- src/cgra.py | 49 +++++++++++++++++++++++++++++- src/operation_characterization.csv | 27 ++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 src/operation_characterization.csv diff --git a/src/cgra.py b/src/cgra.py index 94bd85b..6371191 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -1,3 +1,4 @@ +import copy import numpy as np from ctypes import c_int32 import csv @@ -21,6 +22,19 @@ dsts = ['SELF', 'RCL', 'RCR', 'RCT', 'RCB','R0', 'R1', 'R2', 'R3'] regs = dsts[-4:] +operation_latency_mapping = {} +script_dir = os.path.dirname(os.path.abspath(__file__)) +csv_file_path = os.path.join(script_dir, 'operation_characterization.csv') + +with open(csv_file_path, 'r') as csvfile: + reader = csv.reader(csvfile) + next(reader) + for row in reader: + operation, key = row + key = int(key) + if operation not in operation_latency_mapping: + operation_latency_mapping[operation] = key + class INSTR: def __init__( self,matrix): self.time = matrix[0][0] # ToDo: Fix how we assign this length @@ -71,6 +85,8 @@ def __init__( self, kernel, memory, read_addrs, write_addrs): self.memory = memory self.instr2exec = 0 self.cycles = 0 + self.instr_time = [] + self.max_instr = None if read_addrs is not None and len(read_addrs) == N_COLS: self.load_addr = read_addrs else: @@ -97,6 +113,8 @@ def step( self, prs="ROUT" ): for c in range(N_COLS): self.cells[r][c].update() instr2exec = self.instr2exec + self.max_instr = None + self.lw_count = [0] * N_COLS if PRINT_OUTS: print("Instr = ", self.cycles, "(",instr2exec,")") for r in range(N_ROWS): for c in range(N_COLS): @@ -104,16 +122,33 @@ def step( self, prs="ROUT" ): b ,e = self.cells[r][c].exec( op ) if b != 0: self.instr2exec = b - 1 #To avoid more logic afterwards if e != 0: self.exit = True + if self.max_instr is None or self.cells[r][c].time > self.max_instr.time: + self.max_instr = self.cells[r][c] + if self.cells[r][c].op in ["LWD", "LWI", "SWD","SWI"]: + self.lw_count[c] += 1 outs = [ self.cells[r][i].out for i in range(N_COLS) ] insts = [ self.cells[r][i].instr for i in range(N_COLS) ] ops = [ self.cells[r][i].op for i in range(N_COLS) ] reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ] print_out( prs, outs, insts, ops, reg ) + self.compute_mem_latency() + self.max_instr.instr2exec = instr2exec + self.instr_time.append(copy.copy(self.max_instr)) self.instr2exec += 1 self.cycles += 1 return self.exit + # A memory access to a memory bank has a 2-cycle overhead, + # plus 1 additional cycle per PE trying to access it. + def compute_mem_latency(self): + lw_time = 0 + if any(count >= 1 for count in self.lw_count): + lw_time = 1 + for k in range (N_COLS): + lw_time += (self.lw_count[k]) + self.max_instr.time = max(self.max_instr.time, lw_time) + def get_neighbour_address( self, r, c, dir ): n_r = r n_c = c @@ -181,6 +216,7 @@ def __init__( self, parent, row, col ): self.regs = {'R0':0, 'R1':0, 'R2':0, 'R3':0 } self.op = "" self.instr = "" + self.time = 0 def get_out( self ): return self.old_out @@ -375,6 +411,17 @@ def blt( self, val1, val2, branch ): ops_jump = { 'JUMP' : '' } ops_exit = { 'EXIT' : '' } +def display_characterization(cgra): + total_time = 0 + print("Longest instructions per cycle:\n") + for index, item in enumerate(cgra.instr_time): + print("Cycle:", index + 1, "( ", item.instr2exec, " )") + print("Instruction:", item.instr) + print("Time:", item.time, "CC\n") + total_time += item.time + print("\nTotal time for all instructions:", total_time, "CC") + + def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None): ker = [] mem = [] @@ -407,5 +454,5 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= sorted_mem = sorted(mem, key=lambda x: x[0]) with open( kernel + "/"+FILENAME_MEM_O+version+EXT, 'w+') as f: for row in sorted_mem: csv.writer(f).writerow(row) - + display_characterization(cgra) print("\n\nEND") diff --git a/src/operation_characterization.csv b/src/operation_characterization.csv new file mode 100644 index 0000000..05ea6d1 --- /dev/null +++ b/src/operation_characterization.csv @@ -0,0 +1,27 @@ +# operation_latency_mapping +NOP,1 +EXIT,2 +SADD,1 +SSUB,1 +SLT,1 +SRT,1 +SRA,1 +LAND,1 +LOR,1 +LXOR,1 +LNAND,1 +LNOR,1 +LXNOR,1 +BSFA,1 +BZFA,1 +BEQ,1 +BNE,1 +BLT,1 +BGE,1 +JUMP,1 +LWD,2 +SWD,2 +LWI,2 +SWI,2 +SMUL,3 +FXPMUL,3 \ No newline at end of file From 8f403b8178ec07010a3047bc1e9493f5d2ee352f Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:14:17 +0200 Subject: [PATCH 2/9] added characterization.py --- src/cgra.py | 55 +++---------------- src/characterization.py | 88 ++++++++++++++++++++++++++++++ src/operation_characterization.csv | 2 +- 3 files changed, 98 insertions(+), 47 deletions(-) create mode 100644 src/characterization.py diff --git a/src/cgra.py b/src/cgra.py index 6371191..9c83609 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -1,9 +1,8 @@ -import copy import numpy as np from ctypes import c_int32 import csv import os.path - +from characterization import load_operation_characterization, display_characterization, get_latency_cc from kernels import * # CGRA from left to right, top to bottom @@ -21,20 +20,8 @@ srcs = ['ZERO', 'SELF', 'RCL', 'RCR', 'RCT', 'RCB', 'R0', 'R1', 'R2', 'R3', 'IMM'] dsts = ['SELF', 'RCL', 'RCR', 'RCT', 'RCB','R0', 'R1', 'R2', 'R3'] regs = dsts[-4:] - operation_latency_mapping = {} -script_dir = os.path.dirname(os.path.abspath(__file__)) -csv_file_path = os.path.join(script_dir, 'operation_characterization.csv') - -with open(csv_file_path, 'r') as csvfile: - reader = csv.reader(csvfile) - next(reader) - for row in reader: - operation, key = row - key = int(key) - if operation not in operation_latency_mapping: - operation_latency_mapping[operation] = key - +operation_latency_mapping = load_operation_characterization(operation_latency_mapping, "latency_cc") class INSTR: def __init__( self,matrix): self.time = matrix[0][0] # ToDo: Fix how we assign this length @@ -85,8 +72,9 @@ def __init__( self, kernel, memory, read_addrs, write_addrs): self.memory = memory self.instr2exec = 0 self.cycles = 0 - self.instr_time = [] - self.max_instr = None + self.total_latency_cc = 0 + self.instr_latency_cc = [] + self.max_latency_instr = None if read_addrs is not None and len(read_addrs) == N_COLS: self.load_addr = read_addrs else: @@ -113,8 +101,6 @@ def step( self, prs="ROUT" ): for c in range(N_COLS): self.cells[r][c].update() instr2exec = self.instr2exec - self.max_instr = None - self.lw_count = [0] * N_COLS if PRINT_OUTS: print("Instr = ", self.cycles, "(",instr2exec,")") for r in range(N_ROWS): for c in range(N_COLS): @@ -122,32 +108,18 @@ def step( self, prs="ROUT" ): b ,e = self.cells[r][c].exec( op ) if b != 0: self.instr2exec = b - 1 #To avoid more logic afterwards if e != 0: self.exit = True - if self.max_instr is None or self.cells[r][c].time > self.max_instr.time: - self.max_instr = self.cells[r][c] - if self.cells[r][c].op in ["LWD", "LWI", "SWD","SWI"]: - self.lw_count[c] += 1 outs = [ self.cells[r][i].out for i in range(N_COLS) ] insts = [ self.cells[r][i].instr for i in range(N_COLS) ] ops = [ self.cells[r][i].op for i in range(N_COLS) ] reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ] print_out( prs, outs, insts, ops, reg ) - self.compute_mem_latency() - self.max_instr.instr2exec = instr2exec - self.instr_time.append(copy.copy(self.max_instr)) + get_latency_cc(self) self.instr2exec += 1 self.cycles += 1 return self.exit - # A memory access to a memory bank has a 2-cycle overhead, - # plus 1 additional cycle per PE trying to access it. - def compute_mem_latency(self): - lw_time = 0 - if any(count >= 1 for count in self.lw_count): - lw_time = 1 - for k in range (N_COLS): - lw_time += (self.lw_count[k]) - self.max_instr.time = max(self.max_instr.time, lw_time) + def get_neighbour_address( self, r, c, dir ): n_r = r @@ -216,7 +188,7 @@ def __init__( self, parent, row, col ): self.regs = {'R0':0, 'R1':0, 'R2':0, 'R3':0 } self.op = "" self.instr = "" - self.time = 0 + self.latency_cc = 0 def get_out( self ): return self.old_out @@ -258,7 +230,7 @@ def run_instr( self, instr): self.op = instr[0] except: self.op = instr - + self.latency_cc = int(operation_latency_mapping[self.op]) if self.op in self.ops_arith: des = instr[1] val1 = self.fetch_val( instr[2] ) @@ -411,15 +383,6 @@ def blt( self, val1, val2, branch ): ops_jump = { 'JUMP' : '' } ops_exit = { 'EXIT' : '' } -def display_characterization(cgra): - total_time = 0 - print("Longest instructions per cycle:\n") - for index, item in enumerate(cgra.instr_time): - print("Cycle:", index + 1, "( ", item.instr2exec, " )") - print("Instruction:", item.instr) - print("Time:", item.time, "CC\n") - total_time += item.time - print("\nTotal time for all instructions:", total_time, "CC") def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None): diff --git a/src/characterization.py b/src/characterization.py new file mode 100644 index 0000000..87f96e0 --- /dev/null +++ b/src/characterization.py @@ -0,0 +1,88 @@ +import copy +import os.path +import csv +# from cgra import N_ROWS, N_COLS + +OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"] + +def load_operation_characterization(operation_mapping, characterization_type): + script_dir = os.path.dirname(os.path.abspath(__file__)) + csv_file_path = os.path.join(script_dir, 'operation_characterization.csv') + + with open(csv_file_path, 'r') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + if not row: + continue + if row[0].startswith('#'): + current_section = row[0].strip('# ') + continue + if current_section == f'operation_{characterization_type}_mapping': + if len(row) == 3: + key_type, value_type = int, float + operation, key, value = row + key = key_type(key) + value = value_type(value) + if operation not in operation_mapping: + operation_mapping[operation] = {} + operation_mapping[operation][key] = value + elif len(row) == 2: + key_type, value_type = int, int + operation, key = row + key = key_type(key) + if operation not in operation_mapping: + operation_mapping[operation] = key + else: + continue + return operation_mapping + +def get_latency_cc(self): + from cgra import N_ROWS, N_COLS + self.max_latency_instr = None + mem_latency_cc = 0 + for r in range(N_ROWS): + for c in range(N_COLS): + if self.max_latency_instr is None or self.cells[r][c].latency_cc > self.max_latency_instr.latency_cc: + self.max_latency_instr = self.cells[r][c] + if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + mem_latency_cc += 1 + # A memory access to a memory bank has a 2-cycle overhead, + # plus 1 additional cycle per PE trying to access it. + if mem_latency_cc >= 1: + mem_latency_cc += 1 + self.max_latency_instr.latency_cc = max(self.max_latency_instr.latency_cc, mem_latency_cc) + if (self.exit): + if (self.max_latency_instr.latency_cc > 2): + self.max_latency_instr.latency_cc += 1 + + self.max_latency_instr.instr2exec = self.instr2exec + self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) + self.total_latency_cc += self.instr_latency_cc[-1].latency_cc + + +# Example usage: +# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc + +def display_characterization(cgra): + print("Longest instructions per cycle:\n") + print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)")) + for index, item in enumerate(cgra.instr_latency_cc): + print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) + print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") + + +# def display_characterization(cgra): +# print("Longest instructions per cycle:\n") +# print("{:<8} {:<20} {:<10}".format("Cycle", "Instruction", "Latency")) +# for index, item in enumerate(cgra.instr_latency_cc): +# print("{:<2} ({:<8}) {:<20} {:<10} CC".format(index + 1, item.instr2exec, item.instr, item.latency_cc)) +# print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") + + +# Example usage: +# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc +# display_characterization(cgra) + +# Example usage: +# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc +# display_characterization(cgra) diff --git a/src/operation_characterization.csv b/src/operation_characterization.csv index 05ea6d1..0bdb3d8 100644 --- a/src/operation_characterization.csv +++ b/src/operation_characterization.csv @@ -1,4 +1,4 @@ -# operation_latency_mapping +# operation_latency_cc_mapping NOP,1 EXIT,2 SADD,1 From 3141ac586f7f1018ed8c25de9160aa43157302cb Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:16:05 +0200 Subject: [PATCH 3/9] removed comments --- src/characterization.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/src/characterization.py b/src/characterization.py index 87f96e0..ba4e34a 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -1,7 +1,6 @@ import copy import os.path import csv -# from cgra import N_ROWS, N_COLS OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"] @@ -59,30 +58,9 @@ def get_latency_cc(self): self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) self.total_latency_cc += self.instr_latency_cc[-1].latency_cc - -# Example usage: -# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc - def display_characterization(cgra): print("Longest instructions per cycle:\n") print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)")) for index, item in enumerate(cgra.instr_latency_cc): print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) - print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") - - -# def display_characterization(cgra): -# print("Longest instructions per cycle:\n") -# print("{:<8} {:<20} {:<10}".format("Cycle", "Instruction", "Latency")) -# for index, item in enumerate(cgra.instr_latency_cc): -# print("{:<2} ({:<8}) {:<20} {:<10} CC".format(index + 1, item.instr2exec, item.instr, item.latency_cc)) -# print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") - - -# Example usage: -# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc -# display_characterization(cgra) - -# Example usage: -# Assuming cgra is an object with attributes instr_latency_cc and total_latency_cc -# display_characterization(cgra) + print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") \ No newline at end of file From fe3d72d5db4b1e9b42a5195076e857b1c35858d6 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:52:35 +0200 Subject: [PATCH 4/9] estimator uses bus type to simulate latency --- src/cgra.py | 19 ++- src/characterization.py | 179 +++++++++++++++++++++++++---- src/operation_characterization.csv | 11 +- 3 files changed, 177 insertions(+), 32 deletions(-) diff --git a/src/cgra.py b/src/cgra.py index 9c83609..e901128 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -1,8 +1,9 @@ +import copy import numpy as np from ctypes import c_int32 import csv import os.path -from characterization import load_operation_characterization, display_characterization, get_latency_cc +from characterization import display_characterization, get_latency_cc from kernels import * # CGRA from left to right, top to bottom @@ -20,8 +21,7 @@ srcs = ['ZERO', 'SELF', 'RCL', 'RCR', 'RCT', 'RCB', 'R0', 'R1', 'R2', 'R3', 'IMM'] dsts = ['SELF', 'RCL', 'RCR', 'RCT', 'RCB','R0', 'R1', 'R2', 'R3'] regs = dsts[-4:] -operation_latency_mapping = {} -operation_latency_mapping = load_operation_characterization(operation_latency_mapping, "latency_cc") +flag_poll_cnt = 0 class INSTR: def __init__( self,matrix): self.time = matrix[0][0] # ToDo: Fix how we assign this length @@ -48,8 +48,8 @@ def print_out( prs, outs, insts, ops, reg ): elif pr == "R1" : pnt = reg[1] elif pr == "R2" : pnt = reg[2] elif pr == "R3" : pnt = reg[3] - - out_string += "[" + if pnt != []: + out_string += "[" for i in range(len(pnt)): out_string += "{{{}:4}}".format(i) if i == (len(pnt) - 1): @@ -83,6 +83,7 @@ def __init__( self, kernel, memory, read_addrs, write_addrs): self.store_addr = write_addrs else: self.store_addr = [0]*N_COLS + self.init_store = copy.copy(self.store_addr) self.exit = False def run( self, pr, limit ): @@ -114,13 +115,12 @@ def step( self, prs="ROUT" ): reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ] print_out( prs, outs, insts, ops, reg ) - get_latency_cc(self) + get_latency_cc(self, prs) self.instr2exec += 1 self.cycles += 1 - return self.exit + return self.exit - def get_neighbour_address( self, r, c, dir ): n_r = r n_c = c @@ -230,7 +230,6 @@ def run_instr( self, instr): self.op = instr[0] except: self.op = instr - self.latency_cc = int(operation_latency_mapping[self.op]) if self.op in self.ops_arith: des = instr[1] val1 = self.fetch_val( instr[2] ) @@ -417,5 +416,5 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= sorted_mem = sorted(mem, key=lambda x: x[0]) with open( kernel + "/"+FILENAME_MEM_O+version+EXT, 'w+') as f: for row in sorted_mem: csv.writer(f).writerow(row) - display_characterization(cgra) + display_characterization(cgra, pr) print("\n\nEND") diff --git a/src/characterization.py b/src/characterization.py index ba4e34a..380bfcb 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -1,13 +1,15 @@ import copy +import math import os.path import csv OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"] +BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"] -def load_operation_characterization(operation_mapping, characterization_type): +def load_operation_characterization(characterization_type): + operation_mapping = {} script_dir = os.path.dirname(os.path.abspath(__file__)) csv_file_path = os.path.join(script_dir, 'operation_characterization.csv') - with open(csv_file_path, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: @@ -35,32 +37,167 @@ def load_operation_characterization(operation_mapping, characterization_type): continue return operation_mapping -def get_latency_cc(self): - from cgra import N_ROWS, N_COLS +operation_latency_mapping = load_operation_characterization("latency_cc") +bus_type_active_row_coef = load_operation_characterization("active_row_coef") +bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs") + +def get_latency_cc(self, prs): + bus_type = next((item for item in BUS_TYPES if item in prs), "ONE-TO-M") self.max_latency_instr = None + mem_latency_cc = find_longest_operation(self) + mem_latency_cc = adjust_latency_for_bus(self, mem_latency_cc, bus_type) + if mem_latency_cc > self.max_latency_instr.latency_cc: + self.max_latency_instr.latency_cc = mem_latency_cc + self.max_latency_instr.instr = f'MEM ({self.max_latency_instr.instr})' + if (self.exit): + self.max_latency_instr.latency_cc += 1 + self.max_latency_instr.instr2exec = self.instr2exec + self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) + self.total_latency_cc += self.instr_latency_cc[-1].latency_cc + +def find_longest_operation(self): + from cgra import N_ROWS, N_COLS + self.mem_count = [0] * N_COLS mem_latency_cc = 0 for r in range(N_ROWS): - for c in range(N_COLS): + for c in range(N_COLS): + self.cells[r][c].latency_cc = int(operation_latency_mapping[self.cells[r][c].op]) if self.max_latency_instr is None or self.cells[r][c].latency_cc > self.max_latency_instr.latency_cc: self.max_latency_instr = self.cells[r][c] if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: mem_latency_cc += 1 - # A memory access to a memory bank has a 2-cycle overhead, - # plus 1 additional cycle per PE trying to access it. + self.mem_count[c] += 1 if mem_latency_cc >= 1: mem_latency_cc += 1 - self.max_latency_instr.latency_cc = max(self.max_latency_instr.latency_cc, mem_latency_cc) - if (self.exit): - if (self.max_latency_instr.latency_cc > 2): - self.max_latency_instr.latency_cc += 1 - - self.max_latency_instr.instr2exec = self.instr2exec - self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) - self.total_latency_cc += self.instr_latency_cc[-1].latency_cc + return mem_latency_cc + +def adjust_latency_for_bus(self, mem_latency_cc, bus_type): + from cgra import N_ROWS, flag_poll_cnt + ACTIVE_ROW_COEF = bus_type_active_row_coef[bus_type] + CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[bus_type] + for i in range (N_ROWS): + if self.mem_count[i] != 0: + mem_latency_cc += ACTIVE_ROW_COEF + if CPU_LOOP_INSTRS != 0: + flag_poll_cnt += mem_latency_cc + if flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: + mem_latency_cc += 1 + if bus_type == "INTERLEAVED": + concurrent_accesses = group_sequential_accesses(self) + mem_latency_cc = find_longest_sequence(concurrent_accesses) + return mem_latency_cc + +def group_sequential_accesses(self): + from cgra import N_ROWS, N_COLS + self.curr_lwd = [0] * 4 + self.curr_swd = [0] * 4 + covered_accesses = [] + # count the number of direct accesses per column + for r in range(N_ROWS): + for c in range(N_COLS): + if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + if self.cells[r][c].op == "LWD": + self.curr_lwd[c] += 1 + if self.cells[r][c].op == "SWD": + self.curr_swd[c] += 1 + concurrent_accesses = [{} for _ in range(4)] + # reorder memory accesses to group into concurrent executions + # covered_accesses tracks the accesses that have already been visited + for r in range(N_ROWS): + for c in range(N_COLS): + if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in covered_accesses: + index_pos = record_bank_access(self, r, c) + covered_accesses.append((r, c)) + if index_pos not in concurrent_accesses[r]: + concurrent_accesses[r][index_pos] = [] + concurrent_accesses[r][index_pos].append((r, c) ) + else: + for k in range(N_ROWS): + if self.cells[k][c].op in OPERATIONS_MEMORY_ACCESS and (k, c) not in covered_accesses: + index_pos = record_bank_access(self, k, c) + covered_accesses.append((k, c)) + if index_pos not in concurrent_accesses[r]: + concurrent_accesses[r][index_pos] = [] + concurrent_accesses[r][index_pos].append((k, c)) + break + if not accesses_are_ordered(concurrent_accesses): + for i in range(N_ROWS - 1, 0, -1): + concurrent_accesses[i-1] = rearrange_accesses(concurrent_accesses[i-1], concurrent_accesses[i]) + return concurrent_accesses + +def accesses_are_ordered(concurrent_accesses): + highest_row = [0] * 4 + from cgra import N_ROWS + for i in range (N_ROWS): + for values in concurrent_accesses[i].values(): + for current_access in values: + if highest_row[i] > current_access[0]: + return False + else: + highest_row[i] = current_access[0] + return True + +def find_longest_sequence(concurrent_accesses): + from cgra import N_ROWS + latency = [1] * 4 + for i in range (N_ROWS): + for values in concurrent_accesses[i].values(): + for current_access in values: + # find the position of an access within the conflict, as well as that of the next dependency + access_pos = find_position(concurrent_accesses[i], current_access[1]) + 1 + if i < N_ROWS - 1: + latency[current_access[1]] += access_pos - find_position(concurrent_accesses[i+1],current_access[1]) + else: + latency[current_access[1]] += access_pos + return max(latency) + +def rearrange_accesses(first_list, second_list): + order_pairs = [] + for second_pairs in second_list.values(): + for second_pair in second_pairs: + order_pairs.append(second_pair[1]) + order_pairs_reversed = list(reversed(order_pairs)) + sorted_first_list = {} + for key, pairs in first_list.items(): + sorted_pairs = sorted(pairs, key=lambda x: order_pairs_reversed.index(x[1]) if x[1] in order_pairs_reversed else float('inf')) + sorted_first_list[key] = sorted_pairs + return sorted_first_list + +def find_position(conflict_pos, column): + for pairs in conflict_pos.items(): + for pair in pairs[1]: + if pair[1] == column: + return pairs[1].index(pair) + return 0 + +def record_bank_access(self, r, c) : + if self.cells[r][c].op == "LWD": + addr = self.load_addr[self.cells[r][c].col] - (4 * self.curr_lwd[c]) + self.curr_lwd[c] -= 1 + elif self.cells[r][c].op == "LWI": + instr = self.cells[r][c].instr + instr = instr.split() + addr = self.cells[r][c].fetch_val(instr[2]) + elif self.cells[r][c].op == "SWD": + addr = self.store_addr[self.cells[r][c].col] - (4 * self.curr_swd[c]) + index_pos = int(((addr - self.init_store[0]) / 4) % 8) + self.curr_swd[c] -= 1 + return index_pos + elif self.cells[r][c].op == "SWI": + instr = self.cells[r][c].instr + instr = instr.split() + addr = self.cells[r][c].fetch_val(instr[2]) + index_pos = int(((addr - sorted(self.memory)[0][0]) / 4) % 8) + return index_pos -def display_characterization(cgra): - print("Longest instructions per cycle:\n") - print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)")) - for index, item in enumerate(cgra.instr_latency_cc): - print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) - print("\nTotal latency for all instructions:", cgra.total_latency_cc, "CC") \ No newline at end of file +def display_characterization(cgra, pr): + if any(item in pr for item in ["OP_MAX_LAT", "ALL_LAT_INFO"]): + print("Longest instructions per cycle:\n") + print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)")) + for index, item in enumerate(cgra.instr_latency_cc): + print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) + if any(item in pr for item in ["TOTAL_LAT", "ALL_LAT_INFO"]): + print(f'\nConfiguration time: {len(cgra.instrs)} CC') + print(f'Time between end of configuration and start of first iteration: {math.ceil(14 + (len(cgra.instrs) * 3))} CC') + print(f'Total time for all instructions: {cgra.total_latency_cc}') + \ No newline at end of file diff --git a/src/operation_characterization.csv b/src/operation_characterization.csv index 0bdb3d8..c5a77d1 100644 --- a/src/operation_characterization.csv +++ b/src/operation_characterization.csv @@ -24,4 +24,13 @@ SWD,2 LWI,2 SWI,2 SMUL,3 -FXPMUL,3 \ No newline at end of file +FXPMUL,3 + +# operation_active_row_coef_mapping +ONE-TO-M, 1 +N-TO-M, 0 +INTERLEAVED, 0 +# operation_cpu_loop_instrs_mapping +ONE-TO-M, 5 +N-TO-M, 0 +INTERLEAVED, 0 \ No newline at end of file From 22cc30c1f94936c7001132736390a6e7b882a8b1 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:43:44 +0200 Subject: [PATCH 5/9] divided latency func into steps --- src/cgra.py | 16 ++-- src/characterization.py | 168 +++++++++++++++++++--------------------- 2 files changed, 90 insertions(+), 94 deletions(-) diff --git a/src/cgra.py b/src/cgra.py index e901128..921bc7b 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -12,7 +12,7 @@ INSTR_SIZE = N_ROWS+1 MAX_COL = N_COLS - 1 MAX_ROW = N_ROWS - 1 - +BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"] PRINT_OUTS = 1 MAX_32b = 0xFFFFFFFF @@ -115,7 +115,7 @@ def step( self, prs="ROUT" ): reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ] print_out( prs, outs, insts, ops, reg ) - get_latency_cc(self, prs) + get_latency_cc(self) self.instr2exec += 1 self.cycles += 1 return self.exit @@ -189,6 +189,7 @@ def __init__( self, parent, row, col ): self.op = "" self.instr = "" self.latency_cc = 0 + self.addr = 0 def get_out( self ): return self.old_out @@ -258,25 +259,27 @@ def run_instr( self, instr): elif self.op in self.ops_lwd: des = instr[1] + self.addr = self.parent.load_addr[self.col] ret = self.parent.load_direct( self.col, 4 ) if des in self.regs: self.regs[des] = ret self.out = ret elif self.op in self.ops_swd: val = self.fetch_val( instr[1] ) + self.addr = self.parent.store_addr[self.col] self.parent.store_direct( self.col, val, 4 ) elif self.op in self.ops_lwi: des = instr[1] - addr = self.fetch_val( instr[2] ) - ret = self.parent.load_indirect(addr) + self.addr = self.fetch_val( instr[2] ) + ret = self.parent.load_indirect(self.addr) if des in self.regs: self.regs[des] = ret self.out = ret elif self.op in self.ops_swi: - addr = self.fetch_val( instr[2] ) + self.addr = self.fetch_val( instr[2] ) val = self.fetch_val( instr[1] ) - self.parent.store_indirect( addr, val ) + self.parent.store_indirect( self.addr, val ) pass elif self.op in self.ops_nop: @@ -410,6 +413,7 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= # Run the kernel cgra = CGRA(ker, mem, load_addrs, store_addrs) + cgra.bus_type = next((item for item in BUS_TYPES if item in pr), "ONE-TO-M") mem = cgra.run(pr, limit) # Store the output sorted diff --git a/src/characterization.py b/src/characterization.py index 380bfcb..7e09785 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -41,13 +41,12 @@ def load_operation_characterization(characterization_type): bus_type_active_row_coef = load_operation_characterization("active_row_coef") bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs") -def get_latency_cc(self, prs): - bus_type = next((item for item in BUS_TYPES if item in prs), "ONE-TO-M") +def get_latency_cc(self): self.max_latency_instr = None - mem_latency_cc = find_longest_operation(self) - mem_latency_cc = adjust_latency_for_bus(self, mem_latency_cc, bus_type) - if mem_latency_cc > self.max_latency_instr.latency_cc: - self.max_latency_instr.latency_cc = mem_latency_cc + longest_alu_op_latency_cc = get_latency_alu_cc(self) + total_mem_latency_cc = get_latency_mem_cc(self) + self.max_latency_instr.latency_cc = max(longest_alu_op_latency_cc, total_mem_latency_cc) + if total_mem_latency_cc > longest_alu_op_latency_cc: self.max_latency_instr.instr = f'MEM ({self.max_latency_instr.instr})' if (self.exit): self.max_latency_instr.latency_cc += 1 @@ -55,76 +54,70 @@ def get_latency_cc(self, prs): self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) self.total_latency_cc += self.instr_latency_cc[-1].latency_cc -def find_longest_operation(self): +def get_latency_alu_cc(self): from cgra import N_ROWS, N_COLS - self.mem_count = [0] * N_COLS - mem_latency_cc = 0 for r in range(N_ROWS): for c in range(N_COLS): self.cells[r][c].latency_cc = int(operation_latency_mapping[self.cells[r][c].op]) if self.max_latency_instr is None or self.cells[r][c].latency_cc > self.max_latency_instr.latency_cc: - self.max_latency_instr = self.cells[r][c] - if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: - mem_latency_cc += 1 - self.mem_count[c] += 1 - if mem_latency_cc >= 1: - mem_latency_cc += 1 - return mem_latency_cc - -def adjust_latency_for_bus(self, mem_latency_cc, bus_type): - from cgra import N_ROWS, flag_poll_cnt - ACTIVE_ROW_COEF = bus_type_active_row_coef[bus_type] - CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[bus_type] - for i in range (N_ROWS): - if self.mem_count[i] != 0: - mem_latency_cc += ACTIVE_ROW_COEF - if CPU_LOOP_INSTRS != 0: - flag_poll_cnt += mem_latency_cc - if flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: - mem_latency_cc += 1 - if bus_type == "INTERLEAVED": - concurrent_accesses = group_sequential_accesses(self) - mem_latency_cc = find_longest_sequence(concurrent_accesses) - return mem_latency_cc + self.max_latency_instr = self.cells[r][c] + return self.max_latency_instr.latency_cc + +def get_latency_mem_cc(self): + record_bank_access(self) + self.concurrent_accesses = group_dma_accesses(self) + dependencies = track_dependencies(self) + latency_cc = compute_latency_cc(self, dependencies) + return latency_cc -def group_sequential_accesses(self): +def record_bank_access(self): from cgra import N_ROWS, N_COLS - self.curr_lwd = [0] * 4 - self.curr_swd = [0] * 4 - covered_accesses = [] - # count the number of direct accesses per column for r in range(N_ROWS): for c in range(N_COLS): - if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: - if self.cells[r][c].op == "LWD": - self.curr_lwd[c] += 1 - if self.cells[r][c].op == "SWD": - self.curr_swd[c] += 1 + if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + self.cells[r][c].bank_index = compute_bank_index(self,r,c) + +def compute_bank_index(self, r, c) : + if self.bus_type == "INTERLEAVED": + if self.cells[r][c].op == "SWD": + index_pos = int(((self.cells[r][c].addr - self.init_store[0]) / 4) % 8) + else: + index_pos = int(((self.cells[r][c].addr - sorted(self.memory)[0][0]) / 4) % 8) + elif self.bus_type == "N-TO-M": + index_pos = 1 + elif self.bus_type == "ONE-TO-M": + index_pos = 1 + return index_pos + +def group_dma_accesses(self): + from cgra import N_ROWS, N_COLS + covered_accesses = [] concurrent_accesses = [{} for _ in range(4)] # reorder memory accesses to group into concurrent executions # covered_accesses tracks the accesses that have already been visited for r in range(N_ROWS): for c in range(N_COLS): if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in covered_accesses: - index_pos = record_bank_access(self, r, c) - covered_accesses.append((r, c)) - if index_pos not in concurrent_accesses[r]: - concurrent_accesses[r][index_pos] = [] - concurrent_accesses[r][index_pos].append((r, c) ) + covered_accesses, concurrent_accesses = update_accesses(covered_accesses, concurrent_accesses, r, c, r, self.cells[r][c].bank_index) else: for k in range(N_ROWS): if self.cells[k][c].op in OPERATIONS_MEMORY_ACCESS and (k, c) not in covered_accesses: - index_pos = record_bank_access(self, k, c) - covered_accesses.append((k, c)) - if index_pos not in concurrent_accesses[r]: - concurrent_accesses[r][index_pos] = [] - concurrent_accesses[r][index_pos].append((k, c)) + covered_accesses, concurrent_accesses = update_accesses(covered_accesses, concurrent_accesses, r, c, k, self.cells[k][c].bank_index) break - if not accesses_are_ordered(concurrent_accesses): + if self.bus_type != "INTERLEAVED": + concurrent_accesses = [{1: [(0, 0)] * len(covered_accesses)}, {}, {}, {}] + elif not accesses_are_ordered(concurrent_accesses): for i in range(N_ROWS - 1, 0, -1): concurrent_accesses[i-1] = rearrange_accesses(concurrent_accesses[i-1], concurrent_accesses[i]) return concurrent_accesses +def update_accesses(covered_accesses, concurrent_accesses, r, c, k, bank_index): + covered_accesses.append((k, c)) + if bank_index not in concurrent_accesses[r]: + concurrent_accesses[r][bank_index] = [] + concurrent_accesses[r][bank_index].append((k, c)) + return covered_accesses, concurrent_accesses + def accesses_are_ordered(concurrent_accesses): highest_row = [0] * 4 from cgra import N_ROWS @@ -137,20 +130,6 @@ def accesses_are_ordered(concurrent_accesses): highest_row[i] = current_access[0] return True -def find_longest_sequence(concurrent_accesses): - from cgra import N_ROWS - latency = [1] * 4 - for i in range (N_ROWS): - for values in concurrent_accesses[i].values(): - for current_access in values: - # find the position of an access within the conflict, as well as that of the next dependency - access_pos = find_position(concurrent_accesses[i], current_access[1]) + 1 - if i < N_ROWS - 1: - latency[current_access[1]] += access_pos - find_position(concurrent_accesses[i+1],current_access[1]) - else: - latency[current_access[1]] += access_pos - return max(latency) - def rearrange_accesses(first_list, second_list): order_pairs = [] for second_pairs in second_list.values(): @@ -163,6 +142,20 @@ def rearrange_accesses(first_list, second_list): sorted_first_list[key] = sorted_pairs return sorted_first_list +def track_dependencies(self): + from cgra import N_ROWS + latency = [1] * 4 + for i in range (N_ROWS): + for values in self.concurrent_accesses[i].values(): + for current_access in values: + # find the position of an access within the conflict, as well as that of the next dependency + access_pos = find_position(self.concurrent_accesses[i], current_access[1]) + 1 + if i < N_ROWS - 1: + latency[current_access[1]] += access_pos - find_position(self.concurrent_accesses[i+1],current_access[1]) + else: + latency[current_access[1]] += access_pos + return latency + def find_position(conflict_pos, column): for pairs in conflict_pos.items(): for pair in pairs[1]: @@ -170,25 +163,25 @@ def find_position(conflict_pos, column): return pairs[1].index(pair) return 0 -def record_bank_access(self, r, c) : - if self.cells[r][c].op == "LWD": - addr = self.load_addr[self.cells[r][c].col] - (4 * self.curr_lwd[c]) - self.curr_lwd[c] -= 1 - elif self.cells[r][c].op == "LWI": - instr = self.cells[r][c].instr - instr = instr.split() - addr = self.cells[r][c].fetch_val(instr[2]) - elif self.cells[r][c].op == "SWD": - addr = self.store_addr[self.cells[r][c].col] - (4 * self.curr_swd[c]) - index_pos = int(((addr - self.init_store[0]) / 4) % 8) - self.curr_swd[c] -= 1 - return index_pos - elif self.cells[r][c].op == "SWI": - instr = self.cells[r][c].instr - instr = instr.split() - addr = self.cells[r][c].fetch_val(instr[2]) - index_pos = int(((addr - sorted(self.memory)[0][0]) / 4) % 8) - return index_pos +def compute_latency_cc(self, dependencies): + from cgra import N_ROWS, N_COLS, flag_poll_cnt + ACTIVE_ROW_COEF = bus_type_active_row_coef[self.bus_type] + CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[self.bus_type] + mem_count = [0] * N_COLS + latency_cc = max(dependencies) + for r in range(N_ROWS): + for c in range(N_COLS): + if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + mem_count[c] += 1 + if ACTIVE_ROW_COEF != 0: + for i in range (N_ROWS): + if mem_count[i] != 0: + latency_cc += ACTIVE_ROW_COEF + if CPU_LOOP_INSTRS != 0: + flag_poll_cnt += latency_cc + if flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: + latency_cc += 1 + return latency_cc def display_characterization(cgra, pr): if any(item in pr for item in ["OP_MAX_LAT", "ALL_LAT_INFO"]): @@ -199,5 +192,4 @@ def display_characterization(cgra, pr): if any(item in pr for item in ["TOTAL_LAT", "ALL_LAT_INFO"]): print(f'\nConfiguration time: {len(cgra.instrs)} CC') print(f'Time between end of configuration and start of first iteration: {math.ceil(14 + (len(cgra.instrs) * 3))} CC') - print(f'Total time for all instructions: {cgra.total_latency_cc}') - \ No newline at end of file + print(f'Total time for all instructions: {cgra.total_latency_cc}') \ No newline at end of file From b46f6d2b336ac2f9309b13b0d484c50809f55ba8 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:32:50 +0200 Subject: [PATCH 6/9] refactored latency algo --- src/cgra.py | 15 +-- src/characterization.py | 199 +++++++++++++++++++--------------------- src/kernels.py | 32 +------ src/memory.py | 41 +++++++++ 4 files changed, 144 insertions(+), 143 deletions(-) create mode 100644 src/memory.py diff --git a/src/cgra.py b/src/cgra.py index 921bc7b..2faf4c0 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -5,6 +5,7 @@ import os.path from characterization import display_characterization, get_latency_cc from kernels import * +from memory import * # CGRA from left to right, top to bottom N_ROWS = 4 @@ -21,7 +22,7 @@ srcs = ['ZERO', 'SELF', 'RCL', 'RCR', 'RCT', 'RCB', 'R0', 'R1', 'R2', 'R3', 'IMM'] dsts = ['SELF', 'RCL', 'RCR', 'RCT', 'RCB','R0', 'R1', 'R2', 'R3'] regs = dsts[-4:] -flag_poll_cnt = 0 + class INSTR: def __init__( self,matrix): self.time = matrix[0][0] # ToDo: Fix how we assign this length @@ -61,7 +62,7 @@ def print_out( prs, outs, insts, ops, reg ): class CGRA: - def __init__( self, kernel, memory, read_addrs, write_addrs): + def __init__( self, kernel, memory, read_addrs, write_addrs, memory_manager): self.cells = [] for r in range(N_ROWS): list = [] @@ -72,6 +73,8 @@ def __init__( self, kernel, memory, read_addrs, write_addrs): self.memory = memory self.instr2exec = 0 self.cycles = 0 + self.N_COLS = N_COLS + self.N_ROWS = N_ROWS self.total_latency_cc = 0 self.instr_latency_cc = [] self.max_latency_instr = None @@ -84,6 +87,7 @@ def __init__( self, kernel, memory, read_addrs, write_addrs): else: self.store_addr = [0]*N_COLS self.init_store = copy.copy(self.store_addr) + self.memory_manager = memory_manager self.exit = False def run( self, pr, limit ): @@ -114,7 +118,7 @@ def step( self, prs="ROUT" ): ops = [ self.cells[r][i].op for i in range(N_COLS) ] reg = [[ self.cells[r][i].regs[regs[x]] for i in range(N_COLS) ] for x in range(len(regs)) ] print_out( prs, outs, insts, ops, reg ) - + self.flag_poll_cnt = 0 get_latency_cc(self) self.instr2exec += 1 self.cycles += 1 @@ -387,7 +391,7 @@ def blt( self, val1, val2, branch ): -def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None): +def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs=None, memory_manager=MEMORY()): ker = [] mem = [] @@ -412,8 +416,7 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= return None # Run the kernel - cgra = CGRA(ker, mem, load_addrs, store_addrs) - cgra.bus_type = next((item for item in BUS_TYPES if item in pr), "ONE-TO-M") + cgra = CGRA(ker, mem, load_addrs, store_addrs, memory_manager) mem = cgra.run(pr, limit) # Store the output sorted diff --git a/src/characterization.py b/src/characterization.py index 7e09785..1d24887 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -5,11 +5,12 @@ OPERATIONS_MEMORY_ACCESS = ["LWD", "LWI", "SWD","SWI"] BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"] +INTERVAL_CST = 14 -def load_operation_characterization(characterization_type): +def load_operation_characterization(characterization_type, mapping_file): operation_mapping = {} script_dir = os.path.dirname(os.path.abspath(__file__)) - csv_file_path = os.path.join(script_dir, 'operation_characterization.csv') + csv_file_path = os.path.join(script_dir, mapping_file) with open(csv_file_path, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: @@ -37,91 +38,82 @@ def load_operation_characterization(characterization_type): continue return operation_mapping -operation_latency_mapping = load_operation_characterization("latency_cc") -bus_type_active_row_coef = load_operation_characterization("active_row_coef") -bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs") +operation_latency_mapping = load_operation_characterization("latency_cc", 'operation_characterization.csv') +bus_type_active_row_coef = load_operation_characterization("active_row_coef", 'operation_characterization.csv') +bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs", 'operation_characterization.csv') -def get_latency_cc(self): - self.max_latency_instr = None - longest_alu_op_latency_cc = get_latency_alu_cc(self) - total_mem_latency_cc = get_latency_mem_cc(self) - self.max_latency_instr.latency_cc = max(longest_alu_op_latency_cc, total_mem_latency_cc) +# This function takes the maximum latency between the memory operations and the non-memory operations in the instruction +def get_latency_cc(cgra): + cgra.max_latency_instr = None + longest_alu_op_latency_cc = get_latency_alu_cc(cgra) + total_mem_latency_cc = get_latency_mem_cc(cgra) + cgra.max_latency_instr.latency_cc = max(longest_alu_op_latency_cc, total_mem_latency_cc) if total_mem_latency_cc > longest_alu_op_latency_cc: - self.max_latency_instr.instr = f'MEM ({self.max_latency_instr.instr})' - if (self.exit): - self.max_latency_instr.latency_cc += 1 - self.max_latency_instr.instr2exec = self.instr2exec - self.instr_latency_cc.append(copy.copy(self.max_latency_instr)) - self.total_latency_cc += self.instr_latency_cc[-1].latency_cc - -def get_latency_alu_cc(self): - from cgra import N_ROWS, N_COLS - for r in range(N_ROWS): - for c in range(N_COLS): - self.cells[r][c].latency_cc = int(operation_latency_mapping[self.cells[r][c].op]) - if self.max_latency_instr is None or self.cells[r][c].latency_cc > self.max_latency_instr.latency_cc: - self.max_latency_instr = self.cells[r][c] - return self.max_latency_instr.latency_cc - -def get_latency_mem_cc(self): - record_bank_access(self) - self.concurrent_accesses = group_dma_accesses(self) - dependencies = track_dependencies(self) - latency_cc = compute_latency_cc(self, dependencies) + cgra.max_latency_instr.instr = f'MEM ({cgra.max_latency_instr.instr})' + if (cgra.exit): + cgra.max_latency_instr.latency_cc += 1 + cgra.max_latency_instr.instr2exec = cgra.instr2exec + cgra.instr_latency_cc.append(copy.copy(cgra.max_latency_instr)) + cgra.total_latency_cc += cgra.instr_latency_cc[-1].latency_cc + +def get_latency_alu_cc(cgra): + for r in range(cgra.N_ROWS): + for c in range(cgra.N_COLS): + cgra.cells[r][c].latency_cc = int(operation_latency_mapping[cgra.cells[r][c].op]) + if cgra.max_latency_instr is None or cgra.cells[r][c].latency_cc > cgra.max_latency_instr.latency_cc: + cgra.max_latency_instr = cgra.cells[r][c] + return cgra.max_latency_instr.latency_cc + +def get_latency_mem_cc(cgra): + record_bank_access(cgra) + cgra.concurrent_accesses = group_dma_accesses(cgra) + dependencies = track_dependencies(cgra) + latency_cc = compute_latency_cc(cgra, dependencies) return latency_cc -def record_bank_access(self): - from cgra import N_ROWS, N_COLS - for r in range(N_ROWS): - for c in range(N_COLS): - if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: - self.cells[r][c].bank_index = compute_bank_index(self,r,c) - -def compute_bank_index(self, r, c) : - if self.bus_type == "INTERLEAVED": - if self.cells[r][c].op == "SWD": - index_pos = int(((self.cells[r][c].addr - self.init_store[0]) / 4) % 8) - else: - index_pos = int(((self.cells[r][c].addr - sorted(self.memory)[0][0]) / 4) % 8) - elif self.bus_type == "N-TO-M": - index_pos = 1 - elif self.bus_type == "ONE-TO-M": - index_pos = 1 +# Record the bank index used for each memory access +def record_bank_access(cgra): + for r in range(cgra.N_ROWS): + for c in range(cgra.N_COLS): + if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + cgra.cells[r][c].bank_index = compute_bank_index(cgra,r,c) + +def compute_bank_index(cgra, r, c): + base_addr = cgra.init_store[0] if cgra.cells[r][c].op == "SWD" else sorted(cgra.memory)[0][0] + if cgra.memory_manager.bus_type == "INTERLEAVED": + index_pos = int(((cgra.cells[r][c].addr - base_addr) / cgra.memory_manager.spacing) % cgra.memory_manager.n_banks) + else: + index_pos = cgra.cells[r][c].addr / cgra.memory_manager.bank_size return index_pos -def group_dma_accesses(self): - from cgra import N_ROWS, N_COLS - covered_accesses = [] +def group_dma_accesses(cgra): + # For each row, scan the PEs for memory accesses + # If it lands on a column without a memory access, then scan all the rows on that column (= push up/down) + cgra.covered_accesses = [] concurrent_accesses = [{} for _ in range(4)] - # reorder memory accesses to group into concurrent executions - # covered_accesses tracks the accesses that have already been visited - for r in range(N_ROWS): - for c in range(N_COLS): - if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in covered_accesses: - covered_accesses, concurrent_accesses = update_accesses(covered_accesses, concurrent_accesses, r, c, r, self.cells[r][c].bank_index) + for r in range(cgra.N_ROWS): + for c in range(cgra.N_COLS): + if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in cgra.covered_accesses: + cgra.covered_accesses, concurrent_accesses = mark_access(cgra.covered_accesses, concurrent_accesses, r, c, r, cgra.cells[r][c].bank_index) else: - for k in range(N_ROWS): - if self.cells[k][c].op in OPERATIONS_MEMORY_ACCESS and (k, c) not in covered_accesses: - covered_accesses, concurrent_accesses = update_accesses(covered_accesses, concurrent_accesses, r, c, k, self.cells[k][c].bank_index) + for k in range(cgra.N_ROWS): + if cgra.cells[k][c].op in OPERATIONS_MEMORY_ACCESS and (k, c) not in cgra.covered_accesses: + cgra.covered_accesses, concurrent_accesses = mark_access(cgra.covered_accesses, concurrent_accesses, r, c, k, cgra.cells[k][c].bank_index) break - if self.bus_type != "INTERLEAVED": - concurrent_accesses = [{1: [(0, 0)] * len(covered_accesses)}, {}, {}, {}] - elif not accesses_are_ordered(concurrent_accesses): - for i in range(N_ROWS - 1, 0, -1): + if not accesses_are_ordered(cgra, concurrent_accesses): + for i in range(cgra.N_ROWS - 1, 0, -1): concurrent_accesses[i-1] = rearrange_accesses(concurrent_accesses[i-1], concurrent_accesses[i]) return concurrent_accesses -def update_accesses(covered_accesses, concurrent_accesses, r, c, k, bank_index): +def mark_access(covered_accesses, concurrent_accesses, r, c, k, bank_index): + # Record a PE and its bank index into concurrent_accesses covered_accesses.append((k, c)) - if bank_index not in concurrent_accesses[r]: - concurrent_accesses[r][bank_index] = [] - concurrent_accesses[r][bank_index].append((k, c)) + concurrent_accesses[r].setdefault(bank_index, []).append((k, c)) return covered_accesses, concurrent_accesses -def accesses_are_ordered(concurrent_accesses): +def accesses_are_ordered(cgra, concurrent_accesses): highest_row = [0] * 4 - from cgra import N_ROWS - for i in range (N_ROWS): + for i in range (cgra.N_ROWS): for values in concurrent_accesses[i].values(): for current_access in values: if highest_row[i] > current_access[0]: @@ -130,30 +122,24 @@ def accesses_are_ordered(concurrent_accesses): highest_row[i] = current_access[0] return True +# This function arranges the concurrent lists to ensure they match the DMA's behavior def rearrange_accesses(first_list, second_list): - order_pairs = [] - for second_pairs in second_list.values(): - for second_pair in second_pairs: - order_pairs.append(second_pair[1]) - order_pairs_reversed = list(reversed(order_pairs)) - sorted_first_list = {} - for key, pairs in first_list.items(): - sorted_pairs = sorted(pairs, key=lambda x: order_pairs_reversed.index(x[1]) if x[1] in order_pairs_reversed else float('inf')) - sorted_first_list[key] = sorted_pairs - return sorted_first_list - -def track_dependencies(self): - from cgra import N_ROWS + order_pairs = [pair[1] for pairs in second_list.values() for pair in pairs][::-1] + return {key: sorted(pairs, key=lambda x: order_pairs.index(x[1]) if x[1] in order_pairs else float('inf')) + for key, pairs in first_list.items()} + +def track_dependencies(cgra): + # Latencies for non-interleaved bus types require the total number of accesses + if cgra.memory_manager.bus_type != "INTERLEAVED": + cgra.concurrent_accesses = [{1: [(0, 0)] * len(cgra.covered_accesses)}, {}, {}, {}] latency = [1] * 4 - for i in range (N_ROWS): - for values in self.concurrent_accesses[i].values(): + for i in range (cgra.N_ROWS): + for values in cgra.concurrent_accesses[i].values(): for current_access in values: # find the position of an access within the conflict, as well as that of the next dependency - access_pos = find_position(self.concurrent_accesses[i], current_access[1]) + 1 - if i < N_ROWS - 1: - latency[current_access[1]] += access_pos - find_position(self.concurrent_accesses[i+1],current_access[1]) - else: - latency[current_access[1]] += access_pos + current_pos = find_position(cgra.concurrent_accesses[i], current_access[1]) + 1 + next_pos = find_position(cgra.concurrent_accesses[i+1], current_access[1]) if i < cgra.N_ROWS - 1 else 0 + latency[current_access[1]] += current_pos - next_pos return latency def find_position(conflict_pos, column): @@ -163,24 +149,24 @@ def find_position(conflict_pos, column): return pairs[1].index(pair) return 0 -def compute_latency_cc(self, dependencies): - from cgra import N_ROWS, N_COLS, flag_poll_cnt - ACTIVE_ROW_COEF = bus_type_active_row_coef[self.bus_type] - CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[self.bus_type] - mem_count = [0] * N_COLS +def compute_latency_cc(cgra, dependencies): + # Account for additional bus type specific delays + ACTIVE_ROW_COEF = bus_type_active_row_coef[cgra.memory_manager.bus_type] + CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[cgra.memory_manager.bus_type] + mem_count = [0] * cgra.N_COLS latency_cc = max(dependencies) - for r in range(N_ROWS): - for c in range(N_COLS): - if self.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: + for r in range(cgra.N_ROWS): + for c in range(cgra.N_COLS): + if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: mem_count[c] += 1 - if ACTIVE_ROW_COEF != 0: - for i in range (N_ROWS): + if ACTIVE_ROW_COEF: + for i in range (cgra.N_ROWS): if mem_count[i] != 0: latency_cc += ACTIVE_ROW_COEF - if CPU_LOOP_INSTRS != 0: - flag_poll_cnt += latency_cc - if flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: - latency_cc += 1 + if CPU_LOOP_INSTRS: + cgra.flag_poll_cnt += latency_cc + if cgra.flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: + latency_cc += 1 return latency_cc def display_characterization(cgra, pr): @@ -191,5 +177,6 @@ def display_characterization(cgra, pr): print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) if any(item in pr for item in ["TOTAL_LAT", "ALL_LAT_INFO"]): print(f'\nConfiguration time: {len(cgra.instrs)} CC') - print(f'Time between end of configuration and start of first iteration: {math.ceil(14 + (len(cgra.instrs) * 3))} CC') + cgra.interval_latency = math.ceil(INTERVAL_CST + (len(cgra.instrs) * 3)) + print(f'Time between end of configuration and start of first iteration: {cgra.interval_latency} CC') print(f'Total time for all instructions: {cgra.total_latency_cc}') \ No newline at end of file diff --git a/src/kernels.py b/src/kernels.py index 0670d31..d51904f 100644 --- a/src/kernels.py +++ b/src/kernels.py @@ -22,34 +22,4 @@ def kernel_new( name, dim=4 ): csv.writer(f).writerow(["Address", "Data"]) csv.writer(f).writerow(["0", "0"]) - print("Kernel", name, "created successfuly!") - - -def kernel_clear_memory( name, version=""): - import csv - filedir = "./"+name+"/" - with open(filedir + FILENAME_MEM + version + EXT,"w+", newline='') as f: - csv.writer(f).writerow(["Address", "Data"]) - - -def kernel_add_memory_region( name, start, vals, version=""): - import csv - mem = [] - region = [] - filedir = "./"+name+"/" - for i in range(len(vals)): - region.append([ start + i*WORD_SIZE,vals[i]]) - - try: - with open(filedir + FILENAME_MEM + version + EXT) as f: - for row in csv.reader(f): mem.append(row) - - for row in region: mem.append(row) - - with open(filedir + FILENAME_MEM + version + EXT,"w", newline='') as f: - for row in mem: csv.writer(f).writerow(row) - except: - print("Could not open memory file") - - - + print("Kernel", name, "created successfuly!") \ No newline at end of file diff --git a/src/memory.py b/src/memory.py new file mode 100644 index 0000000..0975a1e --- /dev/null +++ b/src/memory.py @@ -0,0 +1,41 @@ +EXT = ".csv" +FILENAME_INSTR = "instructions" +FILENAME_MEM = "memory" +FILENAME_INP = "inputs" +FILENAME_OUP = "outputs" +FILENAME_MEM_O = "memory_out" +WORD_SIZE = 4 + +class MEMORY: + def __init__( self,bus_type="ONE-TO-M", spacing=4, n_banks=8, bank_size=32000): + self.bus_type = bus_type + self.spacing = spacing + self.n_banks = n_banks + self.bank_size = bank_size + self.flag_poll_cnt = 0 + +def kernel_clear_memory( name, version=""): + import csv + filedir = "./"+name+"/" + with open(filedir + FILENAME_MEM + version + EXT,"w+", newline='') as f: + csv.writer(f).writerow(["Address", "Data"]) + +def kernel_add_memory_region( name, start, vals, version=""): + import csv + mem = [] + region = [] + filedir = "./"+name+"/" + for i in range(len(vals)): + region.append([ start + i*WORD_SIZE,vals[i]]) + + try: + with open(filedir + FILENAME_MEM + version + EXT) as f: + for row in csv.reader(f): mem.append(row) + + for row in region: mem.append(row) + + with open(filedir + FILENAME_MEM + version + EXT,"w", newline='') as f: + for row in mem: csv.writer(f).writerow(row) + except: + print("Could not open memory file") + From 770f2da180f198632f994c393111e7f55e7cbe08 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:05:47 +0200 Subject: [PATCH 7/9] slight refactoring --- src/characterization.py | 89 ++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/src/characterization.py b/src/characterization.py index 1d24887..05429bc 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -7,40 +7,32 @@ BUS_TYPES = ["ONE-TO-M", "N-TO-M", "INTERLEAVED"] INTERVAL_CST = 14 -def load_operation_characterization(characterization_type, mapping_file): +def load_operation_characterization(characterization_type, mapping_file='operation_characterization.csv'): operation_mapping = {} - script_dir = os.path.dirname(os.path.abspath(__file__)) - csv_file_path = os.path.join(script_dir, mapping_file) + csv_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), mapping_file) with open(csv_file_path, 'r') as csvfile: reader = csv.reader(csvfile) + current_section = None for row in reader: - if not row: - continue - if row[0].startswith('#'): - current_section = row[0].strip('# ') - continue + if not row or row[0].startswith('#'): + current_section = row[0].strip('# ') if row else current_section + continue if current_section == f'operation_{characterization_type}_mapping': - if len(row) == 3: - key_type, value_type = int, float - operation, key, value = row - key = key_type(key) - value = value_type(value) + operation, *rest = row + if len(rest) == 1: + key = int(rest[0]) + operation_mapping[operation] = key + elif len(rest) == 2: + key = int(rest[0]) + value = float(rest[1]) if operation not in operation_mapping: operation_mapping[operation] = {} operation_mapping[operation][key] = value - elif len(row) == 2: - key_type, value_type = int, int - operation, key = row - key = key_type(key) - if operation not in operation_mapping: - operation_mapping[operation] = key - else: - continue return operation_mapping -operation_latency_mapping = load_operation_characterization("latency_cc", 'operation_characterization.csv') -bus_type_active_row_coef = load_operation_characterization("active_row_coef", 'operation_characterization.csv') -bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs", 'operation_characterization.csv') +operation_latency_mapping = load_operation_characterization("latency_cc") +bus_type_active_row_coef = load_operation_characterization("active_row_coef") +bus_type_cpu_loop_instrs = load_operation_characterization("cpu_loop_instrs") # This function takes the maximum latency between the memory operations and the non-memory operations in the instruction def get_latency_cc(cgra): @@ -87,8 +79,8 @@ def compute_bank_index(cgra, r, c): return index_pos def group_dma_accesses(cgra): - # For each row, scan the PEs for memory accesses - # If it lands on a column without a memory access, then scan all the rows on that column (= push up/down) + # For each row, scan the PEs for memory accesses and place them into concurrent_accesses + # If a column has no memory access, then scan all the rows on that column (=push up) cgra.covered_accesses = [] concurrent_accesses = [{} for _ in range(4)] for r in range(cgra.N_ROWS): @@ -101,8 +93,7 @@ def group_dma_accesses(cgra): cgra.covered_accesses, concurrent_accesses = mark_access(cgra.covered_accesses, concurrent_accesses, r, c, k, cgra.cells[k][c].bank_index) break if not accesses_are_ordered(cgra, concurrent_accesses): - for i in range(cgra.N_ROWS - 1, 0, -1): - concurrent_accesses[i-1] = rearrange_accesses(concurrent_accesses[i-1], concurrent_accesses[i]) + concurrent_accesses = rearrange_accesses(cgra, concurrent_accesses) return concurrent_accesses def mark_access(covered_accesses, concurrent_accesses, r, c, k, bank_index): @@ -112,31 +103,37 @@ def mark_access(covered_accesses, concurrent_accesses, r, c, k, bank_index): return covered_accesses, concurrent_accesses def accesses_are_ordered(cgra, concurrent_accesses): - highest_row = [0] * 4 - for i in range (cgra.N_ROWS): - for values in concurrent_accesses[i].values(): - for current_access in values: - if highest_row[i] > current_access[0]: - return False - else: - highest_row[i] = current_access[0] - return True + if (cgra.memory_manager.bus_type != "INTERLEAVED"): + return False + else: + highest_row = [0] * 4 + for i in range (cgra.N_ROWS): + for values in concurrent_accesses[i].values(): + for current_access in values: + if highest_row[i] > current_access[0]: + return False + else: + highest_row[i] = current_access[0] + return True # This function arranges the concurrent lists to ensure they match the DMA's behavior -def rearrange_accesses(first_list, second_list): - order_pairs = [pair[1] for pairs in second_list.values() for pair in pairs][::-1] - return {key: sorted(pairs, key=lambda x: order_pairs.index(x[1]) if x[1] in order_pairs else float('inf')) - for key, pairs in first_list.items()} +def rearrange_accesses(cgra, concurrent_accesses): + if cgra.memory_manager.bus_type == "INTERLEAVED": + for i in range(cgra.N_ROWS - 1, 0, -1): + order_pairs = [pair[1] for pairs in concurrent_accesses[i].values() for pair in pairs][::-1] + concurrent_accesses[i-1] = {key: sorted(pairs, key=lambda x: order_pairs.index(x[1]) if x[1] in order_pairs else float('inf')) for key, pairs in concurrent_accesses[i-1].items()} + else: + # Latencies for non-interleaved bus types require the total number of accesses + concurrent_accesses = [{1: [(0, 0)] * len(cgra.covered_accesses)}, {}, {}, {}] + return concurrent_accesses def track_dependencies(cgra): - # Latencies for non-interleaved bus types require the total number of accesses - if cgra.memory_manager.bus_type != "INTERLEAVED": - cgra.concurrent_accesses = [{1: [(0, 0)] * len(cgra.covered_accesses)}, {}, {}, {}] latency = [1] * 4 for i in range (cgra.N_ROWS): for values in cgra.concurrent_accesses[i].values(): for current_access in values: - # find the position of an access within the conflict, as well as that of the next dependency + # Compare each access with its next dependency (=subsequent access at same column) + # Record the difference between the access within the conflict, and the subsequent access current_pos = find_position(cgra.concurrent_accesses[i], current_access[1]) + 1 next_pos = find_position(cgra.concurrent_accesses[i+1], current_access[1]) if i < cgra.N_ROWS - 1 else 0 latency[current_access[1]] += current_pos - next_pos @@ -171,7 +168,7 @@ def compute_latency_cc(cgra, dependencies): def display_characterization(cgra, pr): if any(item in pr for item in ["OP_MAX_LAT", "ALL_LAT_INFO"]): - print("Longest instructions per cycle:\n") + print("\nLongest instructions per cycle:\n") print("{:<8} {:<25} {:<10}".format("Cycle", "Instruction", "Latency (CC)")) for index, item in enumerate(cgra.instr_latency_cc): print("{:<2} {:<6} {:<25} {:<10}".format(index + 1, f'({item.instr2exec})', item.instr, item.latency_cc)) From de70c2ca2da163e1cb2705663fa23979be986c40 Mon Sep 17 00:00:00 2001 From: maaspa <90349318+maaspa@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:34:37 +0200 Subject: [PATCH 8/9] renamed var names / fixed cpu estimation logic --- INTRODUCTION.ipynb | 10 +++++----- README.md | 4 ++-- examples/convolution.ipynb | 16 ++++++++-------- examples/pMean.ipynb | 4 ++-- examples/strsearch.ipynb | 4 ++-- src/README.md | 25 +++++++++++++++++++++++++ src/cgra.py | 4 ++-- src/characterization.py | 31 +++++++++++++++++++------------ src/memory.py | 15 +++++++-------- 9 files changed, 72 insertions(+), 41 deletions(-) create mode 100644 src/README.md diff --git a/INTRODUCTION.ipynb b/INTRODUCTION.ipynb index 57e4ca5..74411fa 100644 --- a/INTRODUCTION.ipynb +++ b/INTRODUCTION.ipynb @@ -185,9 +185,9 @@ "kernel_name = \"examples/convolution\"\n", "version = \"_v1\"\n", "# Input data\n", - "kernel_clear_memory(kernel_name, version)\n", - "kernel_add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", - "kernel_add_memory_region(kernel_name, 100, [9], version)\n", + "clear_memory(kernel_name, version)\n", + "add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", + "add_memory_region(kernel_name, 100, [9], version)\n", "# Set load and write addrs\n", "load_addrs = [36464, 36464, 36464, 100]\n", "store_addrs = [0, 0, 0, 4]\n", @@ -233,8 +233,8 @@ "\n", "# To test this kernel we will fill the memory with two random arrays\n", "# Note that the addresses in memory (36464 and 37800) correspond to inputs in the inputs.csv file)\n", - "kernel_clear_memory(kernel_name)\n", - "kernel_add_memory_region(kernel_name, 1000, [50, 123, 36464, 112, 37800, 45])" + "clear_memory(kernel_name)\n", + "add_memory_region(kernel_name, 1000, [50, 123, 36464, 112, 37800, 45])" ] }, { diff --git a/README.md b/README.md index 7e469eb..01036ee 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Each kernel has a folder with its name. All the files needed to run a simulation * (optional) A hand-written assembly file. * (optional) A `memory.csv` file including the indexed values that the kernel can access from memory. - You can either write this file by hand or fill the memory using the `kernel_add_memory_region` function. + You can either write this file by hand or fill the memory using the `add_memory_region` function. * A `instructions.csv` file containing a matrix of operations to be executed by each Processing Element (PE) during each instruction. This file can be automatically generated from a `.out` file or hand-written assembly. @@ -42,7 +42,7 @@ kernel_new("") ### Memory The memory can be easily populated with some patter by calling the function ```python -kernel_add_memory_region( "",
, , []) +add_memory_region( "",
, , []) ``` The `memory.csv` file should always have this format diff --git a/examples/convolution.ipynb b/examples/convolution.ipynb index a6e5303..dee85f2 100644 --- a/examples/convolution.ipynb +++ b/examples/convolution.ipynb @@ -173,9 +173,9 @@ "kernel_name = \"convolution\"\n", "version = \"_v1\"\n", "# Input data\n", - "kernel_clear_memory(kernel_name, version)\n", - "kernel_add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", - "kernel_add_memory_region(kernel_name, 100, [9], version)\n", + "clear_memory(kernel_name, version)\n", + "add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", + "add_memory_region(kernel_name, 100, [9], version)\n", "# Set load and write addrs\n", "load_addrs = [36464, 36464, 36464, 100]\n", "store_addrs = [0, 0, 0, 4]\n", @@ -197,7 +197,7 @@ "outputs": [], "source": [ "4, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", - "kernel_add_memory_region(kernel_name, 100, [9], version)\n", + "add_memory_region(kernel_name, 100, [9], version)\n", "# Set load and write addrs\n", "load_addrs = [36464, 36464, 36464, 100]\n", "store_addrs = [0, 0, 0, 4]\n", @@ -375,10 +375,10 @@ "kernel_name = \"convolution\"\n", "version = \"_v2\"\n", "# Input data\n", - "kernel_clear_memory(kernel_name, version)\n", - "kernel_add_memory_region(kernel_name, 1000, [1,1,1], version)\n", - "kernel_add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", - "kernel_add_memory_region(kernel_name, 100, [9], version)\n", + "clear_memory(kernel_name, version)\n", + "add_memory_region(kernel_name, 1000, [1,1,1], version)\n", + "add_memory_region(kernel_name, 36464, [1, 2, 3, 4, 5, 6, 7, 8, 9], version)\n", + "add_memory_region(kernel_name, 100, [9], version)\n", "# Set load and write addrs\n", "load_addrs = [36468, 36468, 36468, 100]\n", "# Run the example\n", diff --git a/examples/pMean.ipynb b/examples/pMean.ipynb index 74ec2b6..5c746b7 100644 --- a/examples/pMean.ipynb +++ b/examples/pMean.ipynb @@ -537,8 +537,8 @@ "\n", "kernel_name = \"pMean\"\n", "# Input data\n", - "kernel_clear_memory(kernel_name)\n", - "kernel_add_memory_region(kernel_name, 1000, [101,110,189,95,65,246,80,120,105,60,70,89,105,95,101,150,99,189,216,253,115,22,120,103,22,120,103,230,88,194,153,231,43,177,150,99,189,216,253,115,22,-1])\n", + "clear_memory(kernel_name)\n", + "add_memory_region(kernel_name, 1000, [101,110,189,95,65,246,80,120,105,60,70,89,105,95,101,150,99,189,216,253,115,22,120,103,22,120,103,230,88,194,153,231,43,177,150,99,189,216,253,115,22,-1])\n", "# Set load and write addrs\n", "load_addrs = [1000, 0, 0, 0]\n", "store_addrs = [4, 0, 0, 0]\n", diff --git a/examples/strsearch.ipynb b/examples/strsearch.ipynb index 9a785bc..5184849 100644 --- a/examples/strsearch.ipynb +++ b/examples/strsearch.ipynb @@ -627,8 +627,8 @@ "\n", "kernel_name = \"strsearch\"\n", "# Input data\n", - "kernel_clear_memory(kernel_name)\n", - "kernel_add_memory_region(kernel_name, 1000, [50, 123, 36464, 112, 37800, 45])\n", + "clear_memory(kernel_name)\n", + "add_memory_region(kernel_name, 1000, [50, 123, 36464, 112, 37800, 45])\n", "# Set load and write addrs\n", "load_addrs = [0, 1000, 1008, 1016]\n", "store_addrs = [0, 4, 0, 0]\n", diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..e799547 --- /dev/null +++ b/src/README.md @@ -0,0 +1,25 @@ +# Latency +The simulator features an estimation tool that provides latency results for each instruction. + +To do so, an algorithm returns the maximum value between the longest ALU (non-memory) operation and the total latency due to memory accesses. + +### Instruction-level breakdown +* ALU operation latencies are straightforward to compute, as they all take 1 CC to execute, with the exception of multiplications (SMUL and FXPMUL) which take 3 CCs. +* To compute the total latency of the memory operations (LWD, SWD, LWI, and SWI), the simulator must consider the bus type (default: one-to-M). +* In the case where the instruction contains an “EXIT” operation, 1 CC is added to the final latency. +### Printing results +* Finally, to select the output information to display, users must pass one or more of the following strings to the pr string array: + * `OP_MAX_LAT`: prints the longest operation for each instruction. If the longest operation is a memory access, the resulting output will be “MEM” followed by the operation’s name in parentheses. + * `TOTAL_LAT`: only displays the configuration time, the time between end of configuration and start of first iteration, and the total time for all instructions. + * `ALL_LAT_INFO`: prints all latency information. + +### Parametrization +To specify a different bus type (one-to-M, N-to-M, or interleaved), users must instantiate `Memory` class with the desired parameter, and pass this class to the CGRA’s `run` function like so: + + +```python +python memory_manager = MEMORY("INTERLEAVED") +run(kernel_name, pr=["ROUT","OPS", "ALL_LAT_INFO","ALL_PWR_EN_INFO"], load_addrs=load_addrs, store_addrs=store_addrs, limit = 300, memory_manager=memory_manager) +``` + +The estimator can be further parametrized by directly modifying the `operation_characterization.csv` file, from which per-operation latencies and bus type specificities are fetched. \ No newline at end of file diff --git a/src/cgra.py b/src/cgra.py index 2faf4c0..ef68212 100644 --- a/src/cgra.py +++ b/src/cgra.py @@ -401,7 +401,7 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= # Create an empty memory file if there is not any if not os.path.isfile(kernel + "/"+FILENAME_MEM+version+EXT): - kernel_clear_memory(kernel, version) + clear_memory(kernel, version) # Read the memory file with open( kernel + "/"+FILENAME_MEM+version+EXT, 'r') as f: @@ -424,4 +424,4 @@ def run( kernel, version="", pr="ROUT", limit=100, load_addrs=None, store_addrs= with open( kernel + "/"+FILENAME_MEM_O+version+EXT, 'w+') as f: for row in sorted_mem: csv.writer(f).writerow(row) display_characterization(cgra, pr) - print("\n\nEND") + print("\n\nEND") \ No newline at end of file diff --git a/src/characterization.py b/src/characterization.py index 05429bc..52eec1c 100644 --- a/src/characterization.py +++ b/src/characterization.py @@ -60,7 +60,7 @@ def get_latency_mem_cc(cgra): record_bank_access(cgra) cgra.concurrent_accesses = group_dma_accesses(cgra) dependencies = track_dependencies(cgra) - latency_cc = compute_latency_cc(cgra, dependencies) + latency_cc = get_total_memory_access_cc(cgra, dependencies) return latency_cc # Record the bank index used for each memory access @@ -71,18 +71,21 @@ def record_bank_access(cgra): cgra.cells[r][c].bank_index = compute_bank_index(cgra,r,c) def compute_bank_index(cgra, r, c): - base_addr = cgra.init_store[0] if cgra.cells[r][c].op == "SWD" else sorted(cgra.memory)[0][0] + if (cgra.memory): + base_addr = cgra.init_store[0] if cgra.cells[r][c].op == "SWD" else sorted(cgra.memory)[0][0] if cgra.memory_manager.bus_type == "INTERLEAVED": - index_pos = int(((cgra.cells[r][c].addr - base_addr) / cgra.memory_manager.spacing) % cgra.memory_manager.n_banks) + index_pos = int(((cgra.cells[r][c].addr - base_addr) / cgra.memory_manager.word_size_B) % cgra.memory_manager.banks_n) + elif cgra.memory_manager.bus_type == "N-TO-M": + index_pos = cgra.cells[r][c].addr / cgra.memory_manager.bank_size_B else: - index_pos = cgra.cells[r][c].addr / cgra.memory_manager.bank_size + index_pos = 1 return index_pos def group_dma_accesses(cgra): # For each row, scan the PEs for memory accesses and place them into concurrent_accesses # If a column has no memory access, then scan all the rows on that column (=push up) cgra.covered_accesses = [] - concurrent_accesses = [{} for _ in range(4)] + concurrent_accesses = [{} for _ in range(cgra.N_ROWS)] for r in range(cgra.N_ROWS): for c in range(cgra.N_COLS): if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS and (r, c) not in cgra.covered_accesses: @@ -106,7 +109,7 @@ def accesses_are_ordered(cgra, concurrent_accesses): if (cgra.memory_manager.bus_type != "INTERLEAVED"): return False else: - highest_row = [0] * 4 + highest_row = [0] * cgra.N_ROWS for i in range (cgra.N_ROWS): for values in concurrent_accesses[i].values(): for current_access in values: @@ -128,7 +131,8 @@ def rearrange_accesses(cgra, concurrent_accesses): return concurrent_accesses def track_dependencies(cgra): - latency = [1] * 4 + latency = [1] * cgra.N_COLS + # Iterate over each sequence (= flattened row), examining them two-by-two: for i in range (cgra.N_ROWS): for values in cgra.concurrent_accesses[i].values(): for current_access in values: @@ -146,7 +150,7 @@ def find_position(conflict_pos, column): return pairs[1].index(pair) return 0 -def compute_latency_cc(cgra, dependencies): +def get_total_memory_access_cc(cgra, dependencies): # Account for additional bus type specific delays ACTIVE_ROW_COEF = bus_type_active_row_coef[cgra.memory_manager.bus_type] CPU_LOOP_INSTRS = bus_type_cpu_loop_instrs[cgra.memory_manager.bus_type] @@ -155,15 +159,18 @@ def compute_latency_cc(cgra, dependencies): for r in range(cgra.N_ROWS): for c in range(cgra.N_COLS): if cgra.cells[r][c].op in OPERATIONS_MEMORY_ACCESS: - mem_count[c] += 1 + mem_count[r] += 1 if ACTIVE_ROW_COEF: for i in range (cgra.N_ROWS): if mem_count[i] != 0: latency_cc += ACTIVE_ROW_COEF + if CPU_LOOP_INSTRS: + cgra.flag_poll_cnt += ACTIVE_ROW_COEF + if cgra.flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: + latency_cc += 1 if CPU_LOOP_INSTRS: - cgra.flag_poll_cnt += latency_cc - if cgra.flag_poll_cnt % (CPU_LOOP_INSTRS - 1) == 0: - latency_cc += 1 + if max(mem_count) == 0: + cgra.flag_poll_cnt += latency_cc return latency_cc def display_characterization(cgra, pr): diff --git a/src/memory.py b/src/memory.py index 0975a1e..e7a487d 100644 --- a/src/memory.py +++ b/src/memory.py @@ -7,20 +7,20 @@ WORD_SIZE = 4 class MEMORY: - def __init__( self,bus_type="ONE-TO-M", spacing=4, n_banks=8, bank_size=32000): + def __init__( self,bus_type="ONE-TO-M", word_size_B=4, banks_n=8, bank_size_B=32000): self.bus_type = bus_type - self.spacing = spacing - self.n_banks = n_banks - self.bank_size = bank_size + self.word_size_B = word_size_B + self.banks_n = banks_n + self.bank_size_B = bank_size_B self.flag_poll_cnt = 0 -def kernel_clear_memory( name, version=""): +def clear_memory( name, version=""): import csv filedir = "./"+name+"/" with open(filedir + FILENAME_MEM + version + EXT,"w+", newline='') as f: csv.writer(f).writerow(["Address", "Data"]) -def kernel_add_memory_region( name, start, vals, version=""): +def add_memory_region( name, start, vals, version=""): import csv mem = [] region = [] @@ -37,5 +37,4 @@ def kernel_add_memory_region( name, start, vals, version=""): with open(filedir + FILENAME_MEM + version + EXT,"w", newline='') as f: for row in mem: csv.writer(f).writerow(row) except: - print("Could not open memory file") - + print("Could not open memory file") \ No newline at end of file From 4b563b8b759c43947dcccbd3841f195e06fe36fe Mon Sep 17 00:00:00 2001 From: Juan Sapriza Date: Fri, 30 Aug 2024 15:44:14 +0200 Subject: [PATCH 9/9] test commit --- src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/README.md b/src/README.md index e799547..23a7b4a 100644 --- a/src/README.md +++ b/src/README.md @@ -4,7 +4,7 @@ The simulator features an estimation tool that provides latency results for each To do so, an algorithm returns the maximum value between the longest ALU (non-memory) operation and the total latency due to memory accesses. ### Instruction-level breakdown -* ALU operation latencies are straightforward to compute, as they all take 1 CC to execute, with the exception of multiplications (SMUL and FXPMUL) which take 3 CCs. +* ALU operation latencies are straightforward to compute, as they all take 1 CC to execute, with the exception of multiplications (SMUL and FXPMUL) which take 3 CCs. * To compute the total latency of the memory operations (LWD, SWD, LWI, and SWI), the simulator must consider the bus type (default: one-to-M). * In the case where the instruction contains an “EXIT” operation, 1 CC is added to the final latency. ### Printing results