From 8b549142bbb869054c6ec3e004367e76ddab3320 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Mon, 3 Apr 2023 18:18:29 +0200
Subject: [PATCH 01/61] added files and started implementation of
 examples/hip/test_vector_add.py examples/hip/vector_add.py and backend/hip.py

---
 examples/hip/test_vector_add.py | 38 +++++++++++++++++++++++++++++
 examples/hip/vector_add.py      | 43 +++++++++++++++++++++++++++++++++
 kernel_tuner/backends/hip.py    | 30 +++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 examples/hip/test_vector_add.py
 create mode 100644 examples/hip/vector_add.py
 create mode 100644 kernel_tuner/backends/hip.py

diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py
new file mode 100644
index 000000000..9e2c69eeb
--- /dev/null
+++ b/examples/hip/test_vector_add.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+"""Minimal example for a HIP Kernel unit test with the Kernel Tuner"""
+
+import numpy
+from kernel_tuner import run_kernel
+import pytest
+
+def test_vector_add():
+    #Check pyhip is installed and if a HIP capable device is present, if not skip the test
+    try:
+        import pyhip as hip
+        hip.hipGetDeviceProperties(0)
+    except (ImportError, Exception):
+        pytest.skip("PyHIP not installed or no HIP device detected")
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 10000000
+    problem_size = (size, 1)
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+    params = {"block_size_x": 512}
+
+    answer = run_kernel("vector_add", kernel_string, problem_size, args, params)
+
+    assert numpy.allclose(answer[0], a+b, atol=1e-8)
\ No newline at end of file
diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py
new file mode 100644
index 000000000..d52c21b27
--- /dev/null
+++ b/examples/hip/vector_add.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+"""This is the minimal example from the README"""
+
+import numpy
+from kernel_tuner import tune_kernel
+from kernel_tuner.file_utils import store_output_file, store_metadata_file
+
+def tune():
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 10000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params)
+
+    # Store the tuning results in an output file
+    store_output_file("vector_add.json", results, tune_params)
+
+    # Store the metadata of this run
+    store_metadata_file("vector_add-metadata.json")
+
+    return results
+
+
+if __name__ == "__main__":
+    tune()
\ No newline at end of file
diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
new file mode 100644
index 000000000..610a7d7bc
--- /dev/null
+++ b/kernel_tuner/backends/hip.py
@@ -0,0 +1,30 @@
+"""This module contains all HIP specific kernel_tuner functions"""
+
+import numpy as np
+
+from kernel_tuner.backends.backend import GPUBackend
+
+# embedded in try block to be able to generate documentation
+# and run tests without pyhip installed
+try:
+    import pyhip as hip
+except ImportError:
+    hip = None
+
+class HipFunctions(GPUBackend):
+    """Class that groups the HIP functions on maintains state about the device"""
+
+    def __init__(self, device=0, iterations=7, compiler_options=None, observers=None):
+        """instantiate HipFunctions object used for interacting with the HIP device
+
+        Instantiating this object will inspect and store certain device properties at
+        runtime, which are used during compilation and/or execution of kernels by the
+        kernel tuner. It also maintains a reference to the most recently compiled
+        source module for copying data to constant memory before kernel launch.
+
+        :param device: Number of HIP device to use for this context
+        :type device: int
+
+        :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
+        :type iterations: int
+        """
\ No newline at end of file

From 59a979f5e09582f86a3f4d095fe813207988c3b6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 10:56:46 +0200
Subject: [PATCH 02/61] added if statement for HIP lang detection in core.py
 class DeviceInterface

---
 kernel_tuner/core.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index accb58cc1..90c9bb4fa 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -19,6 +19,7 @@
 from kernel_tuner.backends.opencl import OpenCLFunctions
 from kernel_tuner.backends.c import CFunctions
 from kernel_tuner.backends.opencl import OpenCLFunctions
+from kernel_tuner.backends.hip import HipFunctions
 import kernel_tuner.util as util
 
 try:
@@ -239,6 +240,8 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
             dev = OpenCLFunctions(device, platform, compiler_options=compiler_options, iterations=iterations, observers=observers)
         elif lang.upper() in ["C", "FORTRAN"]:
             dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations)
+        elif lang.upper() == "HIP":
+            dev = HipFunctions(device, compiler_options=compiler_options, iterations=iterations, observers=observers)
         else:
             raise ValueError("Sorry, support for languages other than CUDA, OpenCL, or C is not implemented yet")
 

From 303e4d5201ddbc5eb1fe76b872a0080368561aa5 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 11:52:58 +0200
Subject: [PATCH 03/61] added hip related code to detect_language in util.py

---
 kernel_tuner/util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index e10f788ea..bc3cecf3e 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -259,7 +259,9 @@ def delete_temp_file(filename):
 
 def detect_language(kernel_string):
     """attempt to detect language from the kernel_string"""
-    if "__global__" in kernel_string:
+    if "__device__" or "__host__" or "__global__ void" in kernel_string:
+        lang = "HIP"
+    elif "__global__" in kernel_string:
         lang = "CUDA"
     elif "__kernel" in kernel_string:
         lang = "OpenCL"

From c3278930f8f4abd6c6f439ce619979c62ea0ec62 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 13:27:03 +0200
Subject: [PATCH 04/61] modified hip tests to enclude hIP lang argument
 speficication when calling tune_kernel

---
 examples/hip/test_vector_add.py | 2 +-
 examples/hip/vector_add.py      | 2 +-
 kernel_tuner/util.py            | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py
index 9e2c69eeb..6fd21aced 100644
--- a/examples/hip/test_vector_add.py
+++ b/examples/hip/test_vector_add.py
@@ -33,6 +33,6 @@ def test_vector_add():
     args = [c, a, b, n]
     params = {"block_size_x": 512}
 
-    answer = run_kernel("vector_add", kernel_string, problem_size, args, params)
+    answer = run_kernel("vector_add", kernel_string, problem_size, args, params, lang="HIP")
 
     assert numpy.allclose(answer[0], a+b, atol=1e-8)
\ No newline at end of file
diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py
index d52c21b27..ed572877b 100644
--- a/examples/hip/vector_add.py
+++ b/examples/hip/vector_add.py
@@ -28,7 +28,7 @@ def tune():
     tune_params = dict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
 
-    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params)
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP")
 
     # Store the tuning results in an output file
     store_output_file("vector_add.json", results, tune_params)
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index bc3cecf3e..890ef21a5 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -259,8 +259,6 @@ def delete_temp_file(filename):
 
 def detect_language(kernel_string):
     """attempt to detect language from the kernel_string"""
-    if "__device__" or "__host__" or "__global__ void" in kernel_string:
-        lang = "HIP"
     elif "__global__" in kernel_string:
         lang = "CUDA"
     elif "__kernel" in kernel_string:

From 0f66875337fdf5a9626492d21a46752ee8d4924c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 16:18:04 +0200
Subject: [PATCH 05/61] added (not tested) compile function to backend/hip.py

---
 kernel_tuner/backends/hip.py | 80 +++++++++++++++++++++++++++++++++++-
 kernel_tuner/util.py         |  2 +-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 610a7d7bc..6eeb64a81 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -1,15 +1,36 @@
 """This module contains all HIP specific kernel_tuner functions"""
 
 import numpy as np
+import ctypes as C
+from collections import namedtuple
 
 from kernel_tuner.backends.backend import GPUBackend
 
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
-    import pyhip as hip
+    from pyhip import hip, hiprtc
 except ImportError:
     hip = None
+    hiprtc = None
+
+dtype_map = {
+    "int8": C.c_int8,
+    "int16": C.c_int16,
+    "int32": C.c_int32,
+    "int64": C.c_int64,
+    "uint8": C.c_uint8,
+    "uint16": C.c_uint16,
+    "uint32": C.c_uint32,
+    "uint64": C.c_uint64,
+    "float32": C.c_float,
+    "float64": C.c_double,
+}
+
+# This represents an individual kernel argument.
+# It contains a numpy object (ndarray or number) and a ctypes object with a copy
+# of the argument data. For an ndarray, the ctypes object is a wrapper for the ndarray's data.
+Argument = namedtuple("Argument", ["numpy", "ctypes"])
 
 class HipFunctions(GPUBackend):
     """Class that groups the HIP functions on maintains state about the device"""
@@ -27,4 +48,59 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
         :type iterations: int
-        """
\ No newline at end of file
+        """
+
+    def ready_argument_list(self, arguments):
+        """ready argument list to be passed to the HIP function
+        :param arguments: List of arguments to be passed to the HIP function.
+            The order should match the argument list on the HIP function.
+            Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
+        :type arguments: list(numpy objects)
+        :returns: A list of arguments that can be passed to the HIP function.
+        :rtype: list(Argument)
+        """
+        ctype_args = [None for _ in arguments]
+
+        for i, arg in enumerate(arguments):
+            if not isinstance(arg, (np.ndarray, np.number)):
+                raise TypeError(
+                    "Argument is not numpy ndarray or numpy scalar %s" % type(arg)
+                )
+            dtype_str = str(arg.dtype)
+            if isinstance(arg, np.ndarray):
+                if dtype_str in dtype_map.keys():
+                    # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference
+                    # to its underlying array, so we need to store a reference to arg.copy()
+                    # in the Argument object manually to avoid it being deleted.
+                    # (This changed in numpy > 1.15.)
+                    # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
+                    data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
+                else:
+                    raise TypeError("unknown dtype for ndarray")
+            elif isinstance(arg, np.generic):
+                data_ctypes = dtype_map[dtype_str](arg)
+            ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes)
+        return ctype_args
+    
+    def compile(self, kernel_instance):
+        """call the HIP compiler to compile the kernel, return the function
+        
+        :param kernel_instance: An object representing the specific instance of the tunable kernel
+            in the parameter space.
+        :type kernel_instance: kernel_tuner.core.KernelInstance
+        
+        :returns: An ctypes function that can be called directly.
+        :rtype: ctypes._FuncPtr
+        """
+
+        kernel_string = kernel_instance.kernel_string
+        kernel_name = kernel_instance.name
+        kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], [])
+        
+        device_properties = hip.hipGetDeviceProperties(0)
+        hiprtc.hiprtcCompileProgram(kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}'])
+        code = hiprtc.hiprtcGetCode(kernel_ptr)
+        module = hip.hipModuleLoadData(code)
+        kernel = hip.hipModuleGetFunction(module, kernel_name)
+        
+        return kernel
\ No newline at end of file
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 890ef21a5..e10f788ea 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -259,7 +259,7 @@ def delete_temp_file(filename):
 
 def detect_language(kernel_string):
     """attempt to detect language from the kernel_string"""
-    elif "__global__" in kernel_string:
+    if "__global__" in kernel_string:
         lang = "CUDA"
     elif "__kernel" in kernel_string:
         lang = "OpenCL"

From 39700873a83ffc6b5e2f0d40cc624b8f2fa1ecae Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 16:32:31 +0200
Subject: [PATCH 06/61] added (not tested) start_event and stop_event functions
 to backend/hip.py

---
 kernel_tuner/backends/hip.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 6eeb64a81..5f2a07356 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -50,6 +50,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :type iterations: int
         """
 
+        # create a stream and events
+        self.stream = hip.hipStreamCreate()
+        self.start = hip.hipEventCreate()
+        self.end = hip.hipEventCreate()
+
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -103,4 +108,14 @@ def compile(self, kernel_instance):
         module = hip.hipModuleLoadData(code)
         kernel = hip.hipModuleGetFunction(module, kernel_name)
         
-        return kernel
\ No newline at end of file
+        return kernel
+    
+    def start_event(self):
+        """Records the event that marks the start of a measurement"""
+        self.start = hip.hipEventCreate()
+        hip.hipEventRecord(self.start, self.stream)
+
+    def stop_event(self):
+        """Records the event that marks the end of a measurement"""
+        self.end = hip.hipEventCreate()
+        hip.hipEventRecord(self.end, self.stream)

From f92471cc6659f365ff3a8e5b3621326079cae423 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 4 Apr 2023 17:41:48 +0200
Subject: [PATCH 07/61] added (not tested) kernel_finished function to
 backend/hip.py

---
 kernel_tuner/backends/hip.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 5f2a07356..2786e67fe 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -2,10 +2,17 @@
 
 import numpy as np
 import ctypes as C
+import ctypes.util
 from collections import namedtuple
 
 from kernel_tuner.backends.backend import GPUBackend
 
+try:
+    # Load the HIP runtime library
+    hip_lib = C.cdll.LoadLibrary(C.util.find_library('hip'))
+except ImportError:
+    hip_lib = None
+
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
@@ -119,3 +126,22 @@ def stop_event(self):
         """Records the event that marks the end of a measurement"""
         self.end = hip.hipEventCreate()
         hip.hipEventRecord(self.end, self.stream)
+
+    def kernel_finished(self):
+        """Returns True if the kernel has finished, False otherwise"""
+        
+        # Define the argument and return types for hipEventQuery()
+        hip_lib.hipEventQuery.argtypes = [C.c_void_p]
+        hip_lib.hipEventQuery.restype = C.c_int
+
+        # Query the status of the event
+        status = hip_lib.hipEventQuery(self.end)
+        if status == hip_lib.hipSuccess:
+            # Kernel has finished
+            return True
+        elif status == hip_lib.hipErrorNotReady:
+            # Kernel is still running
+            return False
+        else:
+            # Error occurred
+            return False

From 3425e2dfd58c85dcfcfc2b6f8c705f0ea4ddc75f Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Wed, 5 Apr 2023 11:01:17 +0200
Subject: [PATCH 08/61] changed how pyhip is imported (via import sys with
 specific path of system)

---
 kernel_tuner/backends/hip.py | 68 +++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 2786e67fe..cb866f638 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -1,7 +1,7 @@
 """This module contains all HIP specific kernel_tuner functions"""
 
 import numpy as np
-import ctypes as C
+import ctypes
 import ctypes.util
 from collections import namedtuple
 
@@ -9,29 +9,33 @@
 
 try:
     # Load the HIP runtime library
-    hip_lib = C.cdll.LoadLibrary(C.util.find_library('hip'))
+    hip_lib = ctypes.cdll.LoadLibrary(ctypes.util.find_library('hip'))
 except ImportError:
+    print("Not able to import hip c lib")
     hip_lib = None
 
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
+    import sys 
+    sys.path.insert(0, '/home/mli940/PyHIP')
     from pyhip import hip, hiprtc
 except ImportError:
+    print("Not able to import pyhip")
     hip = None
     hiprtc = None
 
 dtype_map = {
-    "int8": C.c_int8,
-    "int16": C.c_int16,
-    "int32": C.c_int32,
-    "int64": C.c_int64,
-    "uint8": C.c_uint8,
-    "uint16": C.c_uint16,
-    "uint32": C.c_uint32,
-    "uint64": C.c_uint64,
-    "float32": C.c_float,
-    "float64": C.c_double,
+    "int8": ctypes.c_int8,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "uint8": ctypes.c_uint8,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double,
 }
 
 # This represents an individual kernel argument.
@@ -85,8 +89,8 @@ def ready_argument_list(self, arguments):
                     # to its underlying array, so we need to store a reference to arg.copy()
                     # in the Argument object manually to avoid it being deleted.
                     # (This changed in numpy > 1.15.)
-                    # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
-                    data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str]))
+                    # data_ctypes = data.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
+                    data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                 else:
                     raise TypeError("unknown dtype for ndarray")
             elif isinstance(arg, np.generic):
@@ -131,8 +135,8 @@ def kernel_finished(self):
         """Returns True if the kernel has finished, False otherwise"""
         
         # Define the argument and return types for hipEventQuery()
-        hip_lib.hipEventQuery.argtypes = [C.c_void_p]
-        hip_lib.hipEventQuery.restype = C.c_int
+        hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p]
+        hip_lib.hipEventQuery.restype = ctypes.c_int
 
         # Query the status of the event
         status = hip_lib.hipEventQuery(self.end)
@@ -145,3 +149,35 @@ def kernel_finished(self):
         else:
             # Error occurred
             return False
+
+    def synchronize(self):
+        """This method must implement a barrier that halts execution until device has finished its tasks."""
+        pass
+
+    def run_kernel(self, func, gpu_args, threads, grid, stream):
+        """This method must implement the execution of the kernel on the device."""
+        pass
+
+    def memset(self, allocation, value, size):
+        """This method must implement setting the memory to a value on the device."""
+        pass
+
+    def memcpy_dtoh(self, dest, src):
+        """This method must implement a device to host copy."""
+        pass
+
+    def memcpy_htod(self, dest, src):
+        """This method must implement a host to device copy."""
+        pass
+
+    def copy_constant_memory_args(self, cmem_args):
+        """This method must implement the allocation and copy of constant memory to the GPU."""
+        pass
+
+    def copy_shared_memory_args(self, smem_args):
+        """This method must implement the dynamic allocation of shared memory on the GPU."""
+        pass
+
+    def copy_texture_memory_args(self, texmem_args):
+        """This method must implement the allocation and copy of texture memory to the GPU."""
+        pass

From ef476e193e948c007d55e911c79fc51caccc3b9f Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Wed, 5 Apr 2023 16:55:00 +0200
Subject: [PATCH 09/61] added (not tested) to HIPFunctions arguments env,
 max_threads, name

---
 kernel_tuner/backends/hip.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index cb866f638..0e7f4d070 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -61,6 +61,13 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :type iterations: int
         """
 
+        hipProps = hip.hipDeviceProperties()
+        self.name = hipProps.name
+        self.max_threads = hipProps.maxThreadsPerBlock
+
+        env = dict()
+        self.env = env
+
         # create a stream and events
         self.stream = hip.hipStreamCreate()
         self.start = hip.hipEventCreate()
@@ -181,3 +188,5 @@ def copy_shared_memory_args(self, smem_args):
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         pass
+
+    units = {"time": "ms"}

From e120aca32e9ec9b8a0fee0b2cb607a6a965dfa85 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 6 Apr 2023 12:56:12 +0200
Subject: [PATCH 10/61] self.name of HIPFunctions works

---
 kernel_tuner/backends/hip.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 0e7f4d070..8cdbb9f78 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -60,9 +60,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
         :type iterations: int
         """
-
-        hipProps = hip.hipDeviceProperties()
-        self.name = hipProps.name
+        hip.hipInit(0)
+        
+        hipProps = hip.hipGetDeviceProperties(device)
+        self.name = hipProps._name.decode('utf-8')
         self.max_threads = hipProps.maxThreadsPerBlock
 
         env = dict()

From 5cdf495edc79197261872c19c528bca5fb558ade Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 6 Apr 2023 13:06:59 +0200
Subject: [PATCH 11/61] import of pyhip based on PYHIP_PATH

---
 kernel_tuner/backends/hip.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 8cdbb9f78..12c2806d8 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -4,6 +4,8 @@
 import ctypes
 import ctypes.util
 from collections import namedtuple
+import os
+import sys
 
 from kernel_tuner.backends.backend import GPUBackend
 
@@ -16,12 +18,13 @@
 
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
-try:
-    import sys 
-    sys.path.insert(0, '/home/mli940/PyHIP')
+PYHIP_PATH = os.environ.get('PYHIP_PATH')  # get the PYHIP_PATH environment variable
+try: 
+    if PYHIP_PATH is not None:
+        sys.path.insert(0, PYHIP_PATH)
     from pyhip import hip, hiprtc
 except ImportError:
-    print("Not able to import pyhip")
+    print("Not able to import pyhip, check if PYHIP_PATH is set")
     hip = None
     hiprtc = None
 

From d15060259efb8a3bb73982ec6162bb695794c4b1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 6 Apr 2023 17:49:16 +0200
Subject: [PATCH 12/61] added test/test_hip_functions.py with
 test_ready_argument_list (not sure of its correctness) and test_compile

---
 test/test_hip_functions.py | 58 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 test/test_hip_functions.py

diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
new file mode 100644
index 000000000..132c24a77
--- /dev/null
+++ b/test/test_hip_functions.py
@@ -0,0 +1,58 @@
+import numpy as np
+import ctypes
+from .context import skip_if_no_pyhip
+
+import pytest
+from kernel_tuner.backends import hip as kt_hip
+from kernel_tuner.core import KernelSource, KernelInstance
+
+try: 
+    from pyhip import hip, hiprtc
+    hip_present = True
+except ImportError:
+    pass
+
+@skip_if_no_pyhip
+def test_ready_argument_list():
+
+    size = 1000
+    a = np.int32(75)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.bool_(True)
+    d = np.zeros_like(b)
+
+    arguments = [d, a, b, c]
+
+    dev = kt_hip.HipFunctions(0)
+    gpu_args = dev.ready_argument_list(arguments)
+
+    assert isinstance(gpu_args[0], ctypes.POINTER(ctypes.c_float))
+    assert isinstance(gpu_args[1], ctypes.c_int32)
+    assert isinstance(gpu_args[2], ctypes.POINTER(ctypes.c_float))
+    assert isinstance(gpu_args[3], ctypes.c_bool)
+
+@skip_if_no_pyhip
+def test_compile():
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    kernel_name = "vector_add"
+    kernel_sources = KernelSource(kernel_name, kernel_string, "cuda")
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
+    dev = kt_hip.HipFunctions(0)
+    try:
+        dev.compile(kernel_instance)
+    except Exception as e:
+        pytest.fail("Did not expect any exception:" + str(e))
+
+
+def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
+    pass
+

From 44b846f70cceb1c883e861f070292b63ddc5fd38 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 6 Apr 2023 17:49:55 +0200
Subject: [PATCH 13/61] added hip related code for tests

---
 test/context.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/context.py b/test/context.py
index 71c0616a5..4842b9c18 100644
--- a/test/context.py
+++ b/test/context.py
@@ -1,6 +1,7 @@
 import sys
 import subprocess
 import shutil
+import os
 
 import pytest
 
@@ -38,6 +39,15 @@
 except Exception:
     cuda_present = False
 
+PYHIP_PATH = os.environ.get('PYHIP_PATH')  # get the PYHIP_PATH environment variable
+try: 
+    if PYHIP_PATH is not None:
+        sys.path.insert(0, PYHIP_PATH)
+    from pyhip import hip, hiprtc
+    pyhip_present = True
+except ImportError:
+    pyhip_present = False
+
 skip_if_no_pycuda = pytest.mark.skipif(not pycuda_present, reason="PyCuda not installed or no CUDA device detected")
 skip_if_no_cupy = pytest.mark.skipif(not cupy_present, reason="CuPy not installed or no CUDA device detected")
 skip_if_no_cuda = pytest.mark.skipif(not cuda_present, reason="NVIDIA CUDA not installed")
@@ -45,6 +55,7 @@
 skip_if_no_gcc = pytest.mark.skipif(not gcc_present, reason="No gcc on PATH")
 skip_if_no_gfortran = pytest.mark.skipif(not gfortran_present, reason="No gfortran on PATH")
 skip_if_no_openmp = pytest.mark.skipif(not openmp_present, reason="No OpenMP found")
+skip_if_no_pyhip = pytest.mark.skipif(not pyhip_present, reason="No PyHIP found")
 
 
 def skip_backend(backend: str):

From 9f691d0fd71254c8a2d0f7b4a90ff6e43daa1d61 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 6 Apr 2023 17:51:37 +0200
Subject: [PATCH 14/61] kernel_tuner/backends/hip.py::compile passes
 test/test_hip_functions.py::test_compile

---
 kernel_tuner/backends/hip.py | 37 ++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 12c2806d8..85e01d17f 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -19,12 +19,10 @@
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 PYHIP_PATH = os.environ.get('PYHIP_PATH')  # get the PYHIP_PATH environment variable
-try: 
-    if PYHIP_PATH is not None:
-        sys.path.insert(0, PYHIP_PATH)
+try:
     from pyhip import hip, hiprtc
 except ImportError:
-    print("Not able to import pyhip, check if PYHIP_PATH is set")
+    print("Not able to import pyhip, check if PYTHONPATH includes PyHIP")
     hip = None
     hiprtc = None
 
@@ -68,6 +66,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         hipProps = hip.hipGetDeviceProperties(device)
         self.name = hipProps._name.decode('utf-8')
         self.max_threads = hipProps.maxThreadsPerBlock
+        self.device = device
+        self.compiler_options = compiler_options
 
         env = dict()
         self.env = env
@@ -89,10 +89,6 @@ def ready_argument_list(self, arguments):
         ctype_args = [None for _ in arguments]
 
         for i, arg in enumerate(arguments):
-            if not isinstance(arg, (np.ndarray, np.number)):
-                raise TypeError(
-                    "Argument is not numpy ndarray or numpy scalar %s" % type(arg)
-                )
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -104,8 +100,10 @@ def ready_argument_list(self, arguments):
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                 else:
                     raise TypeError("unknown dtype for ndarray")
+            elif isinstance(arg, np.bool_):
+                data_ctypes = ctypes.c_bool(arg)
             elif isinstance(arg, np.generic):
-                data_ctypes = dtype_map[dtype_str](arg)
+                data_ctypes = dtype_map[dtype_str](arg)          
             ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes)
         return ctype_args
     
@@ -119,13 +117,28 @@ def compile(self, kernel_instance):
         :returns: An ctypes function that can be called directly.
         :rtype: ctypes._FuncPtr
         """
-
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
+
+        # if filename is known, use that one
+        suffix = kernel_instance.kernel_source.get_user_suffix()
+
+        if suffix is None:
+            # select right suffix based on compiler
+            suffix = ".cc"
+
+        if ".c" in suffix and 'extern "C"' not in kernel_string:
+            kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
+
         kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], [])
         
-        device_properties = hip.hipGetDeviceProperties(0)
-        hiprtc.hiprtcCompileProgram(kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}'])
+        device_properties = hip.hipGetDeviceProperties(self.device)
+        plat = hip.hipGetPlatformName()
+        if plat == "amd":
+            hiprtc.hiprtcCompileProgram(
+                kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}'])
+        else:
+            hiprtc.hiprtcCompileProgram(kernel_ptr, [])
         code = hiprtc.hiprtcGetCode(kernel_ptr)
         module = hip.hipModuleLoadData(code)
         kernel = hip.hipModuleGetFunction(module, kernel_name)

From 52ececf4440bddf7d143fb418ac888d8eafe72f2 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Fri, 7 Apr 2023 14:58:40 +0200
Subject: [PATCH 15/61] ready_argument_list passes partialy due to diffiulty
 with comparing ctypes in test (not sure whats wrong yet)

---
 kernel_tuner/backends/hip.py | 40 +++++++++++++++++++-----------------
 test/test_hip_functions.py   | 10 +++++----
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 85e01d17f..d56480104 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -27,6 +27,7 @@
     hiprtc = None
 
 dtype_map = {
+    "bool": ctypes.c_bool,
     "int8": ctypes.c_int8,
     "int16": ctypes.c_int16,
     "int32": ctypes.c_int32,
@@ -39,11 +40,6 @@
     "float64": ctypes.c_double,
 }
 
-# This represents an individual kernel argument.
-# It contains a numpy object (ndarray or number) and a ctypes object with a copy
-# of the argument data. For an ndarray, the ctypes object is a wrapper for the ndarray's data.
-Argument = namedtuple("Argument", ["numpy", "ctypes"])
-
 class HipFunctions(GPUBackend):
     """Class that groups the HIP functions on maintains state about the device"""
 
@@ -83,29 +79,35 @@ def ready_argument_list(self, arguments):
             The order should match the argument list on the HIP function.
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
-        :returns: A list of arguments that can be passed to the HIP function.
-        :rtype: list(Argument)
+        :returns: A ctypes structure that can be passed to the HIP function.
+        :rtype: ctypes.Structure
         """
-        ctype_args = [None for _ in arguments]
 
+        ctype_args = []
         for i, arg in enumerate(arguments):
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
-                    # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference
-                    # to its underlying array, so we need to store a reference to arg.copy()
-                    # in the Argument object manually to avoid it being deleted.
-                    # (This changed in numpy > 1.15.)
-                    # data_ctypes = data.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                 else:
-                    raise TypeError("unknown dtype for ndarray")
-            elif isinstance(arg, np.bool_):
-                data_ctypes = ctypes.c_bool(arg)
+                    raise TypeError("unknown dtype for ndarray")        
             elif isinstance(arg, np.generic):
-                data_ctypes = dtype_map[dtype_str](arg)          
-            ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes)
-        return ctype_args
+                data_ctypes = dtype_map[dtype_str](arg)
+            ctype_args.append(data_ctypes)  
+        
+        # Determine the number of fields in the structure
+        num_fields = len(ctype_args)
+        # Determine the types of the fields in the structure
+        field_types = [type(x) for x in ctype_args]
+        # Define a new ctypes structure with the inferred layout
+        class ArgListStructure(ctypes.Structure):
+            _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
+        ctypes_struct = ArgListStructure()
+        # Populate the fields of the structure with values from the list
+        for i, value in enumerate(ctype_args):
+            setattr(ctypes_struct, f'field{i}', value)
+        
+        return ctypes_struct
     
     def compile(self, kernel_instance):
         """call the HIP compiler to compile the kernel, return the function
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index 132c24a77..573ea93c5 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -26,10 +26,12 @@ def test_ready_argument_list():
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)
 
-    assert isinstance(gpu_args[0], ctypes.POINTER(ctypes.c_float))
-    assert isinstance(gpu_args[1], ctypes.c_int32)
-    assert isinstance(gpu_args[2], ctypes.POINTER(ctypes.c_float))
-    assert isinstance(gpu_args[3], ctypes.c_bool)
+    assert isinstance(gpu_args, ctypes.Structure)
+
+    assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float))
+    #assert isinstance(gpu_args.field1, ctypes.c_int32)
+    assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float))
+    #assert isinstance(gpu_args.field3, ctypes.c_bool)
 
 @skip_if_no_pyhip
 def test_compile():

From d3ae6bad3023f3fd94293f94f150ef263fd5f148 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Fri, 7 Apr 2023 18:34:04 +0200
Subject: [PATCH 16/61] added (not tested) run_kernel, memset, memcpy_dtoh,
 memcpy_htod

---
 examples/hip/vector_add.py   |  3 +-
 kernel_tuner/backends/hip.py | 94 ++++++++++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py
index ed572877b..72397b628 100644
--- a/examples/hip/vector_add.py
+++ b/examples/hip/vector_add.py
@@ -4,6 +4,7 @@
 import numpy
 from kernel_tuner import tune_kernel
 from kernel_tuner.file_utils import store_output_file, store_metadata_file
+import logging
 
 def tune():
 
@@ -28,7 +29,7 @@ def tune():
     tune_params = dict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
 
-    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP")
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", log=logging.DEBUG)
 
     # Store the tuning results in an output file
     store_output_file("vector_add.json", results, tune_params)
diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index d56480104..394e91636 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -6,6 +6,7 @@
 from collections import namedtuple
 import os
 import sys
+import logging
 
 from kernel_tuner.backends.backend import GPUBackend
 
@@ -57,6 +58,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
         :type iterations: int
         """
+        logging.debug("HipFunction instantiated")
         hip.hipInit(0)
         
         hipProps = hip.hipGetDeviceProperties(device)
@@ -73,6 +75,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.start = hip.hipEventCreate()
         self.end = hip.hipEventCreate()
 
+        self.smem_size = 0
+
+        env["device_name"] = self.name
+
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -82,7 +88,7 @@ def ready_argument_list(self, arguments):
         :returns: A ctypes structure that can be passed to the HIP function.
         :rtype: ctypes.Structure
         """
-
+        logging.debug("HipFunction ready_argument_list called")
         ctype_args = []
         for i, arg in enumerate(arguments):
             dtype_str = str(arg.dtype)
@@ -95,8 +101,6 @@ def ready_argument_list(self, arguments):
                 data_ctypes = dtype_map[dtype_str](arg)
             ctype_args.append(data_ctypes)  
         
-        # Determine the number of fields in the structure
-        num_fields = len(ctype_args)
         # Determine the types of the fields in the structure
         field_types = [type(x) for x in ctype_args]
         # Define a new ctypes structure with the inferred layout
@@ -119,6 +123,7 @@ def compile(self, kernel_instance):
         :returns: An ctypes function that can be called directly.
         :rtype: ctypes._FuncPtr
         """
+        logging.debug("HipFunction compile called")
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
 
@@ -136,6 +141,7 @@ def compile(self, kernel_instance):
         
         device_properties = hip.hipGetDeviceProperties(self.device)
         plat = hip.hipGetPlatformName()
+        #Compile based on device
         if plat == "amd":
             hiprtc.hiprtcCompileProgram(
                 kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}'])
@@ -149,16 +155,17 @@ def compile(self, kernel_instance):
     
     def start_event(self):
         """Records the event that marks the start of a measurement"""
-        self.start = hip.hipEventCreate()
+        logging.debug("HipFunction start_event called")
         hip.hipEventRecord(self.start, self.stream)
 
     def stop_event(self):
         """Records the event that marks the end of a measurement"""
-        self.end = hip.hipEventCreate()
+        logging.debug("HipFunction stop_event called")
         hip.hipEventRecord(self.end, self.stream)
 
     def kernel_finished(self):
         """Returns True if the kernel has finished, False otherwise"""
+        logging.debug("HipFunction kernel_finished called")
         
         # Define the argument and return types for hipEventQuery()
         hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p]
@@ -177,35 +184,90 @@ def kernel_finished(self):
             return False
 
     def synchronize(self):
-        """This method must implement a barrier that halts execution until device has finished its tasks."""
+        """Halts execution until device has finished its tasks"""
+        logging.debug("HipFunction synchronize called")
+        hip.hipEventSynchronize(self.end)
         pass
 
     def run_kernel(self, func, gpu_args, threads, grid, stream):
-        """This method must implement the execution of the kernel on the device."""
+        """runs the HIP kernel passed as 'func'
+
+        :param func: A PyHIP kernel compiled for this specific kernel configuration
+        :type func: ctypes pionter
+
+        :param gpu_args: A list of arguments to the kernel, order should match the
+            order in the code. Allowed values are either variables in global memory
+            or single values passed by value.
+        :type gpu_args: ctypes.Structure
+
+        :param threads: A tuple listing the number of threads in each dimension of
+            the thread block
+        :type threads: tuple(int, int, int)
+
+        :param grid: A tuple listing the number of thread blocks in each dimension
+            of the grid
+        :type grid: tuple(int, int)
+        """
+        logging.debug("HipFunction run_kernel called")
+        hip.hipModuleLaunchKernel(func, 
+                                  grid[0], grid[1], grid[2], grid[3], 
+                                  threads[0], threads[1], threads[2],
+                                  self.smem_size,
+                                  stream,
+                                  gpu_args)
         pass
 
     def memset(self, allocation, value, size):
-        """This method must implement setting the memory to a value on the device."""
-        pass
+        """set the memory in allocation to the value in value
+
+        :param allocation: An Argument for some memory allocation unit
+        :type allocation: ctypes ptr
+
+        :param value: The value to set the memory to
+        :type value: a single 8-bit unsigned int
+
+        :param size: The size of to the allocation unit in bytes
+        :type size: int
+        """
+        logging.debug("HipFunction memset called")
+        allocation.contents.value = value
 
     def memcpy_dtoh(self, dest, src):
-        """This method must implement a device to host copy."""
-        pass
+        """perform a device to host memory copy
+
+        :param dest: A numpy array in host memory to store the data
+        :type dest: numpy.ndarray
+
+        :param src: A GPU memory allocation unit
+        :type src: ctypes ptr
+        """
+        logging.debug("HipFunction memcpy_dtoh called")
+        dtype_str = str(src.dtype)
+        hip.hipMemcpy_dtoh(ctypes.byref(dest.ctypes), src, ctypes.sizeof(dtype_map[dtype_str]) * src.size)
 
     def memcpy_htod(self, dest, src):
-        """This method must implement a host to device copy."""
-        pass
+        """perform a host to device memory copy
+
+        :param dest: A GPU memory allocation unit
+        :type dest: ctypes ptr
+
+        :param src: A numpy array in host memory to store the data
+        :type src: numpy.ndarray
+        """
+        logging.debug("HipFunction memcpy_htod called")
+        dtype_str = str(src.dtype)
+        hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size)
 
     def copy_constant_memory_args(self, cmem_args):
         """This method must implement the allocation and copy of constant memory to the GPU."""
-        pass
+        logging.debug("HipFunction copy_constant_memory_args called")
 
     def copy_shared_memory_args(self, smem_args):
         """This method must implement the dynamic allocation of shared memory on the GPU."""
-        pass
+        logging.debug("HipFunction copy_shared_memory_args called")
 
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
-        pass
+        logging.debug("HipFunction copy_texture_memory_args called")
 
     units = {"time": "ms"}

From cbe9997dd3fa950b6af81591a42db2fbf10e03f1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 11 Apr 2023 16:00:59 +0200
Subject: [PATCH 17/61] not able to solve hipGetDeviceProperties issue, for now
 pached manually

---
 kernel_tuner/backends/hip.py | 39 +++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 394e91636..9f6a36d62 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -10,12 +10,26 @@
 
 from kernel_tuner.backends.backend import GPUBackend
 
-try:
-    # Load the HIP runtime library
-    hip_lib = ctypes.cdll.LoadLibrary(ctypes.util.find_library('hip'))
-except ImportError:
-    print("Not able to import hip c lib")
-    hip_lib = None
+_libhip = None
+_hip_platform_name = ''
+
+# Try to find amd hip library, if not found, fallback to nvhip library
+if 'linux' in sys.platform:
+    try:
+        _libhip_libname = 'libamdhip64.so'
+        _libhip = ctypes.cdll.LoadLibrary(_libhip_libname)
+        _hip_platform_name = 'amd'
+    except:
+        try:
+            _libhip_libname = 'libnvhip64.so'
+            _libhip = ctypes.cdll.LoadLibrary(_libhip_libname)
+            _hip_platform_name = 'nvidia'
+        except:
+            raise RuntimeError(
+                'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set')
+else:
+    # Currently we do not support windows
+    raise RuntimeError('Only linux is supported')
 
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
@@ -59,15 +73,24 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :type iterations: int
         """
         logging.debug("HipFunction instantiated")
-        hip.hipInit(0)
+        #hip.hipInit(0)
+        #hip.hipSetDevice(device)
         
         hipProps = hip.hipGetDeviceProperties(device)
+        # Print out all the values
+        #for field_name, field_type in hipProps._fields_:
+        #    print(f"{field_name}: {getattr(hipProps, field_name)}")
+
         self.name = hipProps._name.decode('utf-8')
         self.max_threads = hipProps.maxThreadsPerBlock
+        print("self.max_threads: " + str(self.max_threads))
+        self.max_threads = 1024 # PATCH FOR NOW
+
         self.device = device
         self.compiler_options = compiler_options
 
         env = dict()
+        env["device_name"] = self.name
         self.env = env
 
         # create a stream and events
@@ -77,8 +100,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.smem_size = 0
 
-        env["device_name"] = self.name
-
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.

From 13d93bd1b825887bee80a15170d8af284dedb773 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Tue, 11 Apr 2023 16:46:48 +0200
Subject: [PATCH 18/61] added (not tested) observer code

---
 kernel_tuner/backends/hip.py  | 14 +++++++++++---
 kernel_tuner/observers/hip.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 kernel_tuner/observers/hip.py

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 9f6a36d62..bc9f394bb 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -9,6 +9,7 @@
 import logging
 
 from kernel_tuner.backends.backend import GPUBackend
+from kernel_tuner.observers.hip import HipRuntimeObserver
 
 _libhip = None
 _hip_platform_name = ''
@@ -33,7 +34,6 @@
 
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
-PYHIP_PATH = os.environ.get('PYHIP_PATH')  # get the PYHIP_PATH environment variable
 try:
     from pyhip import hip, hiprtc
 except ImportError:
@@ -100,6 +100,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.smem_size = 0
 
+        # setup observers
+        self.observers = observers or []
+        self.observers.append(HipRuntimeObserver(self))
+        for obs in self.observers:
+            obs.register_device(self)
+
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -210,7 +216,7 @@ def synchronize(self):
         hip.hipEventSynchronize(self.end)
         pass
 
-    def run_kernel(self, func, gpu_args, threads, grid, stream):
+    def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """runs the HIP kernel passed as 'func'
 
         :param func: A PyHIP kernel compiled for this specific kernel configuration
@@ -230,8 +236,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream):
         :type grid: tuple(int, int)
         """
         logging.debug("HipFunction run_kernel called")
+        if stream is None:
+            stream = self.stream
         hip.hipModuleLaunchKernel(func, 
-                                  grid[0], grid[1], grid[2], grid[3], 
+                                  grid[0], grid[1], grid[2], 
                                   threads[0], threads[1], threads[2],
                                   self.smem_size,
                                   stream,
diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py
new file mode 100644
index 000000000..4c7704aa8
--- /dev/null
+++ b/kernel_tuner/observers/hip.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+from kernel_tuner.observers.observer import BenchmarkObserver
+
+try:
+    from pyhip import hip, hiprtc
+except ImportError:
+    print("Not able to import pyhip, check if PYTHONPATH includes PyHIP")
+    hip = None
+    hiprtc = None
+
+
+class HipRuntimeObserver(BenchmarkObserver):
+    """Observer that measures time using CUDA events during benchmarking"""
+
+    def __init__(self, dev):
+        self.dev = dev
+        self.stream = dev.stream
+        self.start = dev.start
+        self.end = dev.end
+        self.times = []
+
+    def after_finish(self):
+        # Time is measured in milliseconds
+        EventElapsedTime = hip.hipEventElapsedTime(self.start, self.end)
+        self.times.append(EventElapsedTime.value)
+
+    def get_results(self):
+        results = {"time": np.average(self.times), "times": self.times.copy()}
+        self.times = []
+        return results

From d9512ac0ebd6dbcebab5e9769b94ccf94fc48a06 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Wed, 12 Apr 2023 10:30:18 +0200
Subject: [PATCH 19/61] added (not tested) observer code

---
 kernel_tuner/backends/hip.py | 27 ++++++++++++++++++---------
 test/test_hip_functions.py   |  6 +++---
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index bc9f394bb..f29b2c69d 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -113,10 +113,11 @@ def ready_argument_list(self, arguments):
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
         :returns: A ctypes structure that can be passed to the HIP function.
-        :rtype: ctypes.Structure
-        """
+        :rtype: ctypes.Structure"""
+        
         logging.debug("HipFunction ready_argument_list called")
         ctype_args = []
+        data_ctypes = None
         for i, arg in enumerate(arguments):
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
@@ -137,9 +138,11 @@ class ArgListStructure(ctypes.Structure):
         # Populate the fields of the structure with values from the list
         for i, value in enumerate(ctype_args):
             setattr(ctypes_struct, f'field{i}', value)
+            print(f'field{i} = {value} of {type(value)}')
         
         return ctypes_struct
     
+    
     def compile(self, kernel_instance):
         """call the HIP compiler to compile the kernel, return the function
         
@@ -195,15 +198,15 @@ def kernel_finished(self):
         logging.debug("HipFunction kernel_finished called")
         
         # Define the argument and return types for hipEventQuery()
-        hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p]
-        hip_lib.hipEventQuery.restype = ctypes.c_int
+        _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
+        _libhip.hipEventQuery.restype = ctypes.c_int
 
         # Query the status of the event
-        status = hip_lib.hipEventQuery(self.end)
-        if status == hip_lib.hipSuccess:
+        status = _libhip.hipEventQuery(self.end)
+        if status == _libhip.hipSuccess:
             # Kernel has finished
             return True
-        elif status == hip_lib.hipErrorNotReady:
+        elif status == _libhip.hipErrorNotReady:
             # Kernel is still running
             return False
         else:
@@ -233,18 +236,23 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
 
         :param grid: A tuple listing the number of thread blocks in each dimension
             of the grid
-        :type grid: tuple(int, int)
+        :type grid: tuple(int, int, int)
         """
         logging.debug("HipFunction run_kernel called")
         if stream is None:
             stream = self.stream
+        print(func)
+        print(grid)
+        print(threads)
+        print(self.smem_size)
+        print(stream)
+        print(gpu_args)
         hip.hipModuleLaunchKernel(func, 
                                   grid[0], grid[1], grid[2], 
                                   threads[0], threads[1], threads[2],
                                   self.smem_size,
                                   stream,
                                   gpu_args)
-        pass
 
     def memset(self, allocation, value, size):
         """set the memory in allocation to the value in value
@@ -294,6 +302,7 @@ def copy_constant_memory_args(self, cmem_args):
     def copy_shared_memory_args(self, smem_args):
         """This method must implement the dynamic allocation of shared memory on the GPU."""
         logging.debug("HipFunction copy_shared_memory_args called")
+        self.smem_size = smem_args["size"]
 
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index 573ea93c5..c0030c995 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -25,13 +25,13 @@ def test_ready_argument_list():
 
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)
-
+    
     assert isinstance(gpu_args, ctypes.Structure)
 
     assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float))
-    #assert isinstance(gpu_args.field1, ctypes.c_int32)
+    assert isinstance(gpu_args.field1, ctypes.c_int32)
     assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float))
-    #assert isinstance(gpu_args.field3, ctypes.c_bool)
+    assert isinstance(gpu_args.field3, ctypes.c_bool)
 
 @skip_if_no_pyhip
 def test_compile():

From 578dd3061286bad7f3a7359cc03e453fd19cbb23 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Wed, 12 Apr 2023 22:02:40 +0200
Subject: [PATCH 20/61] update

---
 kernel_tuner/backends/hip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index f29b2c69d..013eb3bc6 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -84,7 +84,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.name = hipProps._name.decode('utf-8')
         self.max_threads = hipProps.maxThreadsPerBlock
         print("self.max_threads: " + str(self.max_threads))
-        self.max_threads = 1024 # PATCH FOR NOW
+        #self.max_threads = 1024 # PATCH FOR NOW
 
         self.device = device
         self.compiler_options = compiler_options

From dc0b350c8f7f59e8b5131bb823d8e668f1c73b29 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 10:38:27 +0200
Subject: [PATCH 21/61] debug updates

---
 kernel_tuner/backends/hip.py | 40 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 013eb3bc6..ef6a9529f 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -28,10 +28,19 @@
         except:
             raise RuntimeError(
                 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set')
+    
+    _libhiprtc_libname = 'libhiprtc.so'
+    _libhiprtc = None
+    try:
+        _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname)
+    except:
+        raise OSError('hiprtc library not found')
+    
 else:
     # Currently we do not support windows
     raise RuntimeError('Only linux is supported')
 
+
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
@@ -83,7 +92,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.name = hipProps._name.decode('utf-8')
         self.max_threads = hipProps.maxThreadsPerBlock
-        print("self.max_threads: " + str(self.max_threads))
+        logging.debug("self.max_threads: " + str(self.max_threads))
         #self.max_threads = 1024 # PATCH FOR NOW
 
         self.device = device
@@ -106,6 +115,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         for obs in self.observers:
             obs.register_device(self)
 
+        # define arguments and return value ctypes for hipEventQuery
+        _libhip.hipEventQuery.restype = ctypes.c_int
+        _libhip.hipEventQuery.argtypes = ctypes.c_void_p
+
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -138,7 +151,7 @@ class ArgListStructure(ctypes.Structure):
         # Populate the fields of the structure with values from the list
         for i, value in enumerate(ctype_args):
             setattr(ctypes_struct, f'field{i}', value)
-            print(f'field{i} = {value} of {type(value)}')
+            #print(f'field{i} = {value} of {type(value)}')
         
         return ctypes_struct
     
@@ -197,27 +210,20 @@ def kernel_finished(self):
         """Returns True if the kernel has finished, False otherwise"""
         logging.debug("HipFunction kernel_finished called")
         
-        # Define the argument and return types for hipEventQuery()
-        _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
-        _libhip.hipEventQuery.restype = ctypes.c_int
-
         # Query the status of the event
         status = _libhip.hipEventQuery(self.end)
-        if status == _libhip.hipSuccess:
-            # Kernel has finished
+        logging.debug(f'_libhip.hipEventQuery(self.end) = {status}')
+        if status == 34: # 34 = hipErrorNotReady
+            logging.debug("kernel finished")
             return True
-        elif status == _libhip.hipErrorNotReady:
-            # Kernel is still running
-            return False
         else:
-            # Error occurred
+            logging.debug("kernel not finished")
             return False
 
     def synchronize(self):
         """Halts execution until device has finished its tasks"""
         logging.debug("HipFunction synchronize called")
-        hip.hipEventSynchronize(self.end)
-        pass
+        status = hip.hipEventSynchronize(self.end)
 
     def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """runs the HIP kernel passed as 'func'
@@ -241,12 +247,6 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         logging.debug("HipFunction run_kernel called")
         if stream is None:
             stream = self.stream
-        print(func)
-        print(grid)
-        print(threads)
-        print(self.smem_size)
-        print(stream)
-        print(gpu_args)
         hip.hipModuleLaunchKernel(func, 
                                   grid[0], grid[1], grid[2], 
                                   threads[0], threads[1], threads[2],

From 9732bf79fe14770d63363fecde727861967f2e98 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 15:46:39 +0200
Subject: [PATCH 22/61] added mem calls to ready_argument_list

---
 kernel_tuner/backends/hip.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index ef6a9529f..5b9ead27a 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -36,11 +36,20 @@
     except:
         raise OSError('hiprtc library not found')
     
+    
+    _libhiprtc_libname = 'libhiprtc.so'
+    _libhiprtc = None
+    try:
+        _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname)
+    except:
+        raise OSError('hiprtc library not found')
+    
 else:
     # Currently we do not support windows
     raise RuntimeError('Only linux is supported')
 
 
+
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
@@ -115,10 +124,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         for obs in self.observers:
             obs.register_device(self)
 
-        # define arguments and return value ctypes for hipEventQuery
-        _libhip.hipEventQuery.restype = ctypes.c_int
-        _libhip.hipEventQuery.argtypes = ctypes.c_void_p
-
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -135,7 +140,10 @@ def ready_argument_list(self, arguments):
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
+                    device_ptr = hip.hipMalloc(arg.nbytes)
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
+                    print(data_ctypes)
+                    hip.hipMemcpy_htod(device_ptr, ctypes.byref(data_ctypes), arg.nbytes)
                 else:
                     raise TypeError("unknown dtype for ndarray")        
             elif isinstance(arg, np.generic):
@@ -147,11 +155,11 @@ def ready_argument_list(self, arguments):
         # Define a new ctypes structure with the inferred layout
         class ArgListStructure(ctypes.Structure):
             _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
-        ctypes_struct = ArgListStructure()
+        ctypes_struct = ArgListStructure(*ctype_args)
         # Populate the fields of the structure with values from the list
         for i, value in enumerate(ctype_args):
             setattr(ctypes_struct, f'field{i}', value)
-            #print(f'field{i} = {value} of {type(value)}')
+            print(f'field{i} = {value} of {type(value)}')
         
         return ctypes_struct
     

From 2b3dda6a1b69c29681a4eabc487478476a3ec33b Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 15:52:35 +0200
Subject: [PATCH 23/61] updates

---
 kernel_tuner/backends/hip.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 5b9ead27a..769558377 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -36,20 +36,11 @@
     except:
         raise OSError('hiprtc library not found')
     
-    
-    _libhiprtc_libname = 'libhiprtc.so'
-    _libhiprtc = None
-    try:
-        _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname)
-    except:
-        raise OSError('hiprtc library not found')
-    
 else:
     # Currently we do not support windows
     raise RuntimeError('Only linux is supported')
 
 
-
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
@@ -124,6 +115,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         for obs in self.observers:
             obs.register_device(self)
 
+        # define arguments and return value ctypes for hipEventQuery
+        _libhip.hipEventQuery.restype = ctypes.c_int
+        _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
+
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
         :param arguments: List of arguments to be passed to the HIP function.
@@ -157,9 +152,9 @@ class ArgListStructure(ctypes.Structure):
             _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
         ctypes_struct = ArgListStructure(*ctype_args)
         # Populate the fields of the structure with values from the list
-        for i, value in enumerate(ctype_args):
-            setattr(ctypes_struct, f'field{i}', value)
-            print(f'field{i} = {value} of {type(value)}')
+        #for i, value in enumerate(ctype_args):
+        #    setattr(ctypes_struct, f'field{i}', value)
+            #print(f'field{i} = {value} of {type(value)}')
         
         return ctypes_struct
     

From 5a89c85da44c1e794d6ac12bb478e6bf283bebf6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 16:49:00 +0200
Subject: [PATCH 24/61] corrected synchronize in backend/hip.py

---
 kernel_tuner/backends/hip.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 769558377..164055682 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -135,10 +135,14 @@ def ready_argument_list(self, arguments):
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
+                    print(f'dtype_stre = {dtype_str}')
+                    print(f'arg.size = {arg.size}')
+                    print(f'arg.nbytes = {arg.nbytes}')
                     device_ptr = hip.hipMalloc(arg.nbytes)
+                    print(f'device_ptr = {device_ptr}')
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
-                    print(data_ctypes)
-                    hip.hipMemcpy_htod(device_ptr, ctypes.byref(data_ctypes), arg.nbytes)
+                    print(f'data_ctypes = {data_ctypes}')
+                    hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes)
                 else:
                     raise TypeError("unknown dtype for ndarray")        
             elif isinstance(arg, np.generic):
@@ -226,7 +230,7 @@ def kernel_finished(self):
     def synchronize(self):
         """Halts execution until device has finished its tasks"""
         logging.debug("HipFunction synchronize called")
-        status = hip.hipEventSynchronize(self.end)
+        status = hip.hipDeviceSynchronize()
 
     def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """runs the HIP kernel passed as 'func'

From fc183d8a9156d476c15ce07b7bf63d4e5a9f016f Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 16:58:18 +0200
Subject: [PATCH 25/61] clean up

---
 kernel_tuner/backends/hip.py | 44 ++++++++----------------------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 164055682..d529ae020 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -82,18 +82,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         :type iterations: int
         """
         logging.debug("HipFunction instantiated")
-        #hip.hipInit(0)
-        #hip.hipSetDevice(device)
         
-        hipProps = hip.hipGetDeviceProperties(device)
-        # Print out all the values
-        #for field_name, field_type in hipProps._fields_:
-        #    print(f"{field_name}: {getattr(hipProps, field_name)}")
+        self.hipProps = hip.hipGetDeviceProperties(device)
 
-        self.name = hipProps._name.decode('utf-8')
-        self.max_threads = hipProps.maxThreadsPerBlock
+        self.name = self.hipProps._name.decode('utf-8')
+        self.max_threads = self.hipProps.maxThreadsPerBlock
         logging.debug("self.max_threads: " + str(self.max_threads))
-        #self.max_threads = 1024 # PATCH FOR NOW
 
         self.device = device
         self.compiler_options = compiler_options
@@ -131,17 +125,12 @@ def ready_argument_list(self, arguments):
         logging.debug("HipFunction ready_argument_list called")
         ctype_args = []
         data_ctypes = None
-        for i, arg in enumerate(arguments):
+        for arg in enumerate(arguments):
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
-                    print(f'dtype_stre = {dtype_str}')
-                    print(f'arg.size = {arg.size}')
-                    print(f'arg.nbytes = {arg.nbytes}')
                     device_ptr = hip.hipMalloc(arg.nbytes)
-                    print(f'device_ptr = {device_ptr}')
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
-                    print(f'data_ctypes = {data_ctypes}')
                     hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes)
                 else:
                     raise TypeError("unknown dtype for ndarray")        
@@ -154,13 +143,8 @@ def ready_argument_list(self, arguments):
         # Define a new ctypes structure with the inferred layout
         class ArgListStructure(ctypes.Structure):
             _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
-        ctypes_struct = ArgListStructure(*ctype_args)
-        # Populate the fields of the structure with values from the list
-        #for i, value in enumerate(ctype_args):
-        #    setattr(ctypes_struct, f'field{i}', value)
-            #print(f'field{i} = {value} of {type(value)}')
         
-        return ctypes_struct
+        return ArgListStructure(*ctype_args)
     
     
     def compile(self, kernel_instance):
@@ -177,24 +161,16 @@ def compile(self, kernel_instance):
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
 
-        # if filename is known, use that one
-        suffix = kernel_instance.kernel_source.get_user_suffix()
-
-        if suffix is None:
-            # select right suffix based on compiler
-            suffix = ".cc"
-
-        if ".c" in suffix and 'extern "C"' not in kernel_string:
+        if 'extern "C"' not in kernel_string:
             kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
 
         kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], [])
         
-        device_properties = hip.hipGetDeviceProperties(self.device)
         plat = hip.hipGetPlatformName()
         #Compile based on device
         if plat == "amd":
             hiprtc.hiprtcCompileProgram(
-                kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}'])
+                kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}'])
         else:
             hiprtc.hiprtcCompileProgram(kernel_ptr, [])
         code = hiprtc.hiprtcGetCode(kernel_ptr)
@@ -220,7 +196,7 @@ def kernel_finished(self):
         # Query the status of the event
         status = _libhip.hipEventQuery(self.end)
         logging.debug(f'_libhip.hipEventQuery(self.end) = {status}')
-        if status == 34: # 34 = hipErrorNotReady
+        if status == 34: # 34 = hipErrorNotReady --> still have to look into this
             logging.debug("kernel finished")
             return True
         else:
@@ -230,7 +206,7 @@ def kernel_finished(self):
     def synchronize(self):
         """Halts execution until device has finished its tasks"""
         logging.debug("HipFunction synchronize called")
-        status = hip.hipDeviceSynchronize()
+        hip.hipDeviceSynchronize()
 
     def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """runs the HIP kernel passed as 'func'
@@ -274,7 +250,7 @@ def memset(self, allocation, value, size):
         :type size: int
         """
         logging.debug("HipFunction memset called")
-        allocation.contents.value = value
+        allocation.contents.value = value # probably wrong, still have to look into this
 
     def memcpy_dtoh(self, dest, src):
         """perform a device to host memory copy

From 063dd151b42c2baea25ec55ee7097b067fdc2551 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 17:13:01 +0200
Subject: [PATCH 26/61] resolved memory error caused by ready_argument_list

---
 kernel_tuner/backends/hip.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index d529ae020..8982b082a 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -64,6 +64,12 @@
     "float64": ctypes.c_double,
 }
 
+# define arguments and return value ctypes for hipEventQuery
+_libhip.hipEventQuery.restype = ctypes.c_int
+_libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
+
+hipSuccess = 0
+
 class HipFunctions(GPUBackend):
     """Class that groups the HIP functions on maintains state about the device"""
 
@@ -109,9 +115,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         for obs in self.observers:
             obs.register_device(self)
 
-        # define arguments and return value ctypes for hipEventQuery
-        _libhip.hipEventQuery.restype = ctypes.c_int
-        _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
 
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
@@ -132,11 +135,12 @@ def ready_argument_list(self, arguments):
                     device_ptr = hip.hipMalloc(arg.nbytes)
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                     hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes)
+                    ctype_args.append(device_ptr)
                 else:
                     raise TypeError("unknown dtype for ndarray")        
             elif isinstance(arg, np.generic):
                 data_ctypes = dtype_map[dtype_str](arg)
-            ctype_args.append(data_ctypes)  
+                ctype_args.append(data_ctypes)  
         
         # Determine the types of the fields in the structure
         field_types = [type(x) for x in ctype_args]
@@ -196,7 +200,7 @@ def kernel_finished(self):
         # Query the status of the event
         status = _libhip.hipEventQuery(self.end)
         logging.debug(f'_libhip.hipEventQuery(self.end) = {status}')
-        if status == 34: # 34 = hipErrorNotReady --> still have to look into this
+        if status == hipSuccess:
             logging.debug("kernel finished")
             return True
         else:

From 48db63881898039c4f9f703616e63886e01aaef6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Thu, 13 Apr 2023 17:45:31 +0200
Subject: [PATCH 27/61] examples/hip/vector_add.py runs successfully on AMD

---
 kernel_tuner/backends/hip.py  | 6 ++----
 kernel_tuner/observers/hip.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 8982b082a..fb602d5cb 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -123,7 +123,8 @@ def ready_argument_list(self, arguments):
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
         :returns: A ctypes structure that can be passed to the HIP function.
-        :rtype: ctypes.Structure"""
+        :rtype: ctypes.Structure
+        """
         
         logging.debug("HipFunction ready_argument_list called")
         ctype_args = []
@@ -199,12 +200,9 @@ def kernel_finished(self):
         
         # Query the status of the event
         status = _libhip.hipEventQuery(self.end)
-        logging.debug(f'_libhip.hipEventQuery(self.end) = {status}')
         if status == hipSuccess:
-            logging.debug("kernel finished")
             return True
         else:
-            logging.debug("kernel not finished")
             return False
 
     def synchronize(self):
diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py
index 4c7704aa8..72a3cb4fe 100644
--- a/kernel_tuner/observers/hip.py
+++ b/kernel_tuner/observers/hip.py
@@ -23,7 +23,7 @@ def __init__(self, dev):
     def after_finish(self):
         # Time is measured in milliseconds
         EventElapsedTime = hip.hipEventElapsedTime(self.start, self.end)
-        self.times.append(EventElapsedTime.value)
+        self.times.append(EventElapsedTime)
 
     def get_results(self):
         results = {"time": np.average(self.times), "times": self.times.copy()}

From e003d1c3dee2882b5cf8b6dc16045c675b5b38d0 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mli940@fs0.das6.cs.vu.nl>
Date: Fri, 14 Apr 2023 09:51:26 +0200
Subject: [PATCH 28/61] examples/hip/vector_add.py runs successfully on AMD

---
 kernel_tuner/backends/hip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index fb602d5cb..ac679ab4e 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -129,7 +129,7 @@ def ready_argument_list(self, arguments):
         logging.debug("HipFunction ready_argument_list called")
         ctype_args = []
         data_ctypes = None
-        for arg in enumerate(arguments):
+        for arg in arguments:
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():

From f6cc000cc2f62db5d8d66a8d65d0028f4ec5fc4d Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 14 Apr 2023 12:20:58 +0200
Subject: [PATCH 29/61] modified (not tested) memset, copy_constant_memory_args

---
 kernel_tuner/backends/hip.py | 44 ++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index ac679ab4e..8b96fc1df 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -64,9 +64,14 @@
     "float64": ctypes.c_double,
 }
 
-# define arguments and return value ctypes for hipEventQuery
+# define arguments and return value types of HIP functions
 _libhip.hipEventQuery.restype = ctypes.c_int
 _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
+_libhip.hipModuleGetGlobal.restype = ctypes.c_int
+_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p]
+_libhip.hipMemset.restype = ctypes.c_int
+_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+
 
 hipSuccess = 0
 
@@ -108,6 +113,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.end = hip.hipEventCreate()
 
         self.smem_size = 0
+        self.current_module = None
 
         # setup observers
         self.observers = observers or []
@@ -125,12 +131,13 @@ def ready_argument_list(self, arguments):
         :returns: A ctypes structure that can be passed to the HIP function.
         :rtype: ctypes.Structure
         """
-        
         logging.debug("HipFunction ready_argument_list called")
+
         ctype_args = []
         data_ctypes = None
         for arg in arguments:
             dtype_str = str(arg.dtype)
+            # Allocate space on device for array and convert to ctypes
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
                     device_ptr = hip.hipMalloc(arg.nbytes)
@@ -138,7 +145,8 @@ def ready_argument_list(self, arguments):
                     hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes)
                     ctype_args.append(device_ptr)
                 else:
-                    raise TypeError("unknown dtype for ndarray")        
+                    raise TypeError("unknown dtype for ndarray")  
+            # Convert valid non-array arguments to ctypes      
             elif isinstance(arg, np.generic):
                 data_ctypes = dtype_map[dtype_str](arg)
                 ctype_args.append(data_ctypes)  
@@ -180,6 +188,7 @@ def compile(self, kernel_instance):
             hiprtc.hiprtcCompileProgram(kernel_ptr, [])
         code = hiprtc.hiprtcGetCode(kernel_ptr)
         module = hip.hipModuleLoadData(code)
+        self.current_module = module
         kernel = hip.hipModuleGetFunction(module, kernel_name)
         
         return kernel
@@ -242,7 +251,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
     def memset(self, allocation, value, size):
         """set the memory in allocation to the value in value
 
-        :param allocation: An Argument for some memory allocation unit
+        :param allocation: A GPU memory allocation unit
         :type allocation: ctypes ptr
 
         :param value: The value to set the memory to
@@ -250,9 +259,12 @@ def memset(self, allocation, value, size):
 
         :param size: The size of to the allocation unit in bytes
         :type size: int
+
         """
-        logging.debug("HipFunction memset called")
-        allocation.contents.value = value # probably wrong, still have to look into this
+        ctypes_value = ctypes.c_int(value)
+        ctypes_size = ctypes.c_size_t(size)
+        status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size)
+        hip.hipCheckStatus(status)
 
     def memcpy_dtoh(self, dest, src):
         """perform a device to host memory copy
@@ -281,8 +293,25 @@ def memcpy_htod(self, dest, src):
         hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size)
 
     def copy_constant_memory_args(self, cmem_args):
-        """This method must implement the allocation and copy of constant memory to the GPU."""
+        """adds constant memory arguments to the most recently compiled module
+
+        :param cmem_args: A dictionary containing the data to be passed to the
+            device constant memory. The format to be used is as follows: A
+            string key is used to name the constant memory symbol to which the
+            value needs to be copied. Similar to regular arguments, these need
+            to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
+        :type cmem_args: dict( string: numpy.ndarray, ... )
+        """
         logging.debug("HipFunction copy_constant_memory_args called")
+        logging.debug("current module: " + str(self.current_module))
+
+        for k, v in cmem_args.items():
+            symbol = ctypes.c_void_p
+            size_kernel = ctypes.c_size_t
+            status = _libhip.hipModuleGetGlobal(symbol, size_kernel, self.current_module, str.encode(k))
+            hip.hipCheckStatus(status)
+            dtype_str = str(v.dtype)
+            hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size)
 
     def copy_shared_memory_args(self, smem_args):
         """This method must implement the dynamic allocation of shared memory on the GPU."""
@@ -292,5 +321,6 @@ def copy_shared_memory_args(self, smem_args):
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         logging.debug("HipFunction copy_texture_memory_args called")
+        # NOT SUPPORTED?
 
     units = {"time": "ms"}

From 2505db9de43af9e22ae9faba41c0f3d600d132b7 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 14 Apr 2023 12:30:47 +0200
Subject: [PATCH 30/61] modified (not tested) memset, copy_constant_memory_args

---
 kernel_tuner/backends/hip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 8b96fc1df..b60cde2a9 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -314,13 +314,13 @@ def copy_constant_memory_args(self, cmem_args):
             hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size)
 
     def copy_shared_memory_args(self, smem_args):
-        """This method must implement the dynamic allocation of shared memory on the GPU."""
+        """add shared memory arguments to the kernel"""
         logging.debug("HipFunction copy_shared_memory_args called")
         self.smem_size = smem_args["size"]
 
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         logging.debug("HipFunction copy_texture_memory_args called")
-        # NOT SUPPORTED?
+        raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED?
 
     units = {"time": "ms"}

From dd34b4180467d1d94b06164e33832fbeb91b7bc5 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Mon, 17 Apr 2023 11:56:09 +0200
Subject: [PATCH 31/61] modified run_kernel (modifies grpu_args from ctypes
 list to ctypes structure), ready_argument_list (returns ctypes list), and
 memcpy_dtoh; examples/hip/test_vector_add.py passes -> memcpy_dtoh works

---
 examples/hip/test_vector_add.py | 14 +++++++------
 kernel_tuner/backends/hip.py    | 35 ++++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py
index 6fd21aced..6c342632e 100644
--- a/examples/hip/test_vector_add.py
+++ b/examples/hip/test_vector_add.py
@@ -5,13 +5,15 @@
 from kernel_tuner import run_kernel
 import pytest
 
+#Check pyhip is installed and if a HIP capable device is present, if not skip the test
+try:
+    from pyhip import hip, hiprtc
+except ImportError:
+    pytest.skip("PyHIP not installed or PYTHONPATH does not includes PyHIP")
+    hip = None
+    hiprtc = None
+
 def test_vector_add():
-    #Check pyhip is installed and if a HIP capable device is present, if not skip the test
-    try:
-        import pyhip as hip
-        hip.hipGetDeviceProperties(0)
-    except (ImportError, Exception):
-        pytest.skip("PyHIP not installed or no HIP device detected")
 
     kernel_string = """
     __global__ void vector_add(float *c, float *a, float *b, int n) {
diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index b60cde2a9..f54c01d36 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -128,8 +128,8 @@ def ready_argument_list(self, arguments):
             The order should match the argument list on the HIP function.
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
-        :returns: A ctypes structure that can be passed to the HIP function.
-        :rtype: ctypes.Structure
+        :returns: List of ctypes arguments to be passed to the HIP function.
+        :rtype: list of ctypes
         """
         logging.debug("HipFunction ready_argument_list called")
 
@@ -151,13 +151,7 @@ def ready_argument_list(self, arguments):
                 data_ctypes = dtype_map[dtype_str](arg)
                 ctype_args.append(data_ctypes)  
         
-        # Determine the types of the fields in the structure
-        field_types = [type(x) for x in ctype_args]
-        # Define a new ctypes structure with the inferred layout
-        class ArgListStructure(ctypes.Structure):
-            _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
-        
-        return ArgListStructure(*ctype_args)
+        return ctype_args
     
     
     def compile(self, kernel_instance):
@@ -228,7 +222,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         :param gpu_args: A list of arguments to the kernel, order should match the
             order in the code. Allowed values are either variables in global memory
             or single values passed by value.
-        :type gpu_args: ctypes.Structure
+        :type gpu_args: list of ctypes
 
         :param threads: A tuple listing the number of threads in each dimension of
             the thread block
@@ -241,6 +235,15 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         logging.debug("HipFunction run_kernel called")
         if stream is None:
             stream = self.stream
+
+        # Determine the types of the fields in the structure
+        field_types = [type(x) for x in gpu_args]
+        # Define a new ctypes structure with the inferred layout
+        class ArgListStructure(ctypes.Structure):
+            _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
+        
+        gpu_args = ArgListStructure(*gpu_args)
+
         hip.hipModuleLaunchKernel(func, 
                                   grid[0], grid[1], grid[2], 
                                   threads[0], threads[1], threads[2],
@@ -261,6 +264,8 @@ def memset(self, allocation, value, size):
         :type size: int
 
         """
+        logging.debug("HipFunction memset called")
+        print("HipFunction memset called")
         ctypes_value = ctypes.c_int(value)
         ctypes_size = ctypes.c_size_t(size)
         status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size)
@@ -276,8 +281,10 @@ def memcpy_dtoh(self, dest, src):
         :type src: ctypes ptr
         """
         logging.debug("HipFunction memcpy_dtoh called")
-        dtype_str = str(src.dtype)
-        hip.hipMemcpy_dtoh(ctypes.byref(dest.ctypes), src, ctypes.sizeof(dtype_map[dtype_str]) * src.size)
+        print("HipFunction memcpy_dtoh called")
+
+        address = dest.ctypes.data
+        hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes)
 
     def memcpy_htod(self, dest, src):
         """perform a host to device memory copy
@@ -289,6 +296,7 @@ def memcpy_htod(self, dest, src):
         :type src: numpy.ndarray
         """
         logging.debug("HipFunction memcpy_htod called")
+        print("HipFunction memcpy_htod called")
         dtype_str = str(src.dtype)
         hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size)
 
@@ -303,6 +311,7 @@ def copy_constant_memory_args(self, cmem_args):
         :type cmem_args: dict( string: numpy.ndarray, ... )
         """
         logging.debug("HipFunction copy_constant_memory_args called")
+        print("HipFunction copy_constant_memory_args called")
         logging.debug("current module: " + str(self.current_module))
 
         for k, v in cmem_args.items():
@@ -316,11 +325,13 @@ def copy_constant_memory_args(self, cmem_args):
     def copy_shared_memory_args(self, smem_args):
         """add shared memory arguments to the kernel"""
         logging.debug("HipFunction copy_shared_memory_args called")
+        print("HipFunction copy_shared_memory_args called")
         self.smem_size = smem_args["size"]
 
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         logging.debug("HipFunction copy_texture_memory_args called")
+        print("HipFunction copy_texture_memory_args called")
         raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED?
 
     units = {"time": "ms"}

From 211889b6e12a818887b86d00ebc94eb2c15ef539 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Mon, 17 Apr 2023 12:17:42 +0200
Subject: [PATCH 32/61] clean up and modified memcpy_htod

---
 kernel_tuner/backends/hip.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index f54c01d36..bd8565c4b 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -281,7 +281,6 @@ def memcpy_dtoh(self, dest, src):
         :type src: ctypes ptr
         """
         logging.debug("HipFunction memcpy_dtoh called")
-        print("HipFunction memcpy_dtoh called")
 
         address = dest.ctypes.data
         hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes)
@@ -296,9 +295,9 @@ def memcpy_htod(self, dest, src):
         :type src: numpy.ndarray
         """
         logging.debug("HipFunction memcpy_htod called")
-        print("HipFunction memcpy_htod called")
-        dtype_str = str(src.dtype)
-        hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size)
+
+        address = src.ctypes.data
+        hip.hipMemcpy_htod(dest, ctypes.c_void_p(address), src.nbytes)
 
     def copy_constant_memory_args(self, cmem_args):
         """adds constant memory arguments to the most recently compiled module
@@ -325,13 +324,13 @@ def copy_constant_memory_args(self, cmem_args):
     def copy_shared_memory_args(self, smem_args):
         """add shared memory arguments to the kernel"""
         logging.debug("HipFunction copy_shared_memory_args called")
-        print("HipFunction copy_shared_memory_args called")
+
         self.smem_size = smem_args["size"]
 
     def copy_texture_memory_args(self, texmem_args):
         """This method must implement the allocation and copy of texture memory to the GPU."""
         logging.debug("HipFunction copy_texture_memory_args called")
-        print("HipFunction copy_texture_memory_args called")
-        raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED?
+
+        raise NotImplementedError("HIP backend does not support texture memory")
 
     units = {"time": "ms"}

From 237f39b901c6d1104e6d278ce20da1c80ebd26f3 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Mon, 17 Apr 2023 12:41:39 +0200
Subject: [PATCH 33/61] modified
 test/test_hip_functions.py:test_ready_argument_list

---
 test/test_hip_functions.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index c0030c995..68e6f22f1 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -26,12 +26,10 @@ def test_ready_argument_list():
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)
     
-    assert isinstance(gpu_args, ctypes.Structure)
-
-    assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float))
-    assert isinstance(gpu_args.field1, ctypes.c_int32)
-    assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float))
-    assert isinstance(gpu_args.field3, ctypes.c_bool)
+    assert isinstance(gpu_args[0], ctypes.c_void_p)
+    assert isinstance(gpu_args[1], ctypes.c_int32)
+    assert isinstance(gpu_args[2], ctypes.c_void_p)
+    assert isinstance(gpu_args[3], ctypes.c_bool)
 
 @skip_if_no_pyhip
 def test_compile():

From 5ae62c6f27d5c4f4c2f879bce0bcbd0e510990ee Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Mon, 17 Apr 2023 14:25:11 +0200
Subject: [PATCH 34/61] changed once again how gpu arguments get passed around
 --> ctypes structure

---
 kernel_tuner/backends/hip.py | 29 +++++++++++++++--------------
 test/test_hip_functions.py   |  7 ++-----
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index bd8565c4b..bdb48c053 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -128,8 +128,8 @@ def ready_argument_list(self, arguments):
             The order should match the argument list on the HIP function.
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
-        :returns: List of ctypes arguments to be passed to the HIP function.
-        :rtype: list of ctypes
+        :returns: Ctypes structure of arguments to be passed to the HIP function.
+        :rtype: ctypes structure
         """
         logging.debug("HipFunction ready_argument_list called")
 
@@ -150,9 +150,17 @@ def ready_argument_list(self, arguments):
             elif isinstance(arg, np.generic):
                 data_ctypes = dtype_map[dtype_str](arg)
                 ctype_args.append(data_ctypes)  
+
+        # Determine the types of the fields in the structure
+        field_types = [type(x) for x in ctype_args]
+        # Define a new ctypes structure with the inferred layout
+        class ArgListStructure(ctypes.Structure):
+            _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
+            def __getitem__(self, key):
+                return self._fields_[key]
         
-        return ctype_args
-    
+        return ArgListStructure(*ctype_args)
+            
     
     def compile(self, kernel_instance):
         """call the HIP compiler to compile the kernel, return the function
@@ -219,10 +227,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         :param func: A PyHIP kernel compiled for this specific kernel configuration
         :type func: ctypes pionter
 
-        :param gpu_args: A list of arguments to the kernel, order should match the
+        :param gpu_args: A ctypes structure of arguments to the kernel, order should match the
             order in the code. Allowed values are either variables in global memory
             or single values passed by value.
-        :type gpu_args: list of ctypes
+        :type gpu_args: ctypes structure
 
         :param threads: A tuple listing the number of threads in each dimension of
             the thread block
@@ -233,17 +241,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         :type grid: tuple(int, int, int)
         """
         logging.debug("HipFunction run_kernel called")
+
         if stream is None:
             stream = self.stream
 
-        # Determine the types of the fields in the structure
-        field_types = [type(x) for x in gpu_args]
-        # Define a new ctypes structure with the inferred layout
-        class ArgListStructure(ctypes.Structure):
-            _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
-        
-        gpu_args = ArgListStructure(*gpu_args)
-
         hip.hipModuleLaunchKernel(func, 
                                   grid[0], grid[1], grid[2], 
                                   threads[0], threads[1], threads[2],
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index 68e6f22f1..754bf15ef 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -25,11 +25,8 @@ def test_ready_argument_list():
 
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)
-    
-    assert isinstance(gpu_args[0], ctypes.c_void_p)
-    assert isinstance(gpu_args[1], ctypes.c_int32)
-    assert isinstance(gpu_args[2], ctypes.c_void_p)
-    assert isinstance(gpu_args[3], ctypes.c_bool)
+
+    assert(gpu_args, ctypes.Structure)
 
 @skip_if_no_pyhip
 def test_compile():

From 92f55a56be03ac827648aeda185bdba2f7e9a018 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 18 Apr 2023 16:25:54 +0200
Subject: [PATCH 35/61] added test_memset_and_memcpy_dtoh and test_memcpy_htod
 --> pass

---
 kernel_tuner/backends/hip.py | 14 +++++++-----
 test/test_hip_functions.py   | 44 +++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index bdb48c053..4b1ad720c 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -70,7 +70,7 @@
 _libhip.hipModuleGetGlobal.restype = ctypes.c_int
 _libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p]
 _libhip.hipMemset.restype = ctypes.c_int
-_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+_libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
 
 
 hipSuccess = 0
@@ -266,7 +266,7 @@ def memset(self, allocation, value, size):
 
         """
         logging.debug("HipFunction memset called")
-        print("HipFunction memset called")
+        
         ctypes_value = ctypes.c_int(value)
         ctypes_size = ctypes.c_size_t(size)
         status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size)
@@ -283,8 +283,9 @@ def memcpy_dtoh(self, dest, src):
         """
         logging.debug("HipFunction memcpy_dtoh called")
 
-        address = dest.ctypes.data
-        hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes)
+        dtype_str = str(dest.dtype)
+        dest_c = dest.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
+        hip.hipMemcpy_dtoh(dest_c, src, dest.nbytes)
 
     def memcpy_htod(self, dest, src):
         """perform a host to device memory copy
@@ -297,8 +298,9 @@ def memcpy_htod(self, dest, src):
         """
         logging.debug("HipFunction memcpy_htod called")
 
-        address = src.ctypes.data
-        hip.hipMemcpy_htod(dest, ctypes.c_void_p(address), src.nbytes)
+        dtype_str = str(src.dtype)
+        src_c = src.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
+        hip.hipMemcpy_htod(dest, src_c, src.nbytes)
 
     def copy_constant_memory_args(self, cmem_args):
         """adds constant memory arguments to the most recently compiled module
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index 754bf15ef..a95cd1cba 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -3,6 +3,7 @@
 from .context import skip_if_no_pyhip
 
 import pytest
+import kernel_tuner
 from kernel_tuner.backends import hip as kt_hip
 from kernel_tuner.core import KernelSource, KernelInstance
 
@@ -23,10 +24,24 @@ def test_ready_argument_list():
 
     arguments = [d, a, b, c]
 
+    class ArgListStructure(ctypes.Structure):
+        _fields_ = [("field0", ctypes.POINTER(ctypes.c_float)),
+                    ("field1", ctypes.c_int),
+                    ("field2", ctypes.POINTER(ctypes.c_float)),
+                    ("field3", ctypes.c_bool)]
+        def __getitem__(self, key):
+                return self._fields_[key]
+
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)
 
-    assert(gpu_args, ctypes.Structure)
+    argListStructure = ArgListStructure(d.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                                        ctypes.c_int(a),
+                                        b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                                        ctypes.c_bool(c))
+    
+    assert(gpu_args[1] == argListStructure[1])
+    assert(gpu_args[3] == argListStructure[3])
 
 @skip_if_no_pyhip
 def test_compile():
@@ -50,6 +65,33 @@ def test_compile():
         pytest.fail("Did not expect any exception:" + str(e))
 
 
+@skip_if_no_pyhip
+def test_memset_and_memcpy_dtoh():
+    a = [1, 2, 3, 4]
+    x = np.array(a).astype(np.int8)
+    x_d = hip.hipMalloc(x.nbytes)
+
+    Hipfunc = kt_hip.HipFunctions()
+    Hipfunc.memset(x_d, 4, x.nbytes)
+
+    output = np.empty(4, dtype=np.int8)
+    Hipfunc.memcpy_dtoh(output, x_d)
+
+    assert all(output == np.full(4, 4))
+
+@skip_if_no_pyhip
+def test_memcpy_htod():
+    a = [1, 2, 3, 4]
+    x = np.array(a).astype(np.float32)
+    x_d = hip.hipMalloc(x.nbytes)
+    output = np.empty(4, dtype=np.float32)
+
+    Hipfunc = kt_hip.HipFunctions()
+    Hipfunc.memcpy_htod(x_d, x)
+    Hipfunc.memcpy_dtoh(output, x_d)
+
+    assert all(output == x)
+
 def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
     pass
 

From f0ab5efda4871d0078bd4835574f8940647ae68f Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Wed, 19 Apr 2023 16:09:25 +0200
Subject: [PATCH 36/61] modified copy_constant_memory_args, implemented
 test_copy_constant_memory_args --> passed

---
 kernel_tuner/backends/hip.py | 23 ++++++++++++------
 test/test_hip_functions.py   | 47 ++++++++++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 4b1ad720c..9e4e7f117 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -68,9 +68,11 @@
 _libhip.hipEventQuery.restype = ctypes.c_int
 _libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
 _libhip.hipModuleGetGlobal.restype = ctypes.c_int
-_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p]
+_libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p]
 _libhip.hipMemset.restype = ctypes.c_int
 _libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+_libhip.hipMemcpyToSymbol.restype = ctypes.c_int
+_libhip.hipMemcpyToSymbol.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t, ctypes.c_int]
 
 
 hipSuccess = 0
@@ -313,16 +315,23 @@ def copy_constant_memory_args(self, cmem_args):
         :type cmem_args: dict( string: numpy.ndarray, ... )
         """
         logging.debug("HipFunction copy_constant_memory_args called")
-        print("HipFunction copy_constant_memory_args called")
-        logging.debug("current module: " + str(self.current_module))
 
         for k, v in cmem_args.items():
-            symbol = ctypes.c_void_p
-            size_kernel = ctypes.c_size_t
-            status = _libhip.hipModuleGetGlobal(symbol, size_kernel, self.current_module, str.encode(k))
+            #Format arguments, call hipModuleGetGlobal, and check return status
+            symbol_string = ctypes.c_char_p(k.encode('utf-8'))
+            symbol = ctypes.c_void_p()
+            symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol)
+            size_kernel = ctypes.c_size_t(0)
+
+            size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel)
+            status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string)
             hip.hipCheckStatus(status)
+
+            #Format arguments and call hipMemcpy_htod
             dtype_str = str(v.dtype)
-            hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size)
+            v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
+
+            hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes)
 
     def copy_shared_memory_args(self, smem_args):
         """add shared memory arguments to the kernel"""
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index a95cd1cba..a6a95a8cb 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -56,7 +56,7 @@ def test_compile():
     """
 
     kernel_name = "vector_add"
-    kernel_sources = KernelSource(kernel_name, kernel_string, "cuda")
+    kernel_sources = KernelSource(kernel_name, kernel_string, "HIP")
     kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
     dev = kt_hip.HipFunctions(0)
     try:
@@ -71,11 +71,11 @@ def test_memset_and_memcpy_dtoh():
     x = np.array(a).astype(np.int8)
     x_d = hip.hipMalloc(x.nbytes)
 
-    Hipfunc = kt_hip.HipFunctions()
-    Hipfunc.memset(x_d, 4, x.nbytes)
+    dev = kt_hip.HipFunctions()
+    dev.memset(x_d, 4, x.nbytes)
 
     output = np.empty(4, dtype=np.int8)
-    Hipfunc.memcpy_dtoh(output, x_d)
+    dev.memcpy_dtoh(output, x_d)
 
     assert all(output == np.full(4, 4))
 
@@ -86,12 +86,45 @@ def test_memcpy_htod():
     x_d = hip.hipMalloc(x.nbytes)
     output = np.empty(4, dtype=np.float32)
 
-    Hipfunc = kt_hip.HipFunctions()
-    Hipfunc.memcpy_htod(x_d, x)
-    Hipfunc.memcpy_dtoh(output, x_d)
+    dev = kt_hip.HipFunctions()
+    dev.memcpy_htod(x_d, x)
+    dev.memcpy_dtoh(output, x_d)
 
     assert all(output == x)
 
+@skip_if_no_pyhip
+def test_copy_constant_memory_args():
+    kernel_string = """
+    __constant__ float my_constant_data[100];
+    __global__ void copy_data_kernel(float* output) {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < 100) {
+            output[idx] = my_constant_data[idx];
+        }
+    }
+    """
+
+    kernel_name = "copy_data_kernel"
+    kernel_sources = KernelSource(kernel_name, kernel_string, "HIP")
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
+    dev = kt_hip.HipFunctions(0)
+    kernel = dev.compile(kernel_instance)
+
+    my_constant_data = np.full(100, 23).astype(np.float32)
+    cmem_args = {'my_constant_data': my_constant_data}
+    dev.copy_constant_memory_args(cmem_args)
+
+    output = np.full(100, 0).astype(np.float32)
+    gpu_args = dev.ready_argument_list([output])
+    
+    threads = (100, 1, 1)
+    grid = (1, 1, 1)
+    dev.run_kernel(kernel, gpu_args, threads, grid)
+
+    dev.memcpy_dtoh(output, gpu_args.field0)
+
+    assert(my_constant_data == output).all()
+
 def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
     pass
 

From 685aed3a1186e04ac825e21f5f4341c74eb36f7d Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Wed, 19 Apr 2023 16:29:57 +0200
Subject: [PATCH 37/61] added test_smem_args --> passed

---
 test/test_hip_functions.py | 41 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index a6a95a8cb..87ab9dfe6 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -1,9 +1,11 @@
 import numpy as np
 import ctypes
 from .context import skip_if_no_pyhip
+from collections import OrderedDict
 
 import pytest
 import kernel_tuner
+from kernel_tuner import tune_kernel
 from kernel_tuner.backends import hip as kt_hip
 from kernel_tuner.core import KernelSource, KernelInstance
 
@@ -13,6 +15,29 @@
 except ImportError:
     pass
 
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = OrderedDict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
 @skip_if_no_pyhip
 def test_ready_argument_list():
 
@@ -125,6 +150,18 @@ def test_copy_constant_memory_args():
 
     assert(my_constant_data == output).all()
 
-def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
-    pass
+@skip_if_no_pyhip
+def test_smem_args(env):
+    result, _ = tune_kernel(*env,
+                            smem_args=dict(size="block_size_x*4"),
+                            verbose=True, lang="HIP")
+    tune_params = env[-1]
+    assert len(result) == len(tune_params["block_size_x"])
+    result, _ = tune_kernel(
+        *env,
+        smem_args=dict(size=lambda p: p['block_size_x'] * 4),
+        verbose=True, lang="HIP")
+    tune_params = env[-1]
+    assert len(result) == len(tune_params["block_size_x"])
+
 

From 2388be45409a24f96ce1d46754335aa1469e9a89 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 21 Apr 2023 17:09:44 +0200
Subject: [PATCH 38/61] clean up and added more comments

---
 kernel_tuner/backends/hip.py | 51 ++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 9e4e7f117..5e1fd4116 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -3,8 +3,6 @@
 import numpy as np
 import ctypes
 import ctypes.util
-from collections import namedtuple
-import os
 import sys
 import logging
 
@@ -29,13 +27,6 @@
             raise RuntimeError(
                 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set')
     
-    _libhiprtc_libname = 'libhiprtc.so'
-    _libhiprtc = None
-    try:
-        _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname)
-    except:
-        raise OSError('hiprtc library not found')
-    
 else:
     # Currently we do not support windows
     raise RuntimeError('Only linux is supported')
@@ -71,9 +62,6 @@
 _libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p]
 _libhip.hipMemset.restype = ctypes.c_int
 _libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
-_libhip.hipMemcpyToSymbol.restype = ctypes.c_int
-_libhip.hipMemcpyToSymbol.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t, ctypes.c_int]
-
 
 hipSuccess = 0
 
@@ -100,13 +88,15 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.name = self.hipProps._name.decode('utf-8')
         self.max_threads = self.hipProps.maxThreadsPerBlock
-        logging.debug("self.max_threads: " + str(self.max_threads))
-
         self.device = device
         self.compiler_options = compiler_options
+        self.iterations = iterations
 
         env = dict()
         env["device_name"] = self.name
+        env["iterations"] = self.iterations
+        env["compiler_options"] = compiler_options
+        env["device_properties"] = self.hipProps
         self.env = env
 
         # create a stream and events
@@ -114,7 +104,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.start = hip.hipEventCreate()
         self.end = hip.hipEventCreate()
 
+        # default dynamically allocated shared memory size, can be overwritten using smem_args
         self.smem_size = 0
+
         self.current_module = None
 
         # setup observers
@@ -126,10 +118,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the HIP function
+
         :param arguments: List of arguments to be passed to the HIP function.
             The order should match the argument list on the HIP function.
             Allowed values are np.ndarray, and/or np.int32, np.float32, and so on.
         :type arguments: list(numpy objects)
+        
         :returns: Ctypes structure of arguments to be passed to the HIP function.
         :rtype: ctypes structure
         """
@@ -175,21 +169,23 @@ def compile(self, kernel_instance):
         :rtype: ctypes._FuncPtr
         """
         logging.debug("HipFunction compile called")
+
+        #Format and create program
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
-
         if 'extern "C"' not in kernel_string:
             kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
-
         kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], [])
         
+        #Compile based on device (Not yet tested for non-AMD devices)
         plat = hip.hipGetPlatformName()
-        #Compile based on device
         if plat == "amd":
             hiprtc.hiprtcCompileProgram(
                 kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}'])
         else:
             hiprtc.hiprtcCompileProgram(kernel_ptr, [])
+        
+        #Get module and kernel from compiled kernel string
         code = hiprtc.hiprtcGetCode(kernel_ptr)
         module = hip.hipModuleLoadData(code)
         self.current_module = module
@@ -200,11 +196,13 @@ def compile(self, kernel_instance):
     def start_event(self):
         """Records the event that marks the start of a measurement"""
         logging.debug("HipFunction start_event called")
+
         hip.hipEventRecord(self.start, self.stream)
 
     def stop_event(self):
         """Records the event that marks the end of a measurement"""
         logging.debug("HipFunction stop_event called")
+
         hip.hipEventRecord(self.end, self.stream)
 
     def kernel_finished(self):
@@ -221,12 +219,13 @@ def kernel_finished(self):
     def synchronize(self):
         """Halts execution until device has finished its tasks"""
         logging.debug("HipFunction synchronize called")
+
         hip.hipDeviceSynchronize()
 
     def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """runs the HIP kernel passed as 'func'
 
-        :param func: A PyHIP kernel compiled for this specific kernel configuration
+        :param func: A HIP kernel compiled for this specific kernel configuration
         :type func: ctypes pionter
 
         :param gpu_args: A ctypes structure of arguments to the kernel, order should match the
@@ -269,6 +268,8 @@ def memset(self, allocation, value, size):
         """
         logging.debug("HipFunction memset called")
         
+        # Format arguments to correct type, set the memory and 
+        # check return value of memset (as done in PyHIP with hipCheckStatus)
         ctypes_value = ctypes.c_int(value)
         ctypes_size = ctypes.c_size_t(size)
         status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size)
@@ -285,6 +286,7 @@ def memcpy_dtoh(self, dest, src):
         """
         logging.debug("HipFunction memcpy_dtoh called")
 
+        # Format arguments to correct type and perform memory copy
         dtype_str = str(dest.dtype)
         dest_c = dest.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
         hip.hipMemcpy_dtoh(dest_c, src, dest.nbytes)
@@ -300,6 +302,7 @@ def memcpy_htod(self, dest, src):
         """
         logging.debug("HipFunction memcpy_htod called")
 
+        # Format arguments to correct type and perform memory copy
         dtype_str = str(src.dtype)
         src_c = src.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
         hip.hipMemcpy_htod(dest, src_c, src.nbytes)
@@ -316,21 +319,24 @@ def copy_constant_memory_args(self, cmem_args):
         """
         logging.debug("HipFunction copy_constant_memory_args called")
 
+        # Iterate over dictionary
         for k, v in cmem_args.items():
-            #Format arguments, call hipModuleGetGlobal, and check return status
+            # Format arguments, call hipModuleGetGlobal, 
+            # and check return status (as done in PyHIP with hipCheckStatus)
             symbol_string = ctypes.c_char_p(k.encode('utf-8'))
             symbol = ctypes.c_void_p()
             symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol)
             size_kernel = ctypes.c_size_t(0)
 
+            # Get constant memory symbol and check return value of hipModuleGetGlobal 
+            # (as done in PyHIP with hipCheckStatus)
             size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel)
             status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string)
             hip.hipCheckStatus(status)
 
-            #Format arguments and call hipMemcpy_htod
+            #Format arguments and perform memory copy
             dtype_str = str(v.dtype)
             v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
-
             hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes)
 
     def copy_shared_memory_args(self, smem_args):
@@ -340,9 +346,8 @@ def copy_shared_memory_args(self, smem_args):
         self.smem_size = smem_args["size"]
 
     def copy_texture_memory_args(self, texmem_args):
-        """This method must implement the allocation and copy of texture memory to the GPU."""
         logging.debug("HipFunction copy_texture_memory_args called")
 
         raise NotImplementedError("HIP backend does not support texture memory")
 
-    units = {"time": "ms"}
+    units = {"time": "ms", "power": "s,mW", "energy": "J"}

From ca0aed0cb2e6c2834d71f9e59700c4e0a2311ff3 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 21 Apr 2023 17:52:20 +0200
Subject: [PATCH 39/61] added HIP documentation to INSTALL.rst

---
 INSTALL.rst | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/INSTALL.rst b/INSTALL.rst
index cfa6275fb..a2af17843 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -111,6 +111,28 @@ Or you could install Kernel Tuner and PyOpenCL together if you haven't done so a
 
 If this fails, please see the PyOpenCL installation guide (https://wiki.tiker.net/PyOpenCL/Installation)
 
+HIP and PyHIP
+-------------
+
+Before we can install PyHIP, you'll need to have the HIP runtime and compiler installed on your system. 
+The HIP compiler is included as part of the ROCm software stack. Here is AMD's installation guide:
+
+* `ROCm Documentation: HIP Installation Guide <https://docs.amd.com/bundle/HIP-Installation-Guide-v5.3/page/Introduction_to_HIP_Installation_Guide.html>`__
+
+After you've installed HIP, you will need to install PyHIP. As of the writing of this documentation, PyHIP 
+is not yet available on PyPI, meaning we will have to install it from GitHub.
+
+Clone the GitHub repo:
+
+.. code-block:: bash
+
+    git clone https://github.com/jatinx/PyHIP
+
+Set the PYTHONPATH:
+
+.. code-block:: bash
+    
+    export PYTHONPATH=/path/to/pyhip:$PYTHONPATH
 
 Installing the git version
 --------------------------

From 5db2bee53166ed7b930f46762190870d492c9cec Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 25 Apr 2023 14:21:24 +0200
Subject: [PATCH 40/61] not using anymore _libhip, calling implementations from
 pyhip new coverage

---
 kernel_tuner/backends/hip.py | 58 +++---------------------------------
 1 file changed, 4 insertions(+), 54 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 5e1fd4116..c94e09e4b 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -9,29 +9,6 @@
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.hip import HipRuntimeObserver
 
-_libhip = None
-_hip_platform_name = ''
-
-# Try to find amd hip library, if not found, fallback to nvhip library
-if 'linux' in sys.platform:
-    try:
-        _libhip_libname = 'libamdhip64.so'
-        _libhip = ctypes.cdll.LoadLibrary(_libhip_libname)
-        _hip_platform_name = 'amd'
-    except:
-        try:
-            _libhip_libname = 'libnvhip64.so'
-            _libhip = ctypes.cdll.LoadLibrary(_libhip_libname)
-            _hip_platform_name = 'nvidia'
-        except:
-            raise RuntimeError(
-                'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set')
-    
-else:
-    # Currently we do not support windows
-    raise RuntimeError('Only linux is supported')
-
-
 # embedded in try block to be able to generate documentation
 # and run tests without pyhip installed
 try:
@@ -55,14 +32,6 @@
     "float64": ctypes.c_double,
 }
 
-# define arguments and return value types of HIP functions
-_libhip.hipEventQuery.restype = ctypes.c_int
-_libhip.hipEventQuery.argtypes = [ctypes.c_void_p]
-_libhip.hipModuleGetGlobal.restype = ctypes.c_int
-_libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p]
-_libhip.hipMemset.restype = ctypes.c_int
-_libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
-
 hipSuccess = 0
 
 class HipFunctions(GPUBackend):
@@ -210,11 +179,7 @@ def kernel_finished(self):
         logging.debug("HipFunction kernel_finished called")
         
         # Query the status of the event
-        status = _libhip.hipEventQuery(self.end)
-        if status == hipSuccess:
-            return True
-        else:
-            return False
+        return hip.hipEventQuery(self.end)
 
     def synchronize(self):
         """Halts execution until device has finished its tasks"""
@@ -268,12 +233,7 @@ def memset(self, allocation, value, size):
         """
         logging.debug("HipFunction memset called")
         
-        # Format arguments to correct type, set the memory and 
-        # check return value of memset (as done in PyHIP with hipCheckStatus)
-        ctypes_value = ctypes.c_int(value)
-        ctypes_size = ctypes.c_size_t(size)
-        status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size)
-        hip.hipCheckStatus(status)
+        hip.hipMemset(allocation, value, size)
 
     def memcpy_dtoh(self, dest, src):
         """perform a device to host memory copy
@@ -321,18 +281,8 @@ def copy_constant_memory_args(self, cmem_args):
 
         # Iterate over dictionary
         for k, v in cmem_args.items():
-            # Format arguments, call hipModuleGetGlobal, 
-            # and check return status (as done in PyHIP with hipCheckStatus)
-            symbol_string = ctypes.c_char_p(k.encode('utf-8'))
-            symbol = ctypes.c_void_p()
-            symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol)
-            size_kernel = ctypes.c_size_t(0)
-
-            # Get constant memory symbol and check return value of hipModuleGetGlobal 
-            # (as done in PyHIP with hipCheckStatus)
-            size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel)
-            status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string)
-            hip.hipCheckStatus(status)
+            #Get symbol pointer
+            symbol_ptr, _ = hip.hipModuleGetGlobal(self.current_module, k)
 
             #Format arguments and perform memory copy
             dtype_str = str(v.dtype)

From a37e3021867819294cf3a0eb224509a8d0c2ad6c Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 25 Apr 2023 14:26:32 +0200
Subject: [PATCH 41/61] not using anymore _libhip, calling implementations from
 pyhip new coverage

---
 kernel_tuner/backends/hip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index c94e09e4b..4e114fa23 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -287,7 +287,7 @@ def copy_constant_memory_args(self, cmem_args):
             #Format arguments and perform memory copy
             dtype_str = str(v.dtype)
             v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
-            hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes)
+            hip.hipMemcpy_htod(symbol_ptr, v_c, v.nbytes)
 
     def copy_shared_memory_args(self, smem_args):
         """add shared memory arguments to the kernel"""

From 0ab790dbf3e801f3859d8ab8750b1de5423da247 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 5 May 2023 15:10:01 +0200
Subject: [PATCH 42/61] quick install HIP

---
 doc/source/index.rst | 9 +++++++--
 setup.py             | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index 4a66d893b..c8af5e745 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -27,9 +27,14 @@ To tune OpenCL kernels:
 - First, make sure you have an OpenCL compiler for your intended OpenCL platform
 - Then type: ``pip install kernel_tuner[opencl]``
 
-Or both:
+To tune HIP kernels:
 
-- ``pip install kernel_tuner[cuda,opencl]``
+- First, make sure you have an HIP runtime and compiler installed
+- Then type: ``pip install kernel_tuner[hip]``
+
+Or all:
+
+- ``pip install kernel_tuner[cuda,opencl,hip]``
 
 More information about how to install Kernel Tuner and its
 dependencies can be found under :ref:`install`. 
diff --git a/setup.py b/setup.py
index 07793f353..f17cd45ee 100644
--- a/setup.py
+++ b/setup.py
@@ -76,6 +76,7 @@ def readme():
         "cuda": ["pycuda", "nvidia-ml-py", "pynvml>=11.4.1"],
         "opencl": ["pyopencl"],
         "cuda_opencl": ["pycuda", "pyopencl"],
+        "hip": ["pyhip-interface"],
         "tutorial": ["jupyter", "matplotlib", "pandas"],
         "dev": [
             "numpy>=1.13.3",

From fe7bad1cde3dca4dbcdf9c11592f273de3f4f5a5 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 5 May 2023 15:21:26 +0200
Subject: [PATCH 43/61] Installing HIP doc updated

---
 INSTALL.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/INSTALL.rst b/INSTALL.rst
index a2af17843..624041bd7 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -119,20 +119,23 @@ The HIP compiler is included as part of the ROCm software stack. Here is AMD's i
 
 * `ROCm Documentation: HIP Installation Guide <https://docs.amd.com/bundle/HIP-Installation-Guide-v5.3/page/Introduction_to_HIP_Installation_Guide.html>`__
 
-After you've installed HIP, you will need to install PyHIP. As of the writing of this documentation, PyHIP 
-is not yet available on PyPI, meaning we will have to install it from GitHub.
+After you've installed HIP, you will need to install PyHIP. Run the following command in your terminal to install:
 
-Clone the GitHub repo:
+.. code-block:: bash
+
+    pip install pyhip-interface
+
+Alternatively, you can install PyHIP from the source code. First, clone the repository from GitHub:
 
 .. code-block:: bash
 
     git clone https://github.com/jatinx/PyHIP
 
-Set the PYTHONPATH:
+Then, navigate to the repository directory and run the following command to install:
 
 .. code-block:: bash
     
-    export PYTHONPATH=/path/to/pyhip:$PYTHONPATH
+    python setup.py install
 
 Installing the git version
 --------------------------

From 47b07eea59f6988ceb3fdb2ba88c8845f0d0f4e7 Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 15:25:44 +0200
Subject: [PATCH 44/61] HIP update INSTALL.rst

---
 INSTALL.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/INSTALL.rst b/INSTALL.rst
index 624041bd7..fb57c2830 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -153,6 +153,7 @@ You can install Kernel Tuner with several optional dependencies, the full list i
 
 - `cuda`: install pycuda along with kernel_tuner
 - `opencl`: install pycuda along with kernel_tuner
+- `pyhip`: install pyhip along with kernel_tuner
 - `doc`: installs packages required to build the documentation
 - `tutorial`: install packages required to run the guides
 - `dev`: install everything you need to start development on Kernel Tuner

From 6862f14ccc9b20674b092801604c57ba6f29067e Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 15:27:22 +0200
Subject: [PATCH 45/61] HIP update INSTALL.rst

---
 INSTALL.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.rst b/INSTALL.rst
index fb57c2830..53fef4926 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -153,7 +153,7 @@ You can install Kernel Tuner with several optional dependencies, the full list i
 
 - `cuda`: install pycuda along with kernel_tuner
 - `opencl`: install pycuda along with kernel_tuner
-- `pyhip`: install pyhip along with kernel_tuner
+- `hip`: install pyhip along with kernel_tuner
 - `doc`: installs packages required to build the documentation
 - `tutorial`: install packages required to run the guides
 - `dev`: install everything you need to start development on Kernel Tuner

From eab0306446cc66ed7bf76d4ff40d5e4a7cc7e70d Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 15:37:38 +0200
Subject: [PATCH 46/61] HIP update interface.py doc

---
 kernel_tuner/interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 9abfa1de9..15fe04ec1 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -98,7 +98,7 @@ def __deepcopy__(self, _):
     (
         "kernel_source",
         (
-            """The CUDA, OpenCL, or C kernel code.
+            """The CUDA, OpenCL, HIP, or C kernel code.
             It is allowed for the code to be passed as a string, a filename, a function
             that returns a string of code, or a list when the code needs auxilliary files.
 
@@ -124,7 +124,7 @@ def __deepcopy__(self, _):
             """Specifies the language used for GPU kernels. The kernel_tuner
         automatically detects the language, but if it fails, you may specify
         the language using this argument, currently supported: "CUDA", "Cupy",
-        "OpenCL", or "C".""",
+        "OpenCL", "HIP", or "C".""",
             "string",
         ),
     ),

From cab70bc455dfe6394e62baf4358933615c24728b Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 15:49:11 +0200
Subject: [PATCH 47/61] added updated architecture diagram PNG

---
 doc/source/architecture.png | Bin 0 -> 33120 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 doc/source/architecture.png

diff --git a/doc/source/architecture.png b/doc/source/architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..bedeefa5b31036ead63a6c97f786e34aaae68de0
GIT binary patch
literal 33120
zcmc$`2UJsQw>BC?EC}jWK?M=rZjmCO(iABw3QAR~bPyv_0wRQ-fU<>+5U>GK140Od
z5-ABi0TED;8bT;S1f+&4DUc9ySMYr2oco_~?tjL;{}^|T;V>a9>#cLnXFl_J6MNUx
zKvY;l7z6@|-no6#90U?%gFphMdv*bTIrRwU0sJT6Z*Fh{RN8f74tTTE{kq9@5U4D1
z-=?b&@P6-;+qV87(EeurKY=#if`=ebB;(G_>z2Vzi=&<XM<_!)UhlGW<Dk{hpR)2(
zQU|Yq@zYHoO>94Z4E*yHZE#PR)ES@p1W=n-lW3`TlU8`ucd--Sl%_uH^tJ$Bh<FBH
zyE1R&ps9&QKGIK{3n60cwCCq$maf1`b<k$r#S8|}2@vSoxy^`M(%UZsK+klxUr61^
zKCt~+%)wbf{)?W|JAuIgSHNwWHH)=0Iuf=zacdR$TI0{u%*;jYh^_i{Uxi;+8y^<2
z5rU3n%^qZG>fGuh;CUCmVt%Ro42t~P>U)gz62tlZoIU^hrvCrE<(Stpf#c)*=utk4
zWTq>oC)1o)ssZxe-S>rA>eMiNHH_&Ej6d&>S;50UCR*Z7>*c{;DzVv4qgdjWoycOe
zV_l#aQl<m=Qsh#(*B#7N=)C3f#@bBF`&<Kwp@f9q6leHQ9h@7gZU*v3#~hGm>a72W
z>XDP0v(pK9v;A0dkxdS->+15~1a#l4wVCdiZUqH}uYNK^=^DN-&EA5|DU#QMhHy8h
zr>E65Yhau}Un&8nxA)}Y^xEoVTij{;*ZTWQy?N^QW`f5R7nO@%f7n7uYP|IAJF+gn
zhp9oX{W;s4+aMjWc`?AyV2isPag^$U=C^$9=%Lu<H}gEbnEfxhjf{*MNNBT|ZpRz3
zIv|l4N27p-;LY__{VF_uz}~^(`?}A4@?3|H)?0miMbPrtlV<(4g)gtqwDgS6+Ffrd
z&<gpL$6_;RE4Ey;88W*2iiE-Sk0C2RuCErRZ*izRab)eCxm?eS1tVQ{cOk%rJNy@6
z&xqsr#>U3ji+x8w%_#LE;_sYir1mp*^+2pYNK;jljZ%~t2~t<Zbt1MlW^#1p^IpGy
zNO%;l+Hk!Om+nweQPF~iO2gM(R|~_A3>HP2;Ewep!WKn#z+86ZU2?}bh48kxuNuc{
zp+(2irjg?|=tBacMrHGl?UI+Qhk0qO>wf(k@*WacjB3FWdv;=o`MiMf5@|qws^m<l
z_t2G4r258ap8WwH`LI!N{V!d_w1s;EZAlPxOkAP2TS&%Gu!T?+(2nL^lr)$_!SHQg
zEfJKy%h2SkR*$?(-57PBNpeGi>Ya++4L+#6(u*|)(oZRQz@$nnw5h3^)_1L{w{lbo
zt^L|ggWuo=`kdO>($SxN%Qm<pmLyjw6iTXjDTN>C1HIkx__3eg#m(vdl&Ptym55C0
z`1kMMTdVi&+m|!a5EiuVOl_fO)}l}-#9VP0mkkX(K|BEkc9HA%`{|6{Anv}~7xV0R
zH!8|R(y`8^iw2X#^vujrI<dANpO@^4Oq9B6T{{N<uE_y4&dvte<j-^Ne5lkPIgEbR
zN<bo!2Y3pZMTSvHV{>7S2iU3HSJ6YN^K_ynVrIs5^06jzYbaoT-oao#f5<MnR-ICE
z#NJ(z+UM74=2-kyO%9heu;KM_D={(FgtoTFPFK=7bJ`1&kTOuWuhOwOj~cV646UKQ
zjZGvb?3J+yKhL;&+`Xhb9C9LZF(>XHxc@xulh0mbrAhF4!AGg;-Gn&hyS1R9O~#NC
zee1b&;^a_mzS6^}Vly&6R9%_jmPYLUHdGC_M?j1`H{hDx?jPIpy~+=O;uGNhU<?8I
zS91PT-!UU!E=#;WFaKUzs<5V_lrhS`H5sL3Y5>tz`~KXXq~2LtSFbtmjkHN#@*3Jo
zJA!o|Qmgis6f5R<REltvvP$Qe#uAW`3qzt8Y34*xv$m#0<kjwgQks5zLZ*}v1L_c9
zksqIc9nd3rYBF$HTdk(Xq}lJ5?rW3_=`qo5MB)*XTxcb@)x4t&LcV}Htq$=C(KJvi
zr5+u|!qXWITdbJ|_qCq&8n=88E`HGW77gVw;f9^e9Uo{RmSpN@A20?P-G8@!Ki_b#
z`|-1XCm%!|GCix$HjPb%VF|<vq}$|U*aO4AERGXvX_Ly4aM-*|7XE}uid}+x$N6-k
zk}%r|W`PswP;FU*JT(S+AK*^6Tf%hjy!(U9{G1wr8c7~#8S$F4m3M51xi?!-_E@kq
zCsY=7L?Le;alG!lrCBF9zU_D}Ymv0n=B?d7l{Y~4bz72nz|o(1W-}9PQeF-lie|o@
z_HZPJWUqB3B8BPD^V~8s=vI|W!1qZ;>{TgNe+_!=b8GLFoW&JVp^>7>^vv5d87I>r
zyl3#d#<A{ZUg?A>M?)<!I8`oL-wsWUmhE1QnW@$E_!KyC9I*1w-BEX}6tKFyP5)Vz
zGU)5fMMn5T-QJP@L&Xt3<;dI7@Kh{;jajE#FAWvf($)YEj+hypn7K@=4qA2sz9Rf}
zDww-di;gqhec+@B(hUh?+Ro~RG0)Rm@Mo*y{b#x$9vS*+C*Jz`eidu{Iq&|cS!I4;
z4j1aVc59-N_c`{k<ed5DHztc3A<d59(wz0E_u1CWg>uwl4ecM;GU<j9H9dc5%dYQd
zZPHq=f%FSPmLJxzh^uMi`1bQa#e#(QWH4Nkx0)F7H~bsZWK*+Rvu&R*0vKJx^&IK=
zqddkZU!^XCye>605j0+`K3J2BUlcXTN_M3)MN#`8>l=CN%Z#v|+I0v+Rf~M=vDN{#
zzf*F<^OFCNPU&CHr|qFX+v|c@mD6J~D9$n6Mq=1&p&P%zvkI%$QGR}`mZQ#pFW-v&
zaNO$5#SxrN8u5gJU7}J&ZLI{G!&*MTeFuz8mySCb*!vM8XRq(skiHOgPii-a)KMi~
z+Bm8yw5vA1xTFNKH(P8cEkFNzeTWME*7cbDq_^kHq6uY{myQn3b1<r11lIK=)H2wE
z=?h;=?_qi1MN~}4Mc!h;vv#QTzdUzdIJ!cyb+>RQ{%kE&DOpTsjfE-G5sPR|jXyu)
z$lUyiNDG7#yU|9VDmlmBkoYwzDd`pUAK%-Bey&JoyZiU|OpXNTBz{<(Xg$Wgw5X4i
z)&58gw>qd?lRdh=tk`+51c_IJy8XGOJvE&J8PM{EHF!}<#gM}l)W7>KE1n}9{{W?2
zKq2Q9lr^l`=WZwn8VhIVz$g9y+z*FGW*jD7I6q)^O{2ANep8DbYke#k!+eNoy#v#2
zXRLZ(!DHJ8tHD+!mO>>FIU|)$){qMC<IUf9gY*rERhO)&W{2tmRZE;O0mdXbM-?!+
zVeu*|<Vuu-L3`hrWtt)+*`}pbXu#v-Djiy?;*`nwg0N64iKc}?;ne;sPH&Az)QwmD
zu<gFH8T3ioCOyDsec4K}VgJ=`GDC3m!K}SzE1a-;xlrp)X1&GvEHekFE-oWjl8Y~>
zkYCI1(t0Pp0ql{L1(vI$Vfgt8#y^Qu)x1Zty}M=p8MW=_ByeIMtY?J9pbdE_<YPHC
zg~f1O10J<khQ%!yIETLT_3xxdd9$TYc@JzitCSeBFEqZvK%nL~p;lf}T#;Y;d-3GI
zDm*#J@LzXk(eF@hkkT{xo&>)%M>Qg5+28U(0lijQ#Crj0ndDkn*G(%n@X5e<EUp*P
zRd5GtmiiB;%xaAK`o(T#X@|WuqtXAlMJ^JZv;!3$f|S_k9#q^4vybcjOgyaZ^kEZh
zu^RqhmBj^VYhix0oE=}Sad1Dm>UM>IkLurmkhz&@7zov4C%bXyZE)8+7X`N*dDB7`
zb6Az2X)We>1e4{8kwDLLT(M@~p3iA^l=*QhGRk;cEMg8ezI33i!z9IiS-ov1C>k{{
z15W5u?f9pNP4J1F#w^C6lEA6c>Xg@-bDUg*#2eJzIyD=qnB}4ZM{(40t?8G{t3fxM
z=N;VZHEFLU(Vli*%8kOp#+j2Zs&>Kjb;Wtn3(>E;GOKGF%Iae`#$%*I+Y+S@kk$#j
z5?NUWO#-=_x2DUp2=|o9GCRMeLzdAGuQ)S?hEC_SB}}Yto&iDNlgyDXg~QBk-ydl?
zZ5*2z;iT*1FtfR_mP1u7IFYB?lt-Wl8R(bkWI3LH+HvoLFJ~S=wW2V!hi~idMzBVM
zfMI5oLB5)x_xXpPvmHKp7<6n`-O&A{5R)>xdDFybU65YSD=(L>z7_|M6oNDv`q@*W
z0H5w$rQYE+KbfezA4a`j>iG64WnY8|ro#E@<obAF>o<CXcEy5GNXvlNccXJ{FSrTO
zw`K~nniJ<9rTr3Z7sTwO#z{W3&e9|9GnwRoqoMUh)1*gNAbqm2Q5)AID8R6ev#27^
zbS^X<30D#qjmo&<|H$Hb-qrN>zhjre^D)olP9eMmMa<J&%&A2$$pr&#Yl>qE7PG8a
zrFZgEW&84&uMtIcGh1EQf}I^o?+A9O_?CQ&M}5kAG+zN&@!~~+NY{_)tyZa&BMOQY
zKlGlGnFpkGPOv33aOc4jBTfeA7fnz(hF2{-@Rhbs34JgW89oxgt%lO9*Ta7_KI=Ie
zVe(QMM^d|k*cehP?1FzqOvZRgKjy77ESe$W?>=+#=Yw5Uicqe_O)VLkP5BbR;lF-<
zO-{e?VKU!(x(Kso6YKj;v`DqqQwN$nNimvOds4S!K3E%b_rn1f7clC=RtU`TRQN=J
z=Y4h;=GojQ?7nxG!c<gLPFILKXAdXKyf0r)FEt~VKeZIbPRUl}5|VQ^Xg3PuUL+aE
zUb4fRwm%ut^w^|PurYoCs|Z#@61}bC?xL`xv0`?jExB5TzolJY9fHLQezwHH?zt2N
z+zV$ve%5}9@(1#fy#%JK&pDQ~a6LKY!)lWC2)>ZS(4!nhafOGqB24;1hqpf3{9QQp
z!~%D+2Fhqw(_NG%E<>mVKQHTSS+(s;Zy|pxdfvm^oY4gU`7lwgHZ@;DB~+D!JUnd{
z!M0fSSnK@jUXI!K0kiKz2w9clA2s}W$X%<7&qP`UjD-!OWwu5?zqon`YhYz}r5JWu
zo^Qbg@-NC}7*LHdUCDxPiI)OBWpAGyG6bn59FpN;Ta6b|wMlr@rgt!E01T6ylxr@%
zc(_ib6#S1;M(yCn-_Eo=Kj`azol5AN{&g9g2spV?wD$dULQ*n1ME!~k=z>VpyV<zD
zn;lQuP*GXY=sp95p_6pYL+GLTo3wX^6}xSnUiJms6Y!mQv$@a{#&^hScSas!2F_HK
z!mE95%-VgJ8Y&Nf^=f|BIDM0pET!zt1dLi7Ip7h|Vnu84C2ZucSC-CO#c4{@-@izR
zb#F6Yu!_z1kvy2iP2O##hE(t9y)xG#fIBJuvL*3jD_wHMT;tosrj3=|Mu#%-2(r~u
z_n8Mc@{Rxq)Oy1T9=+N1h-(}+c&^5#+I42IY;N~IEeF5I!kh8f5)yCK_k(nXPly<@
zc6YDWctE|XPkV9L*^FdYRRR)Sn@{`-|54|`#?P3L!{^VtgTY|Z<+xt2t%S2q*s295
z7@9lIN~+zMB_r9?2yBRi3DPclKtVk5?bX1^$U}g$2U(h1Ey45E-_i3uHC^3pW8qee
z2oY0+LMYpD_TC|@jw(~$$eBoe+-1w=W|Es2_mocMc(%ERm7EPaP^=jLa{Z7;QgD%V
zEPARiQ<$vu(G+rLF!z{WY*q3V5v#SriVVyf|3&J1&(Z>B6f^x~<|@Z<g0eW2d#QD}
zyl1OdbDnqKvteSfmls-mXRUAR<hgC8qTpdk4X5gri<a!E(~VRe;Q>a$ZDTU6Bu>z5
z^i+k0bl&XrGYM<!9*oldpOev{Fi8`A4bdY#c4(JD%x2qlFZp1fTu&WH@?o3N?$Pr-
zIs@>ylpMFi)OgEUv2Ftr$KX%%HdXN|t3PXVazhX!zOSzFDHN3Dr^jcV&Y5QHfyE!-
z#p~c>hTZN?M%SF`w*X~%ZkmJ(Hp!(ClMhpNz9N$YnqhLxKp}qSSdeCxIraG&jBK(O
zcl~ubmT7<~5h-icvIuZHo2BHWT=Qi7Q}PVAvzoih-7<o>{{0_JWM#$?5Xj}c<&^uE
z<zkB@%cTACQ61^mTi;69TVBhn+Q0I+#eQT}(i#`sAahT2zrwn8Y<)tE(;BCa%2hgk
zAynAgY8XpR_SqwQ%qr<AZp9hB3z5G;<#DMOq&xLyo>>|307d{=e<wTJ2GKoKD;Eg?
zW&|)!#z3yHgaJ~l*o%u`=3?!zuwqyqEdR%Myp_i3qAu77B`qc)C7&}X&8sfcLP6zI
z%}Qd>t8{x(c;y)8`b8gH<{r8LsImXgp53(w>dH@x3rj_V3e{agKZ4_a(?gMEmT0yv
z9=L%|7R95~McL1`Cm3`gU$)mb=HFibv1gN(bCFLm;U32ym{o^2zqfw(zY<peXIP{!
z-|U=MYhI_OrcNB@lV0*2!mp_UGR_@OcE5jIBFHxcpd0Xl+W(F>`yX^8b}ScJUR_<l
zqPbs%`5p_Bnk;qu4vnu%@S|K@(rk5XtET<OAAKa+GSkx)T0y%sa&@_f_ic9|0A$4i
zyaEU`dvM3^E@}S#d#)n)pZ-0W*?PcK0e86hE{`=d994EY!V_S8>*``==|(th?lw+*
zwgYy^0k}cLUjA}=YHF`nP8=^X^s?`Tl3l{|jd9=;mqGIo<6oJNqt<qjUb+KI30(hA
z4f0p}zr7_={NLF$>Q4@ki8M23Va)^D;p-pQ#er|V1qRS~`-CoBYiAtL+w;+Re6EqT
zRJR=M7r~xGKbQ?BX+~^Jr8EFB&c=tDcNyT>22ZA@r|;v2v6f!&2~C-O&vYVRht6aM
zy)+xmlay%Ame^4{bg_6tE$0mfNM>TXy}Y~{0Mr$;g)&O{bOHE^u{bb+XIupcn7@NM
z#~Q-IuucuaO)Ia@7`)7BE<3EDZ<%|;@(s0R5@2#KK!6d(|1A{nsNB2!+#vC!81k|c
z@C?Um*#}(mb}617P+1wcoS~VNpQ3%#*Xjp%T_X7bMDVec6>M~iqH6VYQFuKB!}$?)
z;1f@s8XRs<<#T;JfXr*SNT{`}6N>WAHOXbvhHpGw_wLwQZIjj@P_!VDs=h=ifNU9-
z|I!;!=!OR8s}9Kk&AvO+v-^TM)bJl?BHS^2*!cQ<U!S>eU?9Hg<Hv)w{Pbz$cE)qF
zeRhyd4d1c9UD--tfY9Y<J&c5t+O=gTm7VJMJoB_yH)Dut*-r+hE$8UPglhPF-^1(h
zjo^%|G_<O*okz>u+FqXa6|F_+({S$HSqtU5TkDe_b?KSYC9BXA^vqcvjKe(ZdvkVI
zWIn0va5@?v;pgYqfk;yy*$EKchr9%jJ$VX}QrFd$?)wMsVk2^gw)?FYH~T_7=jYeA
z4`zplfBkxTEPtcF_rS}F$&sMdTDl<Rf<vYbwzM>Qd_dWa#D<EGh^M2`Q|M@;%ta}(
z`fxd?Dfr^U%+}tIKo<Dmi#=Y8;RdD^GS=3-DojANW<>6N&trFA@b@hcyUyIJy6J#w
zEj+}9gQVC{!vI*(DizLw%!b*NI_!m!cYb7;JEL0lQ9zVX8c?1@o6PMlwa4I#^6buz
z$`ARl<(sZo-RmIhK*|vk28t_49a!zLU&)HfI)PTS!sJZ|+yuWDSP%w4sSe%~zD91*
zVbA9FQ2d9%-OoF^b$KV)!lf38!VAMVh4XskPDCr|Nm;tE$zbJ0$t|*t^9zEFxWh>3
zjL$2PT7)phf)p#tn@Vvm??~L3+VD6p+g*aDJWR)Q@vW==7Jsi{o;Nte5)5HtKMU#h
z-OrJlP<hFPhVYF`bSri^b74qv^r&<AAB3Zw-Ihu1GwW}jDr?m5-<Ryq@p?D%xWk&W
zyJpIZO3ROPcABsD;64OM(j4dfuj;r_EYO5SVZkSAd*oZe*^e6=q205zD=YA8(ya(&
zP|AxJFIF<4p<P-hyR)WBNsK7c1I(sB6Z$j1B+Y#(EX-7g@vScYsU%m04YvcR-=_Q!
zw5BeiWCqxo7yrU8ste5OaF^T)U9X9CNg68{^3BQaTMFh73@iOk964t0waTr;P!8y8
z7_O6_Hei)}x>t*mf)MrtLu?T{c2+vxI}4heI^sNnL%=sMOTLN>WhEsaJAK`i{moEg
z^Uc+-9>4|wfeHY2%;m51JvPwg^6cfM(Yn)*Enhi{e*;pqQr0}V2$3!jwDf&1toKP-
z^yqv~`N9Sxzdc_u-F+wkDmZk{J6G{4T%Vd%Wpk=jpROgPBD5jCBDrr2sU0THhW<7`
zOTPIX;-_%1q()A*786*qXu^&SO*%X%giZIDjcbT%@Ms*w-*h%74ai;ZhWQEvmP+jy
zLFNwX+XrsO6SRGu19N5agVs70N2=bEJUBkztN+lc8vS?UK4g$^VsA-dN2-c@Xo|A)
z*pc-)##3Q>@y+`PY=~#L8BRb;D7MOHR2X^S#N~tRr@(|Gz=dE=e-TX;3wTj+UWu3e
z2jxN!+Pmx-f^PWwli9YCK}n*nK@9>vWUc8dK<xE<pPoKqo~B&u>V{Ja5Yg5a&YoM4
zF&TmE07^zLZ~`KiuSCja?;3k&{SNB+N_kq+lccp7-rwW=O=BKyis<o>&^_-n+ePpi
zKIW|3XkwZ!>C;YNQ41>;8IuT4Ntj!BR<ipVxxD|v2dSf6EMXc8W%Wlod)gOE2Bd=m
zkZ?{}|AlPf8PR;Pvuu#QA*zYnSV}oC@bWZ`EikHo#=DT!QS{SEw2>i5xZ7T0N-8cc
z_I@`XmKKvmw#kKYuU*u-l<mR3{&<}z9qyy$?=ze{J7c(1WJvEIP=dJGL2kN3OpHqI
zu)d=GggYIl)oD-513MDo+LjeIa`lD`oDW+k{|!6X{w8TIxM)}O?+qlYk^30HUxVMi
z8Gotd8(#C9cDFanHzhMP{*Pl16)#xJOzWdZk6xMq4K=QOwzZtPy88?MfuY}+i=F6^
zZKN~uEK(Hm1bBZBx?RNkPl5hlarf1;tX^tekneI!+|iZ@{-b_8a(55wM(QeW*|${;
zYzK1BAJ(kj(f$2@ithghs&%3boTa*PsYP8b-Qwa-Q0s1>@E7nMC~buSQA2S{5cGbg
z8yQ3IeI7_lkk%cm^PAMaXgyr&SO<*y0vs5FaRyKvi#C4W{SLO$wA=Nvx`hlAAU62%
zo~7hee_uohG^<$-M9Wt?1UR?$Snbjlm(!r)-bH6FjY{_Qz5}A%5hTcE6DTdcmzB@d
z38g>&^7q&8-yMAUagSgA5=o4*MfjprMm*5Tk|Y99(G*1ng*t*<9AvdCX<zv96AHxv
z^)7KuA#O)U$7yajQ2wkwzCyziqy4n`{z?VF3y1Z(uVVu2tgWqYL@xjpxCR@n(~I~a
zAX0t`l3T7{GhOuvTkL;|NXGWbE*70JNI0S=;1W8gUo5mCyrS$-eb8_T(box3%P%Cq
z2s5Yp3oUYdkE~k&Oh5(@Lf#)sJgF@VOS%ih8O?C@_V)lMgAJPNLp}JSxv$a!buqWn
zqC*Jut`5KnvwZg_yyC`Q0(!U~33Y2k&lpa^VhKJ4nB+)p(|ix8-f06|x3_~NabZ=y
z=<IY`cPDjNvWGqYTAf49FYCUdZi*2f{5a6;RjUl@uT+l|IJY}epPgF%YUx{d_ffrH
z$tfw#)iq0B-=1rE@8wk&w7kFBi3o3C{%DFq_yHBkPpqP%B3C$@N~l<7WKtg1$YebV
z3=D*T!Kn2>g_7LXRnodCH3SbU1RIg2M*LPLy^euI;`);Bd)~LuL3E>Hv-*OK^d<v(
zqm>mEA>S<Kazv|^A21tQ9RX4|v-DhGXY7VoBEom=wn^@C{|3Vx?K=F3qg%yoOOV<}
zZKLTFGwz~kC5T)eZ7EJ&HNr(=dlGd{m9%Hesi5PZga7Q8TYGSPk8Z=IV{DzI<GsP~
zlM*(hw#oN09b&rJ%aeKF7J35$D|r$g4z){;wwE#V=9qJe=YDa6)CtqCC60oI%{31j
ziFz3}g(|^1%2ZVeIUdbqIVt*6ih`umOf>F(Po@NwpkS{I!%OxJQ0mP|vBa`*O4t!v
zZ-8?p3JfCs1HjvxtJg~|?X{YJ_}8XZlI>dRA5e*8`EmR=1iMVL1Cr)vo^RCL-{QR)
ztSz>Jz69wD*ceHY69l2u%KAU%2l$tJXg-?DQ66V{c=<RL23R5|U9S+B&xj#j3u)P2
z?_kRmBgdDhsnyQCcTj7&c8M~a2M24W1RdK{*dQ50-IcM&C+>)@&4m!k-8bg76qb<{
z{r>Nno7wF7{vXcYWg;DFo}Q(1j;HmoFv&&HLM0aqe2H9jx^{Y1L}CT*UQhRt7X;Rj
zJMzUAP$t%+e(rvqrrr_3FC{EfiMn(Fm%iC6esXm(Ki56NhuM5!1OL_sMkXu`Rh*-%
zoo5Z}tgUtH#=v!F$Ex##?FgkYo~k|TjQQO^a->O&Z}}u#BWH`VtjBw6rn47zoE)k;
zbQF}v53~2)h1~tP95o8mHxI6$C3cp_+sga@$sV&a9SHZIMBSIsPffo}x{q_%H(Yed
zG!$o{wcNiB?l8iMI0kK`4NOB)FR6vz&KGI?s{_>up76lfjYjaeEInQWYiu9vp4V0~
zo`!>YcIu{N4R>0Ndh3KS7m}CalnHyNantUY0i%(N8X8A+sN1y-nrjmqum(M6ecVYC
zrck8#%Qsx>T^T1pyYfAwx$&R$8I9G#*;aa%V(llw5-yYupR`becXnr)RLN)0Pt3<#
zb>QUV?C?nBvUi<#Lvd(ce(X?np@E3`2<6en1Pg}=^hAOhKguCuQPFB(4}W>7&N5X?
zQfOf%S)u5ckH<%7nvIrp5!)HPwuZG+zO%SnGQV<Eaw_z8ChX##&cpBZ0cJ!d>yo(6
znF*CWHSuvN!jH9)wnor2oujie$s_8MKhdOm@(>%HRn=cvxad_E-fIHlHCzJk+yq76
z@i02CRxF2#N)BxCuz%U7E=fiE*Hn@#z^J}QZWLkKrlu|EL>1gQzbiBrfA(Z|>AqFb
z$$O<SXqYRhpvXUbzNbFY<0cvKl4j(CzN58b7*Kn})*7-WMU$*jA`E{y#toy-_O8?*
zhwk~f=F;46e7ZzV9C|wG$(oKjptA?29r$A=b>7>F=>XHMUS_B~Y#6EXjWMNRHwIN$
zS@|Xq=k%#`Ad7i1&NUv&_e`mQ2u1q2CZ7Gu=iiCn7&MBWv(+d>^DC+(OZ(3$bX+03
zwvRM4G_(@onQ873W}3-3gwu7Ss9RCHtwvb-Whb+kEk!n1LzYk^8;C;z3EF0|;^EWJ
z?}{piCG=!DTV4wXyzk18Y-5#+?xT*lsE4`V;#I$1hVeBC*k~j@qh4sDJ=?MR$?3Y{
zekRTq{_PUj<WeW-<J%)Y=53!`@t-!HQyH)AEvj_hVpE7dhHhM3bKmqJpbR<AuDxjA
zA`#9I#he6kUwV2z$*Q|mavvS4T@%Lgsm_|ye;TfMoDuq22S~em5+zfFdH|k0va3vg
z^7Ezh-C5fExbF$d3efwGj-uS}h#f*T;InMcdf8jPA_u`4PuIHcMHk2RGeyfc-%Ge1
zx&#KvG);A+X!zX4sK1eT%9zW39?{U*vcpwqel^`@6Q?&<;CW{wMAos=D+=}cL%PW7
zFZz&=?1{@yPSK|TvB&8Z@zLym?7tlCRDiD;kG`QFK4<zjH~Y<I#O9>5>w8}aHQrO(
zoO6TYq(=X|ybq*UEb7wuRUjJ=kCxM_L%a7)&TuzX=&Kd4n3?%xEpvKi>m6-_$K5jX
zL$|ths|uB}g2-VpEm<-BnjCjsM3zneFGPvacaKmgH~zuJX?r|X)u|q4l7d~Gbl2d*
zZ>#Ox1A@q!0H%UNRd)2XmYYJ}{Cpe=H4KA<qtI~nWyi_@N-J&`t$&U!9!Ig5^wQKl
zPtX|(f@UESu?jUbvh2W_v7>M*ih0XEP5J_Y4@5+erwyS%q`0$s{Ev>?>+vz28cD}j
zQza2-V<gx#JpV781CdZT#XjYb*~(h8JnEj|J*4iVMbIp%NEspIcTh`>kAq|#fli4U
zYJe@~3i>K9D<W)%F_o5Uv7|5kbP1a#Xa>`IRF;gO1Zy*h$&wwY4LOOn=^5XJX%T<z
zr<@xd$7DK>{x;?4OG3hxlg81Pb`|z{=gw`avWs1rncz$9m(J)2qD^V8BuvFdNc+#0
z<HY{p!uYrOaVn+~X4EOeC5WxgEFRjw@|{U`ButZKqJ%*vaxkD@0#<yY=Mf#v6MPJQ
z^hlqWF`)};l;uM>Nl{QM;%sbIx>msK-N$9s#n0qG$ASvAWHwAIn?j;IZn=enE(r9q
zD|T$w)YLqDrOcE+7Y02Ez|H)_0OT_U;R5RG9!p&zPn6jy;%%;=>`VZqTq8Yu9{6xi
zL+ux*ro8zLW~r+XZ~mdHYfjMOFmB~INJh+xb@4nOT?5|h!gb3RygroCctd9vhECJ|
z0@UWuM+0=+pFM3K`5+jG@Blwmd~nw7K7Sy=`u{y9M)vZ_#b|^({h|o}@q$8pg<E6b
zDHpwMFmE}!O+IGp{R>9_JEfM#P+<eq(qcpK>W6hcmv$@x@FHUXX%ojASS?l?{!!ml
z=_xh4mJ1{Zjk0;aQh#dq?&4=M{{wdJzqIyW3B3P&OQ!$oM9UbEjg<d~z6|KmH5LGN
zXDfk)K$!Ya38R3gbgn)Qd;kjggYP9m81q)M0IXsF&ZWM$q@;u)e)OmYfis6jesTxy
z$~FQT3FxX@CP=c0TWqh{uoNZ7`OdDc=xzW_i}1O{-?t<r%@WtTuI5OO5F;Xf>m#&B
z|LUpOtm)hO&Qf>KINHxg?UIW-7Dnr;p2AJgDgY=RJx0y10f7GM;v={%P;mSHUcZAr
zp}t?$>?9O;^!pRQT5B2zl*nc)x(N`g0H;1-VYd_}2?zGX7?8Dx@wq1bIuO8A%}Qwo
z$R3ZZ$}~Najns|M33md};V2-G3w7_x*xMBHO_>IW+{P;&r{b}d%k=wEp4x+2)Uf4-
z1x=u(?WABfqx=vISB-77-p@~$^_xm}Ol6cWL#l^Cce6{Gpou^M!p1b9O&B%}0(5U5
zrPkKgTpfg#=PMWaP*Y>?g!y`~Eg$XLF>qMRt3QBH<C-;Wrfybpz*qj?$8*l+19pDi
z)6co7`1v@1l#Wu1Ek4>?e{g0th*HAEj)lpKRrQ6IL{T1TTFnq6c<}Yj#VTqvA!w>S
zsfP#%>+)egD2^oiLL$SN5PqCpn704lpWRkfyNpz;nx7X2)E4M>NwY@tVa|OKTsEVJ
z6|NRqxA!yz4Mf*_vqe9(t-P9cgeLV%63;6%B`)$Yn3W0gP&mNJeFaFQZ2$1P#q#sa
zZ*{Xn{$ZusXKfz%no~J%wj3h0zReZ?bW{U%Wog%;NTU16JMVT1S&bhh7Ju*L3u0by
zb3H8A#(}g201P0|VF21*bKhXi(rg350=Yntu*Wdslo_USoUv%MM3W>lz`DhG<U<;$
z(DXWCuxRJms{L6z!pOm|_;EebnWo<e7}td>{GFI!HJTU|Vnwm}QFLnk*HlMKPgQtf
zkl_#ue0HrltIH_{EqT25**!ogAyc*gqFN)`BPAt8uM?y&p2#`}^raPWK|pN8aCE+?
z9aVDJ0~7k{_ruQmmr~6#cg6B+y@<bN-D_9TU*vl@h#aOJ3zi#9xf-<eH=jLjmE?cd
z{E1}6OwZZ<4nQs?u~4yVSqtH7HKFbNE30TCOWwryY9{Vt_BCA`LJn?gGFKC^P4)1d
zQ9a<3W=|-rc5SRrCh9paqZ0+eVYrOv9@61O7#=|3yyW4_`<_w{X3Hs<C<F3AlOTYv
z=^2YG0CbVrK@&4TRUkKLt*8ZNwr0J!`^u9a*S4M9_H#4JF`N(Cqhn^$Df1(Bv|HUM
zT1aq1nlGRU40jg@EH19BFIH_RzDe}1wXgDlct@$dcc)=YHRLB)zbJCOlui*7Y)B!g
z{ucJvRTw*9T?E(#s{luq?f(?9dR(Z7+1>!CV3p{I9giB^$99AY^<n0DK!(?I+tJWA
zw&5p%)tn9#OYzr73U{qlDtcZpqsiL^m0%^LRjp{UwISJ(5`ltEco-ObHXLxLJ?}jC
zVUlw+#qhFb&wJBtElBN(1)tGVdB8MZvzABR89NM#pQyj5s0FRPgRH&hd!&c25Z$jP
z$`~B2G#9<D5Pwjt;N7Bz$ZmiX0<ODJ5(Q*8h-bYGx1Sbz3q`@4Mz8Ld(o`isE73kQ
z^p08L(_c9k;O{!T9T`B^_7!^00!#!+Ul0E%ZKUQ-Ls>z>b<BFvKw~A4PykPil)#KF
z<o=)^h*;NKrE20({s6|CAkLBSH+3lmPo)Oluc-d(PIGK8LkURRCBCQ$KSeiOAHknx
zhZ^5N?Bnp$Vtbzx1<adTClPxNZ!<O56#Zfe%24?Fit92Uhi!?VZ2|HUN~n&rnt&?z
z6q(HUa_w!%;H8nr<piJtEq&$bLyHv8{uMw<X5TuJp#BHmi#KH{<@>v`Q*c_Tmvwz$
zb2EI^BhxUgBde2ADUTb_5)&yJ*I_Vtg%;==9b0l~trD4ap^W*~@cvn3XLCNHO{`(b
z6hR)k#K_z+iOI$-vhj+&-9tDpn^h0TB|qVkw~(;#@A1qRh+I=%*yKmv1XT9w*va2&
z?f$#smh4FD?x`Y)h=k|2v<*rnKz^r+`K1aef|*d_mK9XlnXtg!=#CgOe|xP7`FV04
z0s<WIkg5Z{`q}f=%ROpQ#n_ULKnpSWVQdB8NLJ+_Cz#HelbPF^^T=2ilwCjow9eHi
zNkrkYMfRg3gJVRX<ms!B^hN$YU4RO(DWQP8b$r(|wA~IrkCFo!SjRfN$589ERH7tN
z^b|9NxQSY)4`kCMcjHISDs7m>N^X+yp}<h)Pb}x*l2zrT{&kJYYPy_Y2$q@5c<A1@
zr|o`_9<2?im5*IDA+4`UYX#p8uPBu^_F+3shH-y9A+ZJ&m6W1OrEiupqS_a#?VJ_$
zIuKO`j`dqnrPzVC2x4umT{y|Ko;b<42xySgTa9|ReoX?z<PmDIWDJ+`GdL{o!}hk-
zJ5kw^`MmLx@R!YX;Xd>53q_ISI6-%4jf=)fn>9q#r{D}g&pVMmUhI3KV86s{o51D;
zK^SrI(teI|7P0obdf0Dx5-9<2mz-x2z}o44+}G_lvk91sl1_<5u8&-()5to75t}@&
zsHpfNdD>#ak5M1w;MgF28_*)mU3sDs&sf}D)`DJSOn0VFSn$_6?SyaQ?|Px<KgeDR
z51_h$ZF=ONIkLMPUrGsVwN?|H;z14OG*T*2a7K4)HL(R<GQV>O++BXa4K{fVT2~pO
zue%iiEa}_H#eXw8(L&ZN5@6Uhu3ohlv1P^RZsx34rYbK&ithYyaQ|NJ5_F7qb>R@#
z^<95@^=iX9PgB2vH#{bLT(*b8*<lg%*X+M8nF@<7rI>o@(L5-4#$OUJD52<8uU<>8
z-Lht4%k>-#3?ROu-;6mptL@qhFxfGqIhia}*L<Jqnlej~D>N%S{JCw!-hI@dfoDgR
zmCGXgf`Kdqh%%4be+B*IUhs&9b+wihv!W^{(o00^3ghfuY>Jk)!gOJdCK6ReDy0&M
zYm@6xiB>i<YQ$u1u&{OAyaSP#%6wt7{m^0cN4J3#7r3_V^DGvCO@`Lvth4pYL>_$A
zW0bGIH)Dv_-PBcrGNa1~%}TvkFQCo>$YXe{2%b&29eVPRuUYyb?!9Fq;7+#144}Ju
zoAGc52nWDp3c%uuZFQt5^97&?kaYDN2gp&yc@R|N3jcILL?^B=bV-mBwfg|D%s5->
zfK|83yiV@2efiM_(I$yI{pz!v>KB|~=YCrjzm4B$GCR8ZrfxnZs}tb%cqJr%inxpX
zo=r9l+wAVYwYkxg9GNv|a{(hM7%G}ReC#rix$>(gE$?Y*?_O2O^(G41XsawCp2BDa
zocJvRX_zH2kW-wV96eWUoM<8ACF>S2B)_4Ee~kx2<_=a-;dN&lnI$VlNpQ9HX}LWO
za4uUv*Lw@<wR#D7E@khb$wpOtChe**aByI$v$Io-hibXLe!-CLb`{6_SI5_DmX8Q|
zR{2tuw(QZ3T{d3{fP&W-pcyayW3d<ss7--#M_d%EJmfQ#yjQ|PLmBS{8<8yn`vKFg
zcpd*(>ZDfi!RJqu$C&Cg9VL#hzTz)2q@)Goxqu815E|dhbx>ckn)8mgTq*Kgmmji;
zJgytHSeVD6^0tE3i!sABHr%;gk+pn{uWOoi7?Td?j8LQ7Ku;H-a3W`P`OPzwjk|+=
z3<mP%9@bY%omkGD_kfX)A;-LUZMs{2B-Ox-rz1LroZqG=xrg3+HlyOb_5VEqiM*IU
z-yol=15~#G7QG#?toIV3C+F8!C&xGB!0-F#pfb$whe0kJS#4|0Vt26Em;6~6Nx}e|
zeM#MzWK#wnEb9CmT}ZlGMC<YT=q*81YAsc$SzJ*~n4ZDjny++%Yip~usx9pL1`6N<
zp2BRAegcehgH}Xfm^XCsPl}+9n*db^V1(!CWQmZ{JWcZKJEv4sw#{6KZEeZ&yVmZm
zE+5Ym9mH0{<>FF9j#xNN?Y<1o6iTI)q#v;h>NKPL*}(2_;-MCMuM#>87i&xf3gQ5e
zI3xGPW|yJ3xcH4+t$`Atylb3%DUDL<xNi~!(qHaO*EqeWD%%lP`apb;{lgBac$wfo
zV>*j2sB~%y*kqsYy~L1EkV}sl48udYuPi^7KnmskY+gkG`*y_^JJhE(3vlr{B=?4S
zBf1^lvP7V8WLO=<Pmol>#pl1+w94<Fwt9mF8h!fi9jK9cPi`H^4=5b|?&pQ#ES01Z
zVj`GB?m4{CAnFTv`?jQeTjY~FHNPuTXcWBG(-{Ncs~3{{<RRT<A;~p-3Z7<{XSeqv
zga_=vPcr~R`IGQNQz%;r2;Z5%Yl4P8=EoIs4<UiWEMmL{l%!DY@gK-sOI*9<*SG64
zs*DAqNiH7rfSFMUwKE-^xh3H8Mh;;qSM6yf2hj;!xL;tKq~MrnRIa<Z(u_z0$r<Ka
z-EtPF#o~QC$d#eexa6XcpIHbZKo{-#!2|duaXox#9MV0ZFARblKrh(kLCKnTJe>#2
zVd(;3%6`X~0PZS9!bH*t;jSU0wP?dJMM5MPYQ{)^+#d^RM<aCO!Ty3fZ{L?}1d70o
zw}D9cSDwW@y`~Tz?d`?}QeYEak-d|XQfs);<XjyvVv|V>&0RdMOuakI)%A1$*Z9G@
z^BO2J^Fx3vdZ8}=e1qli<2h)?mRu%zrKP0>DA)s;)p(tyoi`~7MY3CJZ#N9wnrKtB
zKdS5WGGER<!JXEBU_I`%mlu$1jZ5TA4dQN$V7+2=(9?_9XX>TXg(2&V%Ee!k8?@Kb
zHb_QhYlYuJX#UiYX8xegFUq8=;z;6A601=a5Ek{=>ImLT-h2DrquV68!#N_!?#I=k
ztD-wQ!atMPA%pcvwy^*ibSVmSLD(?rX*<Eoa1NzugjD2`5aw=k)zXg^fPPAJk5mNM
zQRN^5GyT;EPHGKSE|f#%?6U^(a{pl83@h!UPmWIrh7!Z=ABBsqc@t`#S{CePC_~JY
z+J061xAo!2oJ%;9X1l<I*4_Dk0PH}(0N~m6A^@?*JwbiUClB)V?E?Uwud^`gEER?E
zwHai@Np@>heR0TO<$#UYt#lUPMJ3r|NeVVMkn)?LS|xa?WA`VPAkj(p%ZRzyRVy|7
zL<c#T%_MXy3^ktSt9#5ipR1E;JzPBHv006&*X|P2okZHDXm-z1pzAB+6C~?lr^Rzv
zOfB)G!k4lE9yfRIBVhGT9H2%&+p;-V*d|lF_&1PWM-yuJCnX|&eLa_>-0Y+t*~w4T
z27q96!?}A|t<0hFW3kJa?#SF+Quu^QVEd+Rv?mm*mfEzrF@w}|Uh4v1OM2ZKZk~Dv
zG`j=oB!P-$de)TNd<y{za(=TM5T)<QK22@=Qla=?04ou_r9llp%>LlsC}x~rD(E+y
zIh=f5E0iuvx1dCRXMBs+g|SuXEj}lT{<yB4f`z;+yl8i?`GrJVkaPIzt?JedS_T(c
z7i~MT*XW+x^MQkWfUf5p-QW#M=jUhV)#A@3rxqApuU+7mEG&vY-$H2HKGHvaKtZk@
z%q~uTHXzqEcz;^|UFyAN6Hw1Bod)`mgP4B1>v+TIq8+uC9UFiZ>dkVp`v#xFKptl&
z47;sph6@cFgq_2m1%iF->n%@Zxl#5ippMpEd@Z~M1{m5FbXBFJS(esbHOWeC4s&sz
z+A(thqUWv0C#W9pS34LQcOIk>NB2~vD-WZ=0_=~;qAvU@V<wuSfxDO7^bI_ZF@!GU
zM?GJ^?B<p#*%2vUu@B@ZmQtL)@2Jw>ThuLPm5GMA#gT8GCj&`V*HL|xa&v0`Q^jl>
z2uSFY{A=q-hk+Y}lo4^aXX_R2-8YEwj_1NxN@>pld^yzlY&~pIy3>p_|A)|`q|oN7
z{jLB#7f8cmsbXe6YYpdRX&SzE)(S$EoGusOt532k&cFR><dOCzDdV<MHr5`KYy42&
zVzSO5dOxW4l#;E+)=-#{S4W>4Il3>LK8(X;o11r7UBg~;?AY8^9WDT|EXTLK@8@^?
zG3^=OweE&?GrF)Q`B4wEP2+gG&tG{y`-AO*a93Os)c`>;Z!fG)W{781xqrSrp$!mb
zse6JSYvHcE9nQLUUb4}iPa$<&vi^mQ;408*AHT(bGt3u`8&U_~UXren1ho!}W}iL{
zA{EOwRQ9<f-7vC!1rey(VF(4EUBx@psUHK%p+X<Qc5dOVJH7=wK3$_*qDH5&?$Z{c
zrStkPlA_x>#7A%UAf8uqcX<6I`&JFy;hZHDx}4?7kaeD+LpYc%yWLQ5n_ltaPIA{>
z#T}#du?Wv0(7^U~0~cHNooTPOO8Rhr`s*cWYl+t%TBAOvtjn(5v-;qhT3sF@c#UF(
zN|6_1b{C|eQmmgBN18i299kZFs`!|G9wts)xL>aX&k~-1RtyFOLhf+{)v5fFH=q{y
z#HzKPs~B~;i@;iX5HwA&F3yu)gkkG2nJvENdNyw04sOUB7NBTJJ$dINbDt&7xr1Dd
z*O9PU8{nm}fU0tHb^bzhbQ!GBpzklZ|Fd|Y+rq!)T7%-;_@>snAWmwA$@1}mIyfga
zYh|=PjEDtlu=FJ-UsqRGH+a^x7pZ0WAd`I@w@?ob2}xxE0;r3A!_!GH%DX-zsE!F!
z)uF?}hOrfbN>}%>Wh;O?aH?7<>a?ql1`pI7F&1{qbD6&ar9S!u2C6&Qn|FwbQi20T
zD`?3k>1n2bdr#8jB0oU;!Z)k!+JXse=aG4A%9Sm<l&h_mUV_8;eEn|@`&Q7%uJ<QE
z_!pQhhKdG?JY^-3x9gdu@SpN!_o8lbzuf^2NX8k)#kbnT{*8Wf9PyUjl{a0oOVA4A
z%1+HdxSI&LTuqu<{=#wXd(!oQudx8Y%DQ#WCcuH*1@~H9D;cjn?sA=QA7@~t@Vz07
zrJr=S*F+F3Vg4+ftit{fLbHXbz070uhAV!lVx0a00Lp9j+Y(h5!L#hyLtVDpYA1fe
z@c&LhIj+s;K?d(KPQqOJ4q{y-L{IR^hHU#i@gSG-f76HnoX#hp*2}*F{s%hL|MiGB
zsMZAtNGw2eF+r>XrsB|4mwPtNZZ|tLM#py=Xl*vx;bdaHw9FqfivOQE?&U}zf<y=X
zZ5IR(+Y^#5s+VVgZbz=-N}j<_oLKkat5z-0rVPGZHNIgDkVf?aN3j>~msl7J3;+pG
z^9-Mh2Xd3<(4gMGP8?9VLmTc_%>)kGgBt%zv!yBMXngolKbI$bFgZE-CGXet%uE;{
zm6G2YIWFA8TpT&7BYicn7+_l)Rzkkl19guE3f7qgNIF7+97HKPusiR>fz&)G#Lcej
z3h2Fcjh0h-z4S-YW#5Y>fqC<U=hECC;Z?vPxoEz4OK&gFwC|Udo!s{6!9+=CJO1g9
z3DQGAxQbNw49qkBw0a@ej)H+zO4SBz3;7&@fo}1GC@(W;Y%k2LDb7#_U?L7~MF5JE
zWj@?rGre~=H~rwGo#zISxZf?8FNA5{S@<5j+`zh_3;v!Mqwx}2Qjw@%Vl;CC7<G@;
zh{SAByKfSpH=zS8fAb7q@oH{t$rszknZ5nmg>Z&%fG4WJmaPzJzl`nKt~_JKD%$Zk
zfJ|TPBK!oJ$&n5Nc&h@TY|OU-l*C%aE>H*Ns+UB146!U*LP#Rn>#^|~la}2L7X3(e
zoRP<y#O#-nkEVaxDi2{p?mj7St$_gA5{~8K-Vu1G8SU|;UC_p_sR!S*5Nl1lrM-<0
zFXsl^TfS}fjq~knm=Au;8@@c?F$5NE5p|QacuE_76k@`B?GakqLYqoVjaIudqgsk|
zawPj5G^sjFJO}hS+ke2$Q)TYcryK8m;BdmRH)ucArG*9Wi$6=;+7ezYFL3z95FUA(
z-|>e)fJ4+J(y#s=2hKvb^mOvYEUdI{KvEQ!<D2p}P01;Lc^5Da0MqRujRbClw2s`x
zEyl&gjdS`hmbrbbtgI}{s3rwY^$!^oLdq8J&QR(Wg4em}^X6?gH`UqD6~`g<>7N6*
z&R=N@XI42MmTIg6i`96wGY<1%9Oulgs&~O!R8)dvI%FrPKQ}=2tj39B?ip3@t%1d|
zziWH$NqPE?_Qo)H_4DBFY3bd&#E<+;_M$3$S@jOKJ%s{&ksa*S6miR>-M!#Od+qkj
zM*gz6eN{hUb>w(=^T5@UK50v5Mv{Z8Q?8!Gixw`2|Il23oPTRixt?s}E{)TsOfbql
zp!J)G+;aaeA507JOj)|e{KTp6%3<ds%s`P)Lj#Ma&CmQ=L*hKh>~Fqnxn;(|82aV>
zYX*~;ifhd~_KL<u3FV3q2&`x3o=*ARS%og0yESW|E5oJ!QgFwK#Y<LeN>^p9+l!R;
z7kp+WPLIpoub7)(I8z3Z+t=d_+r&SvEAn~tIM@dk{%Yu<WKmw1lt-Nz%2QhH3gt*}
z7+m+Al#2asXpO(pIj9nQBT11G|BJN~bRVtyA5(R;ZyqjHII|Aes-)fK(r%*$KIx&S
z!Qm6_yg3p&M;Z7Hfbmup-nORGZ7m}`y`zh?T?e5^5f?Qn_i(zn17yR{)bfzQi?h3p
z$8BfdXVoS1mEeR8K1?Mp>AcZYY)5zP)n-B3lqsJY5{Lk~xn&VPT<j^iE&GhZ)R|dy
z6b&1gbm6{-<S^u~J~DwfmvK3)qsTD{G!-~50O}Fr_f_5{Zd;4%?(VMPGzo5|XTgMS
z7A^r?u<HqFS-EWUQ>Upgmg8(Xz!mAx$;WJ^@cRQNvkl)u69lK_t$Z)`z(@Ezt+U-*
zw?iWp?q;)XSX-~!#5_pcFK9Aq65Dup;s<tK%ny$+VVRBCbSEidw<naYzYn{_w$Aj|
z7o`=R)+R0lr(QDsZ8Zw5)7tEd9eE0qKH(k{iMm_w7mOW3gZU*VR#UXy4BmrJaI>wv
z8~8uBz_v;&_8$#N(?k_Pjp!r0^5sleuLna5Dr>*i4whr`%Nj4A#QK1!0X)@)iwBMP
z&5J%QB|AD3<NG!5(>4L6-vNOj&!Mr)+$_GIDLpoF2&V9wB~WLA?Hj@|m_p^5PQ5dQ
zX;{zQ|1?Z@*a((Nei=f<Sk65JW}!5!qN!ieQIYiX{bz4v3rT&{V4@1eZ+YI76mZpl
z<gz@NzYw`v6yqVyS+OwT6);dp2E1{tM`~eXSO%Q0qr7Ib@Fk31^xQK^c;0b*JGTJB
z_cVU_E8ab*giWv-;r)DTc7wQI;a<P!b4|btTp|Sc--Y@H1dGpZH)?KWo?+o`OPFqb
zCb;_svGluog5l6F46|3ZTg@V{OHo-lhOmPT6fMt=occ#i+Xn9@2~doYcdE`jgJV?t
zv;e(ruH{`+Afc(GIp#nN;kkO!Np<j=)m%=pTN9O^m=O1;#{epb2g27Z4LwFK&uXTu
zqKd8nCksTm)sP*%no58mE<AxO>NDw)z697*uWBLm>uN`z=!YXkHK$DyaFmCc?Ukcw
zb6lc+NNFly<ODBsF#KN=5Z|f7Ztp2tRoZ7v4jwD9ulnSkn@xTGkM8F9fv@J;Li;@L
zD~s*%2=$`V_*$nX=eM4miJ2TIt_^gvV+<%IuY0eZ2*~VsmsBb-f~>6>r!at;T)M<H
zqs6FN+F>($d?I~FLZ43B&=-V+FBbK0{o35J>X}VB3!LEizDEXXk^8lbIc$2=tIq5!
zE>Tr?<oB9gaYyleY<hA@xA1@kd97^@FnIv8wAvX#WN<-v8z?~hqn2lv-02z!>VXq_
zGci;7iIkj$(NTQHI_!H|knZInRcYW<M#m{yh6O8gn(4G!P5K3ic+7oJv#24Fl^%GE
zb2~Evxfwp8&7Po^WYJV1hCW&O-jW?ZnWLvTmfkSf4D0o0PFYutHO?u2K4WbrO@}l2
zfOb&X?bwWA=C-^3Pd;eG?t|}HxE$9wRMv0Ny{mbx0erRqkd^N17W9(1r{)BOpRAc9
z-@dO~um`C|Fx->gSC7-uRlc_fQoEKMisbP7iN5J<`Kb`fs5Hg}`?TfFmA4ogTafTW
z2AHOtWrTzo9isZv@G;|jHS-0Pm@j=Wda4DdXKYpIJEWd)gZyi2CSnU6ApKRgOU(k>
zwc&5F=>6E1?cDZ*kM@t9^zttSl=s^k@V9pPAEzAhQ{d-{K=FnD0o%w4s;vMx1up%l
z|8ipJH`Dq*La47&afAu~Gr$4W7JgI?xFa7{Q19H~edePZA0qLO5cPuIDzGG{tF&R4
zkN#+B%Up9|zmmx|;(z?VJ3ghqghs-JyN#&Te!0gkU6BbHt$XRVgMZ+qXD_g|M2?0t
zI^H2{X6h`1=KBk`6&V+M-!VsrD%`{Vuh!l>s;RB*7X=aBB7y}$LBO)<Rir4@iqdN+
z2}NZ`YCuZp35Z+VN>!S45E39rksex56cnTr0tAc@dML)wA#i5g-}{|A&bjA~JMOrQ
z{KbG(X4ahRd46pI9&;JX+y>Rgs)%-|O*HY?=n%C%&TDjN%s)9Z?o5ez`;oP#;5FQ)
za?m0BfJ}pWtNVfCTcHPPH~<`K9Q4~gG~LlwoYoE^R&*r*;Jge<ytwdpoSYV;0ia4;
z0r)jbe%vk^wkWI>TvZFINK>CH^BypugCw}ZnErbJQUug>>Tvz6a%A4T_e~U+zXUOd
z*6dD_?@qsvKE*gP%)@G2KSaF6MO%&pBAa1Q=z<n{H!yENCgit9gLd<Jy*;0))SHt_
z8k~^P?1j~!pdfrL5Ttwu(*9sjccsl4R#1k3CnILbl2wX&3c6|lj*P$UMCYfI=RC4i
z!HwFgtG_s8zX8M?j|V)z{LkAg3~?rSG^$fD5v8n*9Z(&&&H|^3=Bn8;utx~769JkT
z3>fs`#3MtqEYAOj{)c8mw`yxb38<uC5x0|$)3C2RJ@UI1BAzZ?p7Xw0Z7Jy9W9z;j
zEJzYbHx<x%UP8r8sPn6qcC|JSG*a73ecDUif{Ku3x8lTH(e*(^qmHU!#q2XAVkL8u
z<itmY)QC3Kch;;-E}huVL}YKPdeJ=4_z5g9Z(})GUro8j4&;W}NFai6s`K&jL?Yw2
z<3j%zDBjL!7r2C>nQNqpN}O?lQgAZ6Wx;HroJHX`H^?O;Z`}%yG#nrbY95kt={G=u
z&yx)-m$%enqJgeP)k%3&L}Ji+w#p3l2d=~^yQGYgD3VY<%2$u~nfaDF<;56&?=+#C
z+g2HLCcU=nE_I<Tci)I;@6fo6oR54pQ#8FWRspxnyan$Rn(<GqV9r70VAT(Qm(we*
zj)hj=78e)W)?LwnGaY-AJ%ce;^9QAVynSqymUv6mdDc_S?_$k)2TrYK8ZXF;(!?O^
zi;NeiBmh*?W_HMAU<~hDaHih4r)fb0ZntJv?RiM@RM7+6!xF=iq>Rseg}((cRw@W2
z&Y{@VTnq|LmaN2Wx!VpIKh<(L?wrG{L9Y?5a~)GMEygqjn{LFOTB!6LJ$KFL&s78^
zYLwQyvM^Kv`y9WPF2nVj={15D*p`#NZq(-l(avf1nbx1%@ymA$O2z`QwJby}PD?1@
z3>m(#c(<s*L)ZfI!5iwL#_M<KI-_D~V0&|m3H+oA+H0yMA#Or<k%r-=tUHv#8(O^c
zXdDc%@`+FH#%<vS*{4Ocg88;P{!UJs5fb$Sz+Tm6mGKjN#6iwQu2l%p@vOeELXv|X
zZs{Og^>iWZNP29xx>12sVT)GtjU4z3rFv+9kwERh&n`8}-_7q4r{U=Ia^KkWyLoC~
zz>M3=u!_f0QqelerEYcT=S4yYR-6fyyz*h2%f%NcfJ{T7&T2Hv2#>`!`)|q49^=pD
zHg=5VdfJ5B{8dKc`u`fKe~<&;^aHDW!-bNX^dmK>-vJltzXMUI6RORGL(vOw*z9ir
zQ0&xVL|J$z&Z0n_6ZN{oZbZ|ZV*B;RZswkKuZx&@u*j)t*;{Wq{+=n^c6|v~I-Mak
zx6U4s<nW?Q7x%3|D~kDB0pu)p7o3dj*NCvKKam3u!X3+5>g(dz=TaqGl{k*4o3?f1
zu>-mIf4@Jd+DzgjoZP{lezSeqkPLqz4P$?f?oQ!bOkLv0%BX&7l8B|8S8u-)MzqM?
zz<6M~dR#tXG!D51j}k%kDP}LDfdu-3R*h2vx!LY#mS9+~(*ZwW)<iqg#!2uHN!~IQ
z*Ml?@r3x({wMFQvNZ*Y&?OKN?px~+ZbYdcHXOuV4z702Wa>i}uOM3i*`Fo4~!|9#+
zNugUp+<Tf82;x@1<w%SKH|acBh(^#>;blUPf(^w>5tp`&#h4&Op%PAr8Vjg@mD9by
zyhb1{>LFE2OG04!I_C36-8r2J>SPtJv|2YU78SKyCd6BRWWiwGmz>hTL*7I;J%1k;
zF3)-;NJ;mZz}?<eN)N$PGX?{+8M)k1>JTH#BS72Q)nKgR*EpVnIAEg_6Pv&}jqfjH
z6=Qn}gx!kEB}w0LmJTY4lg8~9YLEHYY>4(iEhQH<vXO;wx8um+Nv~P7`}~;OC&M(`
z<nG#THQL#Y{1Kh*-|CS#+o*sV`n8~qn;@ayc)+MRD%FkkJ9o&C+Cv`-*Q+CfiQjtF
zZK|0pVv;hm;Y>Wn26!HI@$1$sM(Xdu@jX5vg(P<Aoo*Xbugr+%ibK)mL=b>-(-Gt~
z`8nO0oV`_yOCP-qY4Mzm<n$H9_HS7_(`KHs!IBl4)YP3jhNgjfT}GQ|JsbMzarHIh
z!k%u|zLh?=$xUl#+@CfjC2_wRH%LdcDm~d<&k~kf|LV!#-yrNCL-p0?I}oAOv2=f1
zM5R*eXb-)bBxnYq26<PbzAp8MtFh_+XVxAmYG6vi#n|8!+sKDDsc&5$VHUZ*`(mhB
zqU}nb#UU@3M=nEt-f!>5Ik&4J%rkr5Bqz<eHxAYh8y!QqFViE=!8WdP#7qcHUCnL2
z8UvA&BvzPTcZ+d^8qn4S)$**ymWYV{nSci>$WZlVcmBZOHLRn<$XYIsmTqEwZVoDE
zy7+?0&N3)Ft0Puz+#39x`g!9voL-_ig0l!;_hM@eI*F>RT$71_(`uzD<ZL5PHtt}s
z=KQV|lCKq}ZyCf`tMYc5R(-X5ct!O1KER+I=~5=_c%@ly8D@u#Rni=+bEo6y==)0Q
znDQk=X9rx~g7y#t{Vzm$a;AU%EbH<uj~7@M&!>hmyVn|<(FKAg1PRJN83%MJL%Qjd
zXT(AJe4u9#4)P-<1-c<F`_rmmYlqY$b=)fb{Cei_KGK!!Lk7jaz)HIk8V%pkaHuiM
z@tS{ER-=Gt+9Ekx6r5S}?>p201P|Tko<g@KY?B@A?6_&A=v45IUkrpZ3N`li5;bP<
z6Rr1mEXKiu^)eSWuOY<Zz|U+C0T5)fR8;ca+Ld%rdu*Q_`-#%|M>5hQ%h@gM9L&lS
z<x3%r4*|JJHAvh82;aaKlEMSU3hYxLh&T&@>DaBRjH7e>-LT(KrQgj3&oF&k2%#d#
z)JM^7U0vNQ$u`Mz>*OU!oHfLu*tqzlPE&={Bg~w%O8?a?!J*2NnkZG=>HAb%HtuDa
zTf?5#Hl+G+N3-<{48;lp^fQpSQ*%j9S((0YG1oI$HSI^(!o6?j#2K4n$DF4wqh!JI
zz=w(00xfy~MD3($Toz>&VzHJchpJh?uvimF&>b~SABfS?Www~Sg|alO4{x^JEKOFW
z%Qow5RiH+Ba95OvI+AN9^yp$*^E<6@+)DMi27Gy4T^$^@)+D&cSXry}HQxEq7Ng;L
zfZIBqdXTN<>VjQPnZ~l4<|pUB6N3y?N}Z@%=IMN07H8#Sga|8;fo=GfaVO-(bqjRX
z-q}*|j-k%_!3KAUSFv$Q(;b7>!UlfU<0qLF=KbnG#9`H%$9%`cIX_t2MH!YT(;l@{
zyOUmCNT0W{k*e=!OOK1VY}SIEw@_jC7zvJa94O>O>)7vxmR8C22>_B|*PM*oSFv-o
zC2k41t7~_}EL_bEk#)2ozEWT>w|Mk#BE-qE<b>Gh0~-}>Nqz!*po<g|a1nCEh{4O+
z`|sEb1VY$}tId}s#eP^VOkLpw`5%x@#SU5Mkrc^R+uA-p6AalZE$Nh3XPvoglM)?|
z%0*$_k~<1zPWexLc{M4N_>~#iPOpvWOPlh&@gx<!lkhsrC}OiNLEJD~&-hgM8G!vl
zob^{x@nzB(+c#hZCxtmDI=Hd=k%y#mE6uezf8l0aGn0#jIG_S%r(4tmkR=Bl#ca7Y
ztBnx)-%C0Ql4%>yR>b8jGGuP_JsywlQ#~=oF@Sl;Ck!-{P~^p4>lbC5TfzKS09QYm
z_X7q|wz$DO5BVvZ)7@oFdsr6G-xZF-ZBoMaQFCFSsF|q#(}ywB)$8RHj}Q{5Irelc
zp1s!y-t!sY)rQ5!2e(<E=Ug$L=9(<_TEMcM3q(yd8hl~l(9Y^Mlx)0-Sy2^kw|M=D
zKDh)=QH5-zYnx=aKW&+6uAP9!4pGz|VT5Gi)hCpV#*)La@BNSYaolYkyq7ZfC{~oz
z)8NKbQ6P9O7-y=V9=i6~E2UnXoHQz))yENOLNEe;K{mFUw9V^|>?uv!dYB(ZHjlOM
z71bWNX%<=ejYZ||10;IHn+0su+=;vJ4=xv0+zWpKe_28*pVII0sB@FIiTX5T^+Y78
zob>23-+gy-=+2?_z-L-BKgpCmRqiOVY`J$+1?c7H>s{X>Ei{g)mc%|AvQ=Y$h;CRs
zGwcT^W0<9gexKFhNLD0388YuB+pk1_;5B4pD<^(BybZjD(%L0zHKe~e#~?R?Q^8|A
zc_j}3X;RvVM=Ga6OE^N{b^{ALNK$dET~VrN=?}Xn<yi@}s$9y3|Ei_mEgc|!EO*`Y
zJIr(L4>fh=MPNnze#W|5$ivRv>3l8vZMr!BkK&R;<M)4do)|y)k5_)xi2S%>Z;W6`
zL2_RD!3Aae(iyh!4|BB`ir6ta7_vqc`V*G)>6YV@F~x{c>_YNf@wWd+%pb+6_aVDo
zpf?oUHZu5|R+=IubGkDrZEFJR#aWdV5hQbLqfBkE2Aa-Q6mWv3`#kJmYplB`%!f9c
z!T@OAm2(^>oo1%;*<pJ@bLsWuX|LnM_`0zDy?+&<nTqr$%}I8}!TzwPW9Q4KNoMP^
zMJmI;EpN?zUF@7mIQj3`?ONO`cgm+sH!hnWc2*ZXeJJFggT=d{_m@P)$`O!wduIwc
zI5{RAszb=IbIn!>bcqZ5haw6gPyg0nBQqpFup01k3vun<S~ewlWJw+_zm7+=g=g})
zDEbdpy%5>lr_PFL<;^AlVxreeU#!QwV0vV|*wN~(0)7P%xU75KO&yY(f)%5{+l!Wb
zcvx9hqY`_OH`ABQ^P=i6=cOwJuM^(A*I{+~XOg-T=ZZg6Viywmgd8VnL=^w`!=e8Q
z)iQC_GFSN^r{fGL`@d2Mf3H{`Yg#rs0#ituqiqrEqBwf(uFF6lE$oavwpc@DGMA^@
zME|<RU%qo_KYz*Jl>pu07#hxYV>|v6%r~}S`BS2EVXJ+2bIVSS)ZM6-=6ZYQ3G6KA
z6aGKidPYd#ywJgvK#^v;mg9nFCgE$m`xk|p@g9&FXC#3nmB^Z=;MOEzv7C#e?P#6a
z5&67Bn|@dB#PJL}32<%FJQY}X!~JlQgUPG2TfRBi4By6W4AN5FIiI+pw0P8M$#di#
z26eU{(qDYvVbVN+GQyaFuS{$#l%`0sAz%^qs+sxCwdu!y+oq)X&y$V}Yc;&Gs5uwn
z{xr5kE5xx{Cx0Qt41yLu+@NQ<A%5&XCq!{Nuh(CJSSZL4Nwl8Tv8q0I;_WB?3-$|b
z4k6AfwHphG8MMrwL)#&{rC~DjaWHaR(dl5$XdI72!OYEslc;g-a^3k|8duB#OUKcr
zq$Cd@+;4j7P$gHiHqgaO`I$5RPV(rmMdy1p{;A|k{?t#RFuAfW|LI|wlfVWn_?y&I
z)CZ#*g=>Zt6-9gj)5D8vk?$(bk8~g;FL_w5Whfx68xWQAd{b_FpmUdO4(inJwpd+k
z!jg{tVW#8B96!)iSItyHy2&d`^#OT8PP*V;kk3*A;FJ0G5m=O+RYm5GgV?x#;~wDM
zpEvhuTc`wtKr(k{)CfmVT{5T3XOo@0WRDBaZXSAZ;Bf;xRWv(Zm5fQ}tiLfqGTZ5o
z-Dd%%zpXgBZY3CYap4z4BC#5+C@>^LTH_#8qkA0;=~hI)g)Ge=^XR6L$;pR4yLBur
z5jN0(HpY9qbpyW@Qu7W_I8*+Cs-@dV7j%t$4fl>FNDom*p+T7N<<II%7RP!@!j?mr
z*)RvU-N8C^#+c9KQa|{6BwhI-^%L3Mypu8eYUuL!=Tu!<(rcku<hGoGtgPkUA&Dci
zH#O`v1^(1t&Nw`M?^HPbzo-vUl4#6<%BwDe`}pv#xt*&&tQBjksw|^lVWS$d+XB)k
zK%;bd#;W6-i)M+VFqbsqd(n@<$~BzOoIz;iZL-%+SL6;bgsInR!zl%x-7ZZQRkC4r
zYa>b!2rYNoMQ1bI(SD@D!_G*kQ_iGy=};7M^f`RC#khYPGZi_T$UO9}GgCA{clIg5
zB7rhgH?W=$e*IY9?`h4~sQt2!L5f>dT2lClmaEjp&sA=y8Q?h*CzmVv(7R$Rt&w>M
zyWuXi@o@dVt_<9$NWwjG!fT<dD~Q>M@t?GQ{P|IFg*^h7>&L0z$K{|#Gx!H^r+NTe
ze{D$QwvKO^jBAC4&(ys$&gN62HNB6k8x&uV0aY0&KQ(99yYBCr9tj`q=ICb?Oh8km
zc-;NVd(YOmg!9C?OE@ImZmz$^L!cY10RBKg9{$8;-hTXt&5agdSNf&$D(&t<h>l(M
zl9AxHW2Qr;?fR%*Oc}{;bW=5^Dfd<xX;3lb%Jc7SKiaE<W`}wPG@1291MXVYJnwJR
z%2XQ()>x!v9EI#CZ7_3yB^#eZ^4<1Ee235!(tsMQfP^=#ZbH-;c%69fjo8gp7pQ+D
z&K)e`VV#!4Xu;Of1C&4<$YM@)@}W0;EoZ(kBh&Y4mV(A`kupX^->&sW_c5E4V<ewJ
zYXP;!qFGlbH`z|LPle!R1`R8Tio~>8%#y@v(UY^q^xgdha~jFN!NXpqNl?eanN~a-
zncf>}?$heKXlF{`A+O_nFYGlhX)}%$6jd_e`vFUKz57NzoPCXroBPsv`llk1A}-mR
z-pVoAvC0nuT@NVou&_{Q_SUXT_*d0;(HTk!X;{z4&Z&KtSew+ou?_D?a%Ia!5&88;
z&j)3AR5HV4B9waY-bS8Gk89AwoQYW_j&Aj;WQem?dT-;u7vduLL*<&l?qg)e(N-;i
z+YH9!esQ#KTF%xnhNdJ;2Nru^#ntFkxOWR+{O$k+Gl!W<pSb_hC?s@LzvsgYD0-OR
zd{=;ua-wqnFn<$zrfvg-KCZKnRPLo`J-A4Pf0J`qHnB{QgR@!2^sPf*+uv;0xrwwj
z>Ff*c{4HI>cKQG2z&2JWqTAGu-JFnXrFmWWAJ5fBoN>!zGj@{X<iTxk`RIVOpCFSr
zH|sEdjWPQ{XMd}w$Ouqr8X)_t<@#dK7?2nKlyx1OPc_TdLo;TK=u&cPP1xEAfWz{l
zdYs>);=NhrAIPYtd?`<Pzb1Qh*?iZBhXo0?Yi~^@^a67fa?XIfG|D2<0HAznD`Umj
z)kl64VZt&fIkHUeO=H?%5#nKDXK#9rje*PJ-q4I6+(2xQ%6m8_Gy#Nu2?(+{=&43Y
zN*MLBcdN$c6qy7>t8eP?pdbHZWw~kT&cTA@-=Vqi&TFio75A{sA0K8?j{2K-0)vqq
z$YoMZG+rjxDv--b{>AD7fUtZXgQC@HVHS5P=KlomR;Q_v;L-Olr-!`G%-&4!4L#TM
z>_OJy*W1lYdxGmh=ZmHX^+M{=z-Z~u&zfYL{SDRCB`cj)M2l?OZq6!mU@zMcgNm!!
z+s-t`;MQuF4Bvml<Hpl&tUp(x<Jx50PS~Y}ETw0D8?(+lO0Q2sG)%?1l2PLD_AdjW
z`5wD9oki*bX&F={&)JWPJx?o^Y9{vDB1TFX=W8=?^M4{m{^|1S@8Mu;>HfV0bp8cv
ztBF=>D(aWC0H-#ALyKo%Dew+ovUznz1?qmPucJ{wJo5re-}@v1cS**kyuVSs{UINi
z&gRH(<`mxgp0wrk3c4>%;%UZFL^s`AT_7?@BoPsiKEH4-J8^6i(vF!AI<HF5(*f10
z22c>UTz|LPt<($g!I-`M<^q-#O-P4W(fq1rxh6}kTx|2G&I8twd`c~2T8pI_oCSgT
zoI>!OTexNc2$|?z{`v#>GXCE4clVLEM1+iNAl7X&of2P@lgGAg<6-{Ym~PJZTPkv2
z*V-oRhGQ8y-Fx>?j<cKji6Bb+8nRy`Oymr~-l)HRxGo>GubH<%%FuZugB6gcCAgjZ
zbuIC1j6v<~)Wr&OdA%ebTOcoLOM~ik0R-Z6N5qFgA8p`B>a~_3<C#C18<gjJV4mG_
z-1VQfM6DFPy7Jpf=2a^`8&T7=XxHnlmwB}RQT7NvUPG6TzyHL3+3t&Q11mBNy_}IS
zhzA3sDu#cbsmU6(LA_Hx<`hxjkzA!=k3eRh<R?41-W+GNBvWgP^`I%01Fi?y?#}|~
z>CC-^evS9%ud~1E`Qp$ViVpw~_avd@9bYGQBk`4Sl24Ikr+yWy0`Tg1Oe0$2`rXPf
zC*98(5bZ5e8Dl<!e<aJ@E&KHH%3OyBxwz&xZH#^ACts_{d5BAQSKTc&SRDG>U`a;2
znT_E5_I^zJGJb{X`@tadD68n4o9*T-@|B;fMmsFq{iKcO{}})tb?odD6&sSO(Y+#H
zk-`t^WnDq>FyjQq^|Db$NB7gQUyGVQ+b*!y@^h@2=xQD;(a{JyWRvO!@y-N0P_PU;
zk@<_`{qHNxWgWTrEoSUHExx90*J#Sz(e$3YJs0Kh6vJJ==`EL#1&fU=R`D{*<FBf=
z@5nD6SY1zhCt;H1aRfc;nR4Jf>@E_DokKe8Y5gF(YIHxVKwX4Wc6@Jio}IFbSD%g0
zkEdlEMK__!X+|CR-c`-s_nyHbUvg%j99X*1^K{H7E#vU}3Pz4cgP+Loy2~|cJjiqM
z=V47*Wx1w-z>ZDmjD2>}D^@Y%a!*>!`XwBIB)VD-S(4$Vc@Vlr>FI-;-fO$>;H#a<
zPG>_nCElLAj(iQXA}`g0I9i)n&J#7f)>c*@12mrbKFHJ&@QYr^IQo#`Ztf2McN`<k
zm&%qjc_*g<Rmc%TnbQ2p7(l-uYVlTOWNNc!k~=mm*On!aYG{*_;8Z;1LwJBD4?Oo}
zcL*K3#*5{3Jy8EQs1R7$X{?|Bn{2`VFop3?1gRW~BQ73C%!hvRV-s{lukvC~kq-ND
zvS0!l(EO{tD#|vt<6^U%*bdkGtir<zLSY>=oM+<f|LO1stI_C=$;r1_<30Y{-xdG2
zOvL{sDDlA4-{FwIuk`;*kWv5kTJZn$dABNRXplqpc7x~?*g#F$2rv05lY2rle8LWV
zuBS|~G;xnc4{ZbLQ_JCljvXGQj@U_KmTL|79Pj2h)6EG!en<X)P{{qJY5x~ndiV*J
zbvqcGJC~QOr4VRLFRg-z#C5P;h28F9pi=`*t|m}DfB*3GaMT6x-U82^;>`yCr3EI7
zk_=-DUwRDmcBa8V?-sJbPw`|WEIqauiYuOXLbi9}!2`z9p0U!SqQP56@ut`J#`z{-
z{AT^DS;Yb!!~@O&8#!XZI9t0(thYIM%?+3WQB4F8BAVWY0=8-Fs0}z$UoeX#-2fNM
ze+!sZihj;dv_8+Jx!p+YC(C;|p1Q(3N`%t!C+zRRj9*5NX=m1;#xcyC{%jynUYED8
zx8y!6n&AZo^o^d5faPTA{ZKSoJ2A_;Zxg3q_QPI{-De@zT37810L=MkRF2#e=oQPT
z^^$VLrN?63ckTo&1Bn-KP6Y$mb?QPpMzXJcu=3=2TIh>7$bR$+7dESjG=57*lv#$`
z0p_xk7jzsFaM^6(%AI4Vou%4b$2(M+))1iKdw6dYq(YU@efY-qf!wGGP@$)9u^B2&
z3nf|af+WWpzYeODV}a)6)nv(?^MojereH`YRA65RWJO*k@<oFcxv;jP;^eX$&f;c?
zTQ(&0K<Vv$V6qWn*n-3kj6Sfo4=ZQMvq^hbz~_zglwKi|53_v%-x%~>0I(@R0(OD6
zDYq}w36RXkQr$88Y~mD_;`=)g9<?58tQ$CXdzuCa;}DR{w4AAqd?`IezQ^eu2;P<!
zMv#T1Ck*0cm2Z0r4X5U=pjg1i3LrCu28#M5#5VP(wBWKJkIHnHbd>l)xf>8$j@;~Q
z%5caeGH)nKs3$7BFRtCh3?_qY>onWe{t|P5jf(kt=_=sFc|K^@v{$A4Xb1OM{Z4&)
zkGTUjqM2UiZY!C0*XCN)a-YCU`xd0JjV*D8ulu!HVrIOdIKymWPXASHEze(8i8ZcO
z_Rg#l?cpEcdIAEIg{uxK%4=LkNi)D-#1BmK^8~@wPjhB(31^>ZW50ve5;*E4TW4Vd
z*1Dx%ub4=i{Sq^*C%ZFVl8>4eJYWm=>98yQ&`_Tk3q)UEL$!))DC=8jHmXE7LrSzr
zFwO1xt!mXIh3KcK@jqJr`>?>E9|aC8xgmB#8gCtuB}1pTLQn>0b!8rygR57KmFL^j
zKn{#SCD{gZ{cPim=i6N6<zfwfLelli+KbHt8u@Tb9OEFuzM%iy{u^xJBa_%sS92ke
z;M-r+rv-?Gj^phI2WJx7YIP3~y#~(baHS}V(#$(+y{lr;_MKc<%e)I&1w%%y2<_3s
zNC#`&Jku|&pvG|_agOZ6)UIVh0>Vt9(dYX**b**Ysx>J<IC{;Z=Ol*QnT9rW3TwhY
zz?V4^<Rif*48T(L0Bqy)Ilj6YUEU1W7Bpeh(#;3-+(wtmGk4BXqZn+D`g@*%G>yNJ
z_I7r%kska78>)M(-uBOu<-p4ghm<ot&lzsx*#cX&y+HN!QM1H=*_K3NP-SIh`6!JT
zO>F-kIu2v=Au%a=yG6fV6}gsJK>wOtX=J@Xx4tXDI?V+I<ER=Kf_9I`?$xw{<Rr2`
z!o+yQZ?v7hEyY~#z}A);K)sAe{-3UXSsHL9lgM~c+4zkYRVSr7C8gh+N8I0swE<u5
zx4$$+jSBXubdX`ECU$?@m<rN93LcBJ_)XZAS6%IvgpLaH)+%4^E|h7V8(1C-6kKj+
zc-Wh#e)mpcaU?BQh1P?F4i9aYv`?n9YY(gL?cj{r-Si>7xb6ZodO+5%!o7y8P0@rN
zWRWIfw`pI8i1Cch_q=V1^v-c~4#UI}^FgESes?fQLlGb4B&_{IzpF>Syht?Ch}N#4
z5rmWPPm604Qnne;1ie?W<x*a~uJ>+-%XiSHKdtcnTz~aJ6oIsr<30I$OP%_;{q~Nj
zedD*Koaut8$hfBuvZ7SWDwHpF57MsAPc9)gR-A^bO2NOjD&kQ51py0_PIUvg{^<&a
z&tTP9;@D4bECsjZNSjn9+q^&#>x$?d-L@sI#$%s^4<TUso`Kf+Xy5~FdO)2`)exQ3
zEM__;qivxNW^){kLo@aJ`~*n~)*jkVvF+5`<g13>$IGZVCfxqq@!53@s%&fi4ehFA
zrkOy`CFHW?nRRO6fFi~bip}KJ){1IZOCyMiWZz25-glsqy)ITa(^bBO&1x7GR4MI^
z&+pr0l)9fQd0rnFOsnz~?=EQH^40LS@8AAL;=y&w$L{684kW327f+qM-qndV+y<K4
zz86I+yX-Z{=vh~Znl80F5-H3%vcn&@2f)yz6jjD82(HtMmA)~<8}-I|S~UeM7)r^y
zp9rEySD7oJ4Pm0K*fqni$3!C^tXwfznf&1~7JDnx55L%WPDM@)C~SlHWk0tEn;X*b
z@kf2ugorh@*Rr(PtW&lT?#+7|=~&sNit|0#_p9CY707D$3?w?!JZMU<VCgJ&Fy@n;
zb&V*}=w2r8Ti=!*37$sk3enZZaJE86VAAzdsnO8<7-K7AGgObW)qkl%UjEev#kRoJ
zFmw;c*b!etg~q~UXEX@Wn{D^p!jML^D~=k4@)}p^Up+L&vJD}!^kRLFrTmx_sdeg4
z9c@_qV5o2EmG|==Sr8gTmv#HIgX4FR#~7Q$%kP-sJ_&W%&x7P=X2kH6cur6Xkz6HQ
zgn?xD`GYq*-(n%lhp=$%$RrVv$Zg-fl)e<tdhowg`UaUB(eU>P1D21m6W-g7&u5C5
z*c>gjILj<z#8<oEGd-RC>X>KSrQJpmgFXn<kyVB{JKVN_PM+f2)tvmH&eM%-9-JsJ
zb8aWAXKK*v7wV_m`zkDUuhO?SK?EP$*e5m9-K)qNYnm`g_9vs)t2L~Eaxa~#q*sF>
zGRKN}eh1*Pl<Y2-&B{(j!Jb!<Vr&g!LDOEghD)p9_uP^2p4+YHaA?mV!B<Tp%VXcO
z)JIimrO70fSm^M$D{{lkGrRb+KB1&l6Www?GDx217gw^n>vKm557lo}mwEfKkMFPf
zOnpgSg~^?h>Eg!(q-Cq?#J-=lEl3V(s%PNR<#~M^9)<$bB<}^U@UMIY8gFYA3a9;0
z1w-{~8nu33Dyur1D>VzC>nyoetL8$Z1+%h8`zVvuA~qKU$!O8%UwI+yXa#Df`9mgt
zXlZ*xz3Er)Q`h4PP$Rq-Zxv8bI>WNfwgpZO)h5Kz2578x0qrF;e513Eb{)9eO|-$*
zJcH4tud<Rw+fC`@qU{g9HaeNKq0cnfdNTWq6q)Uqz~6TJB2oy5(F5jww~*@ZEMFX0
z*y(F9!ikbfv_hskf>18^1icJ7yuY%~B=nMaCs})XWK$0!fFevyO-;bzGFJ2ppc+5&
z;uat(a2;{6Sg=ct56xvAz%NSK=w@{(zl*Sk@1m~Teyq1&Sk9~9hG(J%RyWz7j&}Q+
zx!yJ=dKv7hKd5{la<^&Sa6d)(l$=TGMFRrSI=J*(ostU3&7Kf$2d|d&1AeH0x!&CG
z;GJ>C8TkD2xAxX_&<-Fv!BtpA0X}3n?uTE1{{1y)_XDUfU+(vEYVQZIoaKZd#-q8A
z?UEhkj$avXBQA0B17>a-yM+k_NJwtB^mTK=OUcGBr{@#fY_2Cjb;kcPO#}R%<LO(f
z24K`}GjvS%Sea0{11k=MFnhI!d!rBhH2Hq8FqLn2$3ei#M<Wixxh9dpD95dl^}u<%
zpFm$^85(R*_H3Px=EC&LtSogy9foX(2aW{R<IBs-O?NkvJwLhc(*i8k8ZwkMtlpNL
zf@xSuJU2EL`=|csI<Ntsc6!%<qz&mtVCa66l5--Kq0S%%;Ww7FBl?Nuq0&Q$DF;6<
zn@QKVskInKC3c#&tq6(?(K!yOeDm$=4&(IU>JKeANQmA?<)iP?{~cub^-!ssFJsLz
zE}w#blD$&etR$^{)%p>XmQ!VK=~H&{T8x16A-}wq*lHz4z3qg-jRkCHjTf#|kF6%|
z^x|-rI!0jo-D*bxXZ^LpD)H`(P0fNB>eJ>y<VUxcSgv@>i)G4f{R;2hUnS>-i&vxH
z_S>YZ2B2OtzJTX-CVxGzRH)u6klv+}^d}8#2AL1&D36UBS5|^%KS=Z>BYlsf32mfL
zWBBwaM6~Cp`G8j)YGZ^jovHoE_e|<Tn87M45-RapNw>KkgL~j20gHI_BA4J)WS%<q
z0dbSD#wb$1Z0?t$0(o6E5I{dXewV)bpGiD==hgs@Zzw=IRqm(=bES30qS`pn-KO2_
zY^M6Wy}V0oaUU(R)N{rysJ>U`DMF>@8|WKuQ7EfK{7KT7sp-mp4aKNk0vq?;T^Pi{
z#5GQV)B0=iCiHIde$)Uud)6|%MCczbRarzQCqiG~v%q;NVax5jekTRvS#w%}Yf|Y*
z6@Geq?X%UWj)SKT^g=F3m3Mq;;^pNjWPgj#4{YEzeW&Bs1@9-f$t{j*SR^{cyC@)3
zs{-l?w+}@Royhh@r%wt^<CnVZSG1VgDp@s`zWWQqn5qq|p`mhD6HZBazo<L;Z8H|5
ze<zV%Y@U4G_3Xl~o;mX2aOH*M)XIj2z0#88q6S|<&lelUg$;2X`RiSZv7(CqfU*E_
zK#JLh%v~q&rkfDeCrTa&-xcPCKk^j_&2yUa>X<3zMc3I&SU&6b)GZw7I$|>LDOJ1g
zF~I>jFpYCR5#HD5<v?&w`m{}HLKD0Nk&~ZZC^nZN^965kO<K6VFyXblqSrduZE-f~
zn@kr&C1&x_&CtC5(mORCA%;KFy_!dC{54BrXakvp+YhfSiS~$knWzQ&PC9o3Xf`a;
z3G3wP_dPYx!8=Fip3nnzQ7!voze^uKk(vU>pNnbF?xJHhygftG6We6sa_^p>gdRh<
z*+dr$gAI_dPYb`jIeAgo15(zjw=y4VENyXmr^^csJQn%`#Tp>*=yHCrF!JK&tW3#h
z^YZp))RS?}UjdG_uyK@S+kQ@D&~Eq*Zib!5DN|PmV1oLNr5t`@>)YBaE@|<0#aB5~
z<&A`DsZQ*})};3}wA-aqAC-3T({6RyVPHvGZe~{u-Hws|$HJj||GUM9r~P~w6E3dR
zklNSlb^P%dF>S5hS>QK&R%LJ_G=o2gOZOf2m82G9%gl*%F<abrA^}TMb!iwJsZuR9
z2{5NcbybBm4kE9&%aqoy2&G45zmJ;_-1_nTF!L8kW@!dO##!^wEVm{<p3NF(b3gfZ
z+5-92=8{*K(6%^ns6}#<i4$_qs!@}-Bzoaw@AZryob!2C;nXuXEBR#bhWSmbsCnr9
zGCTaq=8XU~3-m?<2qcjHVHGd`8$mM|QC}!cHkPD(%+sPyDI6S<VlRaf%y12HyD>aY
zItTWtf-_(Ny8O-ey=;!Tu0nHs$lk_{`$J=jad{AGoY2x0eS%76nUDKf8~8V^By&ls
ze8=XTW}!4*vXgs<@u?r~sQ>!m4)&||0mXC4>pK+Bv(raan)|css>TLFsX@|47TWzO
zbp<wp1-tLGnx`0f;z+q)15g#Z`c}o@_N3gBsL7KukfIS^i?-vEXHf=pSNEd~u&V$?
zbshg}rJjG@X0~vOQfjS>+2tQD9@tl3_I3)E!nv&gMHfg5-9M+@-q{~euj{5I=V{to
zLQ9W8oDQui8ad}^!rZUNmr|k%Pyl~qIL2WT`z{@Udc~|f-P_c?*XSq#)lo!XzUG?<
zHVf2+y?sXC(WpODVY`3**^2esjT7#hZ$nZlwA-oE-+A3=XL@|`fnIr{jw^o|#<+W=
zjcr^@6Bko!G|U8ho#djH_DRYMyX!nVn~=ULP@KCspWI+5_T1SKE|`H6o^5XUE)EZE
z%%yZWpL7}jw=Qcp^O?`YeBP@+uR2zCOWNK9Gc8ES9+BJ&(J<DbT5#JI_`<b=<B*_W
zoo_xX{<~luS1FmjDIaH}JD-4FS!gE&y<kAIFrxb8K@;Y3QjJS+jFaoVjPL(o9?<LE
z$Cd5u8__kP1Unc-eCPg{tI+eWvDOL3?d_FS&7QMm8oH#{gLU2ojJ8h{ptr7|CB>as
z@T0G{Q3~eNCfobA=0~=mL!aB*%AW7bB_cJvnZij!&dvU}?UOeL$G!b?sNdpK)X!SH
zFdxyQjez|~7M{atDf%v-Pr@@zja;(8FhCX-^EU71AxY0#9OHjCJ)^7+<-v|pfJ1cp
zzcu^P#)QIwHgqf9Z`+7=aWd|GSHZG+O!^qT8x-zK_2PvFOc51Plz_8cNB1e(q<ch{
z4A^+9Z&2lYb(Xfhd;@=sI-<`9bXI-#1Htk(qbrQ-5LuLYivF#R11S23y9>d8CU+Ma
zj2j*|fK5(shB`E&RUNbW-9__fu6(W1YW+*iZoyC8LH)4#F8)+Nhx^><{r!T;6MdHl
zx26kXG{PJ{Q$ANU?}c=c=QR6*H&=Y^8@uQmQuMuh%&cAQ7a9$@6VJC03h7q%(iF9M
zU^6Co#}q+zJN0CW1S4^2tEbY7f^{WpP442$?mp<Hsuk_rfFuvzT+Yv^wZBH`Fm#9Z
zfBG}yaI1D~!a8hRo*FDaOyAf}?J|NWKrkcFvvJ<QruMhr13o|FqIzGQ^M%YIK8RV3
z+h`FgFKGQ~jUr)Clex;g?q-me|Bxnz_g+-%<xQ@s3M_T&O4CNmkM-ZwF(JGIXimGT
zh}oMzLBkID>_A*gDxLGil{wF?o$-c@=25+3l%z}g@o$-BzIWPhx@OPhI<NZnye4lK
zzHHWZ*T%gqWn9S8ZMTcd9|^{E>p|cf<dbnuj9z>%Z)~fgoJE}6oDoOS+Pg}-rK4Jp
z9(FDtUVfnQF!-<QWo6Y2+qfHZmrrXvuU7CFM>5OdFVPavND&N!R(h9b(!1flEC)*1
zQn;$_^p{dv$1@}trRALu@*VWgR!8EXb6JUoaO>_`rGd0$<;zEAKa|H&^O;t)opqT<
zlZV=FG)Z2<8D?<`tFrPzTpB$rW#4pg!t5^}ng3W<e%)dG&qE&u!U0U}1{0*Z-wX&H
zR4??pdL+R~FxIj3mF&3Ir_J+n^O0uf?(USFoB}n13D?-{>}>79t+yu)L@g24)Mv`g
zFWxjglk!A}viXk=b8h37a~nMEn=QV>slmH^vW1qWXG7tGegW2*Ual4pe`Q&oVMs*Y
zg!E<e>OYG`aE67E?r%18F4S`8!G|}7iW2(<_93)r&HmS(XFhvZ1w9P4g`g<<$kclF
zI#0n(KWqDp^@ZTv34I3Q9Hv2^mb7e;u{Vpo?2pnzNgzwj&X}-ls7y=u9grN{Cf|El
z_YNi^SmbWcv}8gVE+v?!N69?c*l3dEH*YHf`D&4qd-6{si3N9UE8Jgde^hZN$<H?c
zd1pCy_N&1dj+nd&!S<m$kQdCrwCob!kzJ7FnYcbrbdv5WgVEj^zi94v2-&NbT~gti
zk;|+VJcwVVd>Z&CB2<o~a3v`=;k$`QrbRHuI>GleTMgJ)2OD7@zP5|iH@f|#L9pb#
zVd9?ktt%`a<0ggGVhLeXUw4|0D#>1wl)h%!W)u^72EJEyj&l4(i2-g^JS#iPdX2#m
zE<XzLLqWWl!o1b&z%&q>ab=|EH*r<0HCcnCdX*SY=%>y><OfIh1qU5PhgG!qcw(;b
z88#|l4J<zBlc_kQo3JHD66Yz!(&DcG)KO@k=GoM2?VbhkILq}(i+IS?pzy6*9?mT1
z+ap;A0W)!htZXHMst((npw>!;n`EDUJy8m=N`bMJ_6LPsY4Kc?UV?meAFr}D&bVq8
z8eF^(dDAV8&t716n}kOLPv=2!?>93ZDAONT5542g$I+s2aOu41D~|YRhY=;&Q=n7i
zD2|3N{+wt9>0ThVpluh7wL4#iOc&9Jbgs#*qu^<>eDP57IJfDn&@YQ^nA%cT5w0&O
zZ}jlqq1%lJ{Jh=@I~(G}oRjpFlD_KLfgm`JZ&;><6oB(73UJ>xw2{xVuHjC1c+#Mh
z9-QK`*q3(~oC!H|n9Y6_>=N{R)CBMM6T~M~_fOm41Vh2C_^1VUV$I^qswkB3f86=w
zzjJ^DT-FdO>Ep8Ir9dIEa>X3iT*i%!2!(e~;@*HD!s2{=wUg#z5U7A-NRF(pb#l?c
zaJ_;yv!p#1O$gqgrMGq3!EU80fek#LdWH%Hn{w|jddpF=!Z&FVga18L5%9#pixpve
z4#y;HJHNCEUN`43>NwMussmU2jN3@Jdt}&(67L>m|0yR$BkDs_2zzT~LeymVvQO$w
zkPw4u7+MHD_oqzYkP$4KXXr>v()z+Rrsb%R<6)o0)UYOvCLSRQ<-V!RE|izBr@w+6
zfAN_FNUa4B4QyE+{TixVvmr`DZ3bsq^h~!Wo=DJP&YR?}NKvu#iPr{0WlyDbM!VMW
zjv_DJpiKzHn9nCgZXqxD9=@L@Rbr_UfO(c|g{!ol+Dh&QTZq8VxR&*YjfeNf+&6Me
z+KbX(T`NVxutSi69a;p|r26YcxEhL`?Mv#s<89K*o|w6Zz;XB2)cSnY63uM!aMGyc
zE$e{bCO5saU*%T<y4C)eW#^tf06zcW*9<vj5JfPhx46tAk|Iz(vbEG*0{7u!zGjBm
z58h*RB*}a}&Z5iSoU<Gky=0jLYex@@EIR*h2V(ur{}WO7-=W6;=a}LDtEG?IV&SnN
zwfT{b2k-iozZq<izU<&*|5`o!NvXT=@0d%F)o|<I2)zGS=glMS?6ZINV{$ZQ^cu1*
OR_~7S?b4g~BmM{GDR^=K

literal 0
HcmV?d00001


From 1f7d25607c2ca4001331a32aa6b178cdaf51c800 Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 15:52:18 +0200
Subject: [PATCH 48/61] HIP update design.rst

---
 doc/source/design.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/design.rst b/doc/source/design.rst
index 614ac608d..39b13c3f1 100644
--- a/doc/source/design.rst
+++ b/doc/source/design.rst
@@ -12,7 +12,7 @@ The Kernel Tuner is designed to be extensible and support
 different search and execution strategies. The current architecture of 
 the Kernel Tuner can be seen as:
 
-.. image:: architecture_0.4.3.png
+.. image:: architecture.png
    :width: 500pt
 
 At the top we have the kernel code and the Python script that tunes it, 
@@ -48,7 +48,7 @@ building blocks for implementing runners.
 The observers are explained in :ref:`observers`.
 
 At the bottom, the backends are shown. 
-PyCUDA, CuPy, cuda-python and PyOpenCL are for tuning either CUDA or OpenCL kernels.
+PyCUDA, CuPy, cuda-python, PyOpenCL and PyHIP are for tuning either CUDA, OpenCL, or HIP kernels.
 The C 
 Functions implementation can actually call any compiler, typically NVCC 
 or GCC is used. There is limited support for tuning Fortran kernels. 

From d71b11b14414b680ee671b4ffcb5c22f27b833ea Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 16:01:59 +0200
Subject: [PATCH 49/61] HIP update core.py DeviceInterface doc

---
 kernel_tuner/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 90c9bb4fa..0c494179f 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -213,7 +213,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
         :type device: int
 
         :param lang: Specifies the language used for GPU kernels.
-            Currently supported: "CUDA", "OpenCL", or "C"
+            Currently supported: "CUDA", "OpenCL", "HIP" or "C"
         :type lang: string
 
         :param compiler_options: The compiler options to use when compiling kernels for this device.

From ee0c4d52a06f693cdd554ce467a789632478da2e Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 16:03:17 +0200
Subject: [PATCH 50/61] HIP update design.rst

---
 doc/source/design.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/source/design.rst b/doc/source/design.rst
index 39b13c3f1..4ca515e26 100644
--- a/doc/source/design.rst
+++ b/doc/source/design.rst
@@ -128,6 +128,12 @@ kernel_tuner.backends.c.CFunctions
     :special-members: __init__
     :members:
 
+kernel_tuner.backends.hip.HipFunctions
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: kernel_tuner.backends.hip.HipFunctions
+    :special-members: __init__
+    :members:
+
 
 Util Functions
 --------------

From 1220cf5a898e5818b2e38c0b65e080181e1db097 Mon Sep 17 00:00:00 2001
From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com>
Date: Fri, 5 May 2023 16:12:46 +0200
Subject: [PATCH 51/61] HIP update README.rst

---
 README.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 08e641061..2f8e8bcc0 100644
--- a/README.rst
+++ b/README.rst
@@ -28,9 +28,14 @@ To tune OpenCL kernels:
 - First, make sure you have an OpenCL compiler for your intended OpenCL platform
 - Then type: ``pip install kernel_tuner[opencl]``
 
-Or both:
+To tune HIP kernels:
 
-- ``pip install kernel_tuner[cuda,opencl]``
+- First, make sure you have an HIP runtime and compiler installed
+- Then type: ``pip install kernel_tuner[hip]``
+
+Or all:
+
+- ``pip install kernel_tuner[cuda,opencl,hip]``
 
 More information about how to install Kernel Tuner and its
 dependencies can be found in the `installation guide 

From f8b58a100cb8e85a4d6f67a141e3ea4514a3db8e Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 9 May 2023 13:44:46 +0200
Subject: [PATCH 52/61] updates for running benchmarks with HIP on AMD

---
 kernel_tuner/backends/hip.py | 39 ++++++++++-------
 kernel_tuner/core.py         | 82 ++++++++++++++++++------------------
 test/test_c_functions.py     |  2 +-
 test/test_hip_functions.py   |  2 +-
 4 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index 4e114fa23..bbb0f1083 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -58,7 +58,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.name = self.hipProps._name.decode('utf-8')
         self.max_threads = self.hipProps.maxThreadsPerBlock
         self.device = device
-        self.compiler_options = compiler_options
+        self.compiler_options = compiler_options or []
         self.iterations = iterations
 
         env = dict()
@@ -122,7 +122,7 @@ def ready_argument_list(self, arguments):
         class ArgListStructure(ctypes.Structure):
             _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)]
             def __getitem__(self, key):
-                return self._fields_[key]
+                return getattr(self, self._fields_[key][0])
         
         return ArgListStructure(*ctype_args)
             
@@ -146,19 +146,28 @@ def compile(self, kernel_instance):
             kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
         kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], [])
         
-        #Compile based on device (Not yet tested for non-AMD devices)
-        plat = hip.hipGetPlatformName()
-        if plat == "amd":
-            hiprtc.hiprtcCompileProgram(
-                kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}'])
-        else:
-            hiprtc.hiprtcCompileProgram(kernel_ptr, [])
-        
-        #Get module and kernel from compiled kernel string
-        code = hiprtc.hiprtcGetCode(kernel_ptr)
-        module = hip.hipModuleLoadData(code)
-        self.current_module = module
-        kernel = hip.hipModuleGetFunction(module, kernel_name)
+        try:
+            #Compile based on device (Not yet tested for non-AMD devices)
+            plat = hip.hipGetPlatformName()
+            if plat == "amd":
+                options_list = [f'--offload-arch={self.hipProps.gcnArchName}']
+                options_list.extend(self.compiler_options)
+                hiprtc.hiprtcCompileProgram(kernel_ptr, options_list)
+            else:
+                options_list = []
+                options_list.extend(self.compiler_options)
+                hiprtc.hiprtcCompileProgram(kernel_ptr, options_list)
+            
+            #Get module and kernel from compiled kernel string
+            code = hiprtc.hiprtcGetCode(kernel_ptr)
+            module = hip.hipModuleLoadData(code)
+            self.current_module = module
+            kernel = hip.hipModuleGetFunction(module, kernel_name)
+
+        except Exception as e:
+            log = hiprtc.hiprtcGetProgramLog(kernel_ptr)
+            print(log)
+            raise e
         
         return kernel
     
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 0c494179f..1eeeb0d8d 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -425,48 +425,48 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
-            return instance
-
-        try:
-            # compile the kernel
-            start_compilation = time.perf_counter()
-            func = self.compile_kernel(instance, verbose)
-            if not func:
-                result[to.objective] = util.CompilationFailedConfig()
-            else:
-                # add shared memory arguments to compiled module
-                if kernel_options.smem_args is not None:
-                    self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
-                # add constant memory arguments to compiled module
-                if kernel_options.cmem_args is not None:
-                    self.dev.copy_constant_memory_args(kernel_options.cmem_args)
-                # add texture memory arguments to compiled module
-                if kernel_options.texmem_args is not None:
-                    self.dev.copy_texture_memory_args(kernel_options.texmem_args)
-
-            # stop compilation stopwatch and convert to miliseconds
-            last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
-
-            # test kernel for correctness
-            if func and (to.answer or to.verify):
-                start_verification = time.perf_counter()
-                self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
-                last_verification_time = 1000 * (time.perf_counter() - start_verification)
-
-            # benchmark
-            if func:
-                start_benchmark = time.perf_counter()
-                result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective))
-                last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
-
-        except Exception as e:
-            # dump kernel sources to temp file
-            temp_filenames = instance.prepare_temp_files_for_error_msg()
-            print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
-            raise e
+            result[to.objective] = util.InvalidConfig()
+        else:
+            try:
+                # compile the kernel
+                start_compilation = time.perf_counter()
+                func = self.compile_kernel(instance, verbose)
+                if not func:
+                    result[to.objective] = util.CompilationFailedConfig()
+                else:
+                    # add shared memory arguments to compiled module
+                    if kernel_options.smem_args is not None:
+                        self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
+                    # add constant memory arguments to compiled module
+                    if kernel_options.cmem_args is not None:
+                        self.dev.copy_constant_memory_args(kernel_options.cmem_args)
+                    # add texture memory arguments to compiled module
+                    if kernel_options.texmem_args is not None:
+                        self.dev.copy_texture_memory_args(kernel_options.texmem_args)
+
+                # stop compilation stopwatch and convert to miliseconds
+                last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
+
+                # test kernel for correctness
+                if func and (to.answer or to.verify):
+                    start_verification = time.perf_counter()
+                    self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
+                    last_verification_time = 1000 * (time.perf_counter() - start_verification)
+
+                # benchmark
+                if func:
+                    start_benchmark = time.perf_counter()
+                    result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective))
+                    last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark)
+
+            except Exception as e:
+                # dump kernel sources to temp file
+                temp_filenames = instance.prepare_temp_files_for_error_msg()
+                print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
+                raise e
 
-        #clean up any temporary files, if no error occured
-        instance.delete_temp_files()
+            #clean up any temporary files, if no error occured
+            instance.delete_temp_files()
 
         result['compile_time'] = last_compilation_time or 0
         result['verification_time'] = last_verification_time or 0
diff --git a/test/test_c_functions.py b/test/test_c_functions.py
index 6fb0c2b6d..224b66524 100644
--- a/test/test_c_functions.py
+++ b/test/test_c_functions.py
@@ -143,7 +143,7 @@ def test_compile(npct, subprocess):
     print(npct.mock_calls)
     print(f)
 
-    assert len(subprocess.mock_calls) == 6
+    assert len(subprocess.mock_calls) == 8
     assert npct.load_library.called == 1
 
     args, _ = npct.load_library.call_args_list[0]
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index 87ab9dfe6..ce3eb0642 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -55,7 +55,7 @@ class ArgListStructure(ctypes.Structure):
                     ("field2", ctypes.POINTER(ctypes.c_float)),
                     ("field3", ctypes.c_bool)]
         def __getitem__(self, key):
-                return self._fields_[key]
+                return getattr(self, self._fields_[key][0])
 
     dev = kt_hip.HipFunctions(0)
     gpu_args = dev.ready_argument_list(arguments)

From 6c44bdadfd15091fb8e8bd72ae5ac256642c1b76 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 9 May 2023 14:11:27 +0200
Subject: [PATCH 53/61] update to test_compile

---
 test/test_c_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_c_functions.py b/test/test_c_functions.py
index 224b66524..6fb0c2b6d 100644
--- a/test/test_c_functions.py
+++ b/test/test_c_functions.py
@@ -143,7 +143,7 @@ def test_compile(npct, subprocess):
     print(npct.mock_calls)
     print(f)
 
-    assert len(subprocess.mock_calls) == 8
+    assert len(subprocess.mock_calls) == 6
     assert npct.load_library.called == 1
 
     args, _ = npct.load_library.call_args_list[0]

From c8b919848efa48d44bb37df8f4a6b90cf43bc0f5 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Tue, 9 May 2023 17:19:26 +0200
Subject: [PATCH 54/61] handeling loop unroll for HIP

---
 kernel_tuner/util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index fd7ee3c37..0e3dee4d6 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -618,8 +618,8 @@ def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, blo
         v = str(v)
         v = v.replace("\n", "\\\n")
 
-        if "loop_unroll_factor" in k and lang == "CUDA":
-            # this handles the special case that in CUDA
+        if "loop_unroll_factor" in k and lang in ("CUDA", "HIP"):
+            # this handles the special case that in CUDA/HIP
             # pragma unroll loop_unroll_factor, loop_unroll_factor should be a constant integer expression
             # in OpenCL this isn't the case and we can just insert "#define loop_unroll_factor N"
             # using 0 to disable specifying a loop unrolling factor for this loop

From 3f89fc3c79ffefa39f0f335655582dbe4a8d88b7 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Thu, 11 May 2023 08:58:19 +0200
Subject: [PATCH 55/61] added vector add in simulation mode to examples

---
 examples/hip/vector_add_simulation_mode.py | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 examples/hip/vector_add_simulation_mode.py

diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py
new file mode 100644
index 000000000..379a707de
--- /dev/null
+++ b/examples/hip/vector_add_simulation_mode.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+"""This is the minimal example from the README"""
+
+import numpy
+from kernel_tuner import tune_kernel
+from kernel_tuner.file_utils import store_output_file, store_metadata_file
+import logging
+from collections import OrderedDict
+import os
+
+def tune():
+
+    kernel_string = """
+    __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 10000000
+
+    a = numpy.random.randn(size).astype(numpy.float32)
+    b = numpy.random.randn(size).astype(numpy.float32)
+    c = numpy.zeros_like(b)
+    n = numpy.int32(size)
+
+    args = [c, a, b, n]
+
+    tune_params = OrderedDict()
+    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+
+    filename = "vector_add.json"
+    if os.path.isfile(filename):
+        results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, 
+                                strategy="simulated_annealing",
+                                lang="HIP", simulation_mode=True, cache="vector_add.json")
+
+        # Store the tuning results in an output file
+        store_output_file("vector_add_simulated_annealing.json", results, tune_params)
+
+        return results
+    else:
+        print(f"{filename} does not exist in the directory, run vector_add.py first.")
+
+
+if __name__ == "__main__":
+    tune()
\ No newline at end of file

From ee8140b728ef697db0bcb1658f2ceb6e806026b6 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Thu, 11 May 2023 09:15:56 +0200
Subject: [PATCH 56/61] updates hip vector add examples

---
 examples/hip/vector_add.py                 | 6 ++----
 examples/hip/vector_add_simulation_mode.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py
index 72397b628..5b2470923 100644
--- a/examples/hip/vector_add.py
+++ b/examples/hip/vector_add.py
@@ -29,10 +29,8 @@ def tune():
     tune_params = dict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
 
-    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", log=logging.DEBUG)
-
-    # Store the tuning results in an output file
-    store_output_file("vector_add.json", results, tune_params)
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", 
+                               cache="vector_add_cache.json", log=logging.DEBUG)
 
     # Store the metadata of this run
     store_metadata_file("vector_add-metadata.json")
diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py
index 379a707de..e03822141 100644
--- a/examples/hip/vector_add_simulation_mode.py
+++ b/examples/hip/vector_add_simulation_mode.py
@@ -35,7 +35,7 @@ def tune():
     if os.path.isfile(filename):
         results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, 
                                 strategy="simulated_annealing",
-                                lang="HIP", simulation_mode=True, cache="vector_add.json")
+                                lang="HIP", simulation_mode=True, cache="vector_add_cache.json")
 
         # Store the tuning results in an output file
         store_output_file("vector_add_simulated_annealing.json", results, tune_params)

From c77e1be5304d670b24a3190e46859cacb3b1d3bf Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 30 Jun 2023 12:48:04 +0200
Subject: [PATCH 57/61] changed tune_params dictionary type

---
 examples/hip/vector_add.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py
index 5b2470923..7e2810711 100644
--- a/examples/hip/vector_add.py
+++ b/examples/hip/vector_add.py
@@ -5,6 +5,7 @@
 from kernel_tuner import tune_kernel
 from kernel_tuner.file_utils import store_output_file, store_metadata_file
 import logging
+from collections import OrderedDict
 
 def tune():
 
@@ -26,7 +27,7 @@ def tune():
 
     args = [c, a, b, n]
 
-    tune_params = dict()
+    tune_params = OrderedDict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
 
     results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", 

From 83d9993f1a983692c2b903af87ebe93c7d2838d5 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 30 Jun 2023 12:50:48 +0200
Subject: [PATCH 58/61] specified cache argument for simulation

---
 examples/hip/vector_add_simulation_mode.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py
index e03822141..a3368bca8 100644
--- a/examples/hip/vector_add_simulation_mode.py
+++ b/examples/hip/vector_add_simulation_mode.py
@@ -31,16 +31,12 @@ def tune():
     tune_params = OrderedDict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
 
-    filename = "vector_add.json"
+    filename = "vector_add_cache.json"
     if os.path.isfile(filename):
         results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, 
-                                strategy="simulated_annealing",
+                                strategy="random_sample",  strategy_options=dict(max_fevals=10),
                                 lang="HIP", simulation_mode=True, cache="vector_add_cache.json")
 
-        # Store the tuning results in an output file
-        store_output_file("vector_add_simulated_annealing.json", results, tune_params)
-
-        return results
     else:
         print(f"{filename} does not exist in the directory, run vector_add.py first.")
 

From 4ce2f834db2c31d08676b6902779fb03ab439d76 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 30 Jun 2023 12:52:08 +0200
Subject: [PATCH 59/61] deleted device_properties env, was causing error type

---
 kernel_tuner/backends/hip.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
index bbb0f1083..682f5e6a9 100644
--- a/kernel_tuner/backends/hip.py
+++ b/kernel_tuner/backends/hip.py
@@ -65,7 +65,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         env["device_name"] = self.name
         env["iterations"] = self.iterations
         env["compiler_options"] = compiler_options
-        env["device_properties"] = self.hipProps
         self.env = env
 
         # create a stream and events

From ee7d11afc0890b6a287a40cde19c71800d4fb539 Mon Sep 17 00:00:00 2001
From: MiloLurati <lurati.milo@gmail.com>
Date: Fri, 30 Jun 2023 12:53:13 +0200
Subject: [PATCH 60/61] fixed bug related to runtimes in simulation mode

---
 kernel_tuner/file_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py
index 321d0db80..13ae223bc 100644
--- a/kernel_tuner/file_utils.py
+++ b/kernel_tuner/file_utils.py
@@ -105,6 +105,8 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim
         timings["validation"] = result["verification_time"]
         if "times" in result:
             timings["runtimes"] = result["times"]
+        else:
+            timings["runtimes"] = []
         out["times"] = timings
 
         # encode the validity of the configuration

From 0d968077c44aab7a5e20825b1f75998055415723 Mon Sep 17 00:00:00 2001
From: Ben van Werkhoven <b.vanwerkhoven@esciencecenter.nl>
Date: Fri, 8 Sep 2023 11:52:12 +0200
Subject: [PATCH 61/61] make store_metadata_file a little more robust

---
 kernel_tuner/file_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py
index 13ae223bc..0d5024187 100644
--- a/kernel_tuner/file_utils.py
+++ b/kernel_tuner/file_utils.py
@@ -227,7 +227,7 @@ def store_metadata_file(metadata_filename: str):
             hardware_description_string = "[" + hardware_description_string + "]"
         metadata["operating_system"] = os_string
     except:
-        hardware_description_string = "[error retrieving hardware description]"
+        hardware_description_string = '["error retrieving hardware description"]'
         metadata["operating_system"] = "unidentified OS"
     metadata["hardware"] = dict(hardware_description=json.loads(hardware_description_string))
 
@@ -235,14 +235,14 @@ def store_metadata_file(metadata_filename: str):
     device_query = {}
     try:
         device_query["nvidia-smi"] = get_device_query("nvidia")
-    except FileNotFoundError:
-        # ignore if nvidia-smi is not found
+    except Exception:
+        # ignore if nvidia-smi is not found, or parse error occurs
         pass
 
     try:
         device_query["rocm-smi"] = get_device_query("amd")
-    except FileNotFoundError:
-        # ignore if rocm-smi is not found
+    except Exception:
+        # ignore if rocm-smi is not found, or parse error occurs
         pass
 
     metadata["environment"] = dict(device_query=device_query, requirements=get_dependencies())