From 8b549142bbb869054c6ec3e004367e76ddab3320 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Mon, 3 Apr 2023 18:18:29 +0200 Subject: [PATCH 01/61] added files and started implementation of examples/hip/test_vector_add.py examples/hip/vector_add.py and backend/hip.py --- examples/hip/test_vector_add.py | 38 +++++++++++++++++++++++++++++ examples/hip/vector_add.py | 43 +++++++++++++++++++++++++++++++++ kernel_tuner/backends/hip.py | 30 +++++++++++++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 examples/hip/test_vector_add.py create mode 100644 examples/hip/vector_add.py create mode 100644 kernel_tuner/backends/hip.py diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py new file mode 100644 index 000000000..9e2c69eeb --- /dev/null +++ b/examples/hip/test_vector_add.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +"""Minimal example for a HIP Kernel unit test with the Kernel Tuner""" + +import numpy +from kernel_tuner import run_kernel +import pytest + +def test_vector_add(): + #Check pyhip is installed and if a HIP capable device is present, if not skip the test + try: + import pyhip as hip + hip.hipGetDeviceProperties(0) + except (ImportError, Exception): + pytest.skip("PyHIP not installed or no HIP device detected") + + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * block_size_x + threadIdx.x; + if (i Date: Tue, 4 Apr 2023 10:56:46 +0200 Subject: [PATCH 02/61] added if statement for HIP lang detection in core.py class DeviceInterface --- kernel_tuner/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index accb58cc1..90c9bb4fa 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -19,6 +19,7 @@ from kernel_tuner.backends.opencl import OpenCLFunctions from kernel_tuner.backends.c import CFunctions from kernel_tuner.backends.opencl import OpenCLFunctions +from kernel_tuner.backends.hip import HipFunctions import kernel_tuner.util as util try: @@ -239,6 +240,8 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No dev = OpenCLFunctions(device, platform, compiler_options=compiler_options, iterations=iterations, observers=observers) elif lang.upper() in ["C", "FORTRAN"]: dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations) + elif lang.upper() == "HIP": + dev = HipFunctions(device, compiler_options=compiler_options, iterations=iterations, observers=observers) else: raise ValueError("Sorry, support for languages other than CUDA, OpenCL, or C is not implemented yet") From 303e4d5201ddbc5eb1fe76b872a0080368561aa5 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 4 Apr 2023 11:52:58 +0200 Subject: [PATCH 03/61] added hip related code to detect_language in util.py --- kernel_tuner/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index e10f788ea..bc3cecf3e 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -259,7 +259,9 @@ def delete_temp_file(filename): def detect_language(kernel_string): """attempt to detect language from the kernel_string""" - if "__global__" in kernel_string: + if "__device__" or "__host__" or "__global__ void" in kernel_string: + lang = "HIP" + elif "__global__" in kernel_string: lang = "CUDA" elif "__kernel" in kernel_string: lang = "OpenCL" From c3278930f8f4abd6c6f439ce619979c62ea0ec62 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 4 Apr 2023 13:27:03 +0200 Subject: [PATCH 04/61] modified hip tests to enclude hIP lang argument speficication when calling tune_kernel --- examples/hip/test_vector_add.py | 2 +- examples/hip/vector_add.py | 2 +- kernel_tuner/util.py | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py index 9e2c69eeb..6fd21aced 100644 --- a/examples/hip/test_vector_add.py +++ b/examples/hip/test_vector_add.py @@ -33,6 +33,6 @@ def test_vector_add(): args = [c, a, b, n] params = {"block_size_x": 512} - answer = run_kernel("vector_add", kernel_string, problem_size, args, params) + answer = run_kernel("vector_add", kernel_string, problem_size, args, params, lang="HIP") assert numpy.allclose(answer[0], a+b, atol=1e-8) \ No newline at end of file diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py index d52c21b27..ed572877b 100644 --- a/examples/hip/vector_add.py +++ b/examples/hip/vector_add.py @@ -28,7 +28,7 @@ def tune(): tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] - results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params) + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP") # Store the tuning results in an output file store_output_file("vector_add.json", results, tune_params) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index bc3cecf3e..890ef21a5 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -259,8 +259,6 @@ def delete_temp_file(filename): def detect_language(kernel_string): """attempt to detect language from the kernel_string""" - if "__device__" or "__host__" or "__global__ void" in kernel_string: - lang = "HIP" elif "__global__" in kernel_string: lang = "CUDA" elif "__kernel" in kernel_string: From 0f66875337fdf5a9626492d21a46752ee8d4924c Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 4 Apr 2023 16:18:04 +0200 Subject: [PATCH 05/61] added (not tested) compile function to backend/hip.py --- kernel_tuner/backends/hip.py | 80 +++++++++++++++++++++++++++++++++++- kernel_tuner/util.py | 2 +- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 610a7d7bc..6eeb64a81 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -1,15 +1,36 @@ """This module contains all HIP specific kernel_tuner functions""" import numpy as np +import ctypes as C +from collections import namedtuple from kernel_tuner.backends.backend import GPUBackend # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: - import pyhip as hip + from pyhip import hip, hiprtc except ImportError: hip = None + hiprtc = None + +dtype_map = { + "int8": C.c_int8, + "int16": C.c_int16, + "int32": C.c_int32, + "int64": C.c_int64, + "uint8": C.c_uint8, + "uint16": C.c_uint16, + "uint32": C.c_uint32, + "uint64": C.c_uint64, + "float32": C.c_float, + "float64": C.c_double, +} + +# This represents an individual kernel argument. +# It contains a numpy object (ndarray or number) and a ctypes object with a copy +# of the argument data. For an ndarray, the ctypes object is a wrapper for the ndarray's data. +Argument = namedtuple("Argument", ["numpy", "ctypes"]) class HipFunctions(GPUBackend): """Class that groups the HIP functions on maintains state about the device""" @@ -27,4 +48,59 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int - """ \ No newline at end of file + """ + + def ready_argument_list(self, arguments): + """ready argument list to be passed to the HIP function + :param arguments: List of arguments to be passed to the HIP function. + The order should match the argument list on the HIP function. + Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. + :type arguments: list(numpy objects) + :returns: A list of arguments that can be passed to the HIP function. + :rtype: list(Argument) + """ + ctype_args = [None for _ in arguments] + + for i, arg in enumerate(arguments): + if not isinstance(arg, (np.ndarray, np.number)): + raise TypeError( + "Argument is not numpy ndarray or numpy scalar %s" % type(arg) + ) + dtype_str = str(arg.dtype) + if isinstance(arg, np.ndarray): + if dtype_str in dtype_map.keys(): + # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference + # to its underlying array, so we need to store a reference to arg.copy() + # in the Argument object manually to avoid it being deleted. + # (This changed in numpy > 1.15.) + # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) + data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) + else: + raise TypeError("unknown dtype for ndarray") + elif isinstance(arg, np.generic): + data_ctypes = dtype_map[dtype_str](arg) + ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) + return ctype_args + + def compile(self, kernel_instance): + """call the HIP compiler to compile the kernel, return the function + + :param kernel_instance: An object representing the specific instance of the tunable kernel + in the parameter space. + :type kernel_instance: kernel_tuner.core.KernelInstance + + :returns: An ctypes function that can be called directly. + :rtype: ctypes._FuncPtr + """ + + kernel_string = kernel_instance.kernel_string + kernel_name = kernel_instance.name + kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) + + device_properties = hip.hipGetDeviceProperties(0) + hiprtc.hiprtcCompileProgram(kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}']) + code = hiprtc.hiprtcGetCode(kernel_ptr) + module = hip.hipModuleLoadData(code) + kernel = hip.hipModuleGetFunction(module, kernel_name) + + return kernel \ No newline at end of file diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 890ef21a5..e10f788ea 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -259,7 +259,7 @@ def delete_temp_file(filename): def detect_language(kernel_string): """attempt to detect language from the kernel_string""" - elif "__global__" in kernel_string: + if "__global__" in kernel_string: lang = "CUDA" elif "__kernel" in kernel_string: lang = "OpenCL" From 39700873a83ffc6b5e2f0d40cc624b8f2fa1ecae Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 4 Apr 2023 16:32:31 +0200 Subject: [PATCH 06/61] added (not tested) start_event and stop_event functions to backend/hip.py --- kernel_tuner/backends/hip.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 6eeb64a81..5f2a07356 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -50,6 +50,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :type iterations: int """ + # create a stream and events + self.stream = hip.hipStreamCreate() + self.start = hip.hipEventCreate() + self.end = hip.hipEventCreate() + def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -103,4 +108,14 @@ def compile(self, kernel_instance): module = hip.hipModuleLoadData(code) kernel = hip.hipModuleGetFunction(module, kernel_name) - return kernel \ No newline at end of file + return kernel + + def start_event(self): + """Records the event that marks the start of a measurement""" + self.start = hip.hipEventCreate() + hip.hipEventRecord(self.start, self.stream) + + def stop_event(self): + """Records the event that marks the end of a measurement""" + self.end = hip.hipEventCreate() + hip.hipEventRecord(self.end, self.stream) From f92471cc6659f365ff3a8e5b3621326079cae423 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 4 Apr 2023 17:41:48 +0200 Subject: [PATCH 07/61] added (not tested) kernel_finished function to backend/hip.py --- kernel_tuner/backends/hip.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 5f2a07356..2786e67fe 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -2,10 +2,17 @@ import numpy as np import ctypes as C +import ctypes.util from collections import namedtuple from kernel_tuner.backends.backend import GPUBackend +try: + # Load the HIP runtime library + hip_lib = C.cdll.LoadLibrary(C.util.find_library('hip')) +except ImportError: + hip_lib = None + # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: @@ -119,3 +126,22 @@ def stop_event(self): """Records the event that marks the end of a measurement""" self.end = hip.hipEventCreate() hip.hipEventRecord(self.end, self.stream) + + def kernel_finished(self): + """Returns True if the kernel has finished, False otherwise""" + + # Define the argument and return types for hipEventQuery() + hip_lib.hipEventQuery.argtypes = [C.c_void_p] + hip_lib.hipEventQuery.restype = C.c_int + + # Query the status of the event + status = hip_lib.hipEventQuery(self.end) + if status == hip_lib.hipSuccess: + # Kernel has finished + return True + elif status == hip_lib.hipErrorNotReady: + # Kernel is still running + return False + else: + # Error occurred + return False From 3425e2dfd58c85dcfcfc2b6f8c705f0ea4ddc75f Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Wed, 5 Apr 2023 11:01:17 +0200 Subject: [PATCH 08/61] changed how pyhip is imported (via import sys with specific path of system) --- kernel_tuner/backends/hip.py | 68 +++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 2786e67fe..cb866f638 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -1,7 +1,7 @@ """This module contains all HIP specific kernel_tuner functions""" import numpy as np -import ctypes as C +import ctypes import ctypes.util from collections import namedtuple @@ -9,29 +9,33 @@ try: # Load the HIP runtime library - hip_lib = C.cdll.LoadLibrary(C.util.find_library('hip')) + hip_lib = ctypes.cdll.LoadLibrary(ctypes.util.find_library('hip')) except ImportError: + print("Not able to import hip c lib") hip_lib = None # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: + import sys + sys.path.insert(0, '/home/mli940/PyHIP') from pyhip import hip, hiprtc except ImportError: + print("Not able to import pyhip") hip = None hiprtc = None dtype_map = { - "int8": C.c_int8, - "int16": C.c_int16, - "int32": C.c_int32, - "int64": C.c_int64, - "uint8": C.c_uint8, - "uint16": C.c_uint16, - "uint32": C.c_uint32, - "uint64": C.c_uint64, - "float32": C.c_float, - "float64": C.c_double, + "int8": ctypes.c_int8, + "int16": ctypes.c_int16, + "int32": ctypes.c_int32, + "int64": ctypes.c_int64, + "uint8": ctypes.c_uint8, + "uint16": ctypes.c_uint16, + "uint32": ctypes.c_uint32, + "uint64": ctypes.c_uint64, + "float32": ctypes.c_float, + "float64": ctypes.c_double, } # This represents an individual kernel argument. @@ -85,8 +89,8 @@ def ready_argument_list(self, arguments): # to its underlying array, so we need to store a reference to arg.copy() # in the Argument object manually to avoid it being deleted. # (This changed in numpy > 1.15.) - # data_ctypes = data.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) - data_ctypes = arg.ctypes.data_as(C.POINTER(dtype_map[dtype_str])) + # data_ctypes = data.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) + data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) else: raise TypeError("unknown dtype for ndarray") elif isinstance(arg, np.generic): @@ -131,8 +135,8 @@ def kernel_finished(self): """Returns True if the kernel has finished, False otherwise""" # Define the argument and return types for hipEventQuery() - hip_lib.hipEventQuery.argtypes = [C.c_void_p] - hip_lib.hipEventQuery.restype = C.c_int + hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p] + hip_lib.hipEventQuery.restype = ctypes.c_int # Query the status of the event status = hip_lib.hipEventQuery(self.end) @@ -145,3 +149,35 @@ def kernel_finished(self): else: # Error occurred return False + + def synchronize(self): + """This method must implement a barrier that halts execution until device has finished its tasks.""" + pass + + def run_kernel(self, func, gpu_args, threads, grid, stream): + """This method must implement the execution of the kernel on the device.""" + pass + + def memset(self, allocation, value, size): + """This method must implement setting the memory to a value on the device.""" + pass + + def memcpy_dtoh(self, dest, src): + """This method must implement a device to host copy.""" + pass + + def memcpy_htod(self, dest, src): + """This method must implement a host to device copy.""" + pass + + def copy_constant_memory_args(self, cmem_args): + """This method must implement the allocation and copy of constant memory to the GPU.""" + pass + + def copy_shared_memory_args(self, smem_args): + """This method must implement the dynamic allocation of shared memory on the GPU.""" + pass + + def copy_texture_memory_args(self, texmem_args): + """This method must implement the allocation and copy of texture memory to the GPU.""" + pass From ef476e193e948c007d55e911c79fc51caccc3b9f Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Wed, 5 Apr 2023 16:55:00 +0200 Subject: [PATCH 09/61] added (not tested) to HIPFunctions arguments env, max_threads, name --- kernel_tuner/backends/hip.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index cb866f638..0e7f4d070 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -61,6 +61,13 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :type iterations: int """ + hipProps = hip.hipDeviceProperties() + self.name = hipProps.name + self.max_threads = hipProps.maxThreadsPerBlock + + env = dict() + self.env = env + # create a stream and events self.stream = hip.hipStreamCreate() self.start = hip.hipEventCreate() @@ -181,3 +188,5 @@ def copy_shared_memory_args(self, smem_args): def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" pass + + units = {"time": "ms"} From e120aca32e9ec9b8a0fee0b2cb607a6a965dfa85 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 6 Apr 2023 12:56:12 +0200 Subject: [PATCH 10/61] self.name of HIPFunctions works --- kernel_tuner/backends/hip.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 0e7f4d070..8cdbb9f78 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -60,9 +60,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ - - hipProps = hip.hipDeviceProperties() - self.name = hipProps.name + hip.hipInit(0) + + hipProps = hip.hipGetDeviceProperties(device) + self.name = hipProps._name.decode('utf-8') self.max_threads = hipProps.maxThreadsPerBlock env = dict() From 5cdf495edc79197261872c19c528bca5fb558ade Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 6 Apr 2023 13:06:59 +0200 Subject: [PATCH 11/61] import of pyhip based on PYHIP_PATH --- kernel_tuner/backends/hip.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 8cdbb9f78..12c2806d8 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -4,6 +4,8 @@ import ctypes import ctypes.util from collections import namedtuple +import os +import sys from kernel_tuner.backends.backend import GPUBackend @@ -16,12 +18,13 @@ # embedded in try block to be able to generate documentation # and run tests without pyhip installed -try: - import sys - sys.path.insert(0, '/home/mli940/PyHIP') +PYHIP_PATH = os.environ.get('PYHIP_PATH') # get the PYHIP_PATH environment variable +try: + if PYHIP_PATH is not None: + sys.path.insert(0, PYHIP_PATH) from pyhip import hip, hiprtc except ImportError: - print("Not able to import pyhip") + print("Not able to import pyhip, check if PYHIP_PATH is set") hip = None hiprtc = None From d15060259efb8a3bb73982ec6162bb695794c4b1 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 6 Apr 2023 17:49:16 +0200 Subject: [PATCH 12/61] added test/test_hip_functions.py with test_ready_argument_list (not sure of its correctness) and test_compile --- test/test_hip_functions.py | 58 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 test/test_hip_functions.py diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py new file mode 100644 index 000000000..132c24a77 --- /dev/null +++ b/test/test_hip_functions.py @@ -0,0 +1,58 @@ +import numpy as np +import ctypes +from .context import skip_if_no_pyhip + +import pytest +from kernel_tuner.backends import hip as kt_hip +from kernel_tuner.core import KernelSource, KernelInstance + +try: + from pyhip import hip, hiprtc + hip_present = True +except ImportError: + pass + +@skip_if_no_pyhip +def test_ready_argument_list(): + + size = 1000 + a = np.int32(75) + b = np.random.randn(size).astype(np.float32) + c = np.bool_(True) + d = np.zeros_like(b) + + arguments = [d, a, b, c] + + dev = kt_hip.HipFunctions(0) + gpu_args = dev.ready_argument_list(arguments) + + assert isinstance(gpu_args[0], ctypes.POINTER(ctypes.c_float)) + assert isinstance(gpu_args[1], ctypes.c_int32) + assert isinstance(gpu_args[2], ctypes.POINTER(ctypes.c_float)) + assert isinstance(gpu_args[3], ctypes.c_bool) + +@skip_if_no_pyhip +def test_compile(): + + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i Date: Thu, 6 Apr 2023 17:49:55 +0200 Subject: [PATCH 13/61] added hip related code for tests --- test/context.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/context.py b/test/context.py index 71c0616a5..4842b9c18 100644 --- a/test/context.py +++ b/test/context.py @@ -1,6 +1,7 @@ import sys import subprocess import shutil +import os import pytest @@ -38,6 +39,15 @@ except Exception: cuda_present = False +PYHIP_PATH = os.environ.get('PYHIP_PATH') # get the PYHIP_PATH environment variable +try: + if PYHIP_PATH is not None: + sys.path.insert(0, PYHIP_PATH) + from pyhip import hip, hiprtc + pyhip_present = True +except ImportError: + pyhip_present = False + skip_if_no_pycuda = pytest.mark.skipif(not pycuda_present, reason="PyCuda not installed or no CUDA device detected") skip_if_no_cupy = pytest.mark.skipif(not cupy_present, reason="CuPy not installed or no CUDA device detected") skip_if_no_cuda = pytest.mark.skipif(not cuda_present, reason="NVIDIA CUDA not installed") @@ -45,6 +55,7 @@ skip_if_no_gcc = pytest.mark.skipif(not gcc_present, reason="No gcc on PATH") skip_if_no_gfortran = pytest.mark.skipif(not gfortran_present, reason="No gfortran on PATH") skip_if_no_openmp = pytest.mark.skipif(not openmp_present, reason="No OpenMP found") +skip_if_no_pyhip = pytest.mark.skipif(not pyhip_present, reason="No PyHIP found") def skip_backend(backend: str): From 9f691d0fd71254c8a2d0f7b4a90ff6e43daa1d61 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 6 Apr 2023 17:51:37 +0200 Subject: [PATCH 14/61] kernel_tuner/backends/hip.py::compile passes test/test_hip_functions.py::test_compile --- kernel_tuner/backends/hip.py | 37 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 12c2806d8..85e01d17f 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -19,12 +19,10 @@ # embedded in try block to be able to generate documentation # and run tests without pyhip installed PYHIP_PATH = os.environ.get('PYHIP_PATH') # get the PYHIP_PATH environment variable -try: - if PYHIP_PATH is not None: - sys.path.insert(0, PYHIP_PATH) +try: from pyhip import hip, hiprtc except ImportError: - print("Not able to import pyhip, check if PYHIP_PATH is set") + print("Not able to import pyhip, check if PYTHONPATH includes PyHIP") hip = None hiprtc = None @@ -68,6 +66,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None hipProps = hip.hipGetDeviceProperties(device) self.name = hipProps._name.decode('utf-8') self.max_threads = hipProps.maxThreadsPerBlock + self.device = device + self.compiler_options = compiler_options env = dict() self.env = env @@ -89,10 +89,6 @@ def ready_argument_list(self, arguments): ctype_args = [None for _ in arguments] for i, arg in enumerate(arguments): - if not isinstance(arg, (np.ndarray, np.number)): - raise TypeError( - "Argument is not numpy ndarray or numpy scalar %s" % type(arg) - ) dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): @@ -104,8 +100,10 @@ def ready_argument_list(self, arguments): data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) else: raise TypeError("unknown dtype for ndarray") + elif isinstance(arg, np.bool_): + data_ctypes = ctypes.c_bool(arg) elif isinstance(arg, np.generic): - data_ctypes = dtype_map[dtype_str](arg) + data_ctypes = dtype_map[dtype_str](arg) ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) return ctype_args @@ -119,13 +117,28 @@ def compile(self, kernel_instance): :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ - kernel_string = kernel_instance.kernel_string kernel_name = kernel_instance.name + + # if filename is known, use that one + suffix = kernel_instance.kernel_source.get_user_suffix() + + if suffix is None: + # select right suffix based on compiler + suffix = ".cc" + + if ".c" in suffix and 'extern "C"' not in kernel_string: + kernel_string = 'extern "C" {\n' + kernel_string + "\n}" + kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) - device_properties = hip.hipGetDeviceProperties(0) - hiprtc.hiprtcCompileProgram(kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}']) + device_properties = hip.hipGetDeviceProperties(self.device) + plat = hip.hipGetPlatformName() + if plat == "amd": + hiprtc.hiprtcCompileProgram( + kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}']) + else: + hiprtc.hiprtcCompileProgram(kernel_ptr, []) code = hiprtc.hiprtcGetCode(kernel_ptr) module = hip.hipModuleLoadData(code) kernel = hip.hipModuleGetFunction(module, kernel_name) From 52ececf4440bddf7d143fb418ac888d8eafe72f2 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Fri, 7 Apr 2023 14:58:40 +0200 Subject: [PATCH 15/61] ready_argument_list passes partialy due to diffiulty with comparing ctypes in test (not sure whats wrong yet) --- kernel_tuner/backends/hip.py | 40 +++++++++++++++++++----------------- test/test_hip_functions.py | 10 +++++---- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 85e01d17f..d56480104 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -27,6 +27,7 @@ hiprtc = None dtype_map = { + "bool": ctypes.c_bool, "int8": ctypes.c_int8, "int16": ctypes.c_int16, "int32": ctypes.c_int32, @@ -39,11 +40,6 @@ "float64": ctypes.c_double, } -# This represents an individual kernel argument. -# It contains a numpy object (ndarray or number) and a ctypes object with a copy -# of the argument data. For an ndarray, the ctypes object is a wrapper for the ndarray's data. -Argument = namedtuple("Argument", ["numpy", "ctypes"]) - class HipFunctions(GPUBackend): """Class that groups the HIP functions on maintains state about the device""" @@ -83,29 +79,35 @@ def ready_argument_list(self, arguments): The order should match the argument list on the HIP function. Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) - :returns: A list of arguments that can be passed to the HIP function. - :rtype: list(Argument) + :returns: A ctypes structure that can be passed to the HIP function. + :rtype: ctypes.Structure """ - ctype_args = [None for _ in arguments] + ctype_args = [] for i, arg in enumerate(arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): - # In numpy <= 1.15, ndarray.ctypes.data_as does not itself keep a reference - # to its underlying array, so we need to store a reference to arg.copy() - # in the Argument object manually to avoid it being deleted. - # (This changed in numpy > 1.15.) - # data_ctypes = data.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) else: - raise TypeError("unknown dtype for ndarray") - elif isinstance(arg, np.bool_): - data_ctypes = ctypes.c_bool(arg) + raise TypeError("unknown dtype for ndarray") elif isinstance(arg, np.generic): - data_ctypes = dtype_map[dtype_str](arg) - ctype_args[i] = Argument(numpy=arg, ctypes=data_ctypes) - return ctype_args + data_ctypes = dtype_map[dtype_str](arg) + ctype_args.append(data_ctypes) + + # Determine the number of fields in the structure + num_fields = len(ctype_args) + # Determine the types of the fields in the structure + field_types = [type(x) for x in ctype_args] + # Define a new ctypes structure with the inferred layout + class ArgListStructure(ctypes.Structure): + _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] + ctypes_struct = ArgListStructure() + # Populate the fields of the structure with values from the list + for i, value in enumerate(ctype_args): + setattr(ctypes_struct, f'field{i}', value) + + return ctypes_struct def compile(self, kernel_instance): """call the HIP compiler to compile the kernel, return the function diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index 132c24a77..573ea93c5 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -26,10 +26,12 @@ def test_ready_argument_list(): dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) - assert isinstance(gpu_args[0], ctypes.POINTER(ctypes.c_float)) - assert isinstance(gpu_args[1], ctypes.c_int32) - assert isinstance(gpu_args[2], ctypes.POINTER(ctypes.c_float)) - assert isinstance(gpu_args[3], ctypes.c_bool) + assert isinstance(gpu_args, ctypes.Structure) + + assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float)) + #assert isinstance(gpu_args.field1, ctypes.c_int32) + assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float)) + #assert isinstance(gpu_args.field3, ctypes.c_bool) @skip_if_no_pyhip def test_compile(): From d3ae6bad3023f3fd94293f94f150ef263fd5f148 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Fri, 7 Apr 2023 18:34:04 +0200 Subject: [PATCH 16/61] added (not tested) run_kernel, memset, memcpy_dtoh, memcpy_htod --- examples/hip/vector_add.py | 3 +- kernel_tuner/backends/hip.py | 94 ++++++++++++++++++++++++++++++------ 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py index ed572877b..72397b628 100644 --- a/examples/hip/vector_add.py +++ b/examples/hip/vector_add.py @@ -4,6 +4,7 @@ import numpy from kernel_tuner import tune_kernel from kernel_tuner.file_utils import store_output_file, store_metadata_file +import logging def tune(): @@ -28,7 +29,7 @@ def tune(): tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] - results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP") + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", log=logging.DEBUG) # Store the tuning results in an output file store_output_file("vector_add.json", results, tune_params) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index d56480104..394e91636 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -6,6 +6,7 @@ from collections import namedtuple import os import sys +import logging from kernel_tuner.backends.backend import GPUBackend @@ -57,6 +58,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + logging.debug("HipFunction instantiated") hip.hipInit(0) hipProps = hip.hipGetDeviceProperties(device) @@ -73,6 +75,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.start = hip.hipEventCreate() self.end = hip.hipEventCreate() + self.smem_size = 0 + + env["device_name"] = self.name + def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -82,7 +88,7 @@ def ready_argument_list(self, arguments): :returns: A ctypes structure that can be passed to the HIP function. :rtype: ctypes.Structure """ - + logging.debug("HipFunction ready_argument_list called") ctype_args = [] for i, arg in enumerate(arguments): dtype_str = str(arg.dtype) @@ -95,8 +101,6 @@ def ready_argument_list(self, arguments): data_ctypes = dtype_map[dtype_str](arg) ctype_args.append(data_ctypes) - # Determine the number of fields in the structure - num_fields = len(ctype_args) # Determine the types of the fields in the structure field_types = [type(x) for x in ctype_args] # Define a new ctypes structure with the inferred layout @@ -119,6 +123,7 @@ def compile(self, kernel_instance): :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ + logging.debug("HipFunction compile called") kernel_string = kernel_instance.kernel_string kernel_name = kernel_instance.name @@ -136,6 +141,7 @@ def compile(self, kernel_instance): device_properties = hip.hipGetDeviceProperties(self.device) plat = hip.hipGetPlatformName() + #Compile based on device if plat == "amd": hiprtc.hiprtcCompileProgram( kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}']) @@ -149,16 +155,17 @@ def compile(self, kernel_instance): def start_event(self): """Records the event that marks the start of a measurement""" - self.start = hip.hipEventCreate() + logging.debug("HipFunction start_event called") hip.hipEventRecord(self.start, self.stream) def stop_event(self): """Records the event that marks the end of a measurement""" - self.end = hip.hipEventCreate() + logging.debug("HipFunction stop_event called") hip.hipEventRecord(self.end, self.stream) def kernel_finished(self): """Returns True if the kernel has finished, False otherwise""" + logging.debug("HipFunction kernel_finished called") # Define the argument and return types for hipEventQuery() hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p] @@ -177,35 +184,90 @@ def kernel_finished(self): return False def synchronize(self): - """This method must implement a barrier that halts execution until device has finished its tasks.""" + """Halts execution until device has finished its tasks""" + logging.debug("HipFunction synchronize called") + hip.hipEventSynchronize(self.end) pass def run_kernel(self, func, gpu_args, threads, grid, stream): - """This method must implement the execution of the kernel on the device.""" + """runs the HIP kernel passed as 'func' + + :param func: A PyHIP kernel compiled for this specific kernel configuration + :type func: ctypes pionter + + :param gpu_args: A list of arguments to the kernel, order should match the + order in the code. Allowed values are either variables in global memory + or single values passed by value. + :type gpu_args: ctypes.Structure + + :param threads: A tuple listing the number of threads in each dimension of + the thread block + :type threads: tuple(int, int, int) + + :param grid: A tuple listing the number of thread blocks in each dimension + of the grid + :type grid: tuple(int, int) + """ + logging.debug("HipFunction run_kernel called") + hip.hipModuleLaunchKernel(func, + grid[0], grid[1], grid[2], grid[3], + threads[0], threads[1], threads[2], + self.smem_size, + stream, + gpu_args) pass def memset(self, allocation, value, size): - """This method must implement setting the memory to a value on the device.""" - pass + """set the memory in allocation to the value in value + + :param allocation: An Argument for some memory allocation unit + :type allocation: ctypes ptr + + :param value: The value to set the memory to + :type value: a single 8-bit unsigned int + + :param size: The size of to the allocation unit in bytes + :type size: int + """ + logging.debug("HipFunction memset called") + allocation.contents.value = value def memcpy_dtoh(self, dest, src): - """This method must implement a device to host copy.""" - pass + """perform a device to host memory copy + + :param dest: A numpy array in host memory to store the data + :type dest: numpy.ndarray + + :param src: A GPU memory allocation unit + :type src: ctypes ptr + """ + logging.debug("HipFunction memcpy_dtoh called") + dtype_str = str(src.dtype) + hip.hipMemcpy_dtoh(ctypes.byref(dest.ctypes), src, ctypes.sizeof(dtype_map[dtype_str]) * src.size) def memcpy_htod(self, dest, src): - """This method must implement a host to device copy.""" - pass + """perform a host to device memory copy + + :param dest: A GPU memory allocation unit + :type dest: ctypes ptr + + :param src: A numpy array in host memory to store the data + :type src: numpy.ndarray + """ + logging.debug("HipFunction memcpy_htod called") + dtype_str = str(src.dtype) + hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size) def copy_constant_memory_args(self, cmem_args): """This method must implement the allocation and copy of constant memory to the GPU.""" - pass + logging.debug("HipFunction copy_constant_memory_args called") def copy_shared_memory_args(self, smem_args): """This method must implement the dynamic allocation of shared memory on the GPU.""" - pass + logging.debug("HipFunction copy_shared_memory_args called") def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" - pass + logging.debug("HipFunction copy_texture_memory_args called") units = {"time": "ms"} From cbe9997dd3fa950b6af81591a42db2fbf10e03f1 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 11 Apr 2023 16:00:59 +0200 Subject: [PATCH 17/61] not able to solve hipGetDeviceProperties issue, for now pached manually --- kernel_tuner/backends/hip.py | 39 +++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 394e91636..9f6a36d62 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -10,12 +10,26 @@ from kernel_tuner.backends.backend import GPUBackend -try: - # Load the HIP runtime library - hip_lib = ctypes.cdll.LoadLibrary(ctypes.util.find_library('hip')) -except ImportError: - print("Not able to import hip c lib") - hip_lib = None +_libhip = None +_hip_platform_name = '' + +# Try to find amd hip library, if not found, fallback to nvhip library +if 'linux' in sys.platform: + try: + _libhip_libname = 'libamdhip64.so' + _libhip = ctypes.cdll.LoadLibrary(_libhip_libname) + _hip_platform_name = 'amd' + except: + try: + _libhip_libname = 'libnvhip64.so' + _libhip = ctypes.cdll.LoadLibrary(_libhip_libname) + _hip_platform_name = 'nvidia' + except: + raise RuntimeError( + 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set') +else: + # Currently we do not support windows + raise RuntimeError('Only linux is supported') # embedded in try block to be able to generate documentation # and run tests without pyhip installed @@ -59,15 +73,24 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :type iterations: int """ logging.debug("HipFunction instantiated") - hip.hipInit(0) + #hip.hipInit(0) + #hip.hipSetDevice(device) hipProps = hip.hipGetDeviceProperties(device) + # Print out all the values + #for field_name, field_type in hipProps._fields_: + # print(f"{field_name}: {getattr(hipProps, field_name)}") + self.name = hipProps._name.decode('utf-8') self.max_threads = hipProps.maxThreadsPerBlock + print("self.max_threads: " + str(self.max_threads)) + self.max_threads = 1024 # PATCH FOR NOW + self.device = device self.compiler_options = compiler_options env = dict() + env["device_name"] = self.name self.env = env # create a stream and events @@ -77,8 +100,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.smem_size = 0 - env["device_name"] = self.name - def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. From 13d93bd1b825887bee80a15170d8af284dedb773 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Tue, 11 Apr 2023 16:46:48 +0200 Subject: [PATCH 18/61] added (not tested) observer code --- kernel_tuner/backends/hip.py | 14 +++++++++++--- kernel_tuner/observers/hip.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 kernel_tuner/observers/hip.py diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 9f6a36d62..bc9f394bb 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -9,6 +9,7 @@ import logging from kernel_tuner.backends.backend import GPUBackend +from kernel_tuner.observers.hip import HipRuntimeObserver _libhip = None _hip_platform_name = '' @@ -33,7 +34,6 @@ # embedded in try block to be able to generate documentation # and run tests without pyhip installed -PYHIP_PATH = os.environ.get('PYHIP_PATH') # get the PYHIP_PATH environment variable try: from pyhip import hip, hiprtc except ImportError: @@ -100,6 +100,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.smem_size = 0 + # setup observers + self.observers = observers or [] + self.observers.append(HipRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) + def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -210,7 +216,7 @@ def synchronize(self): hip.hipEventSynchronize(self.end) pass - def run_kernel(self, func, gpu_args, threads, grid, stream): + def run_kernel(self, func, gpu_args, threads, grid, stream=None): """runs the HIP kernel passed as 'func' :param func: A PyHIP kernel compiled for this specific kernel configuration @@ -230,8 +236,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream): :type grid: tuple(int, int) """ logging.debug("HipFunction run_kernel called") + if stream is None: + stream = self.stream hip.hipModuleLaunchKernel(func, - grid[0], grid[1], grid[2], grid[3], + grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], self.smem_size, stream, diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py new file mode 100644 index 000000000..4c7704aa8 --- /dev/null +++ b/kernel_tuner/observers/hip.py @@ -0,0 +1,31 @@ +import numpy as np + +from kernel_tuner.observers.observer import BenchmarkObserver + +try: + from pyhip import hip, hiprtc +except ImportError: + print("Not able to import pyhip, check if PYTHONPATH includes PyHIP") + hip = None + hiprtc = None + + +class HipRuntimeObserver(BenchmarkObserver): + """Observer that measures time using CUDA events during benchmarking""" + + def __init__(self, dev): + self.dev = dev + self.stream = dev.stream + self.start = dev.start + self.end = dev.end + self.times = [] + + def after_finish(self): + # Time is measured in milliseconds + EventElapsedTime = hip.hipEventElapsedTime(self.start, self.end) + self.times.append(EventElapsedTime.value) + + def get_results(self): + results = {"time": np.average(self.times), "times": self.times.copy()} + self.times = [] + return results From d9512ac0ebd6dbcebab5e9769b94ccf94fc48a06 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Wed, 12 Apr 2023 10:30:18 +0200 Subject: [PATCH 19/61] added (not tested) observer code --- kernel_tuner/backends/hip.py | 27 ++++++++++++++++++--------- test/test_hip_functions.py | 6 +++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index bc9f394bb..f29b2c69d 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -113,10 +113,11 @@ def ready_argument_list(self, arguments): Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) :returns: A ctypes structure that can be passed to the HIP function. - :rtype: ctypes.Structure - """ + :rtype: ctypes.Structure""" + logging.debug("HipFunction ready_argument_list called") ctype_args = [] + data_ctypes = None for i, arg in enumerate(arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): @@ -137,9 +138,11 @@ class ArgListStructure(ctypes.Structure): # Populate the fields of the structure with values from the list for i, value in enumerate(ctype_args): setattr(ctypes_struct, f'field{i}', value) + print(f'field{i} = {value} of {type(value)}') return ctypes_struct + def compile(self, kernel_instance): """call the HIP compiler to compile the kernel, return the function @@ -195,15 +198,15 @@ def kernel_finished(self): logging.debug("HipFunction kernel_finished called") # Define the argument and return types for hipEventQuery() - hip_lib.hipEventQuery.argtypes = [ctypes.c_void_p] - hip_lib.hipEventQuery.restype = ctypes.c_int + _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] + _libhip.hipEventQuery.restype = ctypes.c_int # Query the status of the event - status = hip_lib.hipEventQuery(self.end) - if status == hip_lib.hipSuccess: + status = _libhip.hipEventQuery(self.end) + if status == _libhip.hipSuccess: # Kernel has finished return True - elif status == hip_lib.hipErrorNotReady: + elif status == _libhip.hipErrorNotReady: # Kernel is still running return False else: @@ -233,18 +236,23 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): :param grid: A tuple listing the number of thread blocks in each dimension of the grid - :type grid: tuple(int, int) + :type grid: tuple(int, int, int) """ logging.debug("HipFunction run_kernel called") if stream is None: stream = self.stream + print(func) + print(grid) + print(threads) + print(self.smem_size) + print(stream) + print(gpu_args) hip.hipModuleLaunchKernel(func, grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], self.smem_size, stream, gpu_args) - pass def memset(self, allocation, value, size): """set the memory in allocation to the value in value @@ -294,6 +302,7 @@ def copy_constant_memory_args(self, cmem_args): def copy_shared_memory_args(self, smem_args): """This method must implement the dynamic allocation of shared memory on the GPU.""" logging.debug("HipFunction copy_shared_memory_args called") + self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index 573ea93c5..c0030c995 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -25,13 +25,13 @@ def test_ready_argument_list(): dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) - + assert isinstance(gpu_args, ctypes.Structure) assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float)) - #assert isinstance(gpu_args.field1, ctypes.c_int32) + assert isinstance(gpu_args.field1, ctypes.c_int32) assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float)) - #assert isinstance(gpu_args.field3, ctypes.c_bool) + assert isinstance(gpu_args.field3, ctypes.c_bool) @skip_if_no_pyhip def test_compile(): From 578dd3061286bad7f3a7359cc03e453fd19cbb23 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Wed, 12 Apr 2023 22:02:40 +0200 Subject: [PATCH 20/61] update --- kernel_tuner/backends/hip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index f29b2c69d..013eb3bc6 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -84,7 +84,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = hipProps._name.decode('utf-8') self.max_threads = hipProps.maxThreadsPerBlock print("self.max_threads: " + str(self.max_threads)) - self.max_threads = 1024 # PATCH FOR NOW + #self.max_threads = 1024 # PATCH FOR NOW self.device = device self.compiler_options = compiler_options From dc0b350c8f7f59e8b5131bb823d8e668f1c73b29 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 10:38:27 +0200 Subject: [PATCH 21/61] debug updates --- kernel_tuner/backends/hip.py | 40 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 013eb3bc6..ef6a9529f 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -28,10 +28,19 @@ except: raise RuntimeError( 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set') + + _libhiprtc_libname = 'libhiprtc.so' + _libhiprtc = None + try: + _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname) + except: + raise OSError('hiprtc library not found') + else: # Currently we do not support windows raise RuntimeError('Only linux is supported') + # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: @@ -83,7 +92,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = hipProps._name.decode('utf-8') self.max_threads = hipProps.maxThreadsPerBlock - print("self.max_threads: " + str(self.max_threads)) + logging.debug("self.max_threads: " + str(self.max_threads)) #self.max_threads = 1024 # PATCH FOR NOW self.device = device @@ -106,6 +115,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None for obs in self.observers: obs.register_device(self) + # define arguments and return value ctypes for hipEventQuery + _libhip.hipEventQuery.restype = ctypes.c_int + _libhip.hipEventQuery.argtypes = ctypes.c_void_p + def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -138,7 +151,7 @@ class ArgListStructure(ctypes.Structure): # Populate the fields of the structure with values from the list for i, value in enumerate(ctype_args): setattr(ctypes_struct, f'field{i}', value) - print(f'field{i} = {value} of {type(value)}') + #print(f'field{i} = {value} of {type(value)}') return ctypes_struct @@ -197,27 +210,20 @@ def kernel_finished(self): """Returns True if the kernel has finished, False otherwise""" logging.debug("HipFunction kernel_finished called") - # Define the argument and return types for hipEventQuery() - _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] - _libhip.hipEventQuery.restype = ctypes.c_int - # Query the status of the event status = _libhip.hipEventQuery(self.end) - if status == _libhip.hipSuccess: - # Kernel has finished + logging.debug(f'_libhip.hipEventQuery(self.end) = {status}') + if status == 34: # 34 = hipErrorNotReady + logging.debug("kernel finished") return True - elif status == _libhip.hipErrorNotReady: - # Kernel is still running - return False else: - # Error occurred + logging.debug("kernel not finished") return False def synchronize(self): """Halts execution until device has finished its tasks""" logging.debug("HipFunction synchronize called") - hip.hipEventSynchronize(self.end) - pass + status = hip.hipEventSynchronize(self.end) def run_kernel(self, func, gpu_args, threads, grid, stream=None): """runs the HIP kernel passed as 'func' @@ -241,12 +247,6 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): logging.debug("HipFunction run_kernel called") if stream is None: stream = self.stream - print(func) - print(grid) - print(threads) - print(self.smem_size) - print(stream) - print(gpu_args) hip.hipModuleLaunchKernel(func, grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], From 9732bf79fe14770d63363fecde727861967f2e98 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 15:46:39 +0200 Subject: [PATCH 22/61] added mem calls to ready_argument_list --- kernel_tuner/backends/hip.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index ef6a9529f..5b9ead27a 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -36,11 +36,20 @@ except: raise OSError('hiprtc library not found') + + _libhiprtc_libname = 'libhiprtc.so' + _libhiprtc = None + try: + _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname) + except: + raise OSError('hiprtc library not found') + else: # Currently we do not support windows raise RuntimeError('Only linux is supported') + # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: @@ -115,10 +124,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None for obs in self.observers: obs.register_device(self) - # define arguments and return value ctypes for hipEventQuery - _libhip.hipEventQuery.restype = ctypes.c_int - _libhip.hipEventQuery.argtypes = ctypes.c_void_p - def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -135,7 +140,10 @@ def ready_argument_list(self, arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): + device_ptr = hip.hipMalloc(arg.nbytes) data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) + print(data_ctypes) + hip.hipMemcpy_htod(device_ptr, ctypes.byref(data_ctypes), arg.nbytes) else: raise TypeError("unknown dtype for ndarray") elif isinstance(arg, np.generic): @@ -147,11 +155,11 @@ def ready_argument_list(self, arguments): # Define a new ctypes structure with the inferred layout class ArgListStructure(ctypes.Structure): _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] - ctypes_struct = ArgListStructure() + ctypes_struct = ArgListStructure(*ctype_args) # Populate the fields of the structure with values from the list for i, value in enumerate(ctype_args): setattr(ctypes_struct, f'field{i}', value) - #print(f'field{i} = {value} of {type(value)}') + print(f'field{i} = {value} of {type(value)}') return ctypes_struct From 2b3dda6a1b69c29681a4eabc487478476a3ec33b Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 15:52:35 +0200 Subject: [PATCH 23/61] updates --- kernel_tuner/backends/hip.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 5b9ead27a..769558377 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -36,20 +36,11 @@ except: raise OSError('hiprtc library not found') - - _libhiprtc_libname = 'libhiprtc.so' - _libhiprtc = None - try: - _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname) - except: - raise OSError('hiprtc library not found') - else: # Currently we do not support windows raise RuntimeError('Only linux is supported') - # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: @@ -124,6 +115,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None for obs in self.observers: obs.register_device(self) + # define arguments and return value ctypes for hipEventQuery + _libhip.hipEventQuery.restype = ctypes.c_int + _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] + def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function :param arguments: List of arguments to be passed to the HIP function. @@ -157,9 +152,9 @@ class ArgListStructure(ctypes.Structure): _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] ctypes_struct = ArgListStructure(*ctype_args) # Populate the fields of the structure with values from the list - for i, value in enumerate(ctype_args): - setattr(ctypes_struct, f'field{i}', value) - print(f'field{i} = {value} of {type(value)}') + #for i, value in enumerate(ctype_args): + # setattr(ctypes_struct, f'field{i}', value) + #print(f'field{i} = {value} of {type(value)}') return ctypes_struct From 5a89c85da44c1e794d6ac12bb478e6bf283bebf6 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 16:49:00 +0200 Subject: [PATCH 24/61] corrected synchronize in backend/hip.py --- kernel_tuner/backends/hip.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 769558377..164055682 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -135,10 +135,14 @@ def ready_argument_list(self, arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): + print(f'dtype_stre = {dtype_str}') + print(f'arg.size = {arg.size}') + print(f'arg.nbytes = {arg.nbytes}') device_ptr = hip.hipMalloc(arg.nbytes) + print(f'device_ptr = {device_ptr}') data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) - print(data_ctypes) - hip.hipMemcpy_htod(device_ptr, ctypes.byref(data_ctypes), arg.nbytes) + print(f'data_ctypes = {data_ctypes}') + hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) else: raise TypeError("unknown dtype for ndarray") elif isinstance(arg, np.generic): @@ -226,7 +230,7 @@ def kernel_finished(self): def synchronize(self): """Halts execution until device has finished its tasks""" logging.debug("HipFunction synchronize called") - status = hip.hipEventSynchronize(self.end) + status = hip.hipDeviceSynchronize() def run_kernel(self, func, gpu_args, threads, grid, stream=None): """runs the HIP kernel passed as 'func' From fc183d8a9156d476c15ce07b7bf63d4e5a9f016f Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 16:58:18 +0200 Subject: [PATCH 25/61] clean up --- kernel_tuner/backends/hip.py | 44 ++++++++---------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 164055682..d529ae020 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -82,18 +82,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :type iterations: int """ logging.debug("HipFunction instantiated") - #hip.hipInit(0) - #hip.hipSetDevice(device) - hipProps = hip.hipGetDeviceProperties(device) - # Print out all the values - #for field_name, field_type in hipProps._fields_: - # print(f"{field_name}: {getattr(hipProps, field_name)}") + self.hipProps = hip.hipGetDeviceProperties(device) - self.name = hipProps._name.decode('utf-8') - self.max_threads = hipProps.maxThreadsPerBlock + self.name = self.hipProps._name.decode('utf-8') + self.max_threads = self.hipProps.maxThreadsPerBlock logging.debug("self.max_threads: " + str(self.max_threads)) - #self.max_threads = 1024 # PATCH FOR NOW self.device = device self.compiler_options = compiler_options @@ -131,17 +125,12 @@ def ready_argument_list(self, arguments): logging.debug("HipFunction ready_argument_list called") ctype_args = [] data_ctypes = None - for i, arg in enumerate(arguments): + for arg in enumerate(arguments): dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): - print(f'dtype_stre = {dtype_str}') - print(f'arg.size = {arg.size}') - print(f'arg.nbytes = {arg.nbytes}') device_ptr = hip.hipMalloc(arg.nbytes) - print(f'device_ptr = {device_ptr}') data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) - print(f'data_ctypes = {data_ctypes}') hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) else: raise TypeError("unknown dtype for ndarray") @@ -154,13 +143,8 @@ def ready_argument_list(self, arguments): # Define a new ctypes structure with the inferred layout class ArgListStructure(ctypes.Structure): _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] - ctypes_struct = ArgListStructure(*ctype_args) - # Populate the fields of the structure with values from the list - #for i, value in enumerate(ctype_args): - # setattr(ctypes_struct, f'field{i}', value) - #print(f'field{i} = {value} of {type(value)}') - return ctypes_struct + return ArgListStructure(*ctype_args) def compile(self, kernel_instance): @@ -177,24 +161,16 @@ def compile(self, kernel_instance): kernel_string = kernel_instance.kernel_string kernel_name = kernel_instance.name - # if filename is known, use that one - suffix = kernel_instance.kernel_source.get_user_suffix() - - if suffix is None: - # select right suffix based on compiler - suffix = ".cc" - - if ".c" in suffix and 'extern "C"' not in kernel_string: + if 'extern "C"' not in kernel_string: kernel_string = 'extern "C" {\n' + kernel_string + "\n}" kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) - device_properties = hip.hipGetDeviceProperties(self.device) plat = hip.hipGetPlatformName() #Compile based on device if plat == "amd": hiprtc.hiprtcCompileProgram( - kernel_ptr, [f'--offload-arch={device_properties.gcnArchName}']) + kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}']) else: hiprtc.hiprtcCompileProgram(kernel_ptr, []) code = hiprtc.hiprtcGetCode(kernel_ptr) @@ -220,7 +196,7 @@ def kernel_finished(self): # Query the status of the event status = _libhip.hipEventQuery(self.end) logging.debug(f'_libhip.hipEventQuery(self.end) = {status}') - if status == 34: # 34 = hipErrorNotReady + if status == 34: # 34 = hipErrorNotReady --> still have to look into this logging.debug("kernel finished") return True else: @@ -230,7 +206,7 @@ def kernel_finished(self): def synchronize(self): """Halts execution until device has finished its tasks""" logging.debug("HipFunction synchronize called") - status = hip.hipDeviceSynchronize() + hip.hipDeviceSynchronize() def run_kernel(self, func, gpu_args, threads, grid, stream=None): """runs the HIP kernel passed as 'func' @@ -274,7 +250,7 @@ def memset(self, allocation, value, size): :type size: int """ logging.debug("HipFunction memset called") - allocation.contents.value = value + allocation.contents.value = value # probably wrong, still have to look into this def memcpy_dtoh(self, dest, src): """perform a device to host memory copy From 063dd151b42c2baea25ec55ee7097b067fdc2551 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 17:13:01 +0200 Subject: [PATCH 26/61] resolved memory error caused by ready_argument_list --- kernel_tuner/backends/hip.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index d529ae020..8982b082a 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -64,6 +64,12 @@ "float64": ctypes.c_double, } +# define arguments and return value ctypes for hipEventQuery +_libhip.hipEventQuery.restype = ctypes.c_int +_libhip.hipEventQuery.argtypes = [ctypes.c_void_p] + +hipSuccess = 0 + class HipFunctions(GPUBackend): """Class that groups the HIP functions on maintains state about the device""" @@ -109,9 +115,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None for obs in self.observers: obs.register_device(self) - # define arguments and return value ctypes for hipEventQuery - _libhip.hipEventQuery.restype = ctypes.c_int - _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function @@ -132,11 +135,12 @@ def ready_argument_list(self, arguments): device_ptr = hip.hipMalloc(arg.nbytes) data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) + ctype_args.append(device_ptr) else: raise TypeError("unknown dtype for ndarray") elif isinstance(arg, np.generic): data_ctypes = dtype_map[dtype_str](arg) - ctype_args.append(data_ctypes) + ctype_args.append(data_ctypes) # Determine the types of the fields in the structure field_types = [type(x) for x in ctype_args] @@ -196,7 +200,7 @@ def kernel_finished(self): # Query the status of the event status = _libhip.hipEventQuery(self.end) logging.debug(f'_libhip.hipEventQuery(self.end) = {status}') - if status == 34: # 34 = hipErrorNotReady --> still have to look into this + if status == hipSuccess: logging.debug("kernel finished") return True else: From 48db63881898039c4f9f703616e63886e01aaef6 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Thu, 13 Apr 2023 17:45:31 +0200 Subject: [PATCH 27/61] examples/hip/vector_add.py runs successfully on AMD --- kernel_tuner/backends/hip.py | 6 ++---- kernel_tuner/observers/hip.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 8982b082a..fb602d5cb 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -123,7 +123,8 @@ def ready_argument_list(self, arguments): Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) :returns: A ctypes structure that can be passed to the HIP function. - :rtype: ctypes.Structure""" + :rtype: ctypes.Structure + """ logging.debug("HipFunction ready_argument_list called") ctype_args = [] @@ -199,12 +200,9 @@ def kernel_finished(self): # Query the status of the event status = _libhip.hipEventQuery(self.end) - logging.debug(f'_libhip.hipEventQuery(self.end) = {status}') if status == hipSuccess: - logging.debug("kernel finished") return True else: - logging.debug("kernel not finished") return False def synchronize(self): diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py index 4c7704aa8..72a3cb4fe 100644 --- a/kernel_tuner/observers/hip.py +++ b/kernel_tuner/observers/hip.py @@ -23,7 +23,7 @@ def __init__(self, dev): def after_finish(self): # Time is measured in milliseconds EventElapsedTime = hip.hipEventElapsedTime(self.start, self.end) - self.times.append(EventElapsedTime.value) + self.times.append(EventElapsedTime) def get_results(self): results = {"time": np.average(self.times), "times": self.times.copy()} From e003d1c3dee2882b5cf8b6dc16045c675b5b38d0 Mon Sep 17 00:00:00 2001 From: Milo Lurati Date: Fri, 14 Apr 2023 09:51:26 +0200 Subject: [PATCH 28/61] examples/hip/vector_add.py runs successfully on AMD --- kernel_tuner/backends/hip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index fb602d5cb..ac679ab4e 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -129,7 +129,7 @@ def ready_argument_list(self, arguments): logging.debug("HipFunction ready_argument_list called") ctype_args = [] data_ctypes = None - for arg in enumerate(arguments): + for arg in arguments: dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): From f6cc000cc2f62db5d8d66a8d65d0028f4ec5fc4d Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 14 Apr 2023 12:20:58 +0200 Subject: [PATCH 29/61] modified (not tested) memset, copy_constant_memory_args --- kernel_tuner/backends/hip.py | 44 ++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index ac679ab4e..8b96fc1df 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -64,9 +64,14 @@ "float64": ctypes.c_double, } -# define arguments and return value ctypes for hipEventQuery +# define arguments and return value types of HIP functions _libhip.hipEventQuery.restype = ctypes.c_int _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] +_libhip.hipModuleGetGlobal.restype = ctypes.c_int +_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p] +_libhip.hipMemset.restype = ctypes.c_int +_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] + hipSuccess = 0 @@ -108,6 +113,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.end = hip.hipEventCreate() self.smem_size = 0 + self.current_module = None # setup observers self.observers = observers or [] @@ -125,12 +131,13 @@ def ready_argument_list(self, arguments): :returns: A ctypes structure that can be passed to the HIP function. :rtype: ctypes.Structure """ - logging.debug("HipFunction ready_argument_list called") + ctype_args = [] data_ctypes = None for arg in arguments: dtype_str = str(arg.dtype) + # Allocate space on device for array and convert to ctypes if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): device_ptr = hip.hipMalloc(arg.nbytes) @@ -138,7 +145,8 @@ def ready_argument_list(self, arguments): hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) ctype_args.append(device_ptr) else: - raise TypeError("unknown dtype for ndarray") + raise TypeError("unknown dtype for ndarray") + # Convert valid non-array arguments to ctypes elif isinstance(arg, np.generic): data_ctypes = dtype_map[dtype_str](arg) ctype_args.append(data_ctypes) @@ -180,6 +188,7 @@ def compile(self, kernel_instance): hiprtc.hiprtcCompileProgram(kernel_ptr, []) code = hiprtc.hiprtcGetCode(kernel_ptr) module = hip.hipModuleLoadData(code) + self.current_module = module kernel = hip.hipModuleGetFunction(module, kernel_name) return kernel @@ -242,7 +251,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): def memset(self, allocation, value, size): """set the memory in allocation to the value in value - :param allocation: An Argument for some memory allocation unit + :param allocation: A GPU memory allocation unit :type allocation: ctypes ptr :param value: The value to set the memory to @@ -250,9 +259,12 @@ def memset(self, allocation, value, size): :param size: The size of to the allocation unit in bytes :type size: int + """ - logging.debug("HipFunction memset called") - allocation.contents.value = value # probably wrong, still have to look into this + ctypes_value = ctypes.c_int(value) + ctypes_size = ctypes.c_size_t(size) + status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size) + hip.hipCheckStatus(status) def memcpy_dtoh(self, dest, src): """perform a device to host memory copy @@ -281,8 +293,25 @@ def memcpy_htod(self, dest, src): hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size) def copy_constant_memory_args(self, cmem_args): - """This method must implement the allocation and copy of constant memory to the GPU.""" + """adds constant memory arguments to the most recently compiled module + + :param cmem_args: A dictionary containing the data to be passed to the + device constant memory. The format to be used is as follows: A + string key is used to name the constant memory symbol to which the + value needs to be copied. Similar to regular arguments, these need + to be numpy objects, such as numpy.ndarray or numpy.int32, and so on. + :type cmem_args: dict( string: numpy.ndarray, ... ) + """ logging.debug("HipFunction copy_constant_memory_args called") + logging.debug("current module: " + str(self.current_module)) + + for k, v in cmem_args.items(): + symbol = ctypes.c_void_p + size_kernel = ctypes.c_size_t + status = _libhip.hipModuleGetGlobal(symbol, size_kernel, self.current_module, str.encode(k)) + hip.hipCheckStatus(status) + dtype_str = str(v.dtype) + hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size) def copy_shared_memory_args(self, smem_args): """This method must implement the dynamic allocation of shared memory on the GPU.""" @@ -292,5 +321,6 @@ def copy_shared_memory_args(self, smem_args): def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" logging.debug("HipFunction copy_texture_memory_args called") + # NOT SUPPORTED? units = {"time": "ms"} From 2505db9de43af9e22ae9faba41c0f3d600d132b7 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 14 Apr 2023 12:30:47 +0200 Subject: [PATCH 30/61] modified (not tested) memset, copy_constant_memory_args --- kernel_tuner/backends/hip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 8b96fc1df..b60cde2a9 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -314,13 +314,13 @@ def copy_constant_memory_args(self, cmem_args): hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size) def copy_shared_memory_args(self, smem_args): - """This method must implement the dynamic allocation of shared memory on the GPU.""" + """add shared memory arguments to the kernel""" logging.debug("HipFunction copy_shared_memory_args called") self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" logging.debug("HipFunction copy_texture_memory_args called") - # NOT SUPPORTED? + raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED? units = {"time": "ms"} From dd34b4180467d1d94b06164e33832fbeb91b7bc5 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Mon, 17 Apr 2023 11:56:09 +0200 Subject: [PATCH 31/61] modified run_kernel (modifies grpu_args from ctypes list to ctypes structure), ready_argument_list (returns ctypes list), and memcpy_dtoh; examples/hip/test_vector_add.py passes -> memcpy_dtoh works --- examples/hip/test_vector_add.py | 14 +++++++------ kernel_tuner/backends/hip.py | 35 ++++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/examples/hip/test_vector_add.py b/examples/hip/test_vector_add.py index 6fd21aced..6c342632e 100644 --- a/examples/hip/test_vector_add.py +++ b/examples/hip/test_vector_add.py @@ -5,13 +5,15 @@ from kernel_tuner import run_kernel import pytest +#Check pyhip is installed and if a HIP capable device is present, if not skip the test +try: + from pyhip import hip, hiprtc +except ImportError: + pytest.skip("PyHIP not installed or PYTHONPATH does not includes PyHIP") + hip = None + hiprtc = None + def test_vector_add(): - #Check pyhip is installed and if a HIP capable device is present, if not skip the test - try: - import pyhip as hip - hip.hipGetDeviceProperties(0) - except (ImportError, Exception): - pytest.skip("PyHIP not installed or no HIP device detected") kernel_string = """ __global__ void vector_add(float *c, float *a, float *b, int n) { diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index b60cde2a9..f54c01d36 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -128,8 +128,8 @@ def ready_argument_list(self, arguments): The order should match the argument list on the HIP function. Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) - :returns: A ctypes structure that can be passed to the HIP function. - :rtype: ctypes.Structure + :returns: List of ctypes arguments to be passed to the HIP function. + :rtype: list of ctypes """ logging.debug("HipFunction ready_argument_list called") @@ -151,13 +151,7 @@ def ready_argument_list(self, arguments): data_ctypes = dtype_map[dtype_str](arg) ctype_args.append(data_ctypes) - # Determine the types of the fields in the structure - field_types = [type(x) for x in ctype_args] - # Define a new ctypes structure with the inferred layout - class ArgListStructure(ctypes.Structure): - _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] - - return ArgListStructure(*ctype_args) + return ctype_args def compile(self, kernel_instance): @@ -228,7 +222,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): :param gpu_args: A list of arguments to the kernel, order should match the order in the code. Allowed values are either variables in global memory or single values passed by value. - :type gpu_args: ctypes.Structure + :type gpu_args: list of ctypes :param threads: A tuple listing the number of threads in each dimension of the thread block @@ -241,6 +235,15 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): logging.debug("HipFunction run_kernel called") if stream is None: stream = self.stream + + # Determine the types of the fields in the structure + field_types = [type(x) for x in gpu_args] + # Define a new ctypes structure with the inferred layout + class ArgListStructure(ctypes.Structure): + _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] + + gpu_args = ArgListStructure(*gpu_args) + hip.hipModuleLaunchKernel(func, grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], @@ -261,6 +264,8 @@ def memset(self, allocation, value, size): :type size: int """ + logging.debug("HipFunction memset called") + print("HipFunction memset called") ctypes_value = ctypes.c_int(value) ctypes_size = ctypes.c_size_t(size) status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size) @@ -276,8 +281,10 @@ def memcpy_dtoh(self, dest, src): :type src: ctypes ptr """ logging.debug("HipFunction memcpy_dtoh called") - dtype_str = str(src.dtype) - hip.hipMemcpy_dtoh(ctypes.byref(dest.ctypes), src, ctypes.sizeof(dtype_map[dtype_str]) * src.size) + print("HipFunction memcpy_dtoh called") + + address = dest.ctypes.data + hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes) def memcpy_htod(self, dest, src): """perform a host to device memory copy @@ -289,6 +296,7 @@ def memcpy_htod(self, dest, src): :type src: numpy.ndarray """ logging.debug("HipFunction memcpy_htod called") + print("HipFunction memcpy_htod called") dtype_str = str(src.dtype) hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size) @@ -303,6 +311,7 @@ def copy_constant_memory_args(self, cmem_args): :type cmem_args: dict( string: numpy.ndarray, ... ) """ logging.debug("HipFunction copy_constant_memory_args called") + print("HipFunction copy_constant_memory_args called") logging.debug("current module: " + str(self.current_module)) for k, v in cmem_args.items(): @@ -316,11 +325,13 @@ def copy_constant_memory_args(self, cmem_args): def copy_shared_memory_args(self, smem_args): """add shared memory arguments to the kernel""" logging.debug("HipFunction copy_shared_memory_args called") + print("HipFunction copy_shared_memory_args called") self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" logging.debug("HipFunction copy_texture_memory_args called") + print("HipFunction copy_texture_memory_args called") raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED? units = {"time": "ms"} From 211889b6e12a818887b86d00ebc94eb2c15ef539 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Mon, 17 Apr 2023 12:17:42 +0200 Subject: [PATCH 32/61] clean up and modified memcpy_htod --- kernel_tuner/backends/hip.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index f54c01d36..bd8565c4b 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -281,7 +281,6 @@ def memcpy_dtoh(self, dest, src): :type src: ctypes ptr """ logging.debug("HipFunction memcpy_dtoh called") - print("HipFunction memcpy_dtoh called") address = dest.ctypes.data hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes) @@ -296,9 +295,9 @@ def memcpy_htod(self, dest, src): :type src: numpy.ndarray """ logging.debug("HipFunction memcpy_htod called") - print("HipFunction memcpy_htod called") - dtype_str = str(src.dtype) - hip.hipMemcpy_htod(dest, ctypes.byref(src.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * src.size) + + address = src.ctypes.data + hip.hipMemcpy_htod(dest, ctypes.c_void_p(address), src.nbytes) def copy_constant_memory_args(self, cmem_args): """adds constant memory arguments to the most recently compiled module @@ -325,13 +324,13 @@ def copy_constant_memory_args(self, cmem_args): def copy_shared_memory_args(self, smem_args): """add shared memory arguments to the kernel""" logging.debug("HipFunction copy_shared_memory_args called") - print("HipFunction copy_shared_memory_args called") + self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): """This method must implement the allocation and copy of texture memory to the GPU.""" logging.debug("HipFunction copy_texture_memory_args called") - print("HipFunction copy_texture_memory_args called") - raise NotImplementedError("HIP backend does not support texture memory") # NOT SUPPORTED? + + raise NotImplementedError("HIP backend does not support texture memory") units = {"time": "ms"} From 237f39b901c6d1104e6d278ce20da1c80ebd26f3 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Mon, 17 Apr 2023 12:41:39 +0200 Subject: [PATCH 33/61] modified test/test_hip_functions.py:test_ready_argument_list --- test/test_hip_functions.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index c0030c995..68e6f22f1 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -26,12 +26,10 @@ def test_ready_argument_list(): dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) - assert isinstance(gpu_args, ctypes.Structure) - - assert isinstance(gpu_args.field0, ctypes.POINTER(ctypes.c_float)) - assert isinstance(gpu_args.field1, ctypes.c_int32) - assert isinstance(gpu_args.field2, ctypes.POINTER(ctypes.c_float)) - assert isinstance(gpu_args.field3, ctypes.c_bool) + assert isinstance(gpu_args[0], ctypes.c_void_p) + assert isinstance(gpu_args[1], ctypes.c_int32) + assert isinstance(gpu_args[2], ctypes.c_void_p) + assert isinstance(gpu_args[3], ctypes.c_bool) @skip_if_no_pyhip def test_compile(): From 5ae62c6f27d5c4f4c2f879bce0bcbd0e510990ee Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Mon, 17 Apr 2023 14:25:11 +0200 Subject: [PATCH 34/61] changed once again how gpu arguments get passed around --> ctypes structure --- kernel_tuner/backends/hip.py | 29 +++++++++++++++-------------- test/test_hip_functions.py | 7 ++----- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index bd8565c4b..bdb48c053 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -128,8 +128,8 @@ def ready_argument_list(self, arguments): The order should match the argument list on the HIP function. Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) - :returns: List of ctypes arguments to be passed to the HIP function. - :rtype: list of ctypes + :returns: Ctypes structure of arguments to be passed to the HIP function. + :rtype: ctypes structure """ logging.debug("HipFunction ready_argument_list called") @@ -150,9 +150,17 @@ def ready_argument_list(self, arguments): elif isinstance(arg, np.generic): data_ctypes = dtype_map[dtype_str](arg) ctype_args.append(data_ctypes) + + # Determine the types of the fields in the structure + field_types = [type(x) for x in ctype_args] + # Define a new ctypes structure with the inferred layout + class ArgListStructure(ctypes.Structure): + _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] + def __getitem__(self, key): + return self._fields_[key] - return ctype_args - + return ArgListStructure(*ctype_args) + def compile(self, kernel_instance): """call the HIP compiler to compile the kernel, return the function @@ -219,10 +227,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): :param func: A PyHIP kernel compiled for this specific kernel configuration :type func: ctypes pionter - :param gpu_args: A list of arguments to the kernel, order should match the + :param gpu_args: A ctypes structure of arguments to the kernel, order should match the order in the code. Allowed values are either variables in global memory or single values passed by value. - :type gpu_args: list of ctypes + :type gpu_args: ctypes structure :param threads: A tuple listing the number of threads in each dimension of the thread block @@ -233,17 +241,10 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): :type grid: tuple(int, int, int) """ logging.debug("HipFunction run_kernel called") + if stream is None: stream = self.stream - # Determine the types of the fields in the structure - field_types = [type(x) for x in gpu_args] - # Define a new ctypes structure with the inferred layout - class ArgListStructure(ctypes.Structure): - _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] - - gpu_args = ArgListStructure(*gpu_args) - hip.hipModuleLaunchKernel(func, grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index 68e6f22f1..754bf15ef 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -25,11 +25,8 @@ def test_ready_argument_list(): dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) - - assert isinstance(gpu_args[0], ctypes.c_void_p) - assert isinstance(gpu_args[1], ctypes.c_int32) - assert isinstance(gpu_args[2], ctypes.c_void_p) - assert isinstance(gpu_args[3], ctypes.c_bool) + + assert(gpu_args, ctypes.Structure) @skip_if_no_pyhip def test_compile(): From 92f55a56be03ac827648aeda185bdba2f7e9a018 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 18 Apr 2023 16:25:54 +0200 Subject: [PATCH 35/61] added test_memset_and_memcpy_dtoh and test_memcpy_htod --> pass --- kernel_tuner/backends/hip.py | 14 +++++++----- test/test_hip_functions.py | 44 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index bdb48c053..4b1ad720c 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -70,7 +70,7 @@ _libhip.hipModuleGetGlobal.restype = ctypes.c_int _libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p] _libhip.hipMemset.restype = ctypes.c_int -_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] +_libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] hipSuccess = 0 @@ -266,7 +266,7 @@ def memset(self, allocation, value, size): """ logging.debug("HipFunction memset called") - print("HipFunction memset called") + ctypes_value = ctypes.c_int(value) ctypes_size = ctypes.c_size_t(size) status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size) @@ -283,8 +283,9 @@ def memcpy_dtoh(self, dest, src): """ logging.debug("HipFunction memcpy_dtoh called") - address = dest.ctypes.data - hip.hipMemcpy_dtoh(ctypes.c_void_p(address), src, dest.nbytes) + dtype_str = str(dest.dtype) + dest_c = dest.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) + hip.hipMemcpy_dtoh(dest_c, src, dest.nbytes) def memcpy_htod(self, dest, src): """perform a host to device memory copy @@ -297,8 +298,9 @@ def memcpy_htod(self, dest, src): """ logging.debug("HipFunction memcpy_htod called") - address = src.ctypes.data - hip.hipMemcpy_htod(dest, ctypes.c_void_p(address), src.nbytes) + dtype_str = str(src.dtype) + src_c = src.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) + hip.hipMemcpy_htod(dest, src_c, src.nbytes) def copy_constant_memory_args(self, cmem_args): """adds constant memory arguments to the most recently compiled module diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index 754bf15ef..a95cd1cba 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -3,6 +3,7 @@ from .context import skip_if_no_pyhip import pytest +import kernel_tuner from kernel_tuner.backends import hip as kt_hip from kernel_tuner.core import KernelSource, KernelInstance @@ -23,10 +24,24 @@ def test_ready_argument_list(): arguments = [d, a, b, c] + class ArgListStructure(ctypes.Structure): + _fields_ = [("field0", ctypes.POINTER(ctypes.c_float)), + ("field1", ctypes.c_int), + ("field2", ctypes.POINTER(ctypes.c_float)), + ("field3", ctypes.c_bool)] + def __getitem__(self, key): + return self._fields_[key] + dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) - assert(gpu_args, ctypes.Structure) + argListStructure = ArgListStructure(d.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + ctypes.c_int(a), + b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + ctypes.c_bool(c)) + + assert(gpu_args[1] == argListStructure[1]) + assert(gpu_args[3] == argListStructure[3]) @skip_if_no_pyhip def test_compile(): @@ -50,6 +65,33 @@ def test_compile(): pytest.fail("Did not expect any exception:" + str(e)) +@skip_if_no_pyhip +def test_memset_and_memcpy_dtoh(): + a = [1, 2, 3, 4] + x = np.array(a).astype(np.int8) + x_d = hip.hipMalloc(x.nbytes) + + Hipfunc = kt_hip.HipFunctions() + Hipfunc.memset(x_d, 4, x.nbytes) + + output = np.empty(4, dtype=np.int8) + Hipfunc.memcpy_dtoh(output, x_d) + + assert all(output == np.full(4, 4)) + +@skip_if_no_pyhip +def test_memcpy_htod(): + a = [1, 2, 3, 4] + x = np.array(a).astype(np.float32) + x_d = hip.hipMalloc(x.nbytes) + output = np.empty(4, dtype=np.float32) + + Hipfunc = kt_hip.HipFunctions() + Hipfunc.memcpy_htod(x_d, x) + Hipfunc.memcpy_dtoh(output, x_d) + + assert all(output == x) + def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None): pass From f0ab5efda4871d0078bd4835574f8940647ae68f Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Wed, 19 Apr 2023 16:09:25 +0200 Subject: [PATCH 36/61] modified copy_constant_memory_args, implemented test_copy_constant_memory_args --> passed --- kernel_tuner/backends/hip.py | 23 ++++++++++++------ test/test_hip_functions.py | 47 ++++++++++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 4b1ad720c..9e4e7f117 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -68,9 +68,11 @@ _libhip.hipEventQuery.restype = ctypes.c_int _libhip.hipEventQuery.argtypes = [ctypes.c_void_p] _libhip.hipModuleGetGlobal.restype = ctypes.c_int -_libhip.hipModuleGetGlobal.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_void_p, ctypes.c_char_p] +_libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p] _libhip.hipMemset.restype = ctypes.c_int _libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] +_libhip.hipMemcpyToSymbol.restype = ctypes.c_int +_libhip.hipMemcpyToSymbol.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t, ctypes.c_int] hipSuccess = 0 @@ -313,16 +315,23 @@ def copy_constant_memory_args(self, cmem_args): :type cmem_args: dict( string: numpy.ndarray, ... ) """ logging.debug("HipFunction copy_constant_memory_args called") - print("HipFunction copy_constant_memory_args called") - logging.debug("current module: " + str(self.current_module)) for k, v in cmem_args.items(): - symbol = ctypes.c_void_p - size_kernel = ctypes.c_size_t - status = _libhip.hipModuleGetGlobal(symbol, size_kernel, self.current_module, str.encode(k)) + #Format arguments, call hipModuleGetGlobal, and check return status + symbol_string = ctypes.c_char_p(k.encode('utf-8')) + symbol = ctypes.c_void_p() + symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol) + size_kernel = ctypes.c_size_t(0) + + size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel) + status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string) hip.hipCheckStatus(status) + + #Format arguments and call hipMemcpy_htod dtype_str = str(v.dtype) - hip.hipMemcpy_htod(symbol, ctypes.byref(v.ctypes), ctypes.sizeof(dtype_map[dtype_str]) * v.size) + v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) + + hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes) def copy_shared_memory_args(self, smem_args): """add shared memory arguments to the kernel""" diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index a95cd1cba..a6a95a8cb 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -56,7 +56,7 @@ def test_compile(): """ kernel_name = "vector_add" - kernel_sources = KernelSource(kernel_name, kernel_string, "cuda") + kernel_sources = KernelSource(kernel_name, kernel_string, "HIP") kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) dev = kt_hip.HipFunctions(0) try: @@ -71,11 +71,11 @@ def test_memset_and_memcpy_dtoh(): x = np.array(a).astype(np.int8) x_d = hip.hipMalloc(x.nbytes) - Hipfunc = kt_hip.HipFunctions() - Hipfunc.memset(x_d, 4, x.nbytes) + dev = kt_hip.HipFunctions() + dev.memset(x_d, 4, x.nbytes) output = np.empty(4, dtype=np.int8) - Hipfunc.memcpy_dtoh(output, x_d) + dev.memcpy_dtoh(output, x_d) assert all(output == np.full(4, 4)) @@ -86,12 +86,45 @@ def test_memcpy_htod(): x_d = hip.hipMalloc(x.nbytes) output = np.empty(4, dtype=np.float32) - Hipfunc = kt_hip.HipFunctions() - Hipfunc.memcpy_htod(x_d, x) - Hipfunc.memcpy_dtoh(output, x_d) + dev = kt_hip.HipFunctions() + dev.memcpy_htod(x_d, x) + dev.memcpy_dtoh(output, x_d) assert all(output == x) +@skip_if_no_pyhip +def test_copy_constant_memory_args(): + kernel_string = """ + __constant__ float my_constant_data[100]; + __global__ void copy_data_kernel(float* output) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < 100) { + output[idx] = my_constant_data[idx]; + } + } + """ + + kernel_name = "copy_data_kernel" + kernel_sources = KernelSource(kernel_name, kernel_string, "HIP") + kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) + dev = kt_hip.HipFunctions(0) + kernel = dev.compile(kernel_instance) + + my_constant_data = np.full(100, 23).astype(np.float32) + cmem_args = {'my_constant_data': my_constant_data} + dev.copy_constant_memory_args(cmem_args) + + output = np.full(100, 0).astype(np.float32) + gpu_args = dev.ready_argument_list([output]) + + threads = (100, 1, 1) + grid = (1, 1, 1) + dev.run_kernel(kernel, gpu_args, threads, grid) + + dev.memcpy_dtoh(output, gpu_args.field0) + + assert(my_constant_data == output).all() + def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None): pass From 685aed3a1186e04ac825e21f5f4341c74eb36f7d Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Wed, 19 Apr 2023 16:29:57 +0200 Subject: [PATCH 37/61] added test_smem_args --> passed --- test/test_hip_functions.py | 41 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index a6a95a8cb..87ab9dfe6 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -1,9 +1,11 @@ import numpy as np import ctypes from .context import skip_if_no_pyhip +from collections import OrderedDict import pytest import kernel_tuner +from kernel_tuner import tune_kernel from kernel_tuner.backends import hip as kt_hip from kernel_tuner.core import KernelSource, KernelInstance @@ -13,6 +15,29 @@ except ImportError: pass +@pytest.fixture +def env(): + kernel_string = """ + extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * block_size_x + threadIdx.x; + if (i Date: Fri, 21 Apr 2023 17:09:44 +0200 Subject: [PATCH 38/61] clean up and added more comments --- kernel_tuner/backends/hip.py | 51 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 9e4e7f117..5e1fd4116 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -3,8 +3,6 @@ import numpy as np import ctypes import ctypes.util -from collections import namedtuple -import os import sys import logging @@ -29,13 +27,6 @@ raise RuntimeError( 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set') - _libhiprtc_libname = 'libhiprtc.so' - _libhiprtc = None - try: - _libhiprtc = ctypes.cdll.LoadLibrary(_libhiprtc_libname) - except: - raise OSError('hiprtc library not found') - else: # Currently we do not support windows raise RuntimeError('Only linux is supported') @@ -71,9 +62,6 @@ _libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p] _libhip.hipMemset.restype = ctypes.c_int _libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] -_libhip.hipMemcpyToSymbol.restype = ctypes.c_int -_libhip.hipMemcpyToSymbol.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t, ctypes.c_int] - hipSuccess = 0 @@ -100,13 +88,15 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock - logging.debug("self.max_threads: " + str(self.max_threads)) - self.device = device self.compiler_options = compiler_options + self.iterations = iterations env = dict() env["device_name"] = self.name + env["iterations"] = self.iterations + env["compiler_options"] = compiler_options + env["device_properties"] = self.hipProps self.env = env # create a stream and events @@ -114,7 +104,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.start = hip.hipEventCreate() self.end = hip.hipEventCreate() + # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 + self.current_module = None # setup observers @@ -126,10 +118,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None def ready_argument_list(self, arguments): """ready argument list to be passed to the HIP function + :param arguments: List of arguments to be passed to the HIP function. The order should match the argument list on the HIP function. Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) + :returns: Ctypes structure of arguments to be passed to the HIP function. :rtype: ctypes structure """ @@ -175,21 +169,23 @@ def compile(self, kernel_instance): :rtype: ctypes._FuncPtr """ logging.debug("HipFunction compile called") + + #Format and create program kernel_string = kernel_instance.kernel_string kernel_name = kernel_instance.name - if 'extern "C"' not in kernel_string: kernel_string = 'extern "C" {\n' + kernel_string + "\n}" - kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) + #Compile based on device (Not yet tested for non-AMD devices) plat = hip.hipGetPlatformName() - #Compile based on device if plat == "amd": hiprtc.hiprtcCompileProgram( kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}']) else: hiprtc.hiprtcCompileProgram(kernel_ptr, []) + + #Get module and kernel from compiled kernel string code = hiprtc.hiprtcGetCode(kernel_ptr) module = hip.hipModuleLoadData(code) self.current_module = module @@ -200,11 +196,13 @@ def compile(self, kernel_instance): def start_event(self): """Records the event that marks the start of a measurement""" logging.debug("HipFunction start_event called") + hip.hipEventRecord(self.start, self.stream) def stop_event(self): """Records the event that marks the end of a measurement""" logging.debug("HipFunction stop_event called") + hip.hipEventRecord(self.end, self.stream) def kernel_finished(self): @@ -221,12 +219,13 @@ def kernel_finished(self): def synchronize(self): """Halts execution until device has finished its tasks""" logging.debug("HipFunction synchronize called") + hip.hipDeviceSynchronize() def run_kernel(self, func, gpu_args, threads, grid, stream=None): """runs the HIP kernel passed as 'func' - :param func: A PyHIP kernel compiled for this specific kernel configuration + :param func: A HIP kernel compiled for this specific kernel configuration :type func: ctypes pionter :param gpu_args: A ctypes structure of arguments to the kernel, order should match the @@ -269,6 +268,8 @@ def memset(self, allocation, value, size): """ logging.debug("HipFunction memset called") + # Format arguments to correct type, set the memory and + # check return value of memset (as done in PyHIP with hipCheckStatus) ctypes_value = ctypes.c_int(value) ctypes_size = ctypes.c_size_t(size) status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size) @@ -285,6 +286,7 @@ def memcpy_dtoh(self, dest, src): """ logging.debug("HipFunction memcpy_dtoh called") + # Format arguments to correct type and perform memory copy dtype_str = str(dest.dtype) dest_c = dest.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) hip.hipMemcpy_dtoh(dest_c, src, dest.nbytes) @@ -300,6 +302,7 @@ def memcpy_htod(self, dest, src): """ logging.debug("HipFunction memcpy_htod called") + # Format arguments to correct type and perform memory copy dtype_str = str(src.dtype) src_c = src.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) hip.hipMemcpy_htod(dest, src_c, src.nbytes) @@ -316,21 +319,24 @@ def copy_constant_memory_args(self, cmem_args): """ logging.debug("HipFunction copy_constant_memory_args called") + # Iterate over dictionary for k, v in cmem_args.items(): - #Format arguments, call hipModuleGetGlobal, and check return status + # Format arguments, call hipModuleGetGlobal, + # and check return status (as done in PyHIP with hipCheckStatus) symbol_string = ctypes.c_char_p(k.encode('utf-8')) symbol = ctypes.c_void_p() symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol) size_kernel = ctypes.c_size_t(0) + # Get constant memory symbol and check return value of hipModuleGetGlobal + # (as done in PyHIP with hipCheckStatus) size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel) status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string) hip.hipCheckStatus(status) - #Format arguments and call hipMemcpy_htod + #Format arguments and perform memory copy dtype_str = str(v.dtype) v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) - hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes) def copy_shared_memory_args(self, smem_args): @@ -340,9 +346,8 @@ def copy_shared_memory_args(self, smem_args): self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): - """This method must implement the allocation and copy of texture memory to the GPU.""" logging.debug("HipFunction copy_texture_memory_args called") raise NotImplementedError("HIP backend does not support texture memory") - units = {"time": "ms"} + units = {"time": "ms", "power": "s,mW", "energy": "J"} From ca0aed0cb2e6c2834d71f9e59700c4e0a2311ff3 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 21 Apr 2023 17:52:20 +0200 Subject: [PATCH 39/61] added HIP documentation to INSTALL.rst --- INSTALL.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/INSTALL.rst b/INSTALL.rst index cfa6275fb..a2af17843 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -111,6 +111,28 @@ Or you could install Kernel Tuner and PyOpenCL together if you haven't done so a If this fails, please see the PyOpenCL installation guide (https://wiki.tiker.net/PyOpenCL/Installation) +HIP and PyHIP +------------- + +Before we can install PyHIP, you'll need to have the HIP runtime and compiler installed on your system. +The HIP compiler is included as part of the ROCm software stack. Here is AMD's installation guide: + +* `ROCm Documentation: HIP Installation Guide `__ + +After you've installed HIP, you will need to install PyHIP. As of the writing of this documentation, PyHIP +is not yet available on PyPI, meaning we will have to install it from GitHub. + +Clone the GitHub repo: + +.. code-block:: bash + + git clone https://github.com/jatinx/PyHIP + +Set the PYTHONPATH: + +.. code-block:: bash + + export PYTHONPATH=/path/to/pyhip:$PYTHONPATH Installing the git version -------------------------- From 5db2bee53166ed7b930f46762190870d492c9cec Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 25 Apr 2023 14:21:24 +0200 Subject: [PATCH 40/61] not using anymore _libhip, calling implementations from pyhip new coverage --- kernel_tuner/backends/hip.py | 58 +++--------------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 5e1fd4116..c94e09e4b 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -9,29 +9,6 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.hip import HipRuntimeObserver -_libhip = None -_hip_platform_name = '' - -# Try to find amd hip library, if not found, fallback to nvhip library -if 'linux' in sys.platform: - try: - _libhip_libname = 'libamdhip64.so' - _libhip = ctypes.cdll.LoadLibrary(_libhip_libname) - _hip_platform_name = 'amd' - except: - try: - _libhip_libname = 'libnvhip64.so' - _libhip = ctypes.cdll.LoadLibrary(_libhip_libname) - _hip_platform_name = 'nvidia' - except: - raise RuntimeError( - 'cant find libamdhip64.so or libnvhip64.so. make sure LD_LIBRARY_PATH is set') - -else: - # Currently we do not support windows - raise RuntimeError('Only linux is supported') - - # embedded in try block to be able to generate documentation # and run tests without pyhip installed try: @@ -55,14 +32,6 @@ "float64": ctypes.c_double, } -# define arguments and return value types of HIP functions -_libhip.hipEventQuery.restype = ctypes.c_int -_libhip.hipEventQuery.argtypes = [ctypes.c_void_p] -_libhip.hipModuleGetGlobal.restype = ctypes.c_int -_libhip.hipModuleGetGlobal.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_void_p, ctypes.c_char_p] -_libhip.hipMemset.restype = ctypes.c_int -_libhip.hipMemset.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] - hipSuccess = 0 class HipFunctions(GPUBackend): @@ -210,11 +179,7 @@ def kernel_finished(self): logging.debug("HipFunction kernel_finished called") # Query the status of the event - status = _libhip.hipEventQuery(self.end) - if status == hipSuccess: - return True - else: - return False + return hip.hipEventQuery(self.end) def synchronize(self): """Halts execution until device has finished its tasks""" @@ -268,12 +233,7 @@ def memset(self, allocation, value, size): """ logging.debug("HipFunction memset called") - # Format arguments to correct type, set the memory and - # check return value of memset (as done in PyHIP with hipCheckStatus) - ctypes_value = ctypes.c_int(value) - ctypes_size = ctypes.c_size_t(size) - status = _libhip.hipMemset(allocation, ctypes_value, ctypes_size) - hip.hipCheckStatus(status) + hip.hipMemset(allocation, value, size) def memcpy_dtoh(self, dest, src): """perform a device to host memory copy @@ -321,18 +281,8 @@ def copy_constant_memory_args(self, cmem_args): # Iterate over dictionary for k, v in cmem_args.items(): - # Format arguments, call hipModuleGetGlobal, - # and check return status (as done in PyHIP with hipCheckStatus) - symbol_string = ctypes.c_char_p(k.encode('utf-8')) - symbol = ctypes.c_void_p() - symbol_ptr = ctypes.POINTER(ctypes.c_void_p)(symbol) - size_kernel = ctypes.c_size_t(0) - - # Get constant memory symbol and check return value of hipModuleGetGlobal - # (as done in PyHIP with hipCheckStatus) - size_kernel_ptr = ctypes.POINTER(ctypes.c_size_t)(size_kernel) - status = _libhip.hipModuleGetGlobal(symbol_ptr, size_kernel_ptr, self.current_module, symbol_string) - hip.hipCheckStatus(status) + #Get symbol pointer + symbol_ptr, _ = hip.hipModuleGetGlobal(self.current_module, k) #Format arguments and perform memory copy dtype_str = str(v.dtype) From a37e3021867819294cf3a0eb224509a8d0c2ad6c Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 25 Apr 2023 14:26:32 +0200 Subject: [PATCH 41/61] not using anymore _libhip, calling implementations from pyhip new coverage --- kernel_tuner/backends/hip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index c94e09e4b..4e114fa23 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -287,7 +287,7 @@ def copy_constant_memory_args(self, cmem_args): #Format arguments and perform memory copy dtype_str = str(v.dtype) v_c = v.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) - hip.hipMemcpy_htod(symbol_ptr.contents, v_c, v.nbytes) + hip.hipMemcpy_htod(symbol_ptr, v_c, v.nbytes) def copy_shared_memory_args(self, smem_args): """add shared memory arguments to the kernel""" From 0ab790dbf3e801f3859d8ab8750b1de5423da247 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 5 May 2023 15:10:01 +0200 Subject: [PATCH 42/61] quick install HIP --- doc/source/index.rst | 9 +++++++-- setup.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/index.rst b/doc/source/index.rst index 4a66d893b..c8af5e745 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -27,9 +27,14 @@ To tune OpenCL kernels: - First, make sure you have an OpenCL compiler for your intended OpenCL platform - Then type: ``pip install kernel_tuner[opencl]`` -Or both: +To tune HIP kernels: -- ``pip install kernel_tuner[cuda,opencl]`` +- First, make sure you have an HIP runtime and compiler installed +- Then type: ``pip install kernel_tuner[hip]`` + +Or all: + +- ``pip install kernel_tuner[cuda,opencl,hip]`` More information about how to install Kernel Tuner and its dependencies can be found under :ref:`install`. diff --git a/setup.py b/setup.py index 07793f353..f17cd45ee 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ def readme(): "cuda": ["pycuda", "nvidia-ml-py", "pynvml>=11.4.1"], "opencl": ["pyopencl"], "cuda_opencl": ["pycuda", "pyopencl"], + "hip": ["pyhip-interface"], "tutorial": ["jupyter", "matplotlib", "pandas"], "dev": [ "numpy>=1.13.3", From fe7bad1cde3dca4dbcdf9c11592f273de3f4f5a5 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 5 May 2023 15:21:26 +0200 Subject: [PATCH 43/61] Installing HIP doc updated --- INSTALL.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/INSTALL.rst b/INSTALL.rst index a2af17843..624041bd7 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -119,20 +119,23 @@ The HIP compiler is included as part of the ROCm software stack. Here is AMD's i * `ROCm Documentation: HIP Installation Guide `__ -After you've installed HIP, you will need to install PyHIP. As of the writing of this documentation, PyHIP -is not yet available on PyPI, meaning we will have to install it from GitHub. +After you've installed HIP, you will need to install PyHIP. Run the following command in your terminal to install: -Clone the GitHub repo: +.. code-block:: bash + + pip install pyhip-interface + +Alternatively, you can install PyHIP from the source code. First, clone the repository from GitHub: .. code-block:: bash git clone https://github.com/jatinx/PyHIP -Set the PYTHONPATH: +Then, navigate to the repository directory and run the following command to install: .. code-block:: bash - export PYTHONPATH=/path/to/pyhip:$PYTHONPATH + python setup.py install Installing the git version -------------------------- From 47b07eea59f6988ceb3fdb2ba88c8845f0d0f4e7 Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 15:25:44 +0200 Subject: [PATCH 44/61] HIP update INSTALL.rst --- INSTALL.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/INSTALL.rst b/INSTALL.rst index 624041bd7..fb57c2830 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -153,6 +153,7 @@ You can install Kernel Tuner with several optional dependencies, the full list i - `cuda`: install pycuda along with kernel_tuner - `opencl`: install pycuda along with kernel_tuner +- `pyhip`: install pyhip along with kernel_tuner - `doc`: installs packages required to build the documentation - `tutorial`: install packages required to run the guides - `dev`: install everything you need to start development on Kernel Tuner From 6862f14ccc9b20674b092801604c57ba6f29067e Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 15:27:22 +0200 Subject: [PATCH 45/61] HIP update INSTALL.rst --- INSTALL.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.rst b/INSTALL.rst index fb57c2830..53fef4926 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -153,7 +153,7 @@ You can install Kernel Tuner with several optional dependencies, the full list i - `cuda`: install pycuda along with kernel_tuner - `opencl`: install pycuda along with kernel_tuner -- `pyhip`: install pyhip along with kernel_tuner +- `hip`: install pyhip along with kernel_tuner - `doc`: installs packages required to build the documentation - `tutorial`: install packages required to run the guides - `dev`: install everything you need to start development on Kernel Tuner From eab0306446cc66ed7bf76d4ff40d5e4a7cc7e70d Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 15:37:38 +0200 Subject: [PATCH 46/61] HIP update interface.py doc --- kernel_tuner/interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 9abfa1de9..15fe04ec1 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -98,7 +98,7 @@ def __deepcopy__(self, _): ( "kernel_source", ( - """The CUDA, OpenCL, or C kernel code. + """The CUDA, OpenCL, HIP, or C kernel code. It is allowed for the code to be passed as a string, a filename, a function that returns a string of code, or a list when the code needs auxilliary files. @@ -124,7 +124,7 @@ def __deepcopy__(self, _): """Specifies the language used for GPU kernels. The kernel_tuner automatically detects the language, but if it fails, you may specify the language using this argument, currently supported: "CUDA", "Cupy", - "OpenCL", or "C".""", + "OpenCL", "HIP", or "C".""", "string", ), ), From cab70bc455dfe6394e62baf4358933615c24728b Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 15:49:11 +0200 Subject: [PATCH 47/61] added updated architecture diagram PNG --- doc/source/architecture.png | Bin 0 -> 33120 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 doc/source/architecture.png diff --git a/doc/source/architecture.png b/doc/source/architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..bedeefa5b31036ead63a6c97f786e34aaae68de0 GIT binary patch literal 33120 zcmc$`2UJsQw>BC?EC}jWK?M=rZjmCO(iABw3QAR~bPyv_0wRQ-fU<>+5U>GK140Od z5-ABi0TED;8bT;S1f+&4DUc9ySMYr2oco_~?tjL;{}^|T;V>a9>#cLnXFl_J6MNUx zKvY;l7z6@|-no6#90U?%gFphMdv*bTIrRwU0sJT6Z*Fh{RN8f74tTTE{kq9@5U4D1 z-=?b&@P6-;+qV87(EeurKY=#if`=ebB;(G_>z2Vzi=&94Z4E*yHZE#PR)ES@p1W=n-lW3`TlU8`ucd--Sl%_uH^tJ$BhfGuh;CUCmVt%Ro42t~P>U)gz62tlZoIU^hrvCrE<(Stpf#c)*=utk4 zWTq>oC)1o)ssZxe-S>rA>eMiNHH_&Ej6d&>S;50UCR*Z7>*c{;DzVv4qgdjWoycOe zV_l#aQla72W z>XDP0v(pK9v;A0dkxdS->+15~1a#l4wVCdiZUqH}uYNK^=^DN-&EA5|DU#QMhHy8h zr>E65Yhau}Un&8nxA)}Y^xEoVTij{;*ZTWQy?N^QW`f5R7nO@%f7n7uYP|IAJF+gn zhp9oX{W;s4+aMjWc`?AyV2isPag^$U=C^$9=%LuU3ji+x8w%_#LE;_sYir1mp*^+2pYNK;jljZ%~t2~tHP2;Ewep!WKn#z+86ZU2?}bh48kxuNuc{ zp+(2irjg?|=tBacMrHGl?UI+Qhk0qO>wf(k@*WacjB3FWdv;=o`MiMf5@|qws^mYa++4L+#6(u*|)(oZRQz@$nnw5h3^)_1L{w{lbo zt^L|ggWuo=`kdO>($SxN%QmC1HIkx__3eg#m(vdl&Ptym55C0 z`1kMMTdVi&+m|!a5EiuVOl_fO)}l}-#9VP0mkkX(K|BEkc9HA%`{|6{Anv}~7xV0R zH!8|R(y`8^iw2X#^vujrI7S2iU3HSJ6YN^K_ynVrIs5^06jzYbaoT-oao#f5XHxc@xulh0mbrAhF4!AGg;-Gn&hyS1R9O~#NC zee1b&;^a_mzS6^}Vly&6R9%_jmPYLUHdGC_M?j1`H{hDx?jPIpy~+=O;uGNhUtz`~KXXq~2LtSFbtmjkHN#@*3Jo zJA!o|Qmgis6f5R{TgNe+_!=b8GLFoW&JVp^>7>^vv5d87I>r zyl3#d#M?)uwl4ecM;GU605j0+`K3J2BUlcXTN_M3)MN#`8>l=CN%Z#v|+I0v+Rf~M=vDN{# zzf*F<^OFCNPU&CHr|qFX+v|c@mD6J~D9$n6Mq=1&p&P%zvkI%$QGR}`mZQ#pFW-v& zaNO$5#SxrN8u5gJU7}J&ZLI{G!&*MTeFuz8mySCb*!vM8XRq(skiHOgPii-a)KMi~ z+Bm8yw5vA1xTFNKH(P8cEkFNzeTWME*7cbDq_^kHq6uY{myQn3b1RQ{%kE&DOpTsjfE-G5sPR|jXyu) z$lUyiNDG7#yU|9VDmlmBkoYwzDd`pUAK%-Bey&JoyZiU|OpXNTBz{<(Xg$Wgw5X4i z)&58gw>qd?lRdh=tk`+51c_IJy8XGOJvE&J8PM{EHF!}<#gM}l)W7>KE1n}9{{W?2 zKq2Q9lr^l`=WZwn8VhIVz$g9y+z*FGW*jD7I6q)^O{2ANep8DbYke#k!+eNoy#v#2 zXRLZ(!DHJ8tHD+!mO>>FIU|)$){qMC zkYCI1(t0Pp0ql{L1(vI$Vfgt8#y^Qu)x1Zty}M=p8MW=_ByeIMtY?J9pbdE_)%M>Qg5+28U(0lijQ#Crj0ndDkn*G(%n@X5eUM>IkLurmkhz&@7zov4C%bXyZE)8+7X`N*dDB7` zb6Az2X)We>1e4{8kwDLLT(M@~p3iA^l=*QhGRk;cEMg8ezI33i!z9IiS-ov1C>k{{ z15W5u?f9pNP4J1F#w^C6lEA6c>Xg@-bDUg*#2eJzIyD=qnB}4ZM{(40t?8G{t3fxM z=N;VZHEFLU(Vli*%8kOp#+j2Zs&>Kjb;Wtn3(>E;GOKGF%Iae`#$%*I+Y+S@kk$#j z5?NUWO#-=_x2DUp2=|o9GCRMeLzdAGuQ)S?hEC_SB}}Yto&iDNlgyDXg~QBk-ydl? zZ5*2z;iT*1FtfR_mP1u7IFYB?lt-Wl8R(bkWI3LH+HvoLFJ~S=wW2V!hi~idMzBVM zfMI5oLB5)x_xXpPvmHKp7<6n`-O&A{5R)>xdDFybU65YSD=(L>z7_|M6oNDv`q@*W z0H5w$rQYE+KbfezA4a`j>iG64WnY8|ro#E@ozhjre^D)olP9eMmMaqE7PG8a zrFZgEW&84&uMtIcGh1EQf}I^o?+A9O_?CQ&M}5kAG+zN&@!~~+NY{_)tyZa&BMOQY zKlGlGnFpkGPOv33aOc4jBTfeA7fnz(hF2{-@Rhbs34JgW89oxgt%lO9*Ta7_KI=Ie zVe(QMM^d|k*cehP?1FzqOvZRgKjy77ESe$W?>=+#=Yw5Uicqe_O)VLkP5BbR;lF-< zO-{e?VKU!(x(Kso6YKj;v`DqqQwN$nNimvOds4S!K3E%b_rn1f7clC=RtU`TRQN=J z=Y4h;=GojQ?7nxG!cZS(4!nhafOGqB24;1hqpf3{9QQp z!~%D+2Fhqw(_NG%E<>mVKQHTSS+(s;Zy|pxdfvm^oY4gU`7lwgHZ@;DB~+D!JUnd{ z!M0fSSnK@jUXI!K0kiKz2w9clA2s}W$X%<7&qP`UjD-!OWwu5?zqon`YhYz}r5JWu zo^Qbg@-NC}7*LHdUCDxPiI)OBWpAGyG6bn59FpN;Ta6b|wMlr@rgt!E01T6ylxr@% zc(_ib6#S1;M(yCn-_Eo=Kj`azol5AN{&g9g2spV?wD$dULQ*n1ME!~k=z>VpyVB6f^x~<|@Za$ZDTU6Bu>z5 z^i+k0bl&XrGYMylpMFi)OgEUv2Ftr$KX%%HdXN|t3PXVazhX!zOSzFDHN3Dr^jcV&Y5QHfyE!- z#p~c>hTZN?M%SF`w*X~%ZkmJ(Hp!(ClMhpNz9N$YnqhLxKp}qSSdeCxIraG&jBK(O zcl~ubmT7<~5h-icvIuZHo2BHWT=Qi7Q}PVAvzoih-7{{0_JWM#$?5Xj}c<&^uE z>|307d{=et8{x(c;y)8`b8gH<{r8LsImXgp53(w>dH@x3rj_V3e{agKZ4_a(?gMEmT0yv z9=L%|7R95~McL1`Cm3`gU$)mb=HFibv1gN(bCFLm;U32ym{o^2zqfw(zY`yX^8b}ScJUR_*``==|(th?lw+* zwgYy^0k}cLUjA}=YHF`nP8=^X^s?`Tl3l{|jd9=;mqGIo<6oJNqtQ4@ki8M23Va)^D;p-pQ#er|V1qRS~`-CoBYiAtL+w;+Re6EqT zRJR=M7r~xGKbQ?BX+~^Jr8EFB&c=tDcNyT>22ZA@r|;v2v6f!&2~C-O&vYVRht6aM zy)+xmlay%Ame^4{bg_6tE$0mfNM>TXy}Y~{0Mr$;g)&O{bOHE^u{bb+XIupcn7@NM z#~Q-IuucuaO)Ia@7`)7BE<3EDZ<%|;@(s0R5@2#KK!6d(|1A{nsNB2!+#vC!81k|c z@C?Um*#}(mb}617P+1wcoS~VNpQ3%#*Xjp%T_X7bMDVec6>M~iqH6VYQFuKB!}$?) z;1f@s8XRs<<#T;JfXr*SNT{`}6N>WAHOXbvhHpGw_wLwQZIjj@P_!VDs=h=ifNU9- z|I!;!=!OR8s}9Kk&AvO+v-^TM)bJl?BHS^2*!cQeU?9Hgu+FqXa6|F_+({S$HSqtU5TkDe_b?KSYC9BXA^vqcvjKe(ZdvkVI zWIn0va5@?v;pgYqfk;yy*$EKchr9%jJ$VX}QrFd$?)wMsVk2^gw)?FYH~T_7=jYeA z4`zplfBkxTEPtcF_rS}F$&sMdTDlQd_dWa#D6N&trFA@b@hcyUyIJy6J#w zEj+}9gQVC{!vI*(DizLw%!b*NI_!m!cYb7;JEL0lQ9zVX8c?1@o6PMlwa4I#^6buz z$`ARl<(sZo-RmIhK*|vk28t_49a!zLU&)HfI)PTS!sJZ|+yuWDSP%w4sSe%~zD91* zVbA9FQ2d9%-OoF^b$KV)!lf38!VAMVh4XskPDCr|Nm;tE$zbJ0$t|*t^9zEFxWh>3 zjL$2PT7)phf)p#tn@Vvm??~L3+VD6p+g*aDJWR)Q@vW==7Jsi{o;Nte5)5HtKMU#h z-OrJlPt*mf)MrtLu?T{c2+vxI}4heI^sNnL%=sMOTLN>WhEsaJAK`i{moEg z^Uc+-9>4|wfeHY2%;m51JvPwg^6cfM(Yn)*Enhi{e*;pqQr0}V2$3!jwDf&1toKP- z^yqv~`N9Sxzdc_u-F+wkDmZk{J6G{4T%Vd%Wpk=jpROgPBD5jCBDrr2sU0THhW<7` zOTPIX;-_%1q()A*786*qXu^&SO*%X%giZIDjcbT%@Ms*w-*h%74ai;ZhWQEvmP+jy zLFNwX+XrsO6SRGu19N5agVs70N2=bEJUBkztN+lc8vS?UK4g$^VsA-dN2-c@Xo|A) z*pc-)##3Q>@y+`PY=~#L8BRb;D7MOHR2X^S#N~tRr@(|Gz=dE=e-TX;3wTj+UWu3e z2jxN!+Pmx-f^PWwli9YCK}n*nK@9>vWUc8dKV{Ja5Yg5a&YoM4 zF&TmE07^zLZ~`KiuSCja?;3k&{SNB+N_kq+lccp7-rwW=O=BKyis&^_-n+ePpi zKIW|3XkwZ!>C;YNQ41>;8IuT4Ntj!BRi5xZ7T0N-8cc z_I@`XmKKvmw#kKYuU*u-l@E7nMC~buSQA2S{5cGbg z8yQ3IeI7_lkk%cm^PAMaXgyr&SO<*y0vs5FaRyKvi#C4W{SLO$wA=Nvx`hlAAU62% zo~7hee_uohG^<$-M9Wt?1UR?$Snbjlm(!r)-bH6FjY{_Qz5}A%5hTcE6DTdcmzB@d z38g>&^7q&8-yMAUagSgA5=o4*MfjprMm*5Tk|Y99(G*1ng*t*<9AvdCXg_7LXRnodCH3SbU1RIg2M*LPLy^euI;`);Bd)~LuL3E>Hv-*OK^dmEA>SF(Po@NwpkS{I!%OxJQ0mP|vBa`*O4t!v zZ-8?p3JfCs1HjvxtJg~|?X{YJ_}8XZlI>dRA5e*8`EmR=1iMVL1Cr)vo^RCL-{QR) ztSz>Jz69wD*ceHY69l2u%KAU%2l$tJXg-?DQ66V{c=)@&4m!k-8bg76qb<{ z{r>Nno7wF7{vXcYWg;DFo}Q(1j;HmoFv&&HLM0aqe2H9jx^{Y1L}CT*UQhRt7X;Rj zJMzUAP$t%+e(rvqrrr_3FC{EfiMn(Fm%iC6esXm(Ki56NhuM5!1OL_sMkXu`Rh*-% zoo5Z}tgUtH#=v!F$Ex##?FgkYo~k|TjQQO^a->O&Z}}u#BWH`VtjBw6rn47zoE)k; zbQF}v53~2)h1~tP95o8mHxI6$C3cp_+sga@$sV&a9SHZIMBSIsPffo}x{q_%H(Yed zG!$o{wcNiB?l8iMI0kK`4NOB)FR6vz&KGI?s{_>up76lfjYjaeEInQWYiu9vp4V0~ zo`!>YcIu{N4R>0Ndh3KS7m}CalnHyNantUY0i%(N8X8A+sN1y-nrjmqum(M6ecVYC zrck8#%Qsx>T^T1pyYfAwx$&R$8I9G#*;aa%V(llw5-yYupR`becXnr)RLN)0Pt3<# zb>QUV?C?nBvUi<#Lvd(ce(X?np@E3`2<6en1Pg}=^hAOhKguCuQPFB(4}W>7&N5X? zQfOf%S)u5ckH<%7nvIrp5!)HPwuZG+zO%SnGQVyo(6 znF*CWHSuvN!jH9)wnor2oujie$s_8MKhdOm@(>%HRn=cvxad_E-fIHlHCzJk+yq76 z@i02CRxF2#N)BxCuz%U7E=fiE*Hn@#z^J}QZWLkKrlu|EL>1gQzbiBrfA(Z|>AqFb z$$O7>F=>XHMUS_B~Y#6EXjWMNRHwIN$ zS@|Xq=k%#`Ad7i1&NUv&_e`mQ2u1q2CZ7Gu=iiCn7&MBWv(+d>^DC+(OZ(3$bX+03 zwvRM4G_(@onQ873W}3-3gwu7Ss9RCHtwvb-Whb+kEk!n1LzYk^8;C;z3EF0|;^EWJ z?}{piCG=!DTV4wXyzk18Y-5#+?xT*lsE4`V;#I$1hVeBC*k~j@qh4sDJ=?MR$?3Y{ zekRTq{_PUj5>w8}aHQrO( zoO6TYq(=X|ybq*UEb7wuRUjJ=kCxM_L%a7)&TuzX=&Kd4n3?%xEpvKi>m6-_$K5jX zL$|ths|uB}g2-VpEm<-BnjCjsM3zneFGPvacaKmgH~zuJX?r|X)u|q4l7d~Gbl2d* zZ>#Ox1A@q!0H%UNRd)2XmYYJ}{Cpe=H4KAM*ih0XEP5J_Y4@5+erwyS%q`0$s{Ev>?>+vz28cD}j zQza2-VkAq|#fli4U zYJe@~3i>K9D`IRF;gO1Zy*h$&wwY4LOOn=^5XJX%T{x;?4OG3hxlg81Pb`|z{=gw`avWs1rncz$9m(J)2qD^V8BuvFdNc+#0 zNl{QM;%sbIx>msK-N$9s#n0qG$ASvAWHwAIn?j;IZn=enE(r9q zD|T$w)YLqDrOcE+7Y02Ez|H)_0OT_U;R5RG9!p&zPn6jy;%%;=>`VZqTq8Yu9{6xi zL+ux*ro8zLW~r+XZ~mdHYfjMOFmB~INJh+xb@4nOT?5|h!gb3RygroCctd9vhECJ| z0@UWuM+0=+pFM3K`5+jG@Blwmd~nw7K7Sy=`u{y9M)vZ_#b|^({h|o}@q$8pg9_JEfM#P+W6hcmv$@x@FHUXX%ojASS?l?{!!ml z=_xh4mJ1{Zjk0;aQh#dq?&4=M{{wdJzqIyW3B3P&OQ!$oM9UbEjgm#&B z|LUpOtm)hO&Qf>KINHxg?UIW-7Dnr;p2AJgDgY=RJx0y10f7GM;v={%P;mSHUcZAr zp}t?$>?9O;^!pRQT5B2zl*nc)x(N`g0H;1-VYd_}2?zGX7?8Dx@wq1bIuO8A%}Qwo z$R3ZZ$}~Najns|M33md};V2-G3w7_x*xMBHO_>IW+{P;&r{b}d%k=wEp4x+2)Uf4- z1x=u(?WABfqx=vISB-77-p@~$^_xm}Ol6cWL#l^Cce6{Gpou^M!p1b9O&B%}0(5U5 zrPkKgTpfg#=PMWaP*Y>?g!y`~Eg$XLF>qMRt3QBH?e{g0th*HAEj)lpKRrQ6IL{T1TTFnq6c<}Yj#VTqvA!w>S zsfP#%>+)egD2^oiLL$SN5PqCpn704lpWRkfyNpz;nx7X2)E4M>NwY@tVa|OKTsEVJ z6|NRqxA!yz4Mf*_vqe9(t-P9cgeLV%63;6%B`)$Yn3W0gP&mNJeFaFQZ2$1P#q#sa zZ*{Xn{$ZusXKfz%no~J%wj3h0zReZ?bW{U%Wog%;NTU16JMVT1S&bhh7Ju*L3u0by zb3H8A#(}g201P0|VF21*bKhXi(rg350=Yntu*Wdslo_USoUv%MM3W>lz`DhG{GFI!HJTU|Vnwm}QFLnk*HlMKPgQtf zkl_#ue0HrltIH_{EqT25**!ogAyc*gqFN)`BPAt8uM?y&p2#`}^raPWK|pN8aCE+? z9aVDJ0~7k{_ruQmmr~6#cg6B+y@!CV3p{I9giB^$99AY^z>bv`Q*c_Tmvwz$ zb2EI^BhxUgBde2ADUTb_5)&yJ*I_Vtg%;==9b0l~trD4ap^W*~@cvn3XLCNHO{`(b z6hR)k#K_z+iOI$-vhj+&-9tDpn^h0TB|qVkw~(;#@A1qRh+I=%*yKmv1XT9w*va2& z?f$#smh4FD?x`Y)h=k|2v<*rnKz^r+`K1aef|*d_mK9XlnXtg!=#CgOe|xP7`FV04 z0sN^X+yp}53q_ISI6-%4jf=)fn>9q#r{D}g&pVMmUhI3KV86s{o51D; zK^SrI(teI|7P0obdf0Dx5-9<2mz-x2z}o44+}G_lvk91sl1_<5u8&-()5to75t}@& zsHpfNdD>#ak5M1w;MgF28_*)mU3sDs&sf}D)`DJSOn0VFSn$_6?SyaQ?|PxO++BXa4K{fVT2~pO zue%iiEa}_H#eXw8(L&ZN5@6Uhu3ohlv1P^RZsx34rYbK&ithYyaQ|NJ5_F7qb>R@# z^<95@^=iX9PgB2vH#{bLT(*b8*o@(L5-4#$OUJD52<8uU<>8 z-Lht4%k>-#3?ROu-;6mptL@qhFxfGqIhia}*LN%S{JCw!-hI@dfoDgR zmCGXgf`Kdqh%%4be+B*IUhs&9b+wihv!W^{(o00^3ghfuY>Jk)!gOJdCK6ReDy0&M zYm@6xiB>i_$A zW0bGIH)Dv_-PBcrGNa1~%}TvkFQCo>$YXe{2%b&29eVPRuUYyb?!9Fq;7+#144}Ju zoAGc52nWDp3c%uuZFQt5^97&?kaYDN2gp&yc@R|N3jcILL?^B=bV-mBwfg|D%s5-> zfK|83yiV@2efiM_(I$yI{pz!v>KB|~=YCrjzm4B$GCR8ZrfxnZs}tb%cqJr%inxpX zo=r9l+wAVYwYkxg9GNv|a{(hM7%G}ReC#rix$>(gE$?Y*?_O2O^(G41XsawCp2BDa zocJvRX_zH2kW-wV96eWUoM<8ACF>S2B)_4Ee~kx2<_=a-;dN&lnI$VlNpQ9HX}LWO za4uUv*Lw@ZDfi!RJqu$C&Cg9VL#hzTz)2q@)Goxqu815E|dhbx>ckn)8mgTq*Kgmmji; zJgytHSeVD6^0tE3i!sABHr%;gk+pn{uWOoi7?Td?j8LQ7Ku;H-a3W`P`OPzwjk|+= z3FF9j#xNN?Y<1o6iTI)q#v;h>NKPL*}(2_;-MCMuM#>87i&xf3gQ5e zI3xGPW|yJ3xcH4+t$`Atylb3%DUDL*j2sB~%y*kqsYy~L1EkV}sl48udYuPi^7KnmskY+gkG`*y_^JJhE(3vlr{B=?4S zBf1^lvP7V8WLO=#pl1+w94#2 zVd(;3%6`X~0PZS9!bH*t;jSU0wP?dJMM5MPYQ{)^+#d^RM&0RdMOuakI)%A1$*Z9G@ z^BO2J^Fx3vdZ8}=e1qli<2h)?mRu%zrKP0>DA)s;)p(tyoi`~7MY3CJZ#N9wnrKtB zKdS5WGGERTXg(2&V%Ee!k8?@Kb zHb_QhYlYuJX#UiYX8xegFUq8=;z;6A601=a5Ek{=>ImLT-h2DrquV68!#N_!?#I=k ztD-wQ!atMPA%pcvwy^*ibSVmSLD(?rX*heR0TO<$#UYt#lUPMJ3r|NeVVMkn)?LS|xa?WA`VPAkj(p%ZRzyRVy|7 zL-0Y+t*~w4T z27q96!?}A|t<0hFW3kJa?#SF+Quu^QVEd+Rv?mm*mfEzrF@w}|Uh4v1OM2ZKZk~Dv zG`j=oB!P-$de)TNdLlsC}x~rD(E+y zIh=f5E0iuvx1dCRXMBs+g|SuXEj}lT{v+TIq8+uC9UFiZ>dkVp`v#xFKptl& z47;sph6@cFgq_2m1%iF->n%@Zxl#5ippMpEd@Z~M1{m5FbXBFJS(esbHOWeC4s&sz z+A(thqUWv0C#W9pS34LQcOIk>NB2~vD-WZ=0_=~;qAvU@VThuLPm5GMA#gT8GCj&`V*HL|xa&v0`Q^jl> z2uSFY{A=q-hk+Y}lo4^aXX_R2-8YEwj_1NxN@>pld^yzlY&~pIy3>p_|A)|`q|oN7 z{jLB#7f8cmsbXe6YYpdRX&SzE)(S$EoGusOt532k&cFR>4Il3>LK8(X;o11r7UBg~;?AY8^9WDT|EXTLK@8@^? zG3^=OweE&?GrF)Q`B4wEP2+gG&tG{y`-AO*a93Os)c`>;Z!fG)W{781xqrSrp$!mb zse6JSYvHcE9nQLUUb4}iPa$<&vi^mQ;408*AHT(bGt3u`8&U_~UXren1ho!}W}iL{ zA{EOwRQ9#7A%UAf8uqcX<6I`&JFy;hZHDx}4?7kaeD+LpYc%yWLQ5n_ltaPIA{> z#T}#du?Wv0(7^U~0~cHNooTPOO8Rhr`s*cWYl+t%TBAOvtjn(5v-;qhT3sF@c#UF( zN|6_1b{C|eQmmgBN18i299kZFs`!|G9wts)xL>aX&k~-1RtyFOLhf+{)v5fFH=q{y z#HzKPs~B~;i@;iX5HwA&F3yu)gkkG2nJvENdNyw04sOUB7NBTJJ$dINbDt&7xr1Dd z*O9PU8{nm}fU0tHb^bzhbQ!GBpzklZ|Fd|Y+rq!)T7%-;_@>snAWmwA$@1}mIyfga zYh|=PjEDtlu=FJ-UsqRGH+a^x7pZ0WAd`I@w@?ob2}xxE0;r3A!_!GH%DX-zsE!F! z)uF?}hOrfbN>}%>Wh;O?aH?7<>a?ql1`pI7F&1{qbD6&ar9S!u2C6&Qn|FwbQi20T zD`?3k>1n2bdr#8jB0oU;!Z)k!+JXse=aG4A%9SmXrsB|4mwPtNZZ|tLM#py=Xl*vx;bdaHw9FqfivOQE?&U}zf~msl7J3;+pG z^9-Mh2Xd3<(4gMGP8?9VLmTc_%>)kGgBt%zv!yBMXngolKbI$bFgZE-CGXet%uE;{ zm6G2YIWFA8TpT&7BYicn7+_l)Rzkkl19guE3f7qgNIF7+97HKPusiR>fz&)G#Lcej z3h2Fcjh0h-z4S-YW#5Y>fqCZ&%fG4WJmaPzJzl`nKt~_JKD%$Zk zfJ|TPBK!oJ$&n5Nc&h@TY|OU-l*C%aE>H*Ns+UB146!U*LP#Rn>#^|~la}2L7X3(e zoRPe)fJ4+J(y#s=2hKvb^mOvYEUdI{KvEQ!7N6* z&R=N@XI42MmTIg6i`96wGY<1%9Oulgs&~O!R8)dvI%FrPKQ}=2tj39B?ip3@t%1d| zziWH$NqPE?_Qo)H_4DBFY3bd&#E<+;_M$3$S@jOKJ%s{&ksa*S6miR>-M!#Od+qkj zM*gz6eN{hUb>w(=^T5@UK50v5Mv{Z8Q?8!Gixw`2|Il23oPTRixt?s}E{)TsOfbql zp!J)G+;aaeA507JOj)|e{KTp6%3~Fqnxn;(|82aV> zYX*~;ifhd~_KL^p9+l!R; z7kp+WPLIpoub7)(I8z3Z+t=d_+r&SvEAn~tIM@dk{%YuAcZYY)5zP)n-B3lqsJY5{Lk~xn&VPTUpgmg8(Xz!mAx$;WJ^@cRQNvkl)u69lK_t$Z)`z(@Ezt+U-* zw?iWp?q;)XSX-~!#5_pcFK9Aq65Dup;swv z8~8uBz_v;&_8$#N(?k_Pjp!r0^5sleuLna5Dr>*i4whr`%Nj4A#QK1!0X)@)iwBMP z&5J%QB|AD34L6-vNOj&!Mr)+$_GIDLpoF2&V9wB~WLA?Hj@|m_p^5PQ5dQ zX;{zQ|1?Z@*a((Nei=fNDw)z697*uWBLm>uN`z=!YXkHK$DyaFmCc?Ukcw zb6lc+NNFly6>r!at;T)M($d?I~FLZ43B&=-V+FBbK0{o35J>X}VB3!LEizDEXXk^8lbIc$2=tIq5! zE>Tr?VXq_ zGci;7iIkj$(NTQHI_!H|knZInRcYWgSC7-uRlc_fQoEKMisbP7iN5J<`Kb`fs5Hg}`?TfFmA4ogTafTW z2AHOtWrTzo9isZv@G;|jHS-0Pm@j=Wda4DdXKYpIJEWd)gZyi2CSnU6ApKRgOU(k> zwc&5F=>6E1?cDZ*kM@t9^zttSl=s^k@V9pPAEzAhQ{d-{K=FnD0o%w4s;vMx1up%l z|8ipJH`Dq*La47&afAu~Gr$4W7JgI?xFa7{Q19H~edePZA0qLO5cPuIDzGG{tF&R4 zkN#+B%Up9|zmmx|;(z?VJ3ghqghs-JyN#&Te!0gkU6BbHt$XRVgMZ+qXD_g|M2?0t zI^H2{X6h`1=KBk`6&V+M-!VsrD%`{Vuh!l>s;RB*7X=aBB7y}$LBO)!S45E39rksex56cnTr0tAc@dML)wA#i5g-}{|A&bjA~JMOrQ z{KbG(X4ahRd46pI9&;JX+y>Rgs)%-|O*HY?=n%C%&TDjN%s)9Z?o5ez`;oP#;5FQ) za?m0BfJ}pWtNVfCTcHPPH~<`K9Q4~gG~LlwoYoE^R&*r*;JgeTvZFINK>CH^BypugCw}ZnErbJQUug>>Tvz6a%A4T_e~U+zXUOd z*6dD_?@qsvKE*gP%)@G2KSaF6MO%&pBAa1Q=zfs`#3MtqEYAOj{)c8mw`yxb38nQNqpN}O?lQgAZ6Wx;HroJHX`H^?O;Z`}%yG#nrbY95kt={G=u z&yx)-m$%enqJgeP)k%3&L}Ji+w#p3l2d=~^yQGYgD3VY<%2$u~nfaDF<;56&?=+#C z+g2HLCcU=nE_IRseg}((cRw@W2 z&Y{@VTnq|LmaN2Wx!VpIKh<(L?wrG{L9Y?5a~)GMEygqjn{LFOTB!6LJ$KFL&s78^ zYLwQyvM^Kv`y9WPF2nVj={15D*p`#NZq(-l(avf1nbx1%@ymA$O2z`QwJby}PD?1@ z3>m(#c(iWZNP29xx>12sVT)GtjU4z3rFv+9kwERh&n`8}-_7q4r{U=Ia^KkWyLoC~ zz>M3=u!_f0QqelerEYcT=S4yYR-6fyyz*h2%f%NcfJ{T7&T2Hv2#>`!`)|q49^=pD zHg=5VdfJ5B{8dKc`u`fKe~<&;^aHDW!-bNX^dmK>-vJltzXMUI6RORGL(vOw*z9ir zQ0&xVL|J$z&Z0n_6ZN{oZbZ|ZV*B;RZswkKuZx&@u*j)t*;{Wq{+=n^c6|v~I-Mak zx6U4sg(dz=TaqGl{k*4o3?f1 zu>-mIf4@Jd+DzgjoZP{lezSeqkPLqz4P$?f?oQ!bOkLv0%BX&7l8B|8S8u-)MzqM? zz<6M~dR#tXG!D51j}k%kDP}LDfdu-3R*h2vx!LY#mS9+~(*ZwW)i}uOM3i*`Fo4~!|9#+ zNugUp+;blUPf(^w>5tp`&#h4&Op%PAr8Vjg@mD9by zyhb1{>LFE2OG04!I_C36-8r2J>SPtJv|2YU78SKyCd6BRWWiwGmz>hTL*7I;J%1k; zF3)-;NJ;mZz}?JTH#BS72Q)nKgR*EpVnIAEg_6Pv&}jqfjH z6=Qn}gx!kEB}w0LmJTY4lg8~9YLEHYY>4(iEhQHWn26!HI@$1$sM(Xdu@jX5vg(P-(-Gt~ z`8nO0oV`_yOCP-qY4Mzm5Ik&4J%rkr5Bqz$WZlVcmBZOHLRn<$XYIsmTqEwZVoDE zy7+?0&N3)Ft0Puz+#39x`g!9voL-_ig0l!;_hM@eI*F>RT$71_(`uzDhmyVn|<(FKAg1PRJN83%MJL%Qjd zXT(AJe4u9#4)P-<1-cp201P|Tko5hQ%h@gM9L&lS zDaBRjH7e>-LT(KrQgj3&oF&k2%#d# z)JM^7U0vNQ$u`Mz>*OU!oHfLu*tqzlPE&={Bg~w%O8?a?!J*2NnkZG=>HAb%HtuDa zTf?5#Hl+G+N3-<{48;lp^fQpSQ*%j9S((0YG1oI$HSI^(!o6?j#2K4n$DF4wqh!JI zz=w(00xfy~MD3($Toz>&VzHJchpJh?uvimF&>b~SABfS?Www~Sg|alO4{x^JEKOFW z%Qow5RiH+Ba95OvI+AN9^yp$*^E<6@+)DMi27Gy4T^$^@)+D&cSXry}HQxEq7Ng;L zfZIBqdXTN<>VjQPnZ~l4<|pUB6N3y?N}Z@%=IMN07H8#Sga|8;fo=GfaVO-(bqjRX z-q}*|j-k%_!3KAUSFv$Q(;b7>!UlfU<0qLF=KbnG#9`H%$9%`cIX_t2MH!YT(;l@{ zyOUmCNT0W{k*e=!OOK1VY}SIEw@_jC7zvJa94O>O>)7vxmR8C22>_B|*PM*oSFv-o zC2k41t7~_}EL_bEk#)2ozEWT>w|Mk#BE-qEGh0~-}>Nqz!*po~#iPOpvWOPlh&@gxyR>b8jGGuP_JsywlQ#~=oF@Sl;Ck!-{P~^p4>lbC5TfzKS09QYm z_X7q|wz$DO5BVvZ)7@oFdsr6G-xZF-ZBoMaQFCFSsF|q#(}ywB)$8RHj}Q{5Irelc zp1s!y-t!sY)rQ5!2e(?uv!dYB(ZHjlOM z71bWNX%<=ejYZ||10;IHn+0su+=;vJ4=xv0+zWpKe_28*pVII0sB@FIiTX5T^+Y78 zob>23-+gy-=+2?_z-L-BKgpCmRqiOVY`J$+1?c7H>s{X>Ei{g)mc%|AvQ=Y$h;CRs zGwcT^W0<9gexKFhNLD0388YuB+pk1_;5B4pD<^(BybZjD(%L0zHKe~e#~?R?Q^8|A zc_j}3X;RvVM=Ga6OE^N{b^{ALNK$dET~VrN=?}Xnfh=MPNnze#W|5$ivRv>3l8vZMr!BkK&R;Z;W6` zL2_RD!3Aae(iyh!4|BB`ir6ta7_vqc`V*G)>6YV@F~x{c>_YNf@wWd+%pb+6_aVDo zpf?oUHZu5|R+=IubGkDrZEFJR#aWdV5hQbLqfBkE2Aa-Q6mWv3`#kJmYplB`%!f9c z!T@OAm2(^>oo1%;*bcqZ5haw6gPyg0nBQqpFup01k3vunlr_PFL<;^AlVxreeU#!QwV0vV|*wN~(0)7P%xU75KO&yY(f)%5{+l!Wb zcvx9hqY`_OH`ABQ^P=i6=cOwJuM^(A*I{+~XOg-T=ZZg6Viywmgd8VnL=^w`!=e8Q z)iQC_GFSN^r{fGL`@d2Mf3H{`Yg#rs0#ituqiqrEqBwf(uFF6lE$oavwpc@DGMA^@ zME|pu07#hxYV>|v6%r~}S`BS2EVXJ+2bIVSS)ZM6-=6ZYQ3G6KA z6aGKidPYd#ywJgvK#^v;mg9nFCgE$m`xk|p@g9&FXC#3nmB^Z=;MOEzv7C#e?P#6a z5&67Bn|@dB#PJL}32<%FJQY}X!~JlQgUPG2TfRBi4By6W4AN5FIiI+pw0P8M$#di# z26eU{(qDYvVbVN+GQyaFuS{$#l%`0sAz%^qs+sxCwdu!y+oq)X&y$V}Yc;&Gs5uwn z{xr5kE5xx{Cx0Qt41yLu+@NQZt6-9gj)5D8vk?$(bk8~g;FL_w5Whfx68xWQAd{b_FpmUdO4(inJwpd+k z!jg{tVW#8B96!)iSItyHy2&d`^#OT8PP*V;kk3*A;FJ0G5m=O+RYm5GgV?x#;~wDM zpEvhuTc`wtKr(k{)CfmVT{5T3XOo@0WRDBaZXSAZ;Bf;xRWv(Zm5fQ}tiLfqGTZ5o z-Dd%%zpXgBZY3CYap4z4BC#5+C@>^LTH_#8qkA0;=~hI)g)Ge=^XR6L$;pR4yLBur z5jN0(HpY9qbpyW@Qu7W_I8*+Cs-@dV7j%t$4fl>FNDom*p+T7N<b!2rYNoMQ1bI(SD@D!_G*kQ_iGy=};7M^f`RC#khYPGZi_T$UO9}GgCA{clIg5 zB7rhgH?W=$e*IY9?`h4~sQt2!L5f>dT2lClmaEjp&sA=y8Q?h*CzmVv(7R$Rt&w>M zyWuXi@o@dVt_<9$NWwjG!fTM@t?GQ{P|IFg*^h7>&L0z$K{|#Gx!H^r+NTe ze{D$QwvKO^jBAC4&(ys$&gN62HNB6k8x&uV0aY0&KQ(99yYBCr9tj`q=ICb?Oh8km zc-;NVd(YOmg!9C?OE@ImZmz$^L!cY10RBKg9{$8;-hTXt&5agdSNf&$D(&tl(M zl9AxHW2Qr;?fR%*Oc}{;bW=5^Dfdx!v9EI#CZ7_3yB^#eZ^4<1Ee235!(tsMQfP^=#ZbH-;c%69fjo8gp7pQ+D z&K)e`VV#!4Xu;Of1C&4<$YM@)@}W0;EoZ(kBh&Y4mV(A`kupX^->&sW_c5E4V8%#y@v(UY^q^xgdha~jFN!NXpqNl?eanN~a- zncf>}?$heKXlF{`A+O_nFYGlhX)}%$6jd_e`vFUKz57NzoPCXroBPsv`llk1A}-mR z-pVoAvC0nuT@NVou&_{Q_SUXT_*d0;(HTk!X;{z4&Z&KtSew+ou?_D?a%Ia!5&88; z&j)3AR5HV4B9waY-bS8Gk89AwoQYW_j&Aj;WQem?dT-;u7vduLL*<&l?qg)e(N-;i z+YH9!esQ#KTF%xnhNdJ;2Nru^#ntFkxOWR+{O$k+Gl!WFf*c{4HI>cKQG2z&2JWqTAGu-JFnXrFmWWAJ5fBoN>!zGj@{X);=NhrAIPYtd?`t8eP?pdbHZWw~kT&cTA@-=Vqi&TFio75A{sA0K8?j{2K-0)vqq z$YoMZG+rjxDv--b{>AD7fUtZXgQC@HVHS5P=KlomR;Q_v;L-Olr-!`G%-&4!4L#TM z>_OJy*W1lYdxGmh=ZmHX^+M{=z-Z~u&zfYL{SDRCB`cj)M2l?OZq6!mU@zMcgNm!! z+s-t`;MQuF4BvmlHfV0bp8cv ztBF=>D(aWC0H-#ALyKo%Dew+ovUznz1?qmPucJ{wJo5re-}@v1cS**kyuVSs{UINi z&gRH(<`mxgp0wrk3c4>%;%UZFL^s`AT_7?@BoPsiKEH4-J8^6i(vF!AIUTz|LPt<($g!I-`M<^q-#O-P4W(fq1rxh6}kTx|2G&I8twd`c~2T8pI_oCSgT zoI>!OTexNc2$|?z{`v#>GXCE4clVLEM1+iNAl7X&of2P@lgGAg<6-{Ym~PJZTPkv2 z*V-oRhGQ8y-Fx>?jGubH<%%FuZugB6gcCAgjZ zbuIC1j6v<~)Wr&OdA%ebTOcoLOM~ik0R-Z6N5qFgA8p`B>a~_3CC-^evS9%ud~1E`Qp$ViVpw~_avd@9bYGQBk`4Sl24Ikr+yWy0`Tg1Oe0$2`rXPf zC*98(5bZ5e8DlU`a;2 znT_E5_I^zJGJb{X`@tadD68n4o9*T-@|B;fMmsFq{iKcO{}})tb?odD6&sSO(Y+#H zk-`t^WnDq>FyjQq^|Db$NB7gQUyGVQ+b*!y@^h@2=xQD;(a{JyWRvO!@y-N0P_PU; zk@<_`{qHNxWgWTrEoSUHExx90*J#Sz(e$3YJs0Kh6vJJ==`EL#1&fU=R`D{*@E_DokKe8Y5gF(YIHxVKwX4Wc6@Jio}IFbSD%g0 zkEdlEMK__!X+|CR-c`-s_nyHbUvg%j99X*1^K{H7E#vU}3Pz4cgP+Loy2~|cJjiqM z=V47*Wx1w-z>ZDmjD2>}D^@Y%a!*>!`XwBIB)VD-S(4$Vc@Vlr>FI-;-fO$>;H#a< zPG>_nCElLAj(iQXA}`g0I9i)n&J#7f)>c*@12mrbKFHJ&@QYr^IQo#`Ztf2McN`=oM+`yCr3EI7 zk_=-DUwRDmcBa8V?-sJbPw`|WEIqauiYuOXLbi9}!2`z9p0U!SqQP56@ut`J#`z{- z{AT^DS;Yb!!~@O&8#!XZI9t0(thYIM%?+3WQB4F8BAVWY0=8-Fs0}z$UoeX#-2fNM ze+!sZihj;dv_8+Jx!p+YC(C;|p1Q(3N`%t!C+zRRj9*5NX=m1;#xcyC{%jynUYED8 zx8y!6n&AZo^o^d5faPTA{ZKSoJ2A_;Zxg3q_QPI{-De@zT37810L=MkRF2#e=oQPT z^^$VLrN?63ckTo&1Bn-KP6Y$mb?QPpMzXJcu=3=2TIh>7$bR$+7dESjG=57*lv#$` z0p_xk7jzsFaM^6(%AI4Vou%4b$2(M+))1iKdw6dYq(YU@efY-qf!wGGP@$)9u^B2& z3nf|af+WWpzYeODV}a)6)nv(?^MojereH`YRA65RWJO*k@0I(@R0(OD6 zDYq}w36RXkQr$88Y~mD_;`=)g9l)xf>8$j@;~Q z%5caeGH)nKs3$7BFRtCh3?_qY>onWe{t|P5jf(kt=_=sFc|K^@v{$A4Xb1OM{Z4&) zkGTUjqM2UiZY!C0*XCN)a-YCU`xd0JjV*D8ulu!HVrIOdIKymWPXASHEze(8i8ZcO z_Rg#l?cpEcdIAEIg{uxK%4=LkNi)D-#1BmK^8~@wPjhB(31^>ZW50ve5;*E4TW4Vd z*1Dx%ub4=i{Sq^*C%ZFVl8>4eJYWm=>98yQ&`_Tk3q)UEL$!))DC=8jHmXE7LrSzr zFwO1xt!mXIh3KcK@jqJr`>?>E9|aC8xgmB#8gCtuB}1pTLQn>0b!8rygR57KmFL^j zKn{#SCD{gZ{cPim=i6N6Vt9(dYX**b**Ysx>JyNJ z_I7r%kska78>)M(-uBOu<-p4ghmF-kIu2v=Au%a=yG6fV6}gsJK>wOtX=J@Xx4tXDI?V+I7xb6ZodO+5%!o7y8P0@rN zWRWIfw`pI8i1Cch_q=V1^v-c~4#UI}^FgESes?fQLlGb4B&_{IzpF>Syht?Ch}N#4 z5rmWPPm604Qnne;1ie?W+-%XiSHKdtcnTz~aJ6oIsr<30I$OP%_;{q~Nj zedD*Koaut8$hfBuvZ7SWDwHpF57MsAPc9)gR-A^bO2NOjD&kQ51py0_PIUvg{^<&a z&tTP9;@D4bECsjZNSjn9+q^&#>x$?d-L@sI#$%s^4$IGZVCfxqq@!53@s%&fi4ehFA zrkOy`CFHW?nRRO6fFi~bip}KJ){1IZOCyMiWZz25-glsqy)ITa(^bBO&1x7GR4MI^ z&+pr0l)9fQd0rnFOsnz~?=EQH^40LS@8AAL;=y&w$L{684kW327f+qM-qndV+yP1D21m6W-g7&u5C5 z*c>gjILjX>KSrQJpmgFXn zGRKN}eh1*PlvKm557lo}mwEfKkMFPf zOnpgSg~^?h>Eg!(q-Cq?#J-=lEl3V(s%PNR<#~M^9)<$bB<}^U@UMIY8gFYA3a9;0 z1w-{~8nu33Dyur1D>VzC>nyoetL8$Z1+%h8`zVvuA~qKU$!O8%UwI+yXa#Df`9mgt zXlZ*xz3Er)Q`h4PP$Rq-Zxv8bI>WNfwgpZO)h5Kz2578x0qrF;e513Eb{)9eO|-$* zJcH4tud18^1icJ7yuY%~B=nMaCs})XWK$0!fFevyO-;bzGFJ2ppc+5& z;uat(a2;{6Sg=ct56xvAz%NSK=w@{(zl*Sk@1m~Teyq1&Sk9~9hG(J%RyWz7j&}Q+ zx!yJ=dKv7hKd5{la<^&Sa6d)(l$=TGMFRrSI=J*(ostU3&7Kf$2d|d&1AeH0x!&CG z;GJ>C8TkD2xAxX_&<-Fv!BtpA0X}3n?uTE1{{1y)_XDUfU+(vEYVQZIoaKZd#-q8A z?UEhkj$avXBQA0B17>a-yM+k_NJwtB^mTK=OUcGBr{@#fY_2Cjb;kcPO#}R%JKeANQmA?<)iP?{~cub^-!ssFJsLz zE}w#blD$&etR$^{)%p>XmQ!VK=~H&{T8x16A-}wq*lHz4z3qg-jRkCHjTf#|kF6%| z^x|-rI!0jo-D*bxXZ^LpD)H`(P0fNB>eJ>yi)G4f{R;2hUnS>-i&vxH z_S>YZ2B2OtzJTX-CVxGzRH)u6klv+}^d}8#2AL1&D36UBS5|^%KS=Z>BYlsf32mfL zWBBwaM6~Cp`G8j)YGZ^jovHoE_e|KkgL~j20gHI_BA4J)WS%U`DMF>@8|WKuQ7EfK{7KT7sp-mp4aKNk0vq?;T^Pi{ z#5GQV)B0=iCiHIde$)Uud)6|%MCczbRarzQCqiG~v%q;NVax5jekTRvS#w%}Yf|Y* z6@Geq?X%UWj)SKT^g=F3m3Mq;;^pNjWPgj#4{YEzeW&Bs1@9-f$t{j*SR^{cyC@)3 zs{-l?w+}@Royhh@r%wt^X<3zMc3I&SU&6b)GZw7I$|>LDOJ1g zF~I>jFpYCR5#HD5pNnbF?xJHhygftG6We6sa_^p>gdRh< z*+dr$gAI_dPYb`jIeAgo15(zjw=y4VENyXmr^^csJQn%`#Tp>*=yHCrF!JK&tW3#h z^YZp))RS?}UjdG_uyK@S+kQ@D&~Eq*Zib!5DN|PmV1oLNr5t`@>)YBaE@|<0#aB5~ z<&A`DsZQ*})};3}wA-aqAC-3T({6RyVPHvGZe~{u-Hws|$HJj||GUM9r~P~w6E3dR zklNSlb^P%dF>S5hS>QK&R%LJ_G=o2gOZOf2m82G9%gl*%FaY zItTWtf-_(Ny8O-ey=;!Tu0nHs$lk_{`$J=jad{AGoY2x0eS%76nUDKf8~8V^By&ls ze8=XTW}!4*vXgs<@u?r~sQ>!m4)&||0mXC4>pK+Bv(raan)|css>TLFsX@|47TWzO zbp+2tQD9@tl3_I3)E!nv&gMHfg5-9M+@-q{~euj{5I=V{to zLQ9W8oDQui8ad}^!rZUNmr|k%Pyl~qIL2WT`z{@Udc~|f-P_c?*XSq#)lo!XzUG?< zHVf2+y?sXC(WpODVY`3**^2esjT7#hZ$nZlwA-oE-+A3=XL@|`fnIr{jw^o|#<+W= zjcr^@6Bko!G|U8ho#djH_DRYMyX!nVn~=ULP@KCspWI+5_T1SKE|`H6o^5XUE)EZE z%%yZWpL7}jw=Qcp^O?`YeBP@+uR2zCOWNK9Gc8ES9+BJ&(Jas z@T0G{Q3~eNCfobA=0~=mL!aB*%AW7bB_cJvnZij!&dvU}?UOeL$G!b?sNdpK)X!SH zFdxyQjez|~7M{atDf%v-Pr@@zja;(8FhCX-^EU71AxY0#9OHjCJ)^7+<-v|pfJ1cp zzcu^P#)QIwHgqf9Z`+7=aWd|GSHZG+O!^qT8x-zK_2PvFOc51Plz_8cNB1e(qd8CU+Ma zj2j*|fK5(shB`E&RUNbW-9__fu6(W1YW+*iZoyC8LH)4#F8)+Nhx^><{r!T;6MdHl zx26kXG{PJ{Q$ANU?}c=c=QR6*H&=Y^8@uQmQuMuh%&cAQ7a9$@6VJC03h7q%(iF9M zU^6Co#}q+zJN0CW1S4^2tEbY7f^{WpP442$?mp_A*gDxLGil{wF?o$-c@=25+3l%z}g@o$-BzIWPhx@OPhIp|cf%Z)~fgoJE}6oDoOS+Pg}-rK4Jp z9(FDtUVfnQF!-5OdFVPavND&N!R(h9b(!1flEC)*1 zQn;$_^p{dv$1@}trRALu@*VWgR!8EXb6JUoaO>_`rGd0$<;zEAKa|H&^O;t)opqT< zlZV=FG)Z2<8D?<`tFrPzTpB$rW#4pg!t5^}ng3W}>79t+yu)L@g24)Mv`g zFWxjglk!A}viXk=b8h37a~nMEn=QV>slmH^vW1qWXG7tGegW2*Ual4pe`Q&oVMs*Y zg!EOYG`aE67E?r%18F4S`8!G|}7iW2(<_93)r&HmS(XFhvZ1w9P4g`g<<$kclF zI#0n(KWqDp^@ZTv34I3Q9Hv2^mb7e;u{Vpo?2pnzNgzwj&X}-ls7y=u9grN{Cf|El z_YNi^SmbWcv}8gVE+v?!N69?c*l3dEH*YHf`D&4qd-6{si3N9UE8Jgde^hZN$Z&CB2GleTMgJ)2OD7@zP5|iH@f|#L9pb# zVd9?ktt%`a<0ggGVhLeXUw4|0D#>1wl)h%!W)u^72EJEyj&l4(i2-g^JS#iPdX2#m zEab=|EH*r<0HCcnCdX*SY=%>y>!;n`EDUJy8m=N`bMJ_6LPsY4Kc?UV?meAFr}D&bVq8 z8eF^(dDAV8&t716n}kOLPv=2!?>93ZDAONT5542g$I+s2aOu41D~|YRhY=;&Q=n7i zD2|3N{+wt9>0ThVpluh7wL4#iOc&9Jbgs#*qu^<>eDP57IJfDn&@YQ^nA%cT5w0&O zZ}jlqq1%lJ{Jh=@I~(G}oRjpFlD_KLfgm`JZ&;><6oB(73UJ>xw2{xVuHjC1c+#Mh z9-QK`*q3(~oC!H|n9Y6_>=N{R)CBMM6T~M~_fOm41Vh2C_^1VUV$I^qswkB3f86=w zzjJ^DT-FdO>Ep8Ir9dIEa>X3iT*i%!2!(e~;@*HD!s2{=wUg#z5U7A-NRF(pb#l?c zaJ_;yv!p#1O$gqgrMGq3!EU80fek#LdWH%Hn{w|jddpF=!Z&FVga18L5%9#pixpve z4#y;HJHNCEUN`43>NwMussmU2jN3@Jdt}&(67L>m|0yR$BkDs_2zzT~LeymVvQO$w zkPw4u7+MHD_oqzYkP$4KXXr>v()z+Rrsb%R<6)o0)UYOvCLSRQ<-V!RE|izBr@w+6 zfAN_FNUa4B4QyE+{TixVvmr`DZ3bsq^h~!Wo=DJP&YR?}NKvu#iPr{0WlyDbM!VMW zjv_DJpiKzHn9nCgZXqxD9=@L@Rbr_UfO(c|g{!ol+Dh&QTZq8VxR&*YjfeNf+&6Me z+KbX(T`NVxutSi69a;p|r26YcxEhL`?Mv#s<89K*o|w6Zz;XB2)cSnY63uM!aMGyc zE$e{bCO5saU*%T Date: Fri, 5 May 2023 15:52:18 +0200 Subject: [PATCH 48/61] HIP update design.rst --- doc/source/design.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/design.rst b/doc/source/design.rst index 614ac608d..39b13c3f1 100644 --- a/doc/source/design.rst +++ b/doc/source/design.rst @@ -12,7 +12,7 @@ The Kernel Tuner is designed to be extensible and support different search and execution strategies. The current architecture of the Kernel Tuner can be seen as: -.. image:: architecture_0.4.3.png +.. image:: architecture.png :width: 500pt At the top we have the kernel code and the Python script that tunes it, @@ -48,7 +48,7 @@ building blocks for implementing runners. The observers are explained in :ref:`observers`. At the bottom, the backends are shown. -PyCUDA, CuPy, cuda-python and PyOpenCL are for tuning either CUDA or OpenCL kernels. +PyCUDA, CuPy, cuda-python, PyOpenCL and PyHIP are for tuning either CUDA, OpenCL, or HIP kernels. The C Functions implementation can actually call any compiler, typically NVCC or GCC is used. There is limited support for tuning Fortran kernels. From d71b11b14414b680ee671b4ffcb5c22f27b833ea Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 16:01:59 +0200 Subject: [PATCH 49/61] HIP update core.py DeviceInterface doc --- kernel_tuner/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 90c9bb4fa..0c494179f 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -213,7 +213,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No :type device: int :param lang: Specifies the language used for GPU kernels. - Currently supported: "CUDA", "OpenCL", or "C" + Currently supported: "CUDA", "OpenCL", "HIP" or "C" :type lang: string :param compiler_options: The compiler options to use when compiling kernels for this device. From ee0c4d52a06f693cdd554ce467a789632478da2e Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 16:03:17 +0200 Subject: [PATCH 50/61] HIP update design.rst --- doc/source/design.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/design.rst b/doc/source/design.rst index 39b13c3f1..4ca515e26 100644 --- a/doc/source/design.rst +++ b/doc/source/design.rst @@ -128,6 +128,12 @@ kernel_tuner.backends.c.CFunctions :special-members: __init__ :members: +kernel_tuner.backends.hip.HipFunctions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: kernel_tuner.backends.hip.HipFunctions + :special-members: __init__ + :members: + Util Functions -------------- From 1220cf5a898e5818b2e38c0b65e080181e1db097 Mon Sep 17 00:00:00 2001 From: Milo Lurati <70884255+MiloLurati@users.noreply.github.com> Date: Fri, 5 May 2023 16:12:46 +0200 Subject: [PATCH 51/61] HIP update README.rst --- README.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 08e641061..2f8e8bcc0 100644 --- a/README.rst +++ b/README.rst @@ -28,9 +28,14 @@ To tune OpenCL kernels: - First, make sure you have an OpenCL compiler for your intended OpenCL platform - Then type: ``pip install kernel_tuner[opencl]`` -Or both: +To tune HIP kernels: -- ``pip install kernel_tuner[cuda,opencl]`` +- First, make sure you have an HIP runtime and compiler installed +- Then type: ``pip install kernel_tuner[hip]`` + +Or all: + +- ``pip install kernel_tuner[cuda,opencl,hip]`` More information about how to install Kernel Tuner and its dependencies can be found in the `installation guide From f8b58a100cb8e85a4d6f67a141e3ea4514a3db8e Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 9 May 2023 13:44:46 +0200 Subject: [PATCH 52/61] updates for running benchmarks with HIP on AMD --- kernel_tuner/backends/hip.py | 39 ++++++++++------- kernel_tuner/core.py | 82 ++++++++++++++++++------------------ test/test_c_functions.py | 2 +- test/test_hip_functions.py | 2 +- 4 files changed, 67 insertions(+), 58 deletions(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 4e114fa23..bbb0f1083 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -58,7 +58,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock self.device = device - self.compiler_options = compiler_options + self.compiler_options = compiler_options or [] self.iterations = iterations env = dict() @@ -122,7 +122,7 @@ def ready_argument_list(self, arguments): class ArgListStructure(ctypes.Structure): _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] def __getitem__(self, key): - return self._fields_[key] + return getattr(self, self._fields_[key][0]) return ArgListStructure(*ctype_args) @@ -146,19 +146,28 @@ def compile(self, kernel_instance): kernel_string = 'extern "C" {\n' + kernel_string + "\n}" kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) - #Compile based on device (Not yet tested for non-AMD devices) - plat = hip.hipGetPlatformName() - if plat == "amd": - hiprtc.hiprtcCompileProgram( - kernel_ptr, [f'--offload-arch={self.hipProps.gcnArchName}']) - else: - hiprtc.hiprtcCompileProgram(kernel_ptr, []) - - #Get module and kernel from compiled kernel string - code = hiprtc.hiprtcGetCode(kernel_ptr) - module = hip.hipModuleLoadData(code) - self.current_module = module - kernel = hip.hipModuleGetFunction(module, kernel_name) + try: + #Compile based on device (Not yet tested for non-AMD devices) + plat = hip.hipGetPlatformName() + if plat == "amd": + options_list = [f'--offload-arch={self.hipProps.gcnArchName}'] + options_list.extend(self.compiler_options) + hiprtc.hiprtcCompileProgram(kernel_ptr, options_list) + else: + options_list = [] + options_list.extend(self.compiler_options) + hiprtc.hiprtcCompileProgram(kernel_ptr, options_list) + + #Get module and kernel from compiled kernel string + code = hiprtc.hiprtcGetCode(kernel_ptr) + module = hip.hipModuleLoadData(code) + self.current_module = module + kernel = hip.hipModuleGetFunction(module, kernel_name) + + except Exception as e: + log = hiprtc.hiprtcGetProgramLog(kernel_ptr) + print(log) + raise e return kernel diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 0c494179f..1eeeb0d8d 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -425,48 +425,48 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): - return instance - - try: - # compile the kernel - start_compilation = time.perf_counter() - func = self.compile_kernel(instance, verbose) - if not func: - result[to.objective] = util.CompilationFailedConfig() - else: - # add shared memory arguments to compiled module - if kernel_options.smem_args is not None: - self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params)) - # add constant memory arguments to compiled module - if kernel_options.cmem_args is not None: - self.dev.copy_constant_memory_args(kernel_options.cmem_args) - # add texture memory arguments to compiled module - if kernel_options.texmem_args is not None: - self.dev.copy_texture_memory_args(kernel_options.texmem_args) - - # stop compilation stopwatch and convert to miliseconds - last_compilation_time = 1000 * (time.perf_counter() - start_compilation) - - # test kernel for correctness - if func and (to.answer or to.verify): - start_verification = time.perf_counter() - self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) - last_verification_time = 1000 * (time.perf_counter() - start_verification) - - # benchmark - if func: - start_benchmark = time.perf_counter() - result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective)) - last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) - - except Exception as e: - # dump kernel sources to temp file - temp_filenames = instance.prepare_temp_files_for_error_msg() - print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames)) - raise e + result[to.objective] = util.InvalidConfig() + else: + try: + # compile the kernel + start_compilation = time.perf_counter() + func = self.compile_kernel(instance, verbose) + if not func: + result[to.objective] = util.CompilationFailedConfig() + else: + # add shared memory arguments to compiled module + if kernel_options.smem_args is not None: + self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params)) + # add constant memory arguments to compiled module + if kernel_options.cmem_args is not None: + self.dev.copy_constant_memory_args(kernel_options.cmem_args) + # add texture memory arguments to compiled module + if kernel_options.texmem_args is not None: + self.dev.copy_texture_memory_args(kernel_options.texmem_args) + + # stop compilation stopwatch and convert to miliseconds + last_compilation_time = 1000 * (time.perf_counter() - start_compilation) + + # test kernel for correctness + if func and (to.answer or to.verify): + start_verification = time.perf_counter() + self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) + last_verification_time = 1000 * (time.perf_counter() - start_verification) + + # benchmark + if func: + start_benchmark = time.perf_counter() + result.update(self.benchmark(func, gpu_args, instance, verbose, to.objective)) + last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) + + except Exception as e: + # dump kernel sources to temp file + temp_filenames = instance.prepare_temp_files_for_error_msg() + print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames)) + raise e - #clean up any temporary files, if no error occured - instance.delete_temp_files() + #clean up any temporary files, if no error occured + instance.delete_temp_files() result['compile_time'] = last_compilation_time or 0 result['verification_time'] = last_verification_time or 0 diff --git a/test/test_c_functions.py b/test/test_c_functions.py index 6fb0c2b6d..224b66524 100644 --- a/test/test_c_functions.py +++ b/test/test_c_functions.py @@ -143,7 +143,7 @@ def test_compile(npct, subprocess): print(npct.mock_calls) print(f) - assert len(subprocess.mock_calls) == 6 + assert len(subprocess.mock_calls) == 8 assert npct.load_library.called == 1 args, _ = npct.load_library.call_args_list[0] diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py index 87ab9dfe6..ce3eb0642 100644 --- a/test/test_hip_functions.py +++ b/test/test_hip_functions.py @@ -55,7 +55,7 @@ class ArgListStructure(ctypes.Structure): ("field2", ctypes.POINTER(ctypes.c_float)), ("field3", ctypes.c_bool)] def __getitem__(self, key): - return self._fields_[key] + return getattr(self, self._fields_[key][0]) dev = kt_hip.HipFunctions(0) gpu_args = dev.ready_argument_list(arguments) From 6c44bdadfd15091fb8e8bd72ae5ac256642c1b76 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 9 May 2023 14:11:27 +0200 Subject: [PATCH 53/61] update to test_compile --- test/test_c_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_c_functions.py b/test/test_c_functions.py index 224b66524..6fb0c2b6d 100644 --- a/test/test_c_functions.py +++ b/test/test_c_functions.py @@ -143,7 +143,7 @@ def test_compile(npct, subprocess): print(npct.mock_calls) print(f) - assert len(subprocess.mock_calls) == 8 + assert len(subprocess.mock_calls) == 6 assert npct.load_library.called == 1 args, _ = npct.load_library.call_args_list[0] From c8b919848efa48d44bb37df8f4a6b90cf43bc0f5 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Tue, 9 May 2023 17:19:26 +0200 Subject: [PATCH 54/61] handeling loop unroll for HIP --- kernel_tuner/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index fd7ee3c37..0e3dee4d6 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -618,8 +618,8 @@ def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, blo v = str(v) v = v.replace("\n", "\\\n") - if "loop_unroll_factor" in k and lang == "CUDA": - # this handles the special case that in CUDA + if "loop_unroll_factor" in k and lang in ("CUDA", "HIP"): + # this handles the special case that in CUDA/HIP # pragma unroll loop_unroll_factor, loop_unroll_factor should be a constant integer expression # in OpenCL this isn't the case and we can just insert "#define loop_unroll_factor N" # using 0 to disable specifying a loop unrolling factor for this loop From 3f89fc3c79ffefa39f0f335655582dbe4a8d88b7 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Thu, 11 May 2023 08:58:19 +0200 Subject: [PATCH 55/61] added vector add in simulation mode to examples --- examples/hip/vector_add_simulation_mode.py | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 examples/hip/vector_add_simulation_mode.py diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py new file mode 100644 index 000000000..379a707de --- /dev/null +++ b/examples/hip/vector_add_simulation_mode.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +"""This is the minimal example from the README""" + +import numpy +from kernel_tuner import tune_kernel +from kernel_tuner.file_utils import store_output_file, store_metadata_file +import logging +from collections import OrderedDict +import os + +def tune(): + + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * block_size_x + threadIdx.x; + if (i Date: Thu, 11 May 2023 09:15:56 +0200 Subject: [PATCH 56/61] updates hip vector add examples --- examples/hip/vector_add.py | 6 ++---- examples/hip/vector_add_simulation_mode.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py index 72397b628..5b2470923 100644 --- a/examples/hip/vector_add.py +++ b/examples/hip/vector_add.py @@ -29,10 +29,8 @@ def tune(): tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] - results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", log=logging.DEBUG) - - # Store the tuning results in an output file - store_output_file("vector_add.json", results, tune_params) + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", + cache="vector_add_cache.json", log=logging.DEBUG) # Store the metadata of this run store_metadata_file("vector_add-metadata.json") diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py index 379a707de..e03822141 100644 --- a/examples/hip/vector_add_simulation_mode.py +++ b/examples/hip/vector_add_simulation_mode.py @@ -35,7 +35,7 @@ def tune(): if os.path.isfile(filename): results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, strategy="simulated_annealing", - lang="HIP", simulation_mode=True, cache="vector_add.json") + lang="HIP", simulation_mode=True, cache="vector_add_cache.json") # Store the tuning results in an output file store_output_file("vector_add_simulated_annealing.json", results, tune_params) From c77e1be5304d670b24a3190e46859cacb3b1d3bf Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 30 Jun 2023 12:48:04 +0200 Subject: [PATCH 57/61] changed tune_params dictionary type --- examples/hip/vector_add.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/hip/vector_add.py b/examples/hip/vector_add.py index 5b2470923..7e2810711 100644 --- a/examples/hip/vector_add.py +++ b/examples/hip/vector_add.py @@ -5,6 +5,7 @@ from kernel_tuner import tune_kernel from kernel_tuner.file_utils import store_output_file, store_metadata_file import logging +from collections import OrderedDict def tune(): @@ -26,7 +27,7 @@ def tune(): args = [c, a, b, n] - tune_params = dict() + tune_params = OrderedDict() tune_params["block_size_x"] = [128+64*i for i in range(15)] results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, lang="HIP", From 83d9993f1a983692c2b903af87ebe93c7d2838d5 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 30 Jun 2023 12:50:48 +0200 Subject: [PATCH 58/61] specified cache argument for simulation --- examples/hip/vector_add_simulation_mode.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/hip/vector_add_simulation_mode.py b/examples/hip/vector_add_simulation_mode.py index e03822141..a3368bca8 100644 --- a/examples/hip/vector_add_simulation_mode.py +++ b/examples/hip/vector_add_simulation_mode.py @@ -31,16 +31,12 @@ def tune(): tune_params = OrderedDict() tune_params["block_size_x"] = [128+64*i for i in range(15)] - filename = "vector_add.json" + filename = "vector_add_cache.json" if os.path.isfile(filename): results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, - strategy="simulated_annealing", + strategy="random_sample", strategy_options=dict(max_fevals=10), lang="HIP", simulation_mode=True, cache="vector_add_cache.json") - # Store the tuning results in an output file - store_output_file("vector_add_simulated_annealing.json", results, tune_params) - - return results else: print(f"{filename} does not exist in the directory, run vector_add.py first.") From 4ce2f834db2c31d08676b6902779fb03ab439d76 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 30 Jun 2023 12:52:08 +0200 Subject: [PATCH 59/61] deleted device_properties env, was causing error type --- kernel_tuner/backends/hip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index bbb0f1083..682f5e6a9 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -65,7 +65,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None env["device_name"] = self.name env["iterations"] = self.iterations env["compiler_options"] = compiler_options - env["device_properties"] = self.hipProps self.env = env # create a stream and events From ee7d11afc0890b6a287a40cde19c71800d4fb539 Mon Sep 17 00:00:00 2001 From: MiloLurati Date: Fri, 30 Jun 2023 12:53:13 +0200 Subject: [PATCH 60/61] fixed bug related to runtimes in simulation mode --- kernel_tuner/file_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py index 321d0db80..13ae223bc 100644 --- a/kernel_tuner/file_utils.py +++ b/kernel_tuner/file_utils.py @@ -105,6 +105,8 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim timings["validation"] = result["verification_time"] if "times" in result: timings["runtimes"] = result["times"] + else: + timings["runtimes"] = [] out["times"] = timings # encode the validity of the configuration From 0d968077c44aab7a5e20825b1f75998055415723 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Fri, 8 Sep 2023 11:52:12 +0200 Subject: [PATCH 61/61] make store_metadata_file a little more robust --- kernel_tuner/file_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py index 13ae223bc..0d5024187 100644 --- a/kernel_tuner/file_utils.py +++ b/kernel_tuner/file_utils.py @@ -227,7 +227,7 @@ def store_metadata_file(metadata_filename: str): hardware_description_string = "[" + hardware_description_string + "]" metadata["operating_system"] = os_string except: - hardware_description_string = "[error retrieving hardware description]" + hardware_description_string = '["error retrieving hardware description"]' metadata["operating_system"] = "unidentified OS" metadata["hardware"] = dict(hardware_description=json.loads(hardware_description_string)) @@ -235,14 +235,14 @@ def store_metadata_file(metadata_filename: str): device_query = {} try: device_query["nvidia-smi"] = get_device_query("nvidia") - except FileNotFoundError: - # ignore if nvidia-smi is not found + except Exception: + # ignore if nvidia-smi is not found, or parse error occurs pass try: device_query["rocm-smi"] = get_device_query("amd") - except FileNotFoundError: - # ignore if rocm-smi is not found + except Exception: + # ignore if rocm-smi is not found, or parse error occurs pass metadata["environment"] = dict(device_query=device_query, requirements=get_dependencies())