diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 451bd963d..a1e13ff03 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -1,15 +1,11 @@ -"""This module contains all Cupy specific kernel_tuner functions""" +"""This module contains all Cupy specific kernel_tuner functions.""" from __future__ import print_function - -import logging -import time import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.cupy import CupyRuntimeObserver - # embedded in try block to be able to generate documentation # and run tests without cupy installed try: @@ -19,10 +15,10 @@ class CupyFunctions(GPUBackend): - """Class that groups the Cupy functions on maintains state about the device""" + """Class that groups the Cupy functions on maintains state about the device.""" def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): - """instantiate CupyFunctions object used for interacting with the CUDA device + """Instantiate CupyFunctions object used for interacting with the CUDA device. Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -39,8 +35,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.texrefs = [] if not cp: raise ImportError( - "Error: cupy not installed, please install e.g. " - + "using 'pip install cupy', please check https://github.com/cupy/cupy." + "cupy not installed, install using 'pip install cupy', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) # select device @@ -88,7 +83,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = env["device_name"] def ready_argument_list(self, arguments): - """ready argument list to be passed to the kernel, allocates gpu mem + """Ready argument list to be passed to the kernel, allocates gpu mem. :param arguments: List of arguments to be passed to the kernel. The order should match the argument list on the CUDA kernel. @@ -111,7 +106,7 @@ def ready_argument_list(self, arguments): return gpu_args def compile(self, kernel_instance): - """call the CUDA compiler to compile the kernel, return the device function + """Call the CUDA compiler to compile the kernel, return the device function. :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. @@ -140,23 +135,23 @@ def compile(self, kernel_instance): return self.func def start_event(self): - """Records the event that marks the start of a measurement""" + """Records the event that marks the start of a measurement.""" self.start.record(stream=self.stream) def stop_event(self): - """Records the event that marks the end of a measurement""" + """Records the event that marks the end of a measurement.""" self.end.record(stream=self.stream) def kernel_finished(self): - """Returns True if the kernel has finished, False otherwise""" + """Returns True if the kernel has finished, False otherwise.""" return self.end.done def synchronize(self): - """Halts execution until device has finished its tasks""" + """Halts execution until device has finished its tasks.""" self.dev.synchronize() def copy_constant_memory_args(self, cmem_args): - """adds constant memory arguments to the most recently compiled module + """Adds constant memory arguments to the most recently compiled module. :param cmem_args: A dictionary containing the data to be passed to the device constant memory. The format to be used is as follows: A @@ -171,11 +166,11 @@ def copy_constant_memory_args(self, cmem_args): constant_mem[:] = cp.asarray(v) def copy_shared_memory_args(self, smem_args): - """add shared memory arguments to the kernel""" + """Add shared memory arguments to the kernel.""" self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): - """adds texture memory arguments to the most recently compiled module + """Adds texture memory arguments to the most recently compiled module. :param texmem_args: A dictionary containing the data to be passed to the device texture memory. See tune_kernel(). @@ -184,7 +179,7 @@ def copy_texture_memory_args(self, texmem_args): raise NotImplementedError("CuPy backend does not support texture memory") def run_kernel(self, func, gpu_args, threads, grid, stream=None): - """runs the CUDA kernel passed as 'func' + """Runs the CUDA kernel passed as 'func'. :param func: A cupy kernel compiled for this specific kernel configuration :type func: cupy.RawKernel @@ -205,7 +200,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size) def memset(self, allocation, value, size): - """set the memory in allocation to the value in value + """Set the memory in allocation to the value in value. :param allocation: A GPU memory allocation unit :type allocation: cupy.ndarray @@ -220,7 +215,7 @@ def memset(self, allocation, value, size): allocation[:] = value def memcpy_dtoh(self, dest, src): - """perform a device to host memory copy + """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data :type dest: numpy.ndarray @@ -237,7 +232,7 @@ def memcpy_dtoh(self, dest, src): raise ValueError("dest type not supported") def memcpy_htod(self, dest, src): - """perform a host to device memory copy + """Perform a host to device memory copy. :param dest: A GPU memory allocation unit :type dest: cupy.ndarray diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 682f5e6a9..d3c6e1d4a 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -1,20 +1,17 @@ -"""This module contains all HIP specific kernel_tuner functions""" +"""This module contains all HIP specific kernel_tuner functions.""" -import numpy as np import ctypes import ctypes.util -import sys import logging +import numpy as np + from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.hip import HipRuntimeObserver -# embedded in try block to be able to generate documentation -# and run tests without pyhip installed try: from pyhip import hip, hiprtc except ImportError: - print("Not able to import pyhip, check if PYTHONPATH includes PyHIP") hip = None hiprtc = None @@ -35,10 +32,10 @@ hipSuccess = 0 class HipFunctions(GPUBackend): - """Class that groups the HIP functions on maintains state about the device""" + """Class that groups the HIP functions on maintains state about the device.""" def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): - """instantiate HipFunctions object used for interacting with the HIP device + """Instantiate HipFunctions object used for interacting with the HIP device. Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -51,8 +48,13 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + if not hip or not hiprtc: + raise ImportError("Unable to import PyHIP, make sure PYTHONPATH includes PyHIP, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-pyhip.") + + # embedded in try block to be able to generate documentation + # and run tests without pyhip installed logging.debug("HipFunction instantiated") - + self.hipProps = hip.hipGetDeviceProperties(device) self.name = self.hipProps._name.decode('utf-8') @@ -85,13 +87,13 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None def ready_argument_list(self, arguments): - """ready argument list to be passed to the HIP function + """Ready argument list to be passed to the HIP function. :param arguments: List of arguments to be passed to the HIP function. The order should match the argument list on the HIP function. Allowed values are np.ndarray, and/or np.int32, np.float32, and so on. :type arguments: list(numpy objects) - + :returns: Ctypes structure of arguments to be passed to the HIP function. :rtype: ctypes structure """ @@ -109,11 +111,11 @@ def ready_argument_list(self, arguments): hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) ctype_args.append(device_ptr) else: - raise TypeError("unknown dtype for ndarray") - # Convert valid non-array arguments to ctypes + raise TypeError("unknown dtype for ndarray") + # Convert valid non-array arguments to ctypes elif isinstance(arg, np.generic): data_ctypes = dtype_map[dtype_str](arg) - ctype_args.append(data_ctypes) + ctype_args.append(data_ctypes) # Determine the types of the fields in the structure field_types = [type(x) for x in ctype_args] @@ -122,17 +124,17 @@ class ArgListStructure(ctypes.Structure): _fields_ = [(f'field{i}', t) for i, t in enumerate(field_types)] def __getitem__(self, key): return getattr(self, self._fields_[key][0]) - + return ArgListStructure(*ctype_args) - - + + def compile(self, kernel_instance): - """call the HIP compiler to compile the kernel, return the function - + """Call the HIP compiler to compile the kernel, return the function. + :param kernel_instance: An object representing the specific instance of the tunable kernel in the parameter space. :type kernel_instance: kernel_tuner.core.KernelInstance - + :returns: An ctypes function that can be called directly. :rtype: ctypes._FuncPtr """ @@ -144,7 +146,7 @@ def compile(self, kernel_instance): if 'extern "C"' not in kernel_string: kernel_string = 'extern "C" {\n' + kernel_string + "\n}" kernel_ptr = hiprtc.hiprtcCreateProgram(kernel_string, kernel_name, [], []) - + try: #Compile based on device (Not yet tested for non-AMD devices) plat = hip.hipGetPlatformName() @@ -156,7 +158,7 @@ def compile(self, kernel_instance): options_list = [] options_list.extend(self.compiler_options) hiprtc.hiprtcCompileProgram(kernel_ptr, options_list) - + #Get module and kernel from compiled kernel string code = hiprtc.hiprtcGetCode(kernel_ptr) module = hip.hipModuleLoadData(code) @@ -167,36 +169,36 @@ def compile(self, kernel_instance): log = hiprtc.hiprtcGetProgramLog(kernel_ptr) print(log) raise e - + return kernel - + def start_event(self): - """Records the event that marks the start of a measurement""" + """Records the event that marks the start of a measurement.""" logging.debug("HipFunction start_event called") hip.hipEventRecord(self.start, self.stream) def stop_event(self): - """Records the event that marks the end of a measurement""" + """Records the event that marks the end of a measurement.""" logging.debug("HipFunction stop_event called") hip.hipEventRecord(self.end, self.stream) def kernel_finished(self): - """Returns True if the kernel has finished, False otherwise""" + """Returns True if the kernel has finished, False otherwise.""" logging.debug("HipFunction kernel_finished called") - + # Query the status of the event return hip.hipEventQuery(self.end) def synchronize(self): - """Halts execution until device has finished its tasks""" + """Halts execution until device has finished its tasks.""" logging.debug("HipFunction synchronize called") hip.hipDeviceSynchronize() def run_kernel(self, func, gpu_args, threads, grid, stream=None): - """runs the HIP kernel passed as 'func' + """Runs the HIP kernel passed as 'func'. :param func: A HIP kernel compiled for this specific kernel configuration :type func: ctypes pionter @@ -219,15 +221,15 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): if stream is None: stream = self.stream - hip.hipModuleLaunchKernel(func, - grid[0], grid[1], grid[2], + hip.hipModuleLaunchKernel(func, + grid[0], grid[1], grid[2], threads[0], threads[1], threads[2], self.smem_size, stream, gpu_args) def memset(self, allocation, value, size): - """set the memory in allocation to the value in value + """Set the memory in allocation to the value in value. :param allocation: A GPU memory allocation unit :type allocation: ctypes ptr @@ -240,11 +242,11 @@ def memset(self, allocation, value, size): """ logging.debug("HipFunction memset called") - + hip.hipMemset(allocation, value, size) def memcpy_dtoh(self, dest, src): - """perform a device to host memory copy + """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data :type dest: numpy.ndarray @@ -260,7 +262,7 @@ def memcpy_dtoh(self, dest, src): hip.hipMemcpy_dtoh(dest_c, src, dest.nbytes) def memcpy_htod(self, dest, src): - """perform a host to device memory copy + """Perform a host to device memory copy. :param dest: A GPU memory allocation unit :type dest: ctypes ptr @@ -276,7 +278,7 @@ def memcpy_htod(self, dest, src): hip.hipMemcpy_htod(dest, src_c, src.nbytes) def copy_constant_memory_args(self, cmem_args): - """adds constant memory arguments to the most recently compiled module + """Adds constant memory arguments to the most recently compiled module. :param cmem_args: A dictionary containing the data to be passed to the device constant memory. The format to be used is as follows: A @@ -298,12 +300,13 @@ def copy_constant_memory_args(self, cmem_args): hip.hipMemcpy_htod(symbol_ptr, v_c, v.nbytes) def copy_shared_memory_args(self, smem_args): - """add shared memory arguments to the kernel""" + """Add shared memory arguments to the kernel.""" logging.debug("HipFunction copy_shared_memory_args called") self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): + """Copy texture memory arguments. Not yet implemented.""" logging.debug("HipFunction copy_texture_memory_args called") raise NotImplementedError("HIP backend does not support texture memory") diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 32aa8efe5..c6fb73d5e 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -1,4 +1,4 @@ -"""This module contains all NVIDIA cuda-python specific kernel_tuner functions""" +"""This module contains all NVIDIA cuda-python specific kernel_tuner functions.""" import numpy as np from kernel_tuner.backends.backend import GPUBackend @@ -14,10 +14,10 @@ class CudaFunctions(GPUBackend): - """Class that groups the Cuda functions on maintains state about the device""" + """Class that groups the Cuda functions on maintains state about the device.""" def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): - """instantiate CudaFunctions object used for interacting with the CUDA device + """Instantiate CudaFunctions object used for interacting with the CUDA device. Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -38,8 +38,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.texrefs = [] if not cuda: raise ImportError( - "Error: cuda-python not installed, please install e.g. " - + "using 'pip install cuda-python', please check https://github.com/NVIDIA/cuda-python." + "cuda-python not installed, install using 'pip install cuda-python', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) # initialize and select device @@ -113,7 +112,7 @@ def __del__(self): cuda_error_check(err) def ready_argument_list(self, arguments): - """ready argument list to be passed to the kernel, allocates gpu mem + """Ready argument list to be passed to the kernel, allocates gpu mem. :param arguments: List of arguments to be passed to the kernel. The order should match the argument list on the CUDA kernel. @@ -138,7 +137,7 @@ def ready_argument_list(self, arguments): return gpu_args def compile(self, kernel_instance): - """call the CUDA compiler to compile the kernel, return the device function + """Call the CUDA compiler to compile the kernel, return the device function. :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. @@ -203,17 +202,17 @@ def compile(self, kernel_instance): return self.func def start_event(self): - """Records the event that marks the start of a measurement""" + """Records the event that marks the start of a measurement.""" err = cudart.cudaEventRecord(self.start, self.stream) cuda_error_check(err) def stop_event(self): - """Records the event that marks the end of a measurement""" + """Records the event that marks the end of a measurement.""" err = cudart.cudaEventRecord(self.end, self.stream) cuda_error_check(err) def kernel_finished(self): - """Returns True if the kernel has finished, False otherwise""" + """Returns True if the kernel has finished, False otherwise.""" err = cudart.cudaEventQuery(self.end) if err[0] == cudart.cudaError_t.cudaSuccess: return True @@ -222,12 +221,12 @@ def kernel_finished(self): @staticmethod def synchronize(): - """Halts execution until device has finished its tasks""" + """Halts execution until device has finished its tasks.""" err = cudart.cudaDeviceSynchronize() cuda_error_check(err) def copy_constant_memory_args(self, cmem_args): - """adds constant memory arguments to the most recently compiled module + """Adds constant memory arguments to the most recently compiled module. :param cmem_args: A dictionary containing the data to be passed to the device constant memory. The format to be used is as follows: A @@ -243,11 +242,11 @@ def copy_constant_memory_args(self, cmem_args): cuda_error_check(err) def copy_shared_memory_args(self, smem_args): - """add shared memory arguments to the kernel""" + """Add shared memory arguments to the kernel.""" self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): - """adds texture memory arguments to the most recently compiled module + """Adds texture memory arguments to the most recently compiled module. :param texmem_args: A dictionary containing the data to be passed to the device texture memory. See tune_kernel(). @@ -256,7 +255,7 @@ def copy_texture_memory_args(self, texmem_args): raise NotImplementedError("NVIDIA CUDA backend does not support texture memory") def run_kernel(self, func, gpu_args, threads, grid, stream=None): - """runs the CUDA kernel passed as 'func' + """Runs the CUDA kernel passed as 'func'. :param func: A CUDA kernel compiled for this specific kernel configuration :type func: cuda.CUfunction @@ -298,7 +297,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): @staticmethod def memset(allocation, value, size): - """set the memory in allocation to the value in value + """Set the memory in allocation to the value in value. :param allocation: A GPU memory allocation unit :type allocation: cupy.ndarray @@ -315,7 +314,7 @@ def memset(allocation, value, size): @staticmethod def memcpy_dtoh(dest, src): - """perform a device to host memory copy + """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data :type dest: numpy.ndarray @@ -328,7 +327,7 @@ def memcpy_dtoh(dest, src): @staticmethod def memcpy_htod(dest, src): - """perform a host to device memory copy + """Perform a host to device memory copy. :param dest: A GPU memory allocation unit :type dest: cuda.CUdeviceptr diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index eaf37a469..af3be1c00 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -1,6 +1,6 @@ -"""This module contains all OpenCL specific kernel_tuner functions""" +"""This module contains all OpenCL specific kernel_tuner functions.""" from __future__ import print_function -import time + import numpy as np from kernel_tuner.backends.backend import GPUBackend @@ -14,12 +14,12 @@ class OpenCLFunctions(GPUBackend): - """Class that groups the OpenCL functions on maintains some state about the device""" + """Class that groups the OpenCL functions on maintains some state about the device.""" def __init__( self, device=0, platform=0, iterations=7, compiler_options=None, observers=None ): - """Creates OpenCL device context and reads device properties + """Creates OpenCL device context and reads device properties. :param device: The ID of the OpenCL device to use for benchmarking :type device: int @@ -29,7 +29,7 @@ def __init__( """ if not cl: raise ImportError( - "Error: pyopencl not installed, please install e.g. using 'pip install pyopencl'." + "pyopencl not installed, install using 'pip install pyopencl', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#opencl-and-pyopencl." ) self.iterations = iterations @@ -69,7 +69,7 @@ def __init__( self.name = dev.name def ready_argument_list(self, arguments): - """ready argument list to be passed to the kernel, allocates gpu mem + """Ready argument list to be passed to the kernel, allocates gpu mem. :param arguments: List of arguments to be passed to the kernel. The order should match the argument list on the OpenCL kernel. @@ -96,7 +96,7 @@ def ready_argument_list(self, arguments): return gpu_args def compile(self, kernel_instance): - """call the OpenCL compiler to compile the kernel, return the device function + """Call the OpenCL compiler to compile the kernel, return the device function. :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. @@ -115,27 +115,29 @@ def compile(self, kernel_instance): return func def start_event(self): - """Records the event that marks the start of a measurement + """Records the event that marks the start of a measurement. - In OpenCL the event is created when the kernel is launched""" + In OpenCL the event is created when the kernel is launched + """ pass def stop_event(self): - """Records the event that marks the end of a measurement + """Records the event that marks the end of a measurement. - In OpenCL the event is created when the kernel is launched""" + In OpenCL the event is created when the kernel is launched + """ pass def kernel_finished(self): - """Returns True if the kernel has finished, False otherwise""" + """Returns True if the kernel has finished, False otherwise.""" return self.event.get_info(cl.event_info.COMMAND_EXECUTION_STATUS) == 0 def synchronize(self): - """Halts execution until device has finished its tasks""" + """Halts execution until device has finished its tasks.""" self.queue.finish() def run_kernel(self, func, gpu_args, threads, grid): - """runs the OpenCL kernel passed as 'func' + """Runs the OpenCL kernel passed as 'func'. :param func: An OpenCL Kernel :type func: pyopencl.Kernel @@ -158,7 +160,7 @@ def run_kernel(self, func, gpu_args, threads, grid): self.event = func(self.queue, global_size, local_size, *gpu_args) def memset(self, buffer, value, size): - """set the memory in allocation to the value in value + """Set the memory in allocation to the value in value. :param allocation: An OpenCL Buffer to fill :type allocation: pyopencl.Buffer @@ -178,7 +180,7 @@ def memset(self, buffer, value, size): cl.enqueue_copy(self.queue, buffer, src) def memcpy_dtoh(self, dest, src): - """perform a device to host memory copy + """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data :type dest: numpy.ndarray @@ -190,7 +192,7 @@ def memcpy_dtoh(self, dest, src): cl.enqueue_copy(self.queue, dest, src) def memcpy_htod(self, dest, src): - """perform a host to device memory copy + """Perform a host to device memory copy. :param dest: An OpenCL Buffer to copy data from :type dest: pyopencl.Buffer diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 694a63885..981ebe0bc 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -1,14 +1,13 @@ -"""This module contains all CUDA specific kernel_tuner functions""" +"""This module contains all CUDA specific kernel_tuner functions.""" from __future__ import print_function import logging -import time + import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.pycuda import PyCudaRuntimeObserver -from kernel_tuner.observers.nvml import nvml -from kernel_tuner.util import TorchPlaceHolder, SkippableFailure +from kernel_tuner.util import SkippableFailure, TorchPlaceHolder # embedded in try block to be able to generate documentation # and run tests without pycuda installed @@ -41,7 +40,7 @@ def __init__(self): class Holder(drv.PointerHolderBase): - """class to interoperate torch device memory allocations with PyCUDA""" + """class to interoperate torch device memory allocations with PyCUDA.""" def __init__(self, tensor): super(Holder, self).__init__() @@ -53,10 +52,10 @@ def get_pointer(self): class PyCudaFunctions(GPUBackend): - """Class that groups the CUDA functions on maintains state about the device""" + """Class that groups the CUDA functions on maintains state about the device.""" def __init__(self, device=0, iterations=7, compiler_options=None, observers=None): - """instantiate PyCudaFunctions object used for interacting with the CUDA device + """Instantiate PyCudaFunctions object used for interacting with the CUDA device. Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the @@ -74,7 +73,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None # if not PyCuda available, check if mocking before raising exception if not pycuda_available and isinstance(drv, PyCudaPlaceHolder): raise ImportError( - "Error: pycuda not installed, please install e.g. using 'pip install pycuda'." + "pycuda not installed, install using 'pip install pycuda', or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#cuda-and-pycuda." ) drv.init() @@ -154,7 +153,7 @@ def __del__(self): gpu_mem.free() def ready_argument_list(self, arguments): - """ready argument list to be passed to the kernel, allocates gpu mem + """Ready argument list to be passed to the kernel, allocates gpu mem. :param arguments: List of arguments to be passed to the kernel. The order should match the argument list on the CUDA kernel. @@ -186,7 +185,7 @@ def ready_argument_list(self, arguments): return gpu_args def compile(self, kernel_instance): - """call the CUDA compiler to compile the kernel, return the device function + """Call the CUDA compiler to compile the kernel, return the device function. :param kernel_name: The name of the kernel to be compiled, used to lookup the function after compilation. @@ -226,23 +225,23 @@ def compile(self, kernel_instance): raise e def start_event(self): - """Records the event that marks the start of a measurement""" + """Records the event that marks the start of a measurement.""" self.start.record(stream=self.stream) def stop_event(self): - """Records the event that marks the end of a measurement""" + """Records the event that marks the end of a measurement.""" self.end.record(stream=self.stream) def kernel_finished(self): - """Returns True if the kernel has finished, False otherwise""" + """Returns True if the kernel has finished, False otherwise.""" return self.end.query() def synchronize(self): - """Halts execution until device has finished its tasks""" + """Halts execution until device has finished its tasks.""" self.context.synchronize() def copy_constant_memory_args(self, cmem_args): - """adds constant memory arguments to the most recently compiled module + """Adds constant memory arguments to the most recently compiled module. :param cmem_args: A dictionary containing the data to be passed to the device constant memory. The format to be used is as follows: A @@ -263,17 +262,16 @@ def copy_constant_memory_args(self, cmem_args): drv.memcpy_htod(symbol, v) def copy_shared_memory_args(self, smem_args): - """add shared memory arguments to the kernel""" + """Add shared memory arguments to the kernel.""" self.smem_size = smem_args["size"] def copy_texture_memory_args(self, texmem_args): - """adds texture memory arguments to the most recently compiled module + """Adds texture memory arguments to the most recently compiled module. :param texmem_args: A dictionary containing the data to be passed to the device texture memory. See tune_kernel(). :type texmem_args: dict """ - filter_mode_map = { "point": drv.filter_mode.POINT, "linear": drv.filter_mode.LINEAR, @@ -326,7 +324,7 @@ def copy_texture_memory_args(self, texmem_args): tex.set_flags(tex.get_flags() | drv.TRSF_NORMALIZED_COORDINATES) def run_kernel(self, func, gpu_args, threads, grid, stream=None): - """runs the CUDA kernel passed as 'func' + """Runs the CUDA kernel passed as 'func'. :param func: A PyCuda kernel compiled for this specific kernel configuration :type func: pycuda.driver.Function @@ -356,7 +354,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): ) def memset(self, allocation, value, size): - """set the memory in allocation to the value in value + """Set the memory in allocation to the value in value. :param allocation: A GPU memory allocation unit :type allocation: pycuda.driver.DeviceAllocation @@ -371,7 +369,7 @@ def memset(self, allocation, value, size): drv.memset_d8(allocation, value, size) def memcpy_dtoh(self, dest, src): - """perform a device to host memory copy + """Perform a device to host memory copy. :param dest: A numpy array in host memory to store the data :type dest: numpy.ndarray @@ -385,7 +383,7 @@ def memcpy_dtoh(self, dest, src): dest[:] = src def memcpy_htod(self, dest, src): - """perform a host to device memory copy + """Perform a host to device memory copy. :param dest: A GPU memory allocation unit :type dest: pycuda.driver.DeviceAllocation diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py index 72a3cb4fe..f789462e3 100644 --- a/kernel_tuner/observers/hip.py +++ b/kernel_tuner/observers/hip.py @@ -5,15 +5,17 @@ try: from pyhip import hip, hiprtc except ImportError: - print("Not able to import pyhip, check if PYTHONPATH includes PyHIP") hip = None hiprtc = None class HipRuntimeObserver(BenchmarkObserver): - """Observer that measures time using CUDA events during benchmarking""" + """Observer that measures time using CUDA events during benchmarking.""" def __init__(self, dev): + if not hip or not hiprtc: + raise ImportError("Unable to import PyHIP, make sure PYTHONPATH includes PyHIP, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-pyhip.") + self.dev = dev self.stream = dev.stream self.start = dev.start