NVIDIA · leofang · Nov 27, 2024 · Nov 19, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.4
+    hooks:
+      - id: ruff
+        args: [--fix, --show-fixes]
+      - id: ruff-format
+
+default_language_version:
+      python: python3
diff --git a/cuda_bindings/benchmarks/kernels.py b/cuda_bindings/benchmarks/kernels.py
@@ -5,9 +5,9 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-kernel_string = '''\
+kernel_string = """\
 #define ITEM_PARAM(x, T) T x
-#define REP1(x, T)   , ITEM_PARAM(x, T)	
+#define REP1(x, T)   , ITEM_PARAM(x, T)
 #define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
 #define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
 #define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
@@ -160,4 +160,4 @@
     // Do not touch param to prevent compiler from copying
     // the whole structure from const bank to lmem.
 }
-'''
+"""
diff --git a/cuda_bindings/benchmarks/perf_test_utils.py b/cuda_bindings/benchmarks/perf_test_utils.py
@@ -5,27 +5,30 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+import numpy as np
 import pytest
+
 from cuda import cuda, cudart, nvrtc
-import numpy as np
+
 
 def ASSERT_DRV(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
+            raise RuntimeError(f"Cuda Error: {err}")
     elif isinstance(err, cudart.cudaError_t):
         if err != cudart.cudaError_t.cudaSuccess:
-            raise RuntimeError('Cudart Error: {}'.format(err))
+            raise RuntimeError(f"Cudart Error: {err}")
     elif isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError(f"Nvrtc Error: {err}")
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError(f"Unknown error type: {err}")
+
 
 @pytest.fixture
 def init_cuda():
     # Initialize
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, device = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
@@ -38,31 +41,37 @@ def init_cuda():
 
     yield device, ctx, stream
 
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(ctx)
+    (err,) = cuda.cuCtxDestroy(ctx)
     ASSERT_DRV(err)
 
+
 @pytest.fixture
 def load_module():
     module = None
+
     def _load_module(kernel_string, device):
         nonlocal module
         # Get module
-        err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)
+        err, major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
+        )
         ASSERT_DRV(err)
-        err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)
+        err, minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
+        )
         ASSERT_DRV(err)
 
-        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], [])
+        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
         ASSERT_DRV(err)
-        opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')]
-        err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
+        opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
+        (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)
 
         err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
         ASSERT_DRV(err_log)
-        log = b' ' * logSize
-        err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
+        log = b" " * logSize
+        (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
         ASSERT_DRV(err_log)
         result = log.decode()
         if len(result) > 1:
@@ -71,8 +80,8 @@ def _load_module(kernel_string, device):
         ASSERT_DRV(err)
         err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
         ASSERT_DRV(err)
-        cubin = b' ' * cubinSize
-        err, = nvrtc.nvrtcGetCUBIN(prog, cubin)
+        cubin = b" " * cubinSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
         ASSERT_DRV(err)
         cubin = np.char.array(cubin)
         err, module = cuda.cuModuleLoadData(cubin)
@@ -82,5 +91,5 @@ def _load_module(kernel_string, device):
 
     yield _load_module
 
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
diff --git a/cuda_bindings/benchmarks/test_cupy.py b/cuda_bindings/benchmarks/test_cupy.py
@@ -5,42 +5,47 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import pytest
 import ctypes
 
+import pytest
+
 # Always skip since cupy is not CTK 12.x yet
 skip_tests = True
 if not skip_tests:
     try:
         import cupy
+
         skip_tests = False
     except ImportError:
         skip_tests = True
 
 from .kernels import kernel_string
 
+
 def launch(kernel, args=()):
     kernel((1,), (1,), args)
 
+
 # Measure launch latency with no parmaeters
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_empty_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('empty_kernel')
+    kernel = module.get_function("empty_kernel")
 
     stream = cupy.cuda.stream.Stream(non_blocking=True)
 
     with stream:
         benchmark(launch, kernel)
         stream.synchronize()
 
+
 # Measure launch latency with a single parameter
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel')
+    kernel = module.get_function("small_kernel")
     cupy.cuda.set_allocator()
     arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))
 
@@ -50,12 +55,13 @@ def test_launch_latency_small_kernel(benchmark):
         benchmark(launch, kernel, (arg,))
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_args')
+    kernel = module.get_function("small_kernel_512_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -69,12 +75,13 @@ def test_launch_latency_small_kernel_512_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bools(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_bools')
+    kernel = module.get_function("small_kernel_512_bools")
     cupy.cuda.set_allocator()
 
     args = [True] * 512
@@ -86,12 +93,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_doubles(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_doubles')
+    kernel = module.get_function("small_kernel_512_doubles")
     cupy.cuda.set_allocator()
 
     args = [1.2345] * 512
@@ -103,12 +111,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_ints(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_ints')
+    kernel = module.get_function("small_kernel_512_ints")
     cupy.cuda.set_allocator()
 
     args = [123] * 512
@@ -120,12 +129,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bytes(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_chars')
+    kernel = module.get_function("small_kernel_512_chars")
     cupy.cuda.set_allocator()
 
     args = [127] * 512
@@ -137,12 +147,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_longlongs(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_longlongs')
+    kernel = module.get_function("small_kernel_512_longlongs")
     cupy.cuda.set_allocator()
 
     args = [9223372036854775806] * 512
@@ -154,12 +165,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_256_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_256_args')
+    kernel = module.get_function("small_kernel_256_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -173,12 +185,13 @@ def test_launch_latency_small_kernel_256_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_16_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_16_args')
+    kernel = module.get_function("small_kernel_16_args")
     cupy.cuda.set_allocator()
 
     args = []