Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ruff linter #201

Merged
merged 11 commits into from
Nov 27, 2024
12 changes: 12 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.4
hooks:
- id: ruff
args: [--fix, --show-fixes]
- id: ruff-format

default_language_version:
python: python3
6 changes: 3 additions & 3 deletions cuda_bindings/benchmarks/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
kernel_string = '''\
kernel_string = """\
#define ITEM_PARAM(x, T) T x
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
Expand Down Expand Up @@ -160,4 +160,4 @@
// Do not touch param to prevent compiler from copying
// the whole structure from const bank to lmem.
}
'''
"""
45 changes: 27 additions & 18 deletions cuda_bindings/benchmarks/perf_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,30 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
import numpy as np
import pytest

from cuda import cuda, cudart, nvrtc
import numpy as np


def ASSERT_DRV(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
raise RuntimeError(f"Cuda Error: {err}")
elif isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError('Cudart Error: {}'.format(err))
raise RuntimeError(f"Cudart Error: {err}")
elif isinstance(err, nvrtc.nvrtcResult):
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('Nvrtc Error: {}'.format(err))
raise RuntimeError(f"Nvrtc Error: {err}")
else:
raise RuntimeError('Unknown error type: {}'.format(err))
raise RuntimeError(f"Unknown error type: {err}")


@pytest.fixture
def init_cuda():
# Initialize
err, = cuda.cuInit(0)
(err,) = cuda.cuInit(0)
ASSERT_DRV(err)
err, device = cuda.cuDeviceGet(0)
ASSERT_DRV(err)
Expand All @@ -38,31 +41,37 @@ def init_cuda():

yield device, ctx, stream

err, = cuda.cuStreamDestroy(stream)
(err,) = cuda.cuStreamDestroy(stream)
ASSERT_DRV(err)
err, = cuda.cuCtxDestroy(ctx)
(err,) = cuda.cuCtxDestroy(ctx)
ASSERT_DRV(err)


@pytest.fixture
def load_module():
module = None

def _load_module(kernel_string, device):
nonlocal module
# Get module
err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)
err, major = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
)
ASSERT_DRV(err)
err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)
err, minor = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
)
ASSERT_DRV(err)

err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], [])
err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
ASSERT_DRV(err)
opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')]
err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
(err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)

err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
ASSERT_DRV(err_log)
log = b' ' * logSize
err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
log = b" " * logSize
(err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
ASSERT_DRV(err_log)
result = log.decode()
if len(result) > 1:
Expand All @@ -71,8 +80,8 @@ def _load_module(kernel_string, device):
ASSERT_DRV(err)
err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
ASSERT_DRV(err)
cubin = b' ' * cubinSize
err, = nvrtc.nvrtcGetCUBIN(prog, cubin)
cubin = b" " * cubinSize
(err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
ASSERT_DRV(err)
cubin = np.char.array(cubin)
err, module = cuda.cuModuleLoadData(cubin)
Expand All @@ -82,5 +91,5 @@ def _load_module(kernel_string, device):

yield _load_module

err, = cuda.cuModuleUnload(module)
(err,) = cuda.cuModuleUnload(module)
ASSERT_DRV(err)
35 changes: 24 additions & 11 deletions cuda_bindings/benchmarks/test_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,47 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
import pytest
import ctypes

import pytest

# Always skip since cupy is not CTK 12.x yet
skip_tests = True
if not skip_tests:
try:
import cupy

leofang marked this conversation as resolved.
Show resolved Hide resolved
skip_tests = False
except ImportError:
skip_tests = True

from .kernels import kernel_string


def launch(kernel, args=()):
kernel((1,), (1,), args)


# Measure launch latency with no parmaeters
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_empty_kernel(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('empty_kernel')
kernel = module.get_function("empty_kernel")

stream = cupy.cuda.stream.Stream(non_blocking=True)

with stream:
benchmark(launch, kernel)
stream.synchronize()


# Measure launch latency with a single parameter
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel')
kernel = module.get_function("small_kernel")
cupy.cuda.set_allocator()
arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))

Expand All @@ -50,12 +55,13 @@ def test_launch_latency_small_kernel(benchmark):
benchmark(launch, kernel, (arg,))
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_args')
kernel = module.get_function("small_kernel_512_args")
cupy.cuda.set_allocator()

args = []
Expand All @@ -69,12 +75,13 @@ def test_launch_latency_small_kernel_512_args(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_bools(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_bools')
kernel = module.get_function("small_kernel_512_bools")
cupy.cuda.set_allocator()

args = [True] * 512
Expand All @@ -86,12 +93,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_doubles(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_doubles')
kernel = module.get_function("small_kernel_512_doubles")
cupy.cuda.set_allocator()

args = [1.2345] * 512
Expand All @@ -103,12 +111,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_ints(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_ints')
kernel = module.get_function("small_kernel_512_ints")
cupy.cuda.set_allocator()

args = [123] * 512
Expand All @@ -120,12 +129,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_bytes(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_chars')
kernel = module.get_function("small_kernel_512_chars")
cupy.cuda.set_allocator()

args = [127] * 512
Expand All @@ -137,12 +147,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_longlongs(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_longlongs')
kernel = module.get_function("small_kernel_512_longlongs")
cupy.cuda.set_allocator()

args = [9223372036854775806] * 512
Expand All @@ -154,12 +165,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_256_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_256_args')
kernel = module.get_function("small_kernel_256_args")
cupy.cuda.set_allocator()

args = []
Expand All @@ -173,12 +185,13 @@ def test_launch_latency_small_kernel_256_args(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_16_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_16_args')
kernel = module.get_function("small_kernel_16_args")
cupy.cuda.set_allocator()

args = []
Expand Down
Loading