Skip to content

Commit

Permalink
re-run ruff linter and formatter on updated main
Browse files Browse the repository at this point in the history
  • Loading branch information
ksimpson-work committed Nov 19, 2024
1 parent e36241a commit 5e950f7
Show file tree
Hide file tree
Showing 64 changed files with 3,561 additions and 2,427 deletions.
12 changes: 12 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.4
hooks:
- id: ruff
args: [--fix, --show-fixes]
- id: ruff-format

default_language_version:
python: python3
44 changes: 27 additions & 17 deletions continuous_integration/scripts/render-template.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import argparse
import json
from jinja2 import Environment, FileSystemLoader
import os
import re


# TODO: make this work for arbitrary context. ie. implement replace_using_context()
def replace_placeholder(source_str, variable_name, variable_value):
# Escaping any regex special characters in variable_name
Expand All @@ -14,39 +14,49 @@ def replace_placeholder(source_str, variable_name, variable_value):
# Using regular expression to replace ${variable_name} with actual variable_value
# \s* means any amount of whitespace (including none)
# pattern = rf'\$\{{\s*\{{\s*{variable_name_escaped}\s*\}}\s*\}}'
pattern = rf'<<\s*{variable_name_escaped}\s*>>'
pattern = rf"<<\s*{variable_name_escaped}\s*>>"
return re.sub(pattern, variable_value.strip(), source_str)


# Setup command-line argument parsing
parser = argparse.ArgumentParser(description='Render a Jinja2 template using a JSON context.')
parser.add_argument('template_file', type=str, help='Path to the Jinja2 template file (with .j2 extension).')
parser.add_argument('json_file', type=str, help='Path to the JSON file to use as the rendering context.')
parser.add_argument('output_file', type=str, help='Path to the output file.')
parser = argparse.ArgumentParser(
description="Render a Jinja2 template using a JSON context."
)
parser.add_argument(
"template_file",
type=str,
help="Path to the Jinja2 template file (with .j2 extension).",
)
parser.add_argument(
"json_file", type=str, help="Path to the JSON file to use as the rendering context."
)
parser.add_argument("output_file", type=str, help="Path to the output file.")

args = parser.parse_args()

# Load JSON file as the rendering context
with open(args.json_file, 'r') as file:
with open(args.json_file, "r") as file:
context = json.load(file)

# Setup Jinja2 environment and load the template
env = Environment(
loader=FileSystemLoader(searchpath='./'),
variable_start_string='<<',
variable_end_string='>>',
block_start_string='<%',
block_end_string='%>',
comment_start_string='<#',
comment_end_string='#>')
env.filters['replace_placeholder'] = replace_placeholder
loader=FileSystemLoader(searchpath="./"),
variable_start_string="<<",
variable_end_string=">>",
block_start_string="<%",
block_end_string="%>",
comment_start_string="<#",
comment_end_string="#>",
)
env.filters["replace_placeholder"] = replace_placeholder

template = env.get_template(args.template_file)

# Render the template with the context
rendered_content = template.render(context)
# print(rendered_content)

with open(args.output_file, 'w') as file:
with open(args.output_file, "w") as file:
file.write(rendered_content)

print(f'Template rendered successfully. Output saved to {args.output_file}')
print(f"Template rendered successfully. Output saved to {args.output_file}")
6 changes: 3 additions & 3 deletions cuda_bindings/benchmarks/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
kernel_string = '''\
kernel_string = """\
#define ITEM_PARAM(x, T) T x
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
Expand Down Expand Up @@ -160,4 +160,4 @@
// Do not touch param to prevent compiler from copying
// the whole structure from const bank to lmem.
}
'''
"""
45 changes: 27 additions & 18 deletions cuda_bindings/benchmarks/perf_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,30 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
import numpy as np
import pytest

from cuda import cuda, cudart, nvrtc
import numpy as np


def ASSERT_DRV(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
raise RuntimeError(f"Cuda Error: {err}")
elif isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError('Cudart Error: {}'.format(err))
raise RuntimeError(f"Cudart Error: {err}")
elif isinstance(err, nvrtc.nvrtcResult):
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('Nvrtc Error: {}'.format(err))
raise RuntimeError(f"Nvrtc Error: {err}")
else:
raise RuntimeError('Unknown error type: {}'.format(err))
raise RuntimeError(f"Unknown error type: {err}")


@pytest.fixture
def init_cuda():
# Initialize
err, = cuda.cuInit(0)
(err,) = cuda.cuInit(0)
ASSERT_DRV(err)
err, device = cuda.cuDeviceGet(0)
ASSERT_DRV(err)
Expand All @@ -38,31 +41,37 @@ def init_cuda():

yield device, ctx, stream

err, = cuda.cuStreamDestroy(stream)
(err,) = cuda.cuStreamDestroy(stream)
ASSERT_DRV(err)
err, = cuda.cuCtxDestroy(ctx)
(err,) = cuda.cuCtxDestroy(ctx)
ASSERT_DRV(err)


@pytest.fixture
def load_module():
module = None

def _load_module(kernel_string, device):
nonlocal module
# Get module
err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)
err, major = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
)
ASSERT_DRV(err)
err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)
err, minor = cuda.cuDeviceGetAttribute(
cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
)
ASSERT_DRV(err)

err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], [])
err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
ASSERT_DRV(err)
opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')]
err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
(err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)

err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
ASSERT_DRV(err_log)
log = b' ' * logSize
err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
log = b" " * logSize
(err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
ASSERT_DRV(err_log)
result = log.decode()
if len(result) > 1:
Expand All @@ -71,8 +80,8 @@ def _load_module(kernel_string, device):
ASSERT_DRV(err)
err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
ASSERT_DRV(err)
cubin = b' ' * cubinSize
err, = nvrtc.nvrtcGetCUBIN(prog, cubin)
cubin = b" " * cubinSize
(err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
ASSERT_DRV(err)
cubin = np.char.array(cubin)
err, module = cuda.cuModuleLoadData(cubin)
Expand All @@ -82,5 +91,5 @@ def _load_module(kernel_string, device):

yield _load_module

err, = cuda.cuModuleUnload(module)
(err,) = cuda.cuModuleUnload(module)
ASSERT_DRV(err)
35 changes: 24 additions & 11 deletions cuda_bindings/benchmarks/test_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,47 @@
# this software. Any use, reproduction, disclosure, or distribution of
# this software and related documentation outside the terms of the EULA
# is strictly prohibited.
import pytest
import ctypes

import pytest

# Always skip since cupy is not CTK 12.x yet
skip_tests = True
if not skip_tests:
try:
import cupy

skip_tests = False
except ImportError:
skip_tests = True

from .kernels import kernel_string


def launch(kernel, args=()):
kernel((1,), (1,), args)


# Measure launch latency with no parmaeters
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_empty_kernel(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('empty_kernel')
kernel = module.get_function("empty_kernel")

stream = cupy.cuda.stream.Stream(non_blocking=True)

with stream:
benchmark(launch, kernel)
stream.synchronize()


# Measure launch latency with a single parameter
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel')
kernel = module.get_function("small_kernel")
cupy.cuda.set_allocator()
arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))

Expand All @@ -50,12 +55,13 @@ def test_launch_latency_small_kernel(benchmark):
benchmark(launch, kernel, (arg,))
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_args')
kernel = module.get_function("small_kernel_512_args")
cupy.cuda.set_allocator()

args = []
Expand All @@ -69,12 +75,13 @@ def test_launch_latency_small_kernel_512_args(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_bools(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_bools')
kernel = module.get_function("small_kernel_512_bools")
cupy.cuda.set_allocator()

args = [True] * 512
Expand All @@ -86,12 +93,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_doubles(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_doubles')
kernel = module.get_function("small_kernel_512_doubles")
cupy.cuda.set_allocator()

args = [1.2345] * 512
Expand All @@ -103,12 +111,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_ints(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_ints')
kernel = module.get_function("small_kernel_512_ints")
cupy.cuda.set_allocator()

args = [123] * 512
Expand All @@ -120,12 +129,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_bytes(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_chars')
kernel = module.get_function("small_kernel_512_chars")
cupy.cuda.set_allocator()

args = [127] * 512
Expand All @@ -137,12 +147,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_512_longlongs(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_512_longlongs')
kernel = module.get_function("small_kernel_512_longlongs")
cupy.cuda.set_allocator()

args = [9223372036854775806] * 512
Expand All @@ -154,12 +165,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_256_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_256_args')
kernel = module.get_function("small_kernel_256_args")
cupy.cuda.set_allocator()

args = []
Expand All @@ -173,12 +185,13 @@ def test_launch_latency_small_kernel_256_args(benchmark):
benchmark(launch, kernel, args)
stream.synchronize()


# Measure launch latency with many parameters using builtin parameter packing
@pytest.mark.skipif(skip_tests, reason="cupy is not installed")
@pytest.mark.benchmark(group="cupy")
def test_launch_latency_small_kernel_16_args(benchmark):
module = cupy.RawModule(code=kernel_string)
kernel = module.get_function('small_kernel_16_args')
kernel = module.get_function("small_kernel_16_args")
cupy.cuda.set_allocator()

args = []
Expand Down
Loading

0 comments on commit 5e950f7

Please sign in to comment.