diff --git a/tritonbench/components/ncu/__init__.py b/tritonbench/components/ncu/__init__.py index f3b6a8c..2d64aa6 100644 --- a/tritonbench/components/ncu/__init__.py +++ b/tritonbench/components/ncu/__init__.py @@ -65,7 +65,7 @@ def do_bench_in_task( x.grad = None # we clear the L2 cache before run cache.zero_() - with cuda_profiler_range(use_cuda_profiler_range), torch.cuda.nvtx.range( - range_name - ): + with cuda_profiler_range(use_cuda_profiler_range): + nvtx_range_id = torch.cuda.nvtx.range_start(range_name) fn() + torch.cuda.nvtx.range_end(nvtx_range_id) diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py index 78fef3b..b0bc502 100644 --- a/tritonbench/utils/triton_op.py +++ b/tritonbench/utils/triton_op.py @@ -1489,7 +1489,8 @@ def service_exists(service_name): "ncu", "--nvtx", "--nvtx-include", - f"{_RANGE_NAME}/", + # it is for range_start and range_end. no ending /. + f"{_RANGE_NAME}", "--target-processes", "all", "--import-source",