diff --git a/tritonbench/components/ncu/__init__.py b/tritonbench/components/ncu/__init__.py
index f3b6a8c..2d64aa6 100644
--- a/tritonbench/components/ncu/__init__.py
+++ b/tritonbench/components/ncu/__init__.py
@@ -65,7 +65,7 @@ def do_bench_in_task(
             x.grad = None
     # we clear the L2 cache before run
     cache.zero_()
-    with cuda_profiler_range(use_cuda_profiler_range), torch.cuda.nvtx.range(
-        range_name
-    ):
+    with cuda_profiler_range(use_cuda_profiler_range):
+        nvtx_range_id = torch.cuda.nvtx.range_start(range_name)
         fn()
+        torch.cuda.nvtx.range_end(nvtx_range_id)
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
index 78fef3b..b0bc502 100644
--- a/tritonbench/utils/triton_op.py
+++ b/tritonbench/utils/triton_op.py
@@ -1489,7 +1489,8 @@ def service_exists(service_name):
             "ncu",
             "--nvtx",
             "--nvtx-include",
-            f"{_RANGE_NAME}/",
+            # it is for range_start and range_end. no ending /.
+            f"{_RANGE_NAME}",
             "--target-processes",
             "all",
             "--import-source",