diff --git a/tritonbench/components/ncu/nsys_analyzer.py b/tritonbench/components/ncu/nsys_analyzer.py index dc4364e1..2f50c111 100644 --- a/tritonbench/components/ncu/nsys_analyzer.py +++ b/tritonbench/components/ncu/nsys_analyzer.py @@ -80,7 +80,9 @@ def read_nsys_report( # Define mapping of metrics to their values. The keys must be in nsys_bench_metrics. metrics_map = { - "nsys_kernel_durations": kernel_duration, + # Because tritonbench takes the median of numerical values, we need to convert + # the list of floats to a list of strings. + "nsys_kernel_durations": [str(duration) for duration in kernel_duration], "nsys_kernel_names": kernel_names, "nsys_gpu_kernel_sum": sum_kernel_duration, "nsys_nvtx_range_duration": nvtx_range_duration, diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py index 2184cda0..a03048a5 100644 --- a/tritonbench/utils/triton_op.py +++ b/tritonbench/utils/triton_op.py @@ -67,7 +67,12 @@ class BenchmarkOperatorBackend: REGISTERED_METRICS: Dict[str, List[str]] = {} REGISTERED_X_VALS: Dict[str, str] = {} BASELINE_BENCHMARKS: Dict[str, str] = {} -BASELINE_SKIP_METRICS = {"speedup", "accuracy", "mem_footprint_compression_ratio"} +BASELINE_SKIP_METRICS = { + "speedup", + "accuracy", + "mem_footprint_compression_ratio", + "nsys_gpu_speedup", +} X_ONLY_METRICS = set(["hw_roofline"]) PRECISION_DTYPE_MAPPING = { "fp32": torch.float32, @@ -1094,7 +1099,17 @@ def _init_extra_metrics() -> Dict[str, Any]: ) for metric_name, metric_value in nsys_analyzer_results.items(): metrics.extra_metrics[metric_name] = metric_value - + if "nsys_gpu_speedup" in self.required_metrics: + metrics.nsys_gpu_speedup = ( + self.baseline_metrics.nsys_gpu_kernel_sum + / metrics.nsys_gpu_kernel_sum + if ( + self.baseline_metrics + and self.baseline_metrics.nsys_gpu_kernel_sum + ) + and metrics.nsys_gpu_kernel_sum + else None + ) if "kineto_trace" in self.required_metrics: metrics.kineto_trace = self.kineto_trace(input_id, fn) if "best_config" in self.required_metrics: