diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py index b85580dcbf..e840d4769c 100644 --- a/benchmarks/triton_kernels_benchmark/benchmark_testing.py +++ b/benchmarks/triton_kernels_benchmark/benchmark_testing.py @@ -37,7 +37,7 @@ def _summarize_statistics(times, quantiles, return_mode): def do_bench_ipex(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, return_mode="mean", device="xpu", - sync_submitting=True, kernel_name=None): # pylint: disable=unused-argument + sync_submitting=True): """ Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. @@ -108,7 +108,7 @@ def extract_kernels(funcs): def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, return_mode="mean", - device="xpu", kernel_name=None): # pylint: disable=unused-argument + device="xpu"): """ Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. @@ -160,7 +160,7 @@ def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quan def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, - return_mode="mean", device="xpu", sync_submitting=True, kernel_name=None): # pylint: disable=unused-argument + return_mode="mean", device="xpu", sync_submitting=True): """ Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py index dc073d0e5c..8604824cda 100644 --- a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py @@ -256,8 +256,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider): ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32) atol = 1e-1 if N_CTX == 16384 else 1e-2 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch') - _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles, - kernel_name='_attn_fwd') + _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles) elif provider == 'xetla': module_name = f'flash_attn_causal_{CAUSAL}'.lower() @@ -272,8 +271,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider): l = torch.empty((size_ml, ), device='xpu', dtype=torch.float) xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX, sm_scale) - _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles, - kernel_name='gpu::xetla::fmha::FmhaForwardKernel<') + _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/fused_softmax.py b/benchmarks/triton_kernels_benchmark/fused_softmax.py index 3f17ac4a55..b12ed819f7 100644 --- a/benchmarks/triton_kernels_benchmark/fused_softmax.py +++ b/benchmarks/triton_kernels_benchmark/fused_softmax.py @@ -131,8 +131,7 @@ def benchmark(M, N, provider): triton_fn = lambda: softmax(x, out) torch_fn = lambda: torch.softmax(x, axis=-1) benchmark_suit.assert_close(triton_fn(), torch_fn(), err_msg="triton to torch") - _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, n_warmup=10, n_repeat=10, - kernel_name="softmax_kernel") + _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, n_warmup=10, n_repeat=10) elif provider == "torch-jit": _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(lambda: naive_softmax(x), quantiles=quantiles, @@ -145,17 +144,7 @@ def benchmark(M, N, provider): xetla_fn = lambda: func(x, out, 0) torch_fn = lambda: torch.softmax(x, axis=-1) # benchmark_suit.assert_close(xetla_fn(), torch_fn(), err_msg="xetla to torch") - kernels_name = { - "softmax_shape_4096_256": "mat1_4096x256_bf16_cfg0", - "softmax_shape_4096_1024": "mat1_4096x1024_bf16_cfg0", - "softmax_shape_4096_2048": "mat1_4096x2048_bf16_cfg0", - "softmax_shape_4096_4096": "mat1_4096x4096_bf16_cfg0", - "softmax_shape_4096_8192": "mat1_4096x8k_bf16_cfg0", - "softmax_shape_4096_16384": "mat1_4096x16k_bf16_cfg0", - "softmax_shape_4096_32768": "mat1_4096x32k_bf16_cfg0", - } - _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, n_warmup=10, n_repeat=10, - kernel_name=kernels_name[name]) + _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, quantiles=quantiles, n_warmup=10, n_repeat=10) else: raise NotImplementedError(f"Unsupported provider {provider}") diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py index 7e0b339dc6..c58313263c 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py @@ -284,7 +284,7 @@ def benchmark(B, M, N, K, provider): # Legacy profiler shows ~6000TFLOPS GeoMean for onednn measurements, so use more reliable method do_bench = do_bench_elapsed_time _, min_ms, max_ms, mean_ms, cv = do_bench(lambda: torch.matmul(torch_a, torch_b), n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name='gemm_kernel') + quantiles=quantiles) elif provider == 'triton': assert len(a.shape) == len(b.shape), 'Incompatible sizes' if len(a.shape) == 3: @@ -297,8 +297,7 @@ def benchmark(B, M, N, K, provider): rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, - kernel_name='matmul_kernel_with_block_pointers') + quantiles=quantiles) elif provider == 'xetla': if B == 1: c = torch.empty((M, N), device='xpu', dtype=torch.float32) @@ -317,37 +316,9 @@ def benchmark(B, M, N, K, provider): xetla_fn = lambda: func(a, b, c, acc, cnt) torch_fn = lambda: torch.matmul(a, b).to(torch.float32) - kernels_name = { - 'gemm_shape_1_1024_1024_1024': 'Test_1x1024x1024x1024_row_row', - 'gemm_shape_1_2048_2048_2048': 'Test_1x2048x2048x2048_row_row', - 'gemm_shape_1_4096_4096_4096': 'Test_1x4096x4096x4096_row_row', - 'gemm_shape_1_8192_8192_8192': 'Test_1x8192x8192x8192_row_row', - 'gemm_shape_1_1_5120_13824': 'Test_1x1x5120x13824_row_row', - 'gemm_shape_1_4_4096_12288': 'Test_1x4x4096x12288_row_row', - 'gemm_shape_1_512_8192_8192': 'Test_1x512x8192x8192_row_row', - 'gemm_shape_1_512_8192_32768': 'Test_1x512x8192x32768_row_row', - 'gemm_shape_1_512_32768_8192': 'Test_1x512x32768x8192_row_row', - 'gemm_shape_1_1024_16384_8192': 'Test_1x1024x16384x8192_row_row', - 'gemm_shape_1_1024_28672_8192': 'Test_1x1024x28672x8192_row_row', - 'gemm_shape_1_3072_4096_3072': 'Test_1x3072x4096x3072_row_row', - 'gemm_shape_1_4096_16384_8192': 'Test_1x4096x16384x8192_row_row', - 'gemm_shape_1_8192_16384_1024': 'Test_1x8192x16384x1024_row_row', - 'gemm_shape_1_8192_16384_4096': 'Test_1x8192x16384x4096_row_row', - 'gemm_shape_1_16384_1024_8192': 'Test_1x16384x1024x8192_row_row', - 'gemm_shape_1_16384_4096_8192': 'Test_1x16384x4096x8192_row_row', - 'gemm_shape_1_16384_8192_1024': 'Test_1x16384x8192x1024_row_row', - 'gemm_shape_1_16384_8192_4096': 'Test_1x16384x8192x4096_row_row', - 'gemm_shape_4_32768_128_4096': 'Test_4x32768x128x4096_row_row', - 'gemm_shape_4_32768_4096_128': 'Test_4x32768x4096x128_row_row', - 'gemm_shape_32_4096_4096_128': 'Test_32x4096x4096x128_row_row', - 'gemm_shape_4096_8_128_16384': 'Test_4096x8x128x16384_row_row', - 'gemm_shape_4096_8_16384_128': 'Test_4096x8x16384x128_row_row', - 'gemm_streamk_shape_3072_4096_3072': 'stream_k_gemm_run', - } - # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name=kernels_name[name]) + quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py index 307100dcfe..cefbd5abc9 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py @@ -266,17 +266,15 @@ def benchmark(B, M, N, K, provider): assert len(a.shape) == len(b.shape), 'Incompatible sizes' if len(a.shape) == 3: c = torch.empty((B, M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers_batched' else: assert len(a.shape) == 2, 'Expecting shape of length 2' c = torch.empty((M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers' triton_fn = lambda: matmul(a, b, d, c) torch_fn = lambda: torch.matmul(a, b).to(torch.float32) + d rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name=kernel_name) + quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py index 85bb594ade..68cec3931e 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_postop_gelu_benchmark.py @@ -268,17 +268,15 @@ def benchmark(B, M, N, K, provider): assert len(a.shape) == len(b.shape), 'Incompatible sizes' if len(a.shape) == 3: c = torch.empty((B, M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers_batched' else: assert len(a.shape) == 2, 'Expecting shape of length 2' c = torch.empty((M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers' triton_fn = lambda: matmul(a, b, c) torch_fn = lambda: torch.nn.functional.gelu(torch.matmul(a, b).to(torch.float32)) rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name=kernel_name) + quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py index 30ed124d44..dd5b57c84f 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py @@ -256,17 +256,15 @@ def benchmark(B, M, N, K, provider): assert len(a.shape) == len(b.shape), 'Incompatible sizes' if len(a.shape) == 3: c = torch.empty((B, M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers_batched' else: assert len(a.shape) == 2, 'Expecting shape of length 2' c = torch.empty((M, N), device='xpu', dtype=torch.float32) - kernel_name = 'matmul_kernel_with_block_pointers' triton_fn = lambda: matmul(a, b, c) torch_fn = lambda: torch.matmul(torch.exp(a), b).to(torch.float32) rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name=kernel_name) + quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py index 4aa1910591..4eb4c2b3e8 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py @@ -156,8 +156,7 @@ def benchmark(M, N, K, provider): torch_fn = lambda: torch.matmul(a, b).to(torch.float32) rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3 benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch') - _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles, - kernel_name='_kernel') + _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py index a495dca749..6969506e65 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py @@ -280,8 +280,7 @@ def benchmark(M, N, K, provider): torch_fn = lambda: torch.matmul(a, b).to(torch.float32) benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=1e-2, err_msg='triton to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, - kernel_name=['first_wave', 'full_tiles']) + quantiles=quantiles) elif provider == 'xetla': c = torch.empty((M, N), device='xpu', dtype=torch.float32) acc = torch.empty((M, N), device='xpu', dtype=torch.float32) @@ -294,7 +293,7 @@ def benchmark(M, N, K, provider): # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch') _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10, - quantiles=quantiles, kernel_name='stream_k_gemm_run') + quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}') diff --git a/benchmarks/triton_kernels_benchmark/prefix_sums.py b/benchmarks/triton_kernels_benchmark/prefix_sums.py index 8f17fb9e9f..bb3d2069f0 100644 --- a/benchmarks/triton_kernels_benchmark/prefix_sums.py +++ b/benchmarks/triton_kernels_benchmark/prefix_sums.py @@ -44,8 +44,7 @@ def benchmark(M, N, AXIS, provider): if provider == 'triton': triton_fn = lambda: scan_kernel[(1, )](x, BLOCK_SIZE_M=M, BLOCK_SIZE_N=N, AXIS=AXIS) - _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles, - kernel_name='scan_kernel') + _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, quantiles=quantiles) else: raise NotImplementedError(f'Unsupported provider {provider}')