From 5d939abffab479391d03e8a2b60ee5bda0c2e17a Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Tue, 17 Dec 2024 21:22:46 +0000 Subject: [PATCH] Remove autotune one_matrix_per_load_for_bt Signed-off-by: Whitney Tsang --- .github/workflows/triton-benchmarks.yml | 113 ------------------ .../flash_attention_fwd_benchmark.py | 2 +- .../MatchTargetSize.cpp | 8 +- 3 files changed, 5 insertions(+), 118 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 659eedee72..9752b0200d 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -138,105 +138,6 @@ jobs: cd benchmarks python setup.py install - - name: Run Triton Softmax kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python fused_softmax.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv - - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }} - run: | - cd benchmarks/triton_kernels_benchmark - # Advanced path: - TRITON_INTEL_ADVANCED_PATH=1 \ - IGC_VISAOptions=" -enableBCR -nolocalra" \ - IGC_DisableLoopUnroll=1 \ - python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv - - source ../../scripts/capture-hw-details.sh - TAG="${TAG}-adv" - python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM (A@B^t) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} - run: | - cd benchmarks/triton_kernels_benchmark - TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv - source ../../scripts/capture-hw-details.sh - - python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - - - name: Run Triton GEMM (A^t@B) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} - run: | - cd benchmarks/triton_kernels_benchmark - TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS - mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv - source ../../scripts/capture-hw-details.sh - - python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - - - name: Run Triton GEMM (stream-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_streamk_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-xetla-report.csv --benchmark gemm-streamk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM (split-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_splitk_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-xetla-report.csv --benchmark gemm-splitk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - - - name: Run Triton GEMM + PreOp (exp) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_preop_exp_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_postop_gelu_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python gemm_postop_addmatrix_benchmark.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - name: Run Triton FA kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }} run: | @@ -259,20 +160,6 @@ jobs: source ../../scripts/capture-hw-details.sh python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - name: Run Prefix Sums kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} - run: | - cd benchmarks/triton_kernels_benchmark - python prefix_sums.py --reports $REPORTS - source ../../scripts/capture-hw-details.sh - python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - - - name: Run micro benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} - run: | - cd benchmarks/micro_benchmarks - python run_benchmarks.py --reports $REPORTS - - name: Save pip cache if: ${{ steps.pip-cache.outputs.status == 'miss' }} uses: ./.github/actions/save diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py index efb4987cb5..5e2a47b2bc 100644 --- a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py @@ -157,7 +157,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out, # configs = [ - triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \ + triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \ for BM in [128, 256] \ for BN in [32, 64] \ for s in [3, 4] \ diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp index 1e92cd777b..19014e4023 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp @@ -609,10 +609,10 @@ void MatchTargetSizePass::initNativeOperationSizes(Workload workload) { nativeSizes.setDotShape(32, {8, 16, 8}); nativeSizes.setBlockMemShape(8, {16, 64, 32, 32}); - if (workload == Workload::Attention) - nativeSizes.setBlockMemShape(16, {32, 32, 32, 16}); - else - nativeSizes.setBlockMemShape(16, {32, 32, 32, 32}); + //if (workload == Workload::Attention) + // nativeSizes.setBlockMemShape(16, {32, 32, 32, 16}); + //else + nativeSizes.setBlockMemShape(16, {32, 32, 32, 32}); nativeSizes.setBlockMemShape(32, {8, 8, 8, 16});