Skip to content

Commit

Permalink
Remove autotune one_matrix_per_load_for_bt
Browse files Browse the repository at this point in the history
Signed-off-by: Whitney Tsang <[email protected]>
  • Loading branch information
whitneywhtsang committed Dec 17, 2024
1 parent 571b5b8 commit 5d939ab
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 118 deletions.
113 changes: 0 additions & 113 deletions .github/workflows/triton-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,105 +138,6 @@ jobs:
cd benchmarks
python setup.py install
- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv
source ../../scripts/capture-hw-details.sh
TAG="${TAG}-adv"
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM (A@B^t) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
run: |
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
- name: Run Triton GEMM (A^t@B) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
run: |
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
- name: Run Triton GEMM (stream-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_streamk_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-xetla-report.csv --benchmark gemm-streamk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM (split-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_splitk_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-xetla-report.csv --benchmark gemm-splitk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
run: |
Expand All @@ -259,20 +160,6 @@ jobs:
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
run: |
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
run: |
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS
- name: Save pip cache
if: ${{ steps.pip-cache.outputs.status == 'miss' }}
uses: ./.github/actions/save
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out, #


configs = [
triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \
triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \
for BM in [128, 256] \
for BN in [32, 64] \
for s in [3, 4] \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -609,10 +609,10 @@ void MatchTargetSizePass::initNativeOperationSizes(Workload workload) {
nativeSizes.setDotShape(32, {8, 16, 8});

nativeSizes.setBlockMemShape(8, {16, 64, 32, 32});
if (workload == Workload::Attention)
nativeSizes.setBlockMemShape(16, {32, 32, 32, 16});
else
nativeSizes.setBlockMemShape(16, {32, 32, 32, 32});
//if (workload == Workload::Attention)
// nativeSizes.setBlockMemShape(16, {32, 32, 32, 16});
//else
nativeSizes.setBlockMemShape(16, {32, 32, 32, 32});

nativeSizes.setBlockMemShape(32, {8, 8, 8, 16});

Expand Down

0 comments on commit 5d939ab

Please sign in to comment.