.github/workflows/triton-benchmarks.yml

name: Triton benchmarks

on:
  workflow_dispatch:
    inputs:
      runner_label:
        description: Runner label, keep empty for default
        type: string
        default: ""
      tag:
        description: Tag for benchmark results
        type: string
        default: "test"
      install_ipex:
        description: Install Intel PyTorch Extension
        type: boolean
        default: true
  schedule:
    - cron: "5 23 * * *"

permissions: read-all

env:
  PYTHON_VERSION: "3.10"
  USE_IPEX: ${{ github.event_name == 'schedule' && '1' || inputs.install_ipex && '1' || '0' }}

jobs:
  build:
    name: Triton benchmarks
    runs-on:
      - ${{ inputs.runner_label || 'max1550' }}
    timeout-minutes: 720
    defaults:
      run:
        shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}"
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Load pip cache
        id: pip-cache
        uses: ./.github/actions/load
        with:
          path: $HOME/.cache/pip
          # pip cache per commit id just to minimize network traffic
          key: pip-$PYTHON_VERSION-$GITHUB_SHA

      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install Python build dependencies
        run: |
          pip install wheel cmake

      - name: Setup PyTorch with IPEX
        if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
        uses: ./.github/actions/setup-pytorch
        with:
          repository: Stonepia/pytorch

      - name: Setup PyTorch without IPEX
        if: ${{ !(github.event_name == 'schedule' || inputs.install_ipex) }}
        uses: ./.github/actions/setup-pytorch
        with:
          repository: pytorch/pytorch

      - name: Setup IPEX
        if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
        uses: ./.github/actions/setup-ipex

      - name: Build Triton wheels
        uses: ./.github/actions/setup-triton
        with:
          command: DEBUG=1 python setup.py bdist_wheel

      - name: Install Triton
        run: |
          pip install python/dist/*.whl

      - name: Install benchmark dependencies
        run: |
          pip install matplotlib pandas tabulate

      - name: Create reports dir
        run: |
          mkdir reports
          echo "REPORTS=$PWD/reports" >> $GITHUB_ENV

      - name: Install benchmarks
        id: install
        run: |
          cd benchmarks
          python setup.py install

      - name: Run Triton Softmax kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python fused_softmax.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark - default path
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          # Default path:
          TRITON_INTEL_ADVANCED_PATH=0 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra" \
          IGC_DisableLoopUnroll=1 \
          python gemm_benchmark.py --reports $REPORTS

          TAG=${{ inputs.tag || 'ci' }}-dflt
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark - advanced path
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          # Advanced path:
          TRITON_INTEL_ADVANCED_PATH=1 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra" \
          IGC_DisableLoopUnroll=1 \
          python gemm_benchmark.py --reports $REPORTS

          TAG=${{ inputs.tag || 'ci' }}-adv
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PreOp (exp) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_preop_exp_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_postop_gelu_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          TAG=${{ inputs.tag || 'ci' }}
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark - default path
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRITON_INTEL_ADVANCED_PATH=0 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
          IGC_DisableLoopUnroll=1 \
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          TAG=${{ inputs.tag || 'ci' }}-dflt
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark - advanced path
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRITON_INTEL_ADVANCED_PATH=1 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
          IGC_DisableLoopUnroll=1 \
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          TAG=${{ inputs.tag || 'ci' }}-adv
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Prefix Sums kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python prefix_sums.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          TAG=${{ inputs.tag || 'ci' }}
          python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run micro benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        run: |
          cd benchmarks/micro_benchmarks
          python run_benchmarks.py --reports $REPORTS

      - name: Save pip cache
        if: ${{ steps.pip-cache.outputs.status == 'miss' }}
        uses: ./.github/actions/save
        with:
          path: ${{ steps.pip-cache.outputs.path }}
          dest: ${{ steps.pip-cache.outputs.dest }}

      - name: Upload benchmark reports
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-reports
          path: reports