Triton benchmarks #183
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Triton benchmarks | |
on: | |
workflow_dispatch: | |
inputs: | |
runner_label: | |
description: Runner label, keep empty for default | |
type: string | |
default: "" | |
schedule: | |
- cron: "5 23 * * *" | |
permissions: read-all | |
env: | |
PYTHON_VERSION: "3.10" | |
jobs: | |
build: | |
name: Triton benchmarks | |
runs-on: | |
- ${{ inputs.runner_label || 'max1550' }} | |
timeout-minutes: 720 | |
defaults: | |
run: | |
shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}" | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Load pip cache | |
id: pip-cache | |
uses: ./.github/actions/load | |
with: | |
path: $HOME/.cache/pip | |
# pip cache per commit id just to minimize network traffic | |
key: pip-$PYTHON_VERSION-$GITHUB_SHA | |
- name: Install Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Install Python build dependencies | |
run: | | |
pip install wheel | |
- name: Setup PyTorch | |
uses: ./.github/actions/setup-pytorch | |
- name: Setup IPEX | |
uses: ./.github/actions/setup-ipex | |
- name: Generate Triton cache key | |
id: triton-key | |
run: | | |
COMPOSITE_KEY=$(echo $PYTHON_VERSION $GITHUB_SHA | sha256sum - | cut -d\ -f1) | |
echo "key=triton-$COMPOSITE_KEY" >> $GITHUB_OUTPUT | |
- name: Load Triton wheels from a cache | |
id: triton-cache | |
uses: ./.github/actions/load | |
with: | |
path: python/dist | |
key: ${{ steps.triton-key.outputs.key }} | |
- name: Build Triton wheels | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/setup-triton | |
with: | |
command: DEBUG=1 python setup.py bdist_wheel | |
- name: Install Triton | |
run: | | |
pip install python/dist/*.whl | |
- name: Save Triton wheels to a cache | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.triton-cache.outputs.path }} | |
dest: ${{ steps.triton-cache.outputs.dest }} | |
- name: Install benchmark dependencies | |
run: | | |
pip install matplotlib pandas tabulate | |
- name: Create reports dir | |
run: | | |
mkdir reports | |
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV | |
- name: Install benchmarks | |
run: | | |
cd benchmarks | |
python setup.py install | |
- name: Run triton softmax kernel benchmark | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python fused_softmax.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops-max --hbm_col "Triton-GB/s-max" | |
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops-max --hbm_col "XeTLA-GB/s-max" | |
- name: Run micro benchmark | |
run: | | |
cd benchmarks/micro_benchmarks | |
python run_benchmarks.py --reports $REPORTS | |
- name: Save pip cache | |
if: ${{ steps.pip-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.pip-cache.outputs.path }} | |
dest: ${{ steps.pip-cache.outputs.dest }} | |
- name: Upload benchmark reports | |
uses: actions/upload-artifact@v4 | |
with: | |
name: benchmark-reports | |
path: reports | |
benchmark-attention: | |
name: Benchmark flash attention | |
runs-on: | |
- ${{ inputs.runner_label || 'max1550' }} | |
timeout-minutes: 720 | |
defaults: | |
run: | |
shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}" | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
ref: 'perf_attn' | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
path: llvm-target | |
- name: Load pip cache | |
id: pip-cache | |
uses: ./.github/actions/load | |
with: | |
path: $HOME/.cache/pip | |
# pip cache per commit id just to minimize network traffic | |
key: pip-$PYTHON_VERSION-$GITHUB_SHA | |
- name: Load artifacts cache | |
id: artifacts-cache | |
uses: ./.github/actions/load | |
with: | |
path: artifacts | |
key: artifacts | |
- name: Install Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Install Python build dependencies | |
run: | | |
pip install wheel | |
- name: Setup PyTorch | |
uses: ./.github/actions/setup-pytorch | |
- name: Setup IPEX | |
uses: ./.github/actions/setup-ipex | |
- name: Generate Triton cache key | |
id: triton-key | |
run: | | |
COMPOSITE_KEY=$(echo $PYTHON_VERSION $GITHUB_SHA | sha256sum - | cut -d\ -f1) | |
echo "key=triton-$COMPOSITE_KEY" >> $GITHUB_OUTPUT | |
- name: Load Triton wheels from a cache | |
id: triton-cache | |
uses: ./.github/actions/load | |
with: | |
path: python/dist | |
key: ${{ steps.triton-key.outputs.key }} | |
- name: Build Triton wheels | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./llvm-target/.github/actions/setup-triton | |
with: | |
command: DEBUG=1 python setup.py bdist_wheel | |
- name: Install Triton | |
run: | | |
pip install python/dist/*.whl | |
- name: Save Triton wheels to a cache | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.triton-cache.outputs.path }} | |
dest: ${{ steps.triton-cache.outputs.dest }} | |
- name: Install benchmark dependencies | |
run: | | |
pip install matplotlib pandas tabulate | |
- name: Install a custom libigc from artifacts | |
run: | | |
sudo dpkg -i artifacts/libigc1_1.0.24994.16243-igc+releaseinternal1_amd64.deb | |
- name: Create reports dir | |
run: | | |
mkdir reports | |
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV | |
- name: Run flash attention benchmarks | |
run: | | |
cd python/tutorials | |
bash run_all.sh | |
# This will fix csv file issues, fixing ", "->"," and " " -> "," that exist in raw output | |
cp summary.csv $REPORTS/attention-summary.csv | |
sed -E 's/, /,/g;s/ /,/g' summary.csv > attention-results.csv | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py attention-results.csv $REPORTS/attention-triton-report.csv --benchmark flash_attention --compiler triton --tflops_col max_tflops --param_cols "Z,H,N_CTX,D_HEAD" | |
- name: Save pip cache | |
if: ${{ steps.pip-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.pip-cache.outputs.path }} | |
dest: ${{ steps.pip-cache.outputs.dest }} | |
- name: Upload benchmark reports | |
uses: actions/upload-artifact@v4 | |
with: | |
name: benchmark-attention-reports | |
path: reports | |
benchmark-gemm: | |
name: GEMM benchmarks | |
runs-on: | |
- ${{ inputs.runner_label || 'max1550' }} | |
timeout-minutes: 720 | |
defaults: | |
run: | |
shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}" | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
path: llvm-target | |
- name: Load pip cache | |
id: pip-cache | |
uses: ./.github/actions/load | |
with: | |
path: $HOME/.cache/pip | |
# pip cache per commit id just to minimize network traffic | |
key: pip-$PYTHON_VERSION-$GITHUB_SHA | |
- name: Load artifacts cache | |
id: artifacts-cache | |
uses: ./.github/actions/load | |
with: | |
path: artifacts | |
key: artifacts | |
- name: Install Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Install Python build dependencies | |
run: | | |
pip install wheel | |
- name: Setup PyTorch | |
uses: ./.github/actions/setup-pytorch | |
- name: Setup IPEX | |
uses: ./.github/actions/setup-ipex | |
- name: Generate Triton cache key | |
id: triton-key | |
run: | | |
COMPOSITE_KEY=$(echo $PYTHON_VERSION $GITHUB_SHA | sha256sum - | cut -d\ -f1) | |
echo "key=triton-$COMPOSITE_KEY" >> $GITHUB_OUTPUT | |
- name: Load Triton wheels from a cache | |
id: triton-cache | |
uses: ./.github/actions/load | |
with: | |
path: python/dist | |
key: ${{ steps.triton-key.outputs.key }} | |
- name: Build Triton wheels | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./llvm-target/.github/actions/setup-triton | |
with: | |
command: DEBUG=1 python setup.py bdist_wheel | |
- name: Install Triton | |
run: | | |
pip install python/dist/*.whl | |
- name: Save Triton wheels to a cache | |
if: ${{ steps.triton-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.triton-cache.outputs.path }} | |
dest: ${{ steps.triton-cache.outputs.dest }} | |
- name: Install benchmark dependencies | |
run: | | |
pip install matplotlib pandas tabulate | |
- name: Install a custom libigc from artifacts | |
run: | | |
mkdir libigc1_1.0.24994.16243 | |
sudo dpkg -X artifacts/libigc1_1.0.24994.16243-igc+releaseinternal1_amd64.deb libigc1_1.0.24994.16243 | |
- name: Create reports dir | |
run: | | |
mkdir reports | |
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV | |
- name: Install benchmarks | |
run: | | |
cd benchmarks | |
python setup.py install | |
- name: Run triton gemm kernel benchmark | |
run: | | |
export LD_LIBRARY_PATH=$PWD/libigc1_1.0.24994.16243:$LD_LIBRARY_PATH | |
cd benchmarks/triton_kernels_benchmark | |
rm -rf ./tt_cache | |
TRITON_CACHE_DIR=./tt_cache \ | |
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \ | |
TRITON_INTEL_ADVANCED_PATH=1 \ | |
IGC_VISAOptions=" -TotalGRFNum 256 -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \ | |
IGC_ForcePrefetchToL1Cache=1 \ | |
IGC_VATemp=1 \ | |
UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0 \ | |
IGC_DisableLoopUnroll=1 \ | |
NEO_CACHE_PERSISTENT=0 \ | |
python gemm_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops-max --hbm_col "Triton-GB/s-max" | |
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops-max --hbm_col "XeTLA-GB/s-max" | |
- name: Save pip cache | |
if: ${{ steps.pip-cache.outputs.status == 'miss' }} | |
uses: ./.github/actions/save | |
with: | |
path: ${{ steps.pip-cache.outputs.path }} | |
dest: ${{ steps.pip-cache.outputs.dest }} | |
- name: Upload benchmark reports | |
uses: actions/upload-artifact@v4 | |
with: | |
name: benchmark-gemm-reports | |
path: reports |