-
Notifications
You must be signed in to change notification settings - Fork 45
240 lines (206 loc) · 11.1 KB
/
triton-benchmarks.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
name: Triton benchmarks
on:
workflow_dispatch:
inputs:
runner_label:
description: Runner label, keep empty for default
type: string
default: ""
tag:
description: Tag for benchmark results
type: string
default: "test"
install_ipex:
description: Install Intel PyTorch Extension
type: boolean
default: true
schedule:
- cron: "5 23 * * *"
permissions: read-all
env:
PYTHON_VERSION: "3.10"
USE_IPEX: ${{ github.event_name == 'schedule' && '1' || inputs.install_ipex && '1' || '0' }}
jobs:
build:
name: Triton benchmarks
runs-on:
- ${{ inputs.runner_label || 'max1550' }}
timeout-minutes: 720
defaults:
run:
shell: bash -noprofile --norc -eo pipefail -c "source /home/runner/intel/oneapi/setvars.sh > /dev/null; source {0}"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Load pip cache
id: pip-cache
uses: ./.github/actions/load
with:
path: $HOME/.cache/pip
# pip cache per commit id just to minimize network traffic
key: pip-$PYTHON_VERSION-$GITHUB_SHA
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Python build dependencies
run: |
pip install wheel cmake
- name: Setup PyTorch with IPEX
if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
uses: ./.github/actions/setup-pytorch
with:
repository: Stonepia/pytorch
- name: Setup PyTorch without IPEX
if: ${{ !(github.event_name == 'schedule' || inputs.install_ipex) }}
uses: ./.github/actions/setup-pytorch
with:
repository: pytorch/pytorch
- name: Setup IPEX
if: ${{ github.event_name == 'schedule' || inputs.install_ipex }}
uses: ./.github/actions/setup-ipex
- name: Build Triton wheels
uses: ./.github/actions/setup-triton
with:
command: DEBUG=1 python setup.py bdist_wheel
- name: Install Triton
run: |
pip install python/dist/*.whl
- name: Install benchmark dependencies
run: |
pip install matplotlib pandas tabulate
- name: Create reports dir
run: |
mkdir reports
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
- name: Install benchmarks
id: install
run: |
cd benchmarks
python setup.py install
- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Default path:
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-dflt
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
# Advanced path:
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra" \
IGC_DisableLoopUnroll=1 \
python gemm_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-adv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - default path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=0 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
IGC_DisableLoopUnroll=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-dflt
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Triton FA kernel benchmark - advanced path
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
TRITON_INTEL_ADVANCED_PATH=1 \
TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
IGC_DisableLoopUnroll=1 \
python flash_attention_fwd_benchmark.py --reports $REPORTS
TAG=${{ inputs.tag || 'ci' }}-adv
source ../../scripts/capture-hw-details.sh
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS
source ../../scripts/capture-hw-details.sh
TAG=${{ inputs.tag || 'ci' }}
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
run: |
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS
- name: Save pip cache
if: ${{ steps.pip-cache.outputs.status == 'miss' }}
uses: ./.github/actions/save
with:
path: ${{ steps.pip-cache.outputs.path }}
dest: ${{ steps.pip-cache.outputs.dest }}
- name: Upload benchmark reports
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-reports
path: reports