-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add CI workflow for testing NVIDIA GPU accelerator detection
- Loading branch information
Showing
12 changed files
with
166 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions | ||
name: Tests for accelerator detection (NVIDIA GPU) | ||
on: | ||
push: | ||
pull_request: | ||
permissions: | ||
contents: read # to fetch code (actions/checkout) | ||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
fake_nvidia_smi_script: | ||
- none # no nvidia-smi command | ||
- no_devices # nvidia-smi command works, but no GPUs available | ||
- 1xa100 # cc80, supported with (atleast) zen2 CPU | ||
- 2xa100 # cc80, supported with (atleast) zen2 CPU | ||
- 4xa100 # cc80, supported with (atleast) zen2 CPU | ||
- cc01 # non-existing GPU | ||
fail-fast: false | ||
steps: | ||
- name: checkout | ||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 | ||
|
||
# we deliberately do not use the eessi/github-action-eessi action, | ||
# because we want to control when the EESSI environment is initialized | ||
- name: Mount EESSI CernVM-FS repository | ||
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 | ||
with: | ||
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb | ||
cvmfs_http_proxy: DIRECT | ||
cvmfs_repositories: software.eessi.io | ||
|
||
- name: test accelerator detection | ||
run: | | ||
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' | ||
# put fake nvidia-smi command in place (unless we don't want to) | ||
if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then | ||
tmpdir=$(mktemp -d) | ||
ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi | ||
export PATH=$tmpdir:$PATH | ||
fi | ||
# first run with debugging enabled, just to show the output | ||
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" | ||
# verify output (or exit code if non-zero) | ||
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") | ||
if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then | ||
echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'" | ||
# by default the 'errexit' option is enabled (set -e), | ||
# which causes trouble when 'eessi_archdetect.sh accelpath' | ||
# fails to detect an accelerator and produces a non-zero exit code, | ||
# so we have to unset it using 'set +e' | ||
echo "set flags: $-" | ||
set +e | ||
echo "set flags after unsetting errexit option: $-" | ||
# run full EESSI init script, which pick up on the accelerator (if available) | ||
echo | ||
. init/bash 2>&1 | tee init.out | ||
echo "-----------------------------------------------------------------------------" | ||
if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then | ||
pattern="archdetect could not detect any accelerators" | ||
echo ">>> checking for pattern '${pattern}' in init output..." | ||
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) | ||
pattern="archdetect found supported accelerator" | ||
echo ">>> checking for lack of pattern '${pattern}' in init output..." | ||
match=$(grep "${pattern}" init.out || true) | ||
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) | ||
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" | ||
echo ">>> checking for lack of pattern '${pattern}' in init output..." | ||
match=$(grep "${pattern}" init.out || true) | ||
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) | ||
elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then | ||
pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)" | ||
echo ">>> checking for pattern '${pattern}' in init output..." | ||
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) | ||
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" | ||
echo ">>> checking for lack of pattern '${pattern}' in init output..." | ||
match=$(grep "${pattern}" init.out || true) | ||
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) | ||
else | ||
echo ">>> checking for 'accel/nvidia/cc80' in init output..." | ||
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1) | ||
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) | ||
fi | ||
echo ">>> checking last line of init output..." | ||
tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1) | ||
echo "All checks on init output PASSED" | ||
else | ||
echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2 | ||
exit 1 | ||
fi | ||
- name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE | ||
run: | | ||
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' | ||
export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3' | ||
export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80' | ||
# first run with debugging enabled, just to show the output | ||
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" | ||
# verify output (or exit code if non-zero) | ||
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") | ||
echo | ||
. init/bash 2>&1 | tee init.out | ||
echo "-----------------------------------------------------------------------------" | ||
echo ">>> checking for 'accel/nvidia/cc80' in init output..." | ||
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1) | ||
grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1) | ||
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) | ||
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1) | ||
echo "All checks on init output PASSED" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
accel/nvidia/cc80 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
# output from NVIDIA A100 system, | ||
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | ||
echo "NVIDIA A100-SXM4-80GB, 1, 545.23.08, 8.0" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
accel/nvidia/cc80 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/bash | ||
# output from NVIDIA A100 system, | ||
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | ||
echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" | ||
echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
accel/nvidia/cc80 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
# output from NVIDIA A100 system, | ||
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | ||
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" | ||
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" | ||
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" | ||
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
accel/nvidia/cc01 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/bash | ||
# output from non-existing NVIDIA GPU system, | ||
# to test handling of unknown GPU model | ||
# (supposedly) produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | ||
echo "NVIDIA does-not-exist, 1, 000.00.00, 0.1" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
non-zero exit code: 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
echo "No devices were found" | ||
exit 6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
non-zero exit code: 2 |