Skip to content

Commit

Permalink
add CI workflow for testing NVIDIA GPU accelerator detection
Browse files Browse the repository at this point in the history
  • Loading branch information
boegel committed Oct 6, 2024
1 parent d9bca16 commit 24f0620
Show file tree
Hide file tree
Showing 12 changed files with 166 additions and 0 deletions.
132 changes: 132 additions & 0 deletions .github/workflows/tests_archdetect_nvidia_gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions
name: Tests for accelerator detection (NVIDIA GPU)
on:
push:
pull_request:
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
fake_nvidia_smi_script:
- none # no nvidia-smi command
- no_devices # nvidia-smi command works, but no GPUs available
- 1xa100 # cc80, supported with (atleast) zen2 CPU
- 2xa100 # cc80, supported with (atleast) zen2 CPU
- 4xa100 # cc80, supported with (atleast) zen2 CPU
- cc01 # non-existing GPU
fail-fast: false
steps:
- name: checkout
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0

# we deliberately do not use the eessi/github-action-eessi action,
# because we want to control when the EESSI environment is initialized
- name: Mount EESSI CernVM-FS repository
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
with:
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
cvmfs_http_proxy: DIRECT
cvmfs_repositories: software.eessi.io

- name: test accelerator detection
run: |
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2'
# put fake nvidia-smi command in place (unless we don't want to)
if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then
tmpdir=$(mktemp -d)
ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi
export PATH=$tmpdir:$PATH
fi
# first run with debugging enabled, just to show the output
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?"
# verify output (or exit code if non-zero)
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?")
if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then
echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'"
# by default the 'errexit' option is enabled (set -e),
# which causes trouble when 'eessi_archdetect.sh accelpath'
# fails to detect an accelerator and produces a non-zero exit code,
# so we have to unset it using 'set +e'
echo "set flags: $-"
set +e
echo "set flags after unsetting errexit option: $-"
# run full EESSI init script, which pick up on the accelerator (if available)
echo
. init/bash 2>&1 | tee init.out
echo "-----------------------------------------------------------------------------"
if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then
pattern="archdetect could not detect any accelerators"
echo ">>> checking for pattern '${pattern}' in init output..."
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1)
pattern="archdetect found supported accelerator"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then
pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)"
echo ">>> checking for pattern '${pattern}' in init output..."
grep "${pattern}" init.out || (echo "FAILED 1" || exit 1)
pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH"
echo ">>> checking for lack of pattern '${pattern}' in init output..."
match=$(grep "${pattern}" init.out || true)
test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1)
else
echo ">>> checking for 'accel/nvidia/cc80' in init output..."
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1)
fi
echo ">>> checking last line of init output..."
tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1)
echo "All checks on init output PASSED"
else
echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2
exit 1
fi
- name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE
run: |
export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2'
export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3'
export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80'
# first run with debugging enabled, just to show the output
./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?"
# verify output (or exit code if non-zero)
out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?")
echo
. init/bash 2>&1 | tee init.out
echo "-----------------------------------------------------------------------------"
echo ">>> checking for 'accel/nvidia/cc80' in init output..."
grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1)
grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1)
grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1)
echo "All checks on init output PASSED"
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/1xa100.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accel/nvidia/cc80
5 changes: 5 additions & 0 deletions tests/archdetect/nvidia-smi/1xa100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
# output from NVIDIA A100 system,
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader
echo "NVIDIA A100-SXM4-80GB, 1, 545.23.08, 8.0"
exit 0
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/2xa100.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accel/nvidia/cc80
6 changes: 6 additions & 0 deletions tests/archdetect/nvidia-smi/2xa100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# output from NVIDIA A100 system,
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader
echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0"
echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0"
exit 0
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/4xa100.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accel/nvidia/cc80
8 changes: 8 additions & 0 deletions tests/archdetect/nvidia-smi/4xa100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# output from NVIDIA A100 system,
# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0"
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0"
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0"
echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0"
exit 0
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/cc01.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
accel/nvidia/cc01
6 changes: 6 additions & 0 deletions tests/archdetect/nvidia-smi/cc01.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# output from non-existing NVIDIA GPU system,
# to test handling of unknown GPU model
# (supposedly) produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader
echo "NVIDIA does-not-exist, 1, 000.00.00, 0.1"
exit 0
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/no_devices.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
non-zero exit code: 3
3 changes: 3 additions & 0 deletions tests/archdetect/nvidia-smi/no_devices.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
echo "No devices were found"
exit 6
1 change: 1 addition & 0 deletions tests/archdetect/nvidia-smi/none.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
non-zero exit code: 2

0 comments on commit 24f0620

Please sign in to comment.