diff --git a/.github/workflows/tests_archdetect_nvidia_gpu.yml b/.github/workflows/tests_archdetect_nvidia_gpu.yml new file mode 100644 index 0000000000..d8b5adeb88 --- /dev/null +++ b/.github/workflows/tests_archdetect_nvidia_gpu.yml @@ -0,0 +1,132 @@ +# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions +name: Tests for accelerator detection (NVIDIA GPU) +on: + push: + pull_request: +permissions: + contents: read # to fetch code (actions/checkout) +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + fake_nvidia_smi_script: + - none # no nvidia-smi command + - no_devices # nvidia-smi command works, but no GPUs available + - 1xa100 # cc80, supported with (atleast) zen2 CPU + - 2xa100 # cc80, supported with (atleast) zen2 CPU + - 4xa100 # cc80, supported with (atleast) zen2 CPU + - cc01 # non-existing GPU + fail-fast: false + steps: + - name: checkout + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + + # we deliberately do not use the eessi/github-action-eessi action, + # because we want to control when the EESSI environment is initialized + - name: Mount EESSI CernVM-FS repository + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + with: + cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb + cvmfs_http_proxy: DIRECT + cvmfs_repositories: software.eessi.io + + - name: test accelerator detection + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + + # put fake nvidia-smi command in place (unless we don't want to) + if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then + tmpdir=$(mktemp -d) + ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi + export PATH=$tmpdir:$PATH + fi + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then + + echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'" + + # by default the 'errexit' option is enabled (set -e), + # which causes trouble when 'eessi_archdetect.sh accelpath' + # fails to detect an accelerator and produces a non-zero exit code, + # so we have to unset it using 'set +e' + echo "set flags: $-" + set +e + echo "set flags after unsetting errexit option: $-" + + # run full EESSI init script, which pick up on the accelerator (if available) + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then + + pattern="archdetect could not detect any accelerators" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="archdetect found supported accelerator" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then + + pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + else + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + fi + + echo ">>> checking last line of init output..." + tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1) + + echo "All checks on init output PASSED" + else + echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2 + exit 1 + fi + + - name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3' + export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80' + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1) + grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1) + + echo "All checks on init output PASSED" diff --git a/tests/archdetect/nvidia-smi/1xa100.output b/tests/archdetect/nvidia-smi/1xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/1xa100.sh b/tests/archdetect/nvidia-smi/1xa100.sh new file mode 100755 index 0000000000..ead191418b --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 1, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/2xa100.output b/tests/archdetect/nvidia-smi/2xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/2xa100.sh b/tests/archdetect/nvidia-smi/2xa100.sh new file mode 100755 index 0000000000..5539607fbe --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/4xa100.output b/tests/archdetect/nvidia-smi/4xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/4xa100.sh b/tests/archdetect/nvidia-smi/4xa100.sh new file mode 100755 index 0000000000..45458ea7bd --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/cc01.output b/tests/archdetect/nvidia-smi/cc01.output new file mode 100644 index 0000000000..9cbf66a131 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.output @@ -0,0 +1 @@ +accel/nvidia/cc01 diff --git a/tests/archdetect/nvidia-smi/cc01.sh b/tests/archdetect/nvidia-smi/cc01.sh new file mode 100755 index 0000000000..81011a1d16 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from non-existing NVIDIA GPU system, +# to test handling of unknown GPU model +# (supposedly) produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA does-not-exist, 1, 000.00.00, 0.1" +exit 0 diff --git a/tests/archdetect/nvidia-smi/no_devices.output b/tests/archdetect/nvidia-smi/no_devices.output new file mode 100644 index 0000000000..b251bfc837 --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.output @@ -0,0 +1 @@ +non-zero exit code: 3 diff --git a/tests/archdetect/nvidia-smi/no_devices.sh b/tests/archdetect/nvidia-smi/no_devices.sh new file mode 100755 index 0000000000..0bc26dcddc --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "No devices were found" +exit 6 diff --git a/tests/archdetect/nvidia-smi/none.output b/tests/archdetect/nvidia-smi/none.output new file mode 100644 index 0000000000..e287574cc3 --- /dev/null +++ b/tests/archdetect/nvidia-smi/none.output @@ -0,0 +1 @@ +non-zero exit code: 2