diff --git a/.github/workflows/tests_archdetect_nvidia_gpu.yml b/.github/workflows/tests_archdetect_nvidia_gpu.yml new file mode 100644 index 0000000000..8ad5f4fb36 --- /dev/null +++ b/.github/workflows/tests_archdetect_nvidia_gpu.yml @@ -0,0 +1,124 @@ +# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions +name: Tests for accelerator detection (NVIDIA GPU) +on: + push: + pull_request: +permissions: + contents: read # to fetch code (actions/checkout) +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + fake_nvidia_smi_script: + - none # no nvidia-smi command + - no_devices # nvidia-smi command works, but no GPUs available + - 1xa100 # cc80, supported with (atleast) zen2 CPU + - 2xa100 # cc80, supported with (atleast) zen2 CPU + - 4xa100 # cc80, supported with (atleast) zen2 CPU + - cc01 # non-existing GPU + fail-fast: false + steps: + - name: checkout + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + + # we deliberately do not use the eessi/github-action-eessi action, + # because we want to control when the EESSI environment is initialized + - name: Mount EESSI CernVM-FS repository + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + with: + cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb + cvmfs_http_proxy: DIRECT + cvmfs_repositories: software.eessi.io + + - name: test accelerator detection + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + + # put fake nvidia-smi command in place (unless we don't want to) + if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then + tmpdir=$(mktemp -d) + ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi + export PATH=$tmpdir:$PATH + fi + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then + + echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'" + + # run full EESSI init script, which pick up on the accelerator (if available) + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then + + pattern="archdetect could not detect any accelerators" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="archdetect found supported accelerator" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then + + pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + else + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + fi + + echo ">>> checking last line of init output..." + tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1) + + echo "All checks on init output PASSED" + else + echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2 + exit 1 + fi + + - name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3' + export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80' + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1) + grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1) + + echo "All checks on init output PASSED" diff --git a/init/bash b/init/bash index 4ad09f6a1b..928ac6efdf 100644 --- a/init/bash +++ b/init/bash @@ -29,6 +29,11 @@ if [ $? -eq 0 ]; then show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..." module use $EESSI_SITE_MODULEPATH + if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then + show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..." + module use $EESSI_MODULEPATH_ACCEL + fi + #show_msg "" #show_msg "*** Known problems in the ${EESSI_VERSION} software stack ***" #show_msg "" diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index ad6dce6f9a..2b1534ce62 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -17,7 +17,7 @@ else exit 1 fi -VERSION="1.1.0" +VERSION="1.2.0" # default log level: only emit warnings or errors LOG_LEVEL="WARN" @@ -150,8 +150,45 @@ cpupath(){ fi } +accelpath() { + # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it + log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " + if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then + if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/nvidia/cc[0-9][0-9]$ ]]; then + echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE} + return 0 + else + log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9[0-9]', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" + fi + return 0 + fi + + # check for NVIDIA GPUs via nvidia-smi command + nvidia_smi=$(command -v nvidia-smi) + if [[ $? -eq 0 ]]; then + log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" + nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) + nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out + if [[ $? -eq 0 ]]; then + nvidia_smi_info=$(head -1 $nvidia_smi_out) + cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" + res="accel/nvidia/cc${cuda_cc}" + log "DEBUG" "accelpath: result: ${res}" + echo $res + rm -f $nvidia_smi_out + else + log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + exit 3 + fi + else + log "DEBUG" "accelpath: nvidia-smi command not found" + exit 2 + fi +} + # Parse command line arguments -USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " +USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " while getopts 'hdva' OPTION; do case "$OPTION" in @@ -168,5 +205,6 @@ ARGUMENT=${1:-none} case "$ARGUMENT" in "cpupath") cpupath; exit;; - *) echo "$USAGE"; log "ERROR" "Missing argument (possible actions: 'cpupath')";; + "accelpath") accelpath; exit;; + *) echo "$USAGE"; log "ERROR" "Missing argument (possible actions: 'cpupath', 'accelpath')";; esac diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 8c10b1fca8..ab4894fb16 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -38,6 +38,45 @@ if [ -d $EESSI_PREFIX ]; then break fi done + + # we need to make sure that errexit shell option (set -e) is not enabled, + # since archdetect will produce non-zero exit code if no accelerator was found + if [[ "$-" =~ e ]]; then + errexit_shell_option_set='yes' + set +e + else + errexit_shell_option_set='no' + fi + + # to be able to grab exit code of archdetect trying to detect accelerators, + # we can not run it via $(...), so we have to redirect the output to a temporary file + tmpout=$(mktemp) + ${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh accelpath 2>&1 > $tmpout + accelpath_exit_code=$? + + if [[ "$errexit_shell_option_set" == "yes" ]]; then + set -e + fi + + if [[ $accelpath_exit_code -eq 0 ]]; then + export EESSI_ACCEL_SUBDIR=$(tail -1 $tmpout && rm -f $tmpout) + if [ -z ${EESSI_ACCEL_SUBDIR} ]; then + error "accelerator detection with archdetect worked, but no result was returned?!" + else + # allow specifying different parent directory for accel/* subdirectory via $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR} + # path to where accel/* subdirectory is located + EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR} + if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then + show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}" + else + show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})" + fi + fi + else + show_msg "archdetect could not detect any accelerators" + rm -f $tmpout + fi elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3 @@ -106,6 +145,11 @@ if [ -d $EESSI_PREFIX ]; then false fi + if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then + export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} + show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." + fi + # Fix wrong path for RHEL >=8 libcurl # This is required here because we ship curl in our compat layer. If we only provided # curl as a module file we could instead do this via a `modluafooter` in an EasyBuild diff --git a/tests/archdetect/nvidia-smi/1xa100.output b/tests/archdetect/nvidia-smi/1xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/1xa100.sh b/tests/archdetect/nvidia-smi/1xa100.sh new file mode 100755 index 0000000000..ead191418b --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 1, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/2xa100.output b/tests/archdetect/nvidia-smi/2xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/2xa100.sh b/tests/archdetect/nvidia-smi/2xa100.sh new file mode 100755 index 0000000000..5539607fbe --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/4xa100.output b/tests/archdetect/nvidia-smi/4xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/4xa100.sh b/tests/archdetect/nvidia-smi/4xa100.sh new file mode 100755 index 0000000000..45458ea7bd --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/cc01.output b/tests/archdetect/nvidia-smi/cc01.output new file mode 100644 index 0000000000..9cbf66a131 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.output @@ -0,0 +1 @@ +accel/nvidia/cc01 diff --git a/tests/archdetect/nvidia-smi/cc01.sh b/tests/archdetect/nvidia-smi/cc01.sh new file mode 100755 index 0000000000..81011a1d16 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from non-existing NVIDIA GPU system, +# to test handling of unknown GPU model +# (supposedly) produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA does-not-exist, 1, 000.00.00, 0.1" +exit 0 diff --git a/tests/archdetect/nvidia-smi/no_devices.output b/tests/archdetect/nvidia-smi/no_devices.output new file mode 100644 index 0000000000..b251bfc837 --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.output @@ -0,0 +1 @@ +non-zero exit code: 3 diff --git a/tests/archdetect/nvidia-smi/no_devices.sh b/tests/archdetect/nvidia-smi/no_devices.sh new file mode 100755 index 0000000000..0bc26dcddc --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "No devices were found" +exit 6 diff --git a/tests/archdetect/nvidia-smi/none.output b/tests/archdetect/nvidia-smi/none.output new file mode 100644 index 0000000000..e287574cc3 --- /dev/null +++ b/tests/archdetect/nvidia-smi/none.output @@ -0,0 +1 @@ +non-zero exit code: 2