From 67bed257d099da38a303a175024c44cbba3a7650 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Mon, 30 Sep 2024 18:28:16 +0200 Subject: [PATCH 1/8] enhance archdetect to support detection of NVIDIA GPUs + using that in EESSI init script --- init/bash | 5 +++++ init/eessi_archdetect.sh | 22 ++++++++++++++++++++-- init/eessi_environment_variables | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/init/bash b/init/bash index 4ad09f6a1b..928ac6efdf 100644 --- a/init/bash +++ b/init/bash @@ -29,6 +29,11 @@ if [ $? -eq 0 ]; then show_msg "Prepending site path $EESSI_SITE_MODULEPATH to \$MODULEPATH..." module use $EESSI_SITE_MODULEPATH + if [ ! -z ${EESSI_MODULEPATH_ACCEL} ]; then + show_msg "Prepending $EESSI_MODULEPATH_ACCEL to \$MODULEPATH..." + module use $EESSI_MODULEPATH_ACCEL + fi + #show_msg "" #show_msg "*** Known problems in the ${EESSI_VERSION} software stack ***" #show_msg "" diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index ad6dce6f9a..76de7cace9 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -150,8 +150,25 @@ cpupath(){ fi } +accelpath() { + # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it + log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " + [ $EESSI_ACCELERATOR_TARGET_OVERRIDE ] && echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE} && exit + + # check for NVIDIA GPUs via nvidia-smi command + nvidia_smi=$(command -v nvidia-smi) + if [[ $? -eq 0 ]]; then + log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" + gpu_info=$(nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | head -1) + cuda_cc=$(echo $gpu_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + echo "accel/nvidia/cc${cuda_cc}" + else + log "DEBUG" "accelpath: nvidia-smi command not found" + fi +} + # Parse command line arguments -USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " +USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " while getopts 'hdva' OPTION; do case "$OPTION" in @@ -168,5 +185,6 @@ ARGUMENT=${1:-none} case "$ARGUMENT" in "cpupath") cpupath; exit;; - *) echo "$USAGE"; log "ERROR" "Missing argument (possible actions: 'cpupath')";; + "accelpath") accelpath; exit;; + *) echo "$USAGE"; log "ERROR" "Missing argument (possible actions: 'cpupath', 'accelpath')";; esac diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 8c10b1fca8..aad5fe5003 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -38,6 +38,14 @@ if [ -d $EESSI_PREFIX ]; then break fi done + export EESSI_ACCEL_PATH=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh accelpath) + if [ -z ${EESSI_ACCEL_PATH} ]; then + show_msg "archdetect could not find any accelerators" + else + if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR}/${EESSI_ACCEL_PATH} ]; then + show_msg "archdetect found accelerator: ${EESSI_ACCEL_PATH}" + fi + fi elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3 @@ -106,6 +114,12 @@ if [ -d $EESSI_PREFIX ]; then false fi + EESSI_MODULEPATH_ACCEL=${EESSI_SOFTWARE_PATH}/${EESSI_ACCEL_PATH}/${EESSI_MODULE_SUBDIR} + if [ -d ${EESSI_MODULEPATH_ACCEL} ]; then + export EESSI_MODULEPATH_ACCEL=${EESSI_MODULEPATH_ACCEL} + show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." + fi + # Fix wrong path for RHEL >=8 libcurl # This is required here because we ship curl in our compat layer. If we only provided # curl as a module file we could instead do this via a `modluafooter` in an EasyBuild From 65710351ec20e715c074c13132851169b4a1419f Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 3 Oct 2024 11:11:35 +0200 Subject: [PATCH 2/8] introduce $EESSI_GPU_SOFTWARE_PATH which can be overriden via $EESSI_GPU_SOFTWARE_SUBDIR_OVERRIDE Co-authored-by: ocaisa --- init/eessi_environment_variables | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index aad5fe5003..aa066b924c 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -42,7 +42,9 @@ if [ -d $EESSI_PREFIX ]; then if [ -z ${EESSI_ACCEL_PATH} ]; then show_msg "archdetect could not find any accelerators" else - if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR}/${EESSI_ACCEL_PATH} ]; then + EESSI_GPU_SOFTWARE_SUBDIR=${$EESSI_GPU_SOFTWARE_SUBDIR_OVERRIDE:-${EESSI_SOFTWARE_SUBDIR}} + EESSI_GPU_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_GPU_SOFTWARE_SUBDIR} + if [ -d ${EESSI_GPU_SOFTWARE_PATH}/${EESSI_ACCEL_PATH} ]; then show_msg "archdetect found accelerator: ${EESSI_ACCEL_PATH}" fi fi @@ -114,7 +116,7 @@ if [ -d $EESSI_PREFIX ]; then false fi - EESSI_MODULEPATH_ACCEL=${EESSI_SOFTWARE_PATH}/${EESSI_ACCEL_PATH}/${EESSI_MODULE_SUBDIR} + EESSI_MODULEPATH_ACCEL=${EESSI_GPU_SOFTWARE_PATH}/${EESSI_ACCEL_PATH}/${EESSI_MODULE_SUBDIR} if [ -d ${EESSI_MODULEPATH_ACCEL} ]; then export EESSI_MODULEPATH_ACCEL=${EESSI_MODULEPATH_ACCEL} show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." From 228eaa29372ae5f46b9b7d56ab505a86a5a01025 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 3 Oct 2024 19:29:54 +0200 Subject: [PATCH 3/8] use non-zero exit code in archdetect if nvidia-smi was not found or failed to run + take that into account in EESSI init script + allow overriding software subdirectory for accel/* via $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE --- init/eessi_archdetect.sh | 19 +++++++++++++++---- init/eessi_environment_variables | 32 +++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 76de7cace9..be3154a62f 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -17,7 +17,7 @@ else exit 1 fi -VERSION="1.1.0" +VERSION="1.2.0" # default log level: only emit warnings or errors LOG_LEVEL="WARN" @@ -159,11 +159,22 @@ accelpath() { nvidia_smi=$(command -v nvidia-smi) if [[ $? -eq 0 ]]; then log "DEBUG" "accelpath: nvidia-smi command found @ ${nvidia_smi}" - gpu_info=$(nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader | head -1) - cuda_cc=$(echo $gpu_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') - echo "accel/nvidia/cc${cuda_cc}" + nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) + nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out + if [[ $? -eq 0 ]]; then + nvidia_smi_info=$(head -1 $nvidia_smi_out) + cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'" + res="accel/nvidia/cc${cuda_cc}" + log "DEBUG" "accelpath: result: ${res}" + echo $res + rm -f $nvidia_smi_out + else + log "ERROR" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + fi else log "DEBUG" "accelpath: nvidia-smi command not found" + exit 2 fi } diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index aa066b924c..2003949369 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -38,15 +38,29 @@ if [ -d $EESSI_PREFIX ]; then break fi done - export EESSI_ACCEL_PATH=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh accelpath) - if [ -z ${EESSI_ACCEL_PATH} ]; then - show_msg "archdetect could not find any accelerators" - else - EESSI_GPU_SOFTWARE_SUBDIR=${$EESSI_GPU_SOFTWARE_SUBDIR_OVERRIDE:-${EESSI_SOFTWARE_SUBDIR}} - EESSI_GPU_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_GPU_SOFTWARE_SUBDIR} - if [ -d ${EESSI_GPU_SOFTWARE_PATH}/${EESSI_ACCEL_PATH} ]; then - show_msg "archdetect found accelerator: ${EESSI_ACCEL_PATH}" + # to be able to grab exit code of archdetect trying to detect accelerators, + # we can not run it via $(...), so we have to redirect the output to a temporary file + tmpout=$(mktemp) + ${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh accelpath 2>&1 > $tmpout + ec=$? + if [[ $ec -eq 0 ]]; then + export EESSI_ACCEL_SUBDIR=$(tail -1 $tmpout && rm -f $tmpout) + if [ -z ${EESSI_ACCEL_SUBDIR} ]; then + error "accelerator detection with archdetect worked, but no result was returned?!" + else + # allow specifying different parent directory for accel/* subdirectory via $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + EESSI_ACCEL_SOFTWARE_SUBDIR=${EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE:-$EESSI_SOFTWARE_SUBDIR} + # path to where accel/* subdirectory is located + EESSI_ACCEL_SOFTWARE_PATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_ACCEL_SOFTWARE_SUBDIR} + if [ -d $EESSI_ACCEL_SOFTWARE_PATH/${EESSI_ACCEL_SUBDIR} ]; then + show_msg "archdetect found supported accelerator for CPU target ${EESSI_ACCEL_SOFTWARE_SUBDIR}: ${EESSI_ACCEL_SUBDIR}" + else + show_msg "No matching path found in ${EESSI_ACCEL_SOFTWARE_SUBDIR} for accelerator detected by archdetect (${EESSI_ACCEL_SUBDIR})" + fi fi + else + show_msg "archdetect could not detect any accelerators" + rm -f $tmpout fi elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! @@ -116,7 +130,7 @@ if [ -d $EESSI_PREFIX ]; then false fi - EESSI_MODULEPATH_ACCEL=${EESSI_GPU_SOFTWARE_PATH}/${EESSI_ACCEL_PATH}/${EESSI_MODULE_SUBDIR} + EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} if [ -d ${EESSI_MODULEPATH_ACCEL} ]; then export EESSI_MODULEPATH_ACCEL=${EESSI_MODULEPATH_ACCEL} show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." From f823375c315e389f9c4af00e1fe77ecaed2d5a8f Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 3 Oct 2024 19:35:58 +0200 Subject: [PATCH 4/8] sanity check value of $EESSI_ACCELERATOR_TARGET_OVERRIDE, if it is set, must be 'accel/nvidia/cc[0-9][0-9]' --- init/eessi_archdetect.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index be3154a62f..87d47364f7 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -153,7 +153,15 @@ cpupath(){ accelpath() { # If EESSI_ACCELERATOR_TARGET_OVERRIDE is set, use it log "DEBUG" "accelpath: Override variable set as '$EESSI_ACCELERATOR_TARGET_OVERRIDE' " - [ $EESSI_ACCELERATOR_TARGET_OVERRIDE ] && echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE} && exit + if [ ! -z $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then + if [[ "$EESSI_ACCELERATOR_TARGET_OVERRIDE" =~ ^accel/nvidia/cc[0-9][0-9]$ ]]; then + echo ${EESSI_ACCELERATOR_TARGET_OVERRIDE} + return 0 + else + log "ERROR" "Value of \$EESSI_ACCELERATOR_TARGET_OVERRIDE should match 'accel/nvidia/cc[0-9[0-9]', but it does not: '$EESSI_ACCELERATOR_TARGET_OVERRIDE'" + fi + return 0 + fi # check for NVIDIA GPUs via nvidia-smi command nvidia_smi=$(command -v nvidia-smi) From b04cc298e0e593a85e838eb25696dc1ab8962bf9 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 3 Oct 2024 19:54:59 +0200 Subject: [PATCH 5/8] take into account that nvidia-smi command may be available, but fail with 'No devices were found' if no GPUs are available in Slurm job --- init/eessi_archdetect.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 87d47364f7..2b1534ce62 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -178,7 +178,8 @@ accelpath() { echo $res rm -f $nvidia_smi_out else - log "ERROR" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + log "DEBUG" "accelpath: nvidia-smi command failed, see output in $nvidia_smi_out" + exit 3 fi else log "DEBUG" "accelpath: nvidia-smi command not found" From d9bca16a1a0965b6ed2f25ae4b7d07f35168f614 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 3 Oct 2024 19:56:59 +0200 Subject: [PATCH 6/8] only set $EESSI_MODULEPATH_ACCEL if directory exists --- init/eessi_environment_variables | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 2003949369..fb1b731ee1 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -130,9 +130,8 @@ if [ -d $EESSI_PREFIX ]; then false fi - EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} - if [ -d ${EESSI_MODULEPATH_ACCEL} ]; then - export EESSI_MODULEPATH_ACCEL=${EESSI_MODULEPATH_ACCEL} + if [ -d ${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} ]; then + export EESSI_MODULEPATH_ACCEL=${EESSI_ACCEL_SOFTWARE_PATH}/${EESSI_ACCEL_SUBDIR}/${EESSI_MODULE_SUBDIR} show_msg "Using ${EESSI_MODULEPATH_ACCEL} as additional directory (for accelerators) to be added to MODULEPATH." fi From 24f0620e2b0df2af156df907fcd32a873ded5a35 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Sun, 6 Oct 2024 13:45:58 +0200 Subject: [PATCH 7/8] add CI workflow for testing NVIDIA GPU accelerator detection --- .../workflows/tests_archdetect_nvidia_gpu.yml | 132 ++++++++++++++++++ tests/archdetect/nvidia-smi/1xa100.output | 1 + tests/archdetect/nvidia-smi/1xa100.sh | 5 + tests/archdetect/nvidia-smi/2xa100.output | 1 + tests/archdetect/nvidia-smi/2xa100.sh | 6 + tests/archdetect/nvidia-smi/4xa100.output | 1 + tests/archdetect/nvidia-smi/4xa100.sh | 8 ++ tests/archdetect/nvidia-smi/cc01.output | 1 + tests/archdetect/nvidia-smi/cc01.sh | 6 + tests/archdetect/nvidia-smi/no_devices.output | 1 + tests/archdetect/nvidia-smi/no_devices.sh | 3 + tests/archdetect/nvidia-smi/none.output | 1 + 12 files changed, 166 insertions(+) create mode 100644 .github/workflows/tests_archdetect_nvidia_gpu.yml create mode 100644 tests/archdetect/nvidia-smi/1xa100.output create mode 100755 tests/archdetect/nvidia-smi/1xa100.sh create mode 100644 tests/archdetect/nvidia-smi/2xa100.output create mode 100755 tests/archdetect/nvidia-smi/2xa100.sh create mode 100644 tests/archdetect/nvidia-smi/4xa100.output create mode 100755 tests/archdetect/nvidia-smi/4xa100.sh create mode 100644 tests/archdetect/nvidia-smi/cc01.output create mode 100755 tests/archdetect/nvidia-smi/cc01.sh create mode 100644 tests/archdetect/nvidia-smi/no_devices.output create mode 100755 tests/archdetect/nvidia-smi/no_devices.sh create mode 100644 tests/archdetect/nvidia-smi/none.output diff --git a/.github/workflows/tests_archdetect_nvidia_gpu.yml b/.github/workflows/tests_archdetect_nvidia_gpu.yml new file mode 100644 index 0000000000..d8b5adeb88 --- /dev/null +++ b/.github/workflows/tests_archdetect_nvidia_gpu.yml @@ -0,0 +1,132 @@ +# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions +name: Tests for accelerator detection (NVIDIA GPU) +on: + push: + pull_request: +permissions: + contents: read # to fetch code (actions/checkout) +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + fake_nvidia_smi_script: + - none # no nvidia-smi command + - no_devices # nvidia-smi command works, but no GPUs available + - 1xa100 # cc80, supported with (atleast) zen2 CPU + - 2xa100 # cc80, supported with (atleast) zen2 CPU + - 4xa100 # cc80, supported with (atleast) zen2 CPU + - cc01 # non-existing GPU + fail-fast: false + steps: + - name: checkout + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + + # we deliberately do not use the eessi/github-action-eessi action, + # because we want to control when the EESSI environment is initialized + - name: Mount EESSI CernVM-FS repository + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + with: + cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb + cvmfs_http_proxy: DIRECT + cvmfs_repositories: software.eessi.io + + - name: test accelerator detection + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + + # put fake nvidia-smi command in place (unless we don't want to) + if [[ "${{matrix.fake_nvidia_smi_script}}" != "none" ]]; then + tmpdir=$(mktemp -d) + ln -s $PWD/tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.sh $tmpdir/nvidia-smi + export PATH=$tmpdir:$PATH + fi + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + if [[ $out == "$( cat ./tests/archdetect/nvidia-smi/${{matrix.fake_nvidia_smi_script}}.output )" ]]; then + + echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'" + + # by default the 'errexit' option is enabled (set -e), + # which causes trouble when 'eessi_archdetect.sh accelpath' + # fails to detect an accelerator and produces a non-zero exit code, + # so we have to unset it using 'set +e' + echo "set flags: $-" + set +e + echo "set flags after unsetting errexit option: $-" + + # run full EESSI init script, which pick up on the accelerator (if available) + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + if [[ "${{matrix.fake_nvidia_smi_script}}" == "none" ]] || [[ "${{matrix.fake_nvidia_smi_script}}" == "no_devices" ]]; then + + pattern="archdetect could not detect any accelerators" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="archdetect found supported accelerator" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + elif [[ "${{matrix.fake_nvidia_smi_script}}" == "cc01" ]]; then + + pattern="No matching path found in x86_64/amd/zen2 for accelerator detected by archdetect (accel/nvidia/cc01)" + echo ">>> checking for pattern '${pattern}' in init output..." + grep "${pattern}" init.out || (echo "FAILED 1" || exit 1) + + pattern="Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/.*/accel/.*/modules/all to \$MODULEPATH" + echo ">>> checking for lack of pattern '${pattern}' in init output..." + match=$(grep "${pattern}" init.out || true) + test "x${match}" = "x" || (echo "unexpected match found for '${pattern}' in init output" && exit 1) + + else + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + fi + + echo ">>> checking last line of init output..." + tail -1 init.out | grep "Environment set up to use EESSI (2023.06), have fun!" || (echo "FAILED, full init utput:" && cat init.out && exit 1) + + echo "All checks on init output PASSED" + else + echo "Test for '${{matrix.fake_nvidia_smi_script}}' FAILED: '$out'" >&2 + exit 1 + fi + + - name: test accelerator detection under $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE + $EESSI_ACCELERATOR_TARGET_OVERRIDE + run: | + export EESSI_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen2' + export EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE='x86_64/amd/zen3' + export EESSI_ACCELERATOR_TARGET_OVERRIDE='accel/nvidia/cc80' + + # first run with debugging enabled, just to show the output + ./init/eessi_archdetect.sh -d accelpath || echo "non-zero exit code: $?" + + # verify output (or exit code if non-zero) + out=$(./init/eessi_archdetect.sh accelpath || echo "non-zero exit code: $?") + + echo + . init/bash 2>&1 | tee init.out + echo "-----------------------------------------------------------------------------" + + echo ">>> checking for 'accel/nvidia/cc80' in init output..." + grep "archdetect found supported accelerator for CPU target x86_64/amd/zen3: accel/nvidia/cc80" init.out || (echo "FAILED 1" && exit 1) + grep "Using x86_64/amd/zen2 as software subdirectory" init.out || (echo "FAILED 2" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 4" && exit 1) + + echo "All checks on init output PASSED" diff --git a/tests/archdetect/nvidia-smi/1xa100.output b/tests/archdetect/nvidia-smi/1xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/1xa100.sh b/tests/archdetect/nvidia-smi/1xa100.sh new file mode 100755 index 0000000000..ead191418b --- /dev/null +++ b/tests/archdetect/nvidia-smi/1xa100.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 1, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/2xa100.output b/tests/archdetect/nvidia-smi/2xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/2xa100.sh b/tests/archdetect/nvidia-smi/2xa100.sh new file mode 100755 index 0000000000..5539607fbe --- /dev/null +++ b/tests/archdetect/nvidia-smi/2xa100.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 2, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/4xa100.output b/tests/archdetect/nvidia-smi/4xa100.output new file mode 100644 index 0000000000..5eb3aaff18 --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.output @@ -0,0 +1 @@ +accel/nvidia/cc80 diff --git a/tests/archdetect/nvidia-smi/4xa100.sh b/tests/archdetect/nvidia-smi/4xa100.sh new file mode 100755 index 0000000000..45458ea7bd --- /dev/null +++ b/tests/archdetect/nvidia-smi/4xa100.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# output from NVIDIA A100 system, +# produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +echo "NVIDIA A100-SXM4-80GB, 4, 545.23.08, 8.0" +exit 0 diff --git a/tests/archdetect/nvidia-smi/cc01.output b/tests/archdetect/nvidia-smi/cc01.output new file mode 100644 index 0000000000..9cbf66a131 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.output @@ -0,0 +1 @@ +accel/nvidia/cc01 diff --git a/tests/archdetect/nvidia-smi/cc01.sh b/tests/archdetect/nvidia-smi/cc01.sh new file mode 100755 index 0000000000..81011a1d16 --- /dev/null +++ b/tests/archdetect/nvidia-smi/cc01.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# output from non-existing NVIDIA GPU system, +# to test handling of unknown GPU model +# (supposedly) produced by: nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader +echo "NVIDIA does-not-exist, 1, 000.00.00, 0.1" +exit 0 diff --git a/tests/archdetect/nvidia-smi/no_devices.output b/tests/archdetect/nvidia-smi/no_devices.output new file mode 100644 index 0000000000..b251bfc837 --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.output @@ -0,0 +1 @@ +non-zero exit code: 3 diff --git a/tests/archdetect/nvidia-smi/no_devices.sh b/tests/archdetect/nvidia-smi/no_devices.sh new file mode 100755 index 0000000000..0bc26dcddc --- /dev/null +++ b/tests/archdetect/nvidia-smi/no_devices.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "No devices were found" +exit 6 diff --git a/tests/archdetect/nvidia-smi/none.output b/tests/archdetect/nvidia-smi/none.output new file mode 100644 index 0000000000..e287574cc3 --- /dev/null +++ b/tests/archdetect/nvidia-smi/none.output @@ -0,0 +1 @@ +non-zero exit code: 2 From 3bf3b12e8536e04eaa4027ceaaf68782f41b3733 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Sun, 6 Oct 2024 15:15:24 +0200 Subject: [PATCH 8/8] temporarily disable errexit shell option when calling out to archdetect to detect accelerator --- .../workflows/tests_archdetect_nvidia_gpu.yml | 10 +--------- init/eessi_environment_variables | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests_archdetect_nvidia_gpu.yml b/.github/workflows/tests_archdetect_nvidia_gpu.yml index d8b5adeb88..8ad5f4fb36 100644 --- a/.github/workflows/tests_archdetect_nvidia_gpu.yml +++ b/.github/workflows/tests_archdetect_nvidia_gpu.yml @@ -52,14 +52,6 @@ jobs: echo "Test for '${{matrix.fake_nvidia_smi_script}}' PASSED: '$out'" - # by default the 'errexit' option is enabled (set -e), - # which causes trouble when 'eessi_archdetect.sh accelpath' - # fails to detect an accelerator and produces a non-zero exit code, - # so we have to unset it using 'set +e' - echo "set flags: $-" - set +e - echo "set flags after unsetting errexit option: $-" - # run full EESSI init script, which pick up on the accelerator (if available) echo . init/bash 2>&1 | tee init.out @@ -95,7 +87,7 @@ jobs: else echo ">>> checking for 'accel/nvidia/cc80' in init output..." grep "archdetect found supported accelerator for CPU target x86_64/amd/zen2: accel/nvidia/cc80" init.out || (echo "FAILED 2" && exit 1) - grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) + grep "Prepending /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen2/accel/nvidia/cc80/modules/all to \$MODULEPATH" init.out || (echo "FAILED 3" && exit 1) fi echo ">>> checking last line of init output..." diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index fb1b731ee1..ab4894fb16 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -38,12 +38,27 @@ if [ -d $EESSI_PREFIX ]; then break fi done + + # we need to make sure that errexit shell option (set -e) is not enabled, + # since archdetect will produce non-zero exit code if no accelerator was found + if [[ "$-" =~ e ]]; then + errexit_shell_option_set='yes' + set +e + else + errexit_shell_option_set='no' + fi + # to be able to grab exit code of archdetect trying to detect accelerators, # we can not run it via $(...), so we have to redirect the output to a temporary file tmpout=$(mktemp) ${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh accelpath 2>&1 > $tmpout - ec=$? - if [[ $ec -eq 0 ]]; then + accelpath_exit_code=$? + + if [[ "$errexit_shell_option_set" == "yes" ]]; then + set -e + fi + + if [[ $accelpath_exit_code -eq 0 ]]; then export EESSI_ACCEL_SUBDIR=$(tail -1 $tmpout && rm -f $tmpout) if [ -z ${EESSI_ACCEL_SUBDIR} ]; then error "accelerator detection with archdetect worked, but no result was returned?!"