From 2b30e1f2cca0a60b1dbf9ee0680aa48ba3f260da Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 15 May 2024 17:09:18 +0200 Subject: [PATCH 1/6] Allow overriding the Lmod GPU driver check --- EESSI-install-software.sh | 3 +++ create_lmodsitepackage.py | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 8a5789c2b2..40f0ed8c3d 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -243,6 +243,9 @@ fi # if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) # ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway) +export EESSI_OVERRIDE_GPU_CHECK=1 + # use PR patch file to determine in which easystack files stuff was added changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') if [ -z "${changed_easystacks}" ]; then diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 47aa20e51e..816302f7fc 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -131,8 +131,9 @@ end -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, -- otherwise, refuse to load the requested module and print error message - local haveGpu = mt:haveProperty(simpleName,"arch","gpu") - if haveGpu then + local checkGpu = mt:haveProperty(simpleName,"arch","gpu") + local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK") + if checkGpu and (overrideGpuCheck == nil) then local arch = os.getenv("EESSI_CPU_FAMILY") or "" local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" @@ -141,7 +142,9 @@ if not (cudaDriverExists or singularityCudaExists) then local advice = "which relies on the CUDA runtime environment and driver libraries. " advice = advice .. "In order to be able to use the module, you will need " - advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n" + advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. You can " + advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but " + advice = advice .. "the loaded application will not be able to execute on your system.\\n" advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) else From 7f328675deec19f88969eee91945f44617f68ab6 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 23 May 2024 15:51:36 +0200 Subject: [PATCH 2/6] Use EESSI_OVERRIDE_GPU_CHECK everywhere --- eessi_container.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/eessi_container.sh b/eessi_container.sh index 7d00d1400c..a95a2c87c9 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -464,10 +464,9 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" if [[ "${NVIDIA_MODE}" == "install" ]] ; then - # We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver - # (this works because we build within a container and the LMOD_RC recognises that) - touch ${EESSI_TMPDIR}/libcuda.so - export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so" + # No GPU so we need to "trick" Lmod to allow us to load CUDA modules even without a CUDA driver + # (this variable means EESSI_OVERRIDE_GPU_CHECK=1 will be set inside the container) + export SINGULARITYENV_EESSI_OVERRIDE_GPU_CHECK=1 fi fi fi From df4ac260e209e9ccc2578562845ceb2a328dbbb0 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 23 May 2024 16:05:27 +0200 Subject: [PATCH 3/6] Restrict overridding the GPU driver check to when we know it is valid --- EESSI-install-software.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 40f0ed8c3d..8a5789c2b2 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -243,9 +243,6 @@ fi # if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) # ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh -# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway) -export EESSI_OVERRIDE_GPU_CHECK=1 - # use PR patch file to determine in which easystack files stuff was added changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') if [ -z "${changed_easystacks}" ]; then From bee4b001322a0d22c79642b5b1427f2c397a9f3b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 24 May 2024 10:47:45 +0200 Subject: [PATCH 4/6] Make sure EESSI_OVERRIDE_GPU_CHECK is still set when in prefix shell --- run_in_compat_layer_env.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/run_in_compat_layer_env.sh b/run_in_compat_layer_env.sh index f57c4d0749..b8e9cf979b 100755 --- a/run_in_compat_layer_env.sh +++ b/run_in_compat_layer_env.sh @@ -26,6 +26,9 @@ fi if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then INPUT="export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}; ${INPUT}" fi +if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then + INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}" +fi if [ ! -z ${http_proxy} ]; then INPUT="export http_proxy=${http_proxy}; ${INPUT}" fi From f788ca3ab94ab384ee2e4a98e5b76e2a9317102f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 24 May 2024 11:18:04 +0200 Subject: [PATCH 5/6] Only install NVIDIA drivers if nvidia-smi command exists --- EESSI-install-software.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 8a5789c2b2..7d358e205a 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,6 +17,11 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } +# Function to check if a command exists +function command_exists() { + command -v "$1" >/dev/null 2>&1 +} + function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -238,10 +243,11 @@ else echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi -# Install drivers in host_injections -# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works; -# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) -# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +# Install NVIDIA drivers in host_injections (if they exist) +if command_exists "nvidia-smi"; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +fi # use PR patch file to determine in which easystack files stuff was added changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') From b15fc3d58c270ad3e28727f30e92c78744cbeee9 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Mon, 27 May 2024 10:54:00 +0200 Subject: [PATCH 6/6] Update bot build script to support whether GPU is available or not --- bot/build.sh | 10 +++++++++- scripts/utils.sh | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index dcc61c19d4..c9a362fdca 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -223,7 +223,15 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -BUILD_STEP_ARGS+=("--nvidia" "all") +if command_exists "nvidia-smi"; then + echo "Command 'nvidia-smi' found, using available GPU" + BUILD_STEP_ARGS+=("--nvidia" "all") +else + echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + BUILD_STEP_ARGS+=("--nvidia" "install") +fi +# Retain location for host injections so we don't reinstall CUDA +# (Always need to run the driver installation as available driver may change) if [[ ! -z ${SHARED_FS_PATH} ]]; then BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") fi diff --git a/scripts/utils.sh b/scripts/utils.sh index b2be3f6221..962decd20e 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -78,6 +78,11 @@ function create_directory_structure() { return $return_code } +# Function to check if a command exists +function command_exists() { + command -v "$1" >/dev/null 2>&1 +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2