Skip to content

Commit

Permalink
Merge pull request #579 from ocaisa/override_lmod_gpu_check
Browse files Browse the repository at this point in the history
Allow overriding the Lmod GPU driver check
  • Loading branch information
trz42 authored May 27, 2024
2 parents c635420 + b15fc3d commit 1ac2c30
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 12 deletions.
14 changes: 10 additions & 4 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ display_help() {
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
}

# Function to check if a command exists
function command_exists() {
command -v "$1" >/dev/null 2>&1
}

function copy_build_log() {
# copy specified build log to specified directory, with some context added
build_log=${1}
Expand Down Expand Up @@ -238,10 +243,11 @@ else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works;
# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software)
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
# Install NVIDIA drivers in host_injections (if they exist)
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
fi

# use PR patch file to determine in which easystack files stuff was added
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
Expand Down
10 changes: 9 additions & 1 deletion bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,15 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
# add options required to handle NVIDIA support
BUILD_STEP_ARGS+=("--nvidia" "all")
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found, using available GPU"
BUILD_STEP_ARGS+=("--nvidia" "all")
else
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
BUILD_STEP_ARGS+=("--nvidia" "install")
fi
# Retain location for host injections so we don't reinstall CUDA
# (Always need to run the driver installation as available driver may change)
if [[ ! -z ${SHARED_FS_PATH} ]]; then
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
fi
Expand Down
9 changes: 6 additions & 3 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local checkGpu = mt:haveProperty(simpleName,"arch","gpu")
local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK")
if checkGpu and (overrideGpuCheck == nil) then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
Expand All @@ -141,7 +142,9 @@
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. You can "
advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
Expand Down
7 changes: 3 additions & 4 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -464,10 +464,9 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
if [[ "${NVIDIA_MODE}" == "install" ]] ; then
# We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver
# (this works because we build within a container and the LMOD_RC recognises that)
touch ${EESSI_TMPDIR}/libcuda.so
export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so"
# No GPU so we need to "trick" Lmod to allow us to load CUDA modules even without a CUDA driver
# (this variable means EESSI_OVERRIDE_GPU_CHECK=1 will be set inside the container)
export SINGULARITYENV_EESSI_OVERRIDE_GPU_CHECK=1
fi
fi
fi
Expand Down
3 changes: 3 additions & 0 deletions run_in_compat_layer_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ fi
if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then
INPUT="export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}; ${INPUT}"
fi
if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then
INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}"
fi
if [ ! -z ${http_proxy} ]; then
INPUT="export http_proxy=${http_proxy}; ${INPUT}"
fi
Expand Down
5 changes: 5 additions & 0 deletions scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ function create_directory_structure() {
return $return_code
}

# Function to check if a command exists
function command_exists() {
command -v "$1" >/dev/null 2>&1
}

function get_path_for_tool {
tool_name=$1
tool_envvar_name=$2
Expand Down

0 comments on commit 1ac2c30

Please sign in to comment.