Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow overriding the Lmod GPU driver check #579

Merged
merged 7 commits into from
May 27, 2024
14 changes: 10 additions & 4 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ display_help() {
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
}

# Function to check if a command exists
function command_exists() {
command -v "$1" >/dev/null 2>&1
}

function copy_build_log() {
# copy specified build log to specified directory, with some context added
build_log=${1}
Expand Down Expand Up @@ -238,10 +243,11 @@ else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works;
# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software)
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
# Install NVIDIA drivers in host_injections (if they exist)
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
fi

# use PR patch file to determine in which easystack files stuff was added
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
Expand Down
10 changes: 9 additions & 1 deletion bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,15 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
# add options required to handle NVIDIA support
BUILD_STEP_ARGS+=("--nvidia" "all")
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found, using available GPU"
BUILD_STEP_ARGS+=("--nvidia" "all")
else
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
BUILD_STEP_ARGS+=("--nvidia" "install")
fi
# Retain location for host injections so we don't reinstall CUDA
# (Always need to run the driver installation as available driver may change)
if [[ ! -z ${SHARED_FS_PATH} ]]; then
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
fi
Expand Down
9 changes: 6 additions & 3 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local checkGpu = mt:haveProperty(simpleName,"arch","gpu")
local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK")
if checkGpu and (overrideGpuCheck == nil) then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
Expand All @@ -141,7 +142,9 @@
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. You can "
advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
Expand Down
7 changes: 3 additions & 4 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -464,10 +464,9 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
if [[ "${NVIDIA_MODE}" == "install" ]] ; then
# We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver
# (this works because we build within a container and the LMOD_RC recognises that)
touch ${EESSI_TMPDIR}/libcuda.so
export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so"
# No GPU so we need to "trick" Lmod to allow us to load CUDA modules even without a CUDA driver
# (this variable means EESSI_OVERRIDE_GPU_CHECK=1 will be set inside the container)
export SINGULARITYENV_EESSI_OVERRIDE_GPU_CHECK=1
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
fi
fi
fi
Expand Down
3 changes: 3 additions & 0 deletions run_in_compat_layer_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ fi
if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then
INPUT="export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}; ${INPUT}"
fi
if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then
INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}"
fi
if [ ! -z ${http_proxy} ]; then
INPUT="export http_proxy=${http_proxy}; ${INPUT}"
fi
Expand Down
5 changes: 5 additions & 0 deletions scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ function create_directory_structure() {
return $return_code
}

# Function to check if a command exists
function command_exists() {
command -v "$1" >/dev/null 2>&1
}

function get_path_for_tool {
tool_name=$1
tool_envvar_name=$2
Expand Down