Skip to content

Commit

Permalink
Merge pull request EESSI#368 from ocaisa/host_injections_cuda
Browse files Browse the repository at this point in the history
Build CUDA under `host_injections` and make EESSI aware of host CUDA drivers
  • Loading branch information
casparvl authored Dec 1, 2023
2 parents 2a66987 + 480d356 commit 4e59a22
Show file tree
Hide file tree
Showing 4 changed files with 462 additions and 11 deletions.
72 changes: 66 additions & 6 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
# -. initial settings & exit codes
TOPDIR=$(dirname $(realpath $0))

source ${TOPDIR}/scripts/utils.sh
source ${TOPDIR}/scripts/cfg_files.sh
source "${TOPDIR}"/scripts/utils.sh
source "${TOPDIR}"/scripts/cfg_files.sh

# exit codes: bitwise shift codes to allow for combination of exit codes
# ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh
Expand All @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8))
HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9))
HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10))
RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11))
NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12))

# CernVM-FS settings
CVMFS_VAR_LIB="var-lib-cvmfs"
Expand All @@ -72,12 +73,17 @@ display_help() {
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -h | --help - display this usage information [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
echo " -i | --host-injections - directory to link to for host_injections "
echo " [default: /..storage../opt-eessi]"
echo " -l | --list-repos - list available repository identifiers [default: false]"
echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or"
echo " MODE==run (run a script or command) [default: shell]"
echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs,"
echo " MODE==install for a CUDA installation, MODE==run to"
echo " attach a GPU, MODE==all for both [default: false]"
echo " -r | --repository CFG - configuration file or identifier defining the"
echo " repository to use [default: EESSI-pilot via"
echo " default container, see --container]"
Expand Down Expand Up @@ -111,6 +117,8 @@ VERBOSE=0
STORAGE=
LIST_REPOS=0
MODE="shell"
SETUP_NVIDIA=0
ADDITIONAL_SINGULARITY_FLAGS=
REPOSITORY="EESSI-pilot"
RESUME=
SAVE=
Expand Down Expand Up @@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do
display_help
exit 0
;;
-i|--host-injections)
USER_HOST_INJECTIONS="$2"
shift 2
;;
-l|--list-repos)
LIST_REPOS=1
shift 1
Expand All @@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do
MODE="$2"
shift 2
;;
-n|--nvidia)
SETUP_NVIDIA=1
NVIDIA_MODE="$2"
shift 2
;;
-r|--repository)
REPOSITORY="$2"
shift 2
Expand Down Expand Up @@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then
fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}"
fi

# Also validate the NVIDIA GPU mode (if present)
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then
fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}"
fi
fi

# TODO (arg -r|--repository) check if repository is known
# REPOSITORY_ERROR_EXITCODE
if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI-pilot" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then
Expand Down Expand Up @@ -294,6 +318,7 @@ else
echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')."
fi


# if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE}
if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then
tar xf ${RESUME} -C ${EESSI_HOST_STORAGE}
Expand All @@ -310,12 +335,25 @@ fi
# |-overlay-work
# |-home
# |-repos_cfg
# |-opt-eessi (unless otherwise specificed for host_injections)

# tmp dir for EESSI
EESSI_TMPDIR=${EESSI_HOST_STORAGE}
mkdir -p ${EESSI_TMPDIR}
[[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}"

# Set host_injections directory and ensure it is a writable directory (if user provided)
if [ -z ${USER_HOST_INJECTIONS+x} ]; then
# Not set, so use our default
HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi
mkdir -p $HOST_INJECTIONS
else
# Make sure the host_injections directory specified exists and is a folder
mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created"
HOST_INJECTIONS=${USER_HOST_INJECTIONS}
fi
[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}"

# configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that
# a global SINGULARITY_CACHEDIR would ensure that we don't consume
# storage space again and again for the container & also speed-up
Expand Down Expand Up @@ -394,12 +432,34 @@ fi
[[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}"

# define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined)
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs"
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi"
# provide a '/tmp' inside the container
BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}"

[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"

# Configure anything we need for NVIDIA GPUs and CUDA installation
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then
# Give singularity the appropriate flag
ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}"
[[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}"
fi
if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then
# Add additional bind mounts to allow CUDA to install within a container
# (Experience tells us that these are necessary, but we don't know _why_
# as the CUDA installer is a black box. The suspicion is that the CUDA
# installer gets confused by the permissions on these directories when
# inside a container)
EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log
EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda
mkdir -p ${EESSI_VAR_LOG}
mkdir -p ${EESSI_USR_LOCAL_CUDA}
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
fi
fi

# set up repository config (always create directory repos_cfg and populate it with info when
# arg -r|--repository is used)
mkdir -p ${EESSI_TMPDIR}/repos_cfg
Expand Down Expand Up @@ -558,8 +618,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then
fi

echo "Launching container with command (next line):"
echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
exit_code=$?

# 6. save tmp if requested (arg -s|--save)
Expand Down
209 changes: 209 additions & 0 deletions gpu_support/nvidia/install_cuda_host_injections.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#!/usr/bin/env bash

# This script can be used to install CUDA under the `.../host_injections` directory.
# This provides the parts of the CUDA installation that cannot be redistributed as
# part of EESSI due to license limitations. While GPU-based software from EESSI will
# _run_ without these, installation of additional CUDA software requires the CUDA
# installation(s) under `host_injections` to be present.
#
# The `host_injections` directory is a variant symlink that by default points to
# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see
# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the
# installation to be successful, this directory needs to be writeable by the user
# executing this script.

# Initialise our bash functions
TOPDIR=$(dirname $(realpath $BASH_SOURCE))
source "$TOPDIR"/../../scripts/utils.sh

# Function to display help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --help Display this help message"
echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install"
echo " CUDA, see the EULA at"
echo " https://docs.nvidia.com/cuda/eula/index.html"
echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary"
echo " storage during the CUDA install"
echo " (must have >10GB available)"
}

# Initialize variables
install_cuda_version=""
eula_accepted=0

# Parse command-line options
while [[ $# -gt 0 ]]; do
case "$1" in
--help)
show_help
exit 0
;;
-c|--cuda-version)
if [ -n "$2" ]; then
install_cuda_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
--accept-cuda-eula)
eula_accepted=1
shift 1
;;
-t|--temp-dir)
if [ -n "$2" ]; then
CUDA_TEMP_DIR="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
*)
show_help
fatal_error "Error: Unknown option: $1"
;;
esac
done

# Make sure EESSI is initialised
check_eessi_initialised

# Make sure the CUDA version supplied is a semantic version
is_semantic_version() {
local version=$1
local regex='^[0-9]+\.[0-9]+\.[0-9]+$'

if [[ $version =~ $regex ]]; then
return 0 # Return success (0) if it's a semantic version
else
return 1 # Return failure (1) if it's not a semantic version
fi
}
if ! is_semantic_version "$install_cuda_version"; then
show_help
error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n"
error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n"
error="${error}version to provide is probably one of those available under\n"
error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n"
fatal_error "${error}"
fi

# Make sure they have accepted the CUDA EULA
if [ "$eula_accepted" -ne 1 ]; then
show_help
error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n"
fatal_error "${error}"
fi

# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup)
cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}

# Only install CUDA if specified version is not found.
# (existence of easybuild subdir implies a successful install)
if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then
echo_green "CUDA software found! No need to install CUDA again."
else
# We need to be able write to the installation space so let's make sure we can
if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then
fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA"
fi

# we need a directory we can use for temporary storage
if [[ -z "${CUDA_TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${CUDA_TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi

required_space_in_tmpdir=50000
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi

# The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < 5000000 )); then
fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

if ! command -v "eb" &>/dev/null; then
echo_yellow "Attempting to load an EasyBuild module to do actual install"
module load EasyBuild
# There are some scenarios where this may fail
if [ $? -ne 0 ]; then
error="'eb' command not found in your environment and\n"
error="${error} module load EasyBuild\n"
error="${error}failed for some reason.\n"
error="${error}Please re-run this script with the 'eb' command available."
fatal_error "${error}"
fi
fi

cuda_easyconfig="CUDA-${install_cuda_version}.eb"

# Check the easyconfig file is available in the release
# (eb search always returns 0, so we need a grep to ensure a usable exit code)
eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1
# Check the exit code
if [ $? -ne 0 ]; then
eb_version=$(eb --version)
available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA)

error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n"
error="${error} ${eb_version}\n"
error="${error}You either need to give a different version of CUDA to install _or_ \n"
error="${error}use a different version of EasyBuild for the installation.\n"
error="${error}\nThe versions of available with the current eb command are:\n"
error="${error}${available_cuda_easyconfigs}"
fatal_error "${error}"
fi

# We need the --rebuild option, as the CUDA module may or may not be on the
# `MODULEPATH` yet. Even if it is, we still want to redo this installation
# since it will provide the symlinked targets for the parts of the CUDA
# installation in the `.../versions/...` prefix
# We install the module in our `tmpdir` since we do not need the modulefile,
# we only care about providing the targets for the symlinks.
extra_args="--rebuild --installpath-modules=${tmpdir}"

# We don't want hooks used in this install, we need a vanilla CUDA installation
touch "$tmpdir"/none.py
# shellcheck disable=SC2086 # Intended splitting of extra_args
eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}"
ret=$?
if [ $ret -ne 0 ]; then
fatal_error "CUDA installation failed, please check EasyBuild logs..."
else
echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
fi
Loading

0 comments on commit 4e59a22

Please sign in to comment.