From fcc7ddbab8a57d419736d8a0857943569fdb587c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:36:31 +0100 Subject: [PATCH 01/31] Also recreated lmodrc when it was changed in a PR --- EESSI-install-software.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index f6087b3cfe..8170897726 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -229,7 +229,8 @@ done echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then +lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?) +if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi From 2b09d1c474e399cd1fa6634e18a5234a2e16ac7d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:41:43 +0100 Subject: [PATCH 02/31] Modified lmodrc to add CUDA support. It now checks if you load the CUDA module if a full CUDA SDK was also installed in host_injections (otherwise you have dead links to the non-redistributable parts of the CUDA SDK). Furthermore, for GPU enabled modules, it checks if the drivers have been linked in in the host_injections directory. It also checks if they are new enough to be used with the CUDA version that was used as a dependency for the GPU-enabled module you are trying to load. If any of these checks is not true, it prints an error message with advice on how to proceed. --- create_lmodrc.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/create_lmodrc.py b/create_lmodrc.py index ae65153a20..adf221ecba 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -17,6 +17,81 @@ } """ +GPU_LMOD_RC ="""require("strict") +local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. + -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse + -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI + if simpleName == 'CUDA' then + -- get the full host_injections path + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local cudaDirExists = isDir(cudaEasyBuildDir) + if not cudaDirExists then + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. In order to be able to use the CUDA module, please follow the " + advice = advice .. "instructions available under https://www.eessi.io/docs/gpu/ \\n" + LmodError("\\nYou requested to load ", simpleName, " ", advice) + end + end + -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, + -- otherwise, refuse to load the requested module and print error message + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local arch = os.getenv("EESSI_CPU_FAMILY") or "" + local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cudaDriverExists = isFile(cudaDriverFile) + local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") + if not (cudaDriverExists or singularityCudaExists) then + local advice = "which relies on the CUDA runtime environment and driver libraries. " + advice = advice .. "In order to be able to use the module, please follow the instructions " + advice = advice .. "available under https://www.eessi.io/docs/gpu/ \\n" + LmodError("\\nYou requested to load ", simpleName, " ", advice) + else + -- CUDA driver exists, now we check its version to see if an update is needed + if cudaDriverExists then + local cudaVersion = read_file(cudaVersionFile) + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + -- driver CUDA versions don't give a patch version for CUDA + local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)") + local driver_libs_need_update = false + if major < major_req then + driver_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + driver_libs_need_update = true + end + end + if driver_libs_need_update == true then + local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " + advice = advice .. "Please update your CUDA driver libraries and then follow the instructions " + advice = advice .. "under https://www.eessi.io/docs/gpu/ to let EESSI know about the update.\\n" + LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) + end + end + end + end +end + +hook.register("load", cuda_enabled_load_hook) +""" def error(msg): sys.stderr.write("ERROR: %s\n" % msg) @@ -36,6 +111,7 @@ def error(msg): 'dot_lmod': DOT_LMOD, 'prefix': prefix, } +lmodrc_txt += '\n' + GPU_LMOD_RC try: os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True) with open(lmodrc_path, 'w') as fp: From 62e70ba9474638b78edeafbb3886e432b7e88e7c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:44:50 +0100 Subject: [PATCH 03/31] Adapt created_lmodrc.py for the new domain --- create_lmodrc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_lmodrc.py b/create_lmodrc.py index adf221ecba..9c007c15e6 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -54,8 +54,8 @@ local haveGpu = mt:haveProperty(simpleName,"arch","gpu") if haveGpu then local arch = os.getenv("EESSI_CPU_FAMILY") or "" - local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" - local cudaDriverFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" local cudaDriverExists = isFile(cudaDriverFile) local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") if not (cudaDriverExists or singularityCudaExists) then From 045c099b3c4eee0a742a5859a32ba2142e21cc9c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:52:52 +0100 Subject: [PATCH 04/31] Add post_sanitycheck hook for CUDA in order to only ship the files we are allowed to redistribute. It will create symlinks to the host_injections directory for the rest of the files that we are not allowed to redistribute. Additionally, create a hook to inject the GPU lmod property when creating module files for modules that have CUDA as a dependency --- eb_hooks.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/eb_hooks.py b/eb_hooks.py index 6fe92c7f7b..78580c14b9 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -68,6 +68,9 @@ def parse_hook(ec, *args, **kwargs): if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + # inject the GPU property (if required) + ec = inject_gpu_property(ec) + def post_ready_hook(self, *args, **kwargs): """ @@ -247,6 +250,12 @@ def pre_configure_hook(self, *args, **kwargs): PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanity-check hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): """ Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC @@ -393,6 +402,81 @@ def pre_single_extension_isoband(ext, *args, **kwargs): ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " +def post_sanitycheck_cuda(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = ['EULA', 'README'] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + # Make sure source and target are not the same + if source == target: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you are" + "using this hook for an EESSI installation?") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + system_command="ln -s '%s' '%s'" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec + + PARSE_HOOKS = { 'CGAL': parse_hook_cgal_toolchainopts_precise, 'fontconfig': parse_hook_fontconfig_add_fonts, @@ -424,3 +508,7 @@ def pre_single_extension_isoband(ext, *args, **kwargs): 'isoband': pre_single_extension_isoband, 'testthat': pre_single_extension_testthat, } + +POST_SANITYCHECK_HOOKS = { + 'CUDA': post_sanitycheck_cuda, +} From 4a4c6e768d1b469baf4476a32f9721f789ce341a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:53:37 +0100 Subject: [PATCH 05/31] Add (the redistributable part of) CUDA to the softare stack --- .../2023.06/eessi-2023.06-eb-4.8.2-system.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml index f02b9f2802..86d6931820 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml @@ -5,3 +5,7 @@ easyconfigs: - Nextflow-23.10.0.eb: options: from-pr: 19172 + - CUDA-12.1.1.eb: + options: + include-easyblocks-from-pr: 3045 + accept-eula-for: CUDA From 0346b22291671d5f03343b11e9a919f13f3ab9c5 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Dec 2023 18:55:15 +0100 Subject: [PATCH 06/31] Add CUDA-Samples to the build list --- .../2023.06/eessi-2023.06-eb-4.8.2-2023a.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml index 567db44e42..0537c448e5 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml @@ -35,3 +35,9 @@ easyconfigs: - Boost-1.82.0-GCC-12.3.0.eb - netCDF-4.9.2-gompi-2023a.eb - FFmpeg-6.0-GCCcore-12.3.0.eb + - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb + # use easyconfig that only install subset of CUDA samples, + # to circumvent problem with nvcc linking to glibc of host OS; + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189 + options: + from-pr: 19189 From 5905e727f146480d6472d2f3a1d0ca6ab53105ee Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 20 Dec 2023 01:07:41 +0100 Subject: [PATCH 07/31] Tweak GPU support implementation --- EESSI-install-software.sh | 11 ++++------- .../2023.06/eessi-2023.06-eb-4.8.2-2023a.yml | 2 +- install_scripts.sh | 16 ++++++++-------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 5e9bd5d472..95fb03e9b7 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -189,19 +189,16 @@ pr_diff=$(ls [0-9]*.diff | head -1) # install any additional required scripts # order is important: these are needed to install a full CUDA SDK in host_injections -install_scripts_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^install_scripts.sh$' > /dev/null; echo $?) -if [ ${install_scripts_changed} == '0' ]; then - # for now, this just reinstalls all scripts. Note the most elegant, but works - ${TOPDIR}/install_scripts.sh --prefix ${EESSI_CVMFS_REPO} -fi +# for now, this just reinstalls all scripts. Note the most elegant, but works +${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} # Install full CUDA SDK in host_injections # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install -${EESSI_CVMFS_REPO}/gpu_support/nvidia/install_cuda_host_injections.sh 12.1.1 +${EESSI_PREFIX}/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula # Install drivers in host_injections -${EESSI_CVMFS_REPO}/gpu_support/nvidia/link_nvidia_host_libraries.sh +${EESSI_PREFIX}/gpu_support/nvidia/link_nvidia_host_libraries.sh # use PR patch file to determine in which easystack files stuff was added for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml index 0537c448e5..87ccd69e99 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml @@ -35,7 +35,7 @@ easyconfigs: - Boost-1.82.0-GCC-12.3.0.eb - netCDF-4.9.2-gompi-2023a.eb - FFmpeg-6.0-GCCcore-12.3.0.eb - - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb + - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb: # use easyconfig that only install subset of CUDA samples, # to circumvent problem with nvcc linking to glibc of host OS; # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189 diff --git a/install_scripts.sh b/install_scripts.sh index 209d953c88..8fb27826c6 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -13,7 +13,7 @@ POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case $1 in - -o|--prefix) + -p|--prefix) INSTALL_PREFIX="$2" shift 2 ;; @@ -38,25 +38,25 @@ set -- "${POSITIONAL_ARGS[@]}" TOPDIR=$(dirname $(realpath $0)) # Subdirs for generic scripts -SCRIPTS_DIR_SOURCE=${TOPDIR}/scripts/ # Source dir -SCRIPTS_DIR_TARGET=${INSTALL_PREFIX}/scripts/ # Target dir +SCRIPTS_DIR_SOURCE=${TOPDIR}/scripts # Source dir +SCRIPTS_DIR_TARGET=${INSTALL_PREFIX}/scripts # Target dir # Create target dir mkdir -p ${SCRIPTS_DIR_TARGET} # Copy scripts into this prefix for file in utils.sh; do - cp ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} + cp -u ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} done # Subdirs for GPU support -NVIDIA_GPU_SUPPORT_DIR_SOURCE=${TOPDIR}/gpu_support/nvidia/ # Source dir -NVIDIA_GPU_SUPPORT_DIR_TARGET=${INSTALL_PREFIX}/gpu_support/nvidia/ # Target dir +NVIDIA_GPU_SUPPORT_DIR_SOURCE=${TOPDIR}/gpu_support/nvidia # Source dir +NVIDIA_GPU_SUPPORT_DIR_TARGET=${INSTALL_PREFIX}/gpu_support/nvidia # Target dir # Create target dir mkdir -p ${NVIDIA_GPU_SUPPORT_DIR_TARGET} # Copy files from this directory into the prefix # To be on the safe side, we dont do recursive copies, but we are explicitely copying each individual file we want to add -for file in install_cuda_host_injections.sh link_nvidia_host_injections.sh; do - cp ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} +for file in install_cuda_host_injections.sh link_nvidia_host_libraries.sh; do + cp -u ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} done From 73618a00ee06889314d6efc0e362882b39f34161 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 20 Dec 2023 01:22:21 +0100 Subject: [PATCH 08/31] Add missing quotes on errors --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index 6458be7fae..cb7420a0e9 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -36,7 +36,7 @@ if [ ${#found_paths[@]} -gt 0 ]; then host_ldconfig=${found_paths[0]} else error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." - fatal_error $error + fatal_error "$error" fi # Make sure EESSI is initialised (doesn't matter what version) @@ -52,7 +52,7 @@ if $nvidia_smi_command > /dev/null; then host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') else error="Failed to successfully execute\n $nvidia_smi_command\n" - fatal_error $error + fatal_error "$error" fi # Let's make sure the driver libraries are not already in place @@ -71,7 +71,7 @@ if [ -e "$host_injection_driver_version_file" ]; then rm $host_injection_driver_dir/* if [ $? -ne 0 ]; then error="Unable to remove files under '$host_injection_driver_dir'." - fatal_error $error + fatal_error "$error" fi fi fi From 32925fe17e7576847be87ef3f1786ee374a2356a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 20 Dec 2023 11:12:27 +0100 Subject: [PATCH 09/31] Error messages now refer to the scripts that need to be run to install the CUDA SDK or link the CUDA drivers --- create_lmodrc.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/create_lmodrc.py b/create_lmodrc.py index 9c007c15e6..6a72d8dc62 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -43,9 +43,11 @@ local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" local cudaDirExists = isDir(cudaEasyBuildDir) if not cudaDirExists then + local cvmfsRepo = os.getenv('EESSI_CVMFS_REPO') local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " - advice = advice .. "due to licencing. In order to be able to use the CUDA module, please follow the " - advice = advice .. "instructions available under https://www.eessi.io/docs/gpu/ \\n" + advice = advice .. "due to licencing. Please install a full copy of the CUDA SDK using the script " + advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/install_cuda_host_injections.sh.\\n" + advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" LmodError("\\nYou requested to load ", simpleName, " ", advice) end end @@ -60,8 +62,10 @@ local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") if not (cudaDriverExists or singularityCudaExists) then local advice = "which relies on the CUDA runtime environment and driver libraries. " - advice = advice .. "In order to be able to use the module, please follow the instructions " - advice = advice .. "available under https://www.eessi.io/docs/gpu/ \\n" + advice = advice .. "In order to be able to use the module, please run the script " + advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/link_nvidia_host_libraries.sh " + advice = advice .. "to make sure EESSI can find the drivers from on your host system.\\n" + advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" LmodError("\\nYou requested to load ", simpleName, " ", advice) else -- CUDA driver exists, now we check its version to see if an update is needed @@ -81,8 +85,10 @@ end if driver_libs_need_update == true then local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "Please update your CUDA driver libraries and then follow the instructions " - advice = advice .. "under https://www.eessi.io/docs/gpu/ to let EESSI know about the update.\\n" + advice = advice .. "Please update your CUDA driver libraries and rerun the script" + advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/install_cuda_host_injections.sh " + advice = advice .. "to let EESSI know about the update.\\n" + advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end end From a33a0cd204a13be3321336caa18c5b3faa5b29f0 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 13:50:46 +0100 Subject: [PATCH 10/31] make install_scripts a bit more verbose --- install_scripts.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/install_scripts.sh b/install_scripts.sh index 8fb27826c6..d53dbb6a9c 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -45,8 +45,9 @@ SCRIPTS_DIR_TARGET=${INSTALL_PREFIX}/scripts # Target dir mkdir -p ${SCRIPTS_DIR_TARGET} # Copy scripts into this prefix +echo "copying scripts from ${SCRIPTS_DIR_SOURCE} to ${SCRIPTS_DIR_TARGET}" for file in utils.sh; do - cp -u ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} + cp -v -u ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} done # Subdirs for GPU support NVIDIA_GPU_SUPPORT_DIR_SOURCE=${TOPDIR}/gpu_support/nvidia # Source dir @@ -57,6 +58,7 @@ mkdir -p ${NVIDIA_GPU_SUPPORT_DIR_TARGET} # Copy files from this directory into the prefix # To be on the safe side, we dont do recursive copies, but we are explicitely copying each individual file we want to add +echo "copying scripts from ${NVIDIA_GPU_SUPPORT_DIR_SOURCE} to ${NVIDIA_GPU_SUPPORT_DIR_TARGET}" for file in install_cuda_host_injections.sh link_nvidia_host_libraries.sh; do - cp -u ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} + cp -v -u ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} done From c7b380d7b3d7580c935824fb3f3c4c80200b8301 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:07:29 +0100 Subject: [PATCH 11/31] use separate easystack file for CUDA + control order in which easystack are processed (where needed) --- easystacks/software.eessi.io/2023.06/README.md | 7 +++++++ ....2-system.yml => eessi-2023.06-eb-4.8.2-001-system.yml} | 4 ---- .../2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml | 5 +++++ 3 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 easystacks/software.eessi.io/2023.06/README.md rename easystacks/software.eessi.io/2023.06/{eessi-2023.06-eb-4.8.2-system.yml => eessi-2023.06-eb-4.8.2-001-system.yml} (56%) create mode 100644 easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/README.md b/easystacks/software.eessi.io/2023.06/README.md new file mode 100644 index 0000000000..733ebf9475 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/README.md @@ -0,0 +1,7 @@ +File naming matters, since it determines the order in which easystack files are processed. + +Software installed with system toolchain should be installed first, +this includes EasyBuild itself, see `eessi-2023.06-eb-4.8.2-001-system.yml` . + +CUDA installations must be done before CUDA is required as dependency for something +built with a non-system toolchain, see `eessi-2023.06-eb-4.8.2-010-CUDA.yml` . diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml similarity index 56% rename from easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml rename to easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml index 86d6931820..f02b9f2802 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-system.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-001-system.yml @@ -5,7 +5,3 @@ easyconfigs: - Nextflow-23.10.0.eb: options: from-pr: 19172 - - CUDA-12.1.1.eb: - options: - include-easyblocks-from-pr: 3045 - accept-eula-for: CUDA diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml new file mode 100644 index 0000000000..dda274b8db --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-010-CUDA.yml @@ -0,0 +1,5 @@ +easyconfigs: + - CUDA-12.1.1.eb: + options: + include-easyblocks-from-pr: 3045 + accept-eula-for: CUDA From f506566c982818bf0ee985c70d7dbe0d24084035 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:10:12 +0100 Subject: [PATCH 12/31] copy EasyBuild log file in case CUDA installation failed in install_cuda_host_injections.sh --- gpu_support/nvidia/install_cuda_host_injections.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh index f02f0da02e..62996fa924 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -200,7 +200,9 @@ else eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? if [ $ret -ne 0 ]; then - fatal_error "CUDA installation failed, please check EasyBuild logs..." + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." else echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" fi From e3ddaccfc04cca311d40ef69eb55028ccf257dcc Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:29:00 +0100 Subject: [PATCH 13/31] add additional optional options required for handling NVIDIA support to start build container --- bot/build.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index 4af217628e..66f93d523e 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -176,6 +176,11 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} declare -a BUILD_STEP_ARGS=() BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") +# add options required to handle NVIDIA support +BUILD_STEP_ARGS+=("--nvidia" "all") +if [[ ! -z ${SHARED_FS_PATH} ]]; then + BUILD_STEP_ARGS+=("--host-injections ${SHARED_FS_PATH}/host-injections") +fi # prepare arguments to install_software_layer.sh (specific to build step) declare -a INSTALL_SCRIPT_ARGS=() From 16ddf7f8a0d29cfe47c9948933bb2a10b5ee17c0 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:41:53 +0100 Subject: [PATCH 14/31] fix typo when passing --host-injections to container script --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 66f93d523e..1622e757e2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -179,7 +179,7 @@ BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support BUILD_STEP_ARGS+=("--nvidia" "all") if [[ ! -z ${SHARED_FS_PATH} ]]; then - BUILD_STEP_ARGS+=("--host-injections ${SHARED_FS_PATH}/host-injections") + BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") fi # prepare arguments to install_software_layer.sh (specific to build step) From 35d6084a25ea76b75982e9bbac0154236d860d6a Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:50:04 +0100 Subject: [PATCH 15/31] correctly pass --nv to singularity command --- eessi_container.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/eessi_container.sh b/eessi_container.sh index 81d7be81ad..268f94975e 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -118,7 +118,6 @@ STORAGE= LIST_REPOS=0 MODE="shell" SETUP_NVIDIA=0 -ADDITIONAL_SINGULARITY_FLAGS= REPOSITORY="EESSI" RESUME= SAVE= @@ -437,12 +436,14 @@ BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" +declare -a ADDITIONAL_CONTAINER_OPTIONS=() + # Configure anything we need for NVIDIA GPUs and CUDA installation if [[ ${SETUP_NVIDIA} -eq 1 ]]; then if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then # Give singularity the appropriate flag - ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}" - [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}" + ADDITIONAL_CONTAINER_OPTIONS+=(--nv) + [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_CONTAINER_OPTIONS=${ADDITIONAL_CONTAINER_OPTIONS[@]}" fi if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then # Add additional bind mounts to allow CUDA to install within a container @@ -621,8 +622,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then fi echo "Launching container with command (next line):" -echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" -singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" +echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_CONTAINER_OPTIONS[@]} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" +singularity ${RUN_QUIET} ${MODE} "${ADDITIONAL_CONTAINER_OPTIONS[@]}" "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" exit_code=$? # 6. save tmp if requested (arg -s|--save) From fd976675c798b1c79205460eec225469fbf28473 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 14:50:55 +0100 Subject: [PATCH 16/31] use quotes when adding --nv --- eessi_container.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_container.sh b/eessi_container.sh index 268f94975e..d6e9558202 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -442,7 +442,7 @@ declare -a ADDITIONAL_CONTAINER_OPTIONS=() if [[ ${SETUP_NVIDIA} -eq 1 ]]; then if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then # Give singularity the appropriate flag - ADDITIONAL_CONTAINER_OPTIONS+=(--nv) + ADDITIONAL_CONTAINER_OPTIONS+=("--nv") [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_CONTAINER_OPTIONS=${ADDITIONAL_CONTAINER_OPTIONS[@]}" fi if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then From 19171465240f52b3ef0f7eba17e4467031e23a58 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 15:22:20 +0100 Subject: [PATCH 17/31] comment out running of link_nvidia_host_libraries.sh script, since it requires working nvidia-smi command --- EESSI-install-software.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 95fb03e9b7..4e3f0acee5 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -198,7 +198,9 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} ${EESSI_PREFIX}/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula # Install drivers in host_injections -${EESSI_PREFIX}/gpu_support/nvidia/link_nvidia_host_libraries.sh +# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works; +# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) +# ${EESSI_PREFIX}/gpu_support/nvidia/link_nvidia_host_libraries.sh # use PR patch file to determine in which easystack files stuff was added for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do From f80f0fc0f70026e4be1fa980e13a991102503737 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 20:15:58 +0100 Subject: [PATCH 18/31] clean up post_sanitycheck_cuda hook and inject_gpu_property function used in parse_hook --- eb_hooks.py | 149 +++++++++++++++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 66 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 78580c14b9..27cc873fa1 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -7,7 +7,7 @@ from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS from easybuild.tools.build_log import EasyBuildError, print_msg from easybuild.tools.config import build_option, update_build_option -from easybuild.tools.filetools import apply_regex_substitutions, copy_file, which +from easybuild.tools.filetools import apply_regex_substitutions, copy_file, remove_file, symlink, which from easybuild.tools.run import run_cmd from easybuild.tools.systemtools import AARCH64, POWER, X86_64, get_cpu_architecture, get_cpu_features from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC @@ -250,12 +250,6 @@ def pre_configure_hook(self, *args, **kwargs): PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) -def post_sanitycheck_hook(self, *args, **kwargs): - """Main post-sanity-check hook: trigger custom functions based on software name.""" - if self.name in POST_SANITYCHECK_HOOKS: - POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) - - def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs): """ Pre-configure hook for OpenBLAS: add DYNAMIC_ARCH=1 to build/test/install options when using --optarch=GENERIC @@ -402,76 +396,99 @@ def pre_single_extension_isoband(ext, *args, **kwargs): ext.cfg['preinstallopts'] = "sed -i 's/SIGSTKSZ/32768/g' src/testthat/vendor/catch.h && " +def post_sanitycheck_hook(self, *args, **kwargs): + """Main post-sanity-check hook: trigger custom functions based on software name.""" + if self.name in POST_SANITYCHECK_HOOKS: + POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) + + def post_sanitycheck_cuda(self, *args, **kwargs): - """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" - print_msg("Replacing CUDA stuff we cannot ship with symlinks...") - # read CUDA EULA - eula_path = os.path.join(self.installdir, "EULA.txt") - tmp_buffer = [] - with open(eula_path) as infile: - copy = False - for line in infile: - if line.strip() == "2.6. Attachment A": - copy = True - continue - elif line.strip() == "2.7. Attachment B": - copy = False - continue - elif copy: - tmp_buffer.append(line) - # create whitelist without file extensions, they're not really needed and they only complicate things - whitelist = ['EULA', 'README'] - file_extensions = [".so", ".a", ".h", ".bc"] - for tmp in tmp_buffer: - for word in tmp.split(): - if any(ext in word for ext in file_extensions): - whitelist.append(word.split(".")[0]) - whitelist = list(set(whitelist)) - # Do some quick checks for things we should or shouldn't have in the list - if "nvcc" in whitelist: - raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) - if "libcudart" not in whitelist: - raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) - # iterate over all files in the CUDA path - for root, dirs, files in os.walk(self.installdir): - for filename in files: - # we only really care about real files, i.e. not symlinks - if not os.path.islink(os.path.join(root, filename)): - # check if the current file is part of the whitelist - basename = filename.split(".")[0] - if basename not in whitelist: - # if it is not in the whitelist, delete the file and create a symlink to host_injections - source = os.path.join(root, filename) - target = source.replace("versions", "host_injections") - # Make sure source and target are not the same - if source == target: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you are" - "using this hook for an EESSI installation?") - os.remove(source) - # Using os.symlink requires the existence of the target directory, so we use os.system - system_command="ln -s '%s' '%s'" % (target, source) - if os.system(system_command) != 0: - raise EasyBuildError("Failed to create symbolic link: %s" % system_command) + """ + Remove files from CUDA installation that we are not allowed to ship, + and replace them with a symlink to a corresponding installation under host_injections. + """ + if self.name == 'CUDA': + print_msg("Replacing files in CUDA installation that we can not ship with symlinks to host_injections...") + + # read CUDA EULA, construct allowlist based on section 2.6 that specifies list of files that can be shipped + eula_path = os.path.join(self.installdir, 'EULA.txt') + relevant_eula_lines = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + relevant_eula_lines.append(line) + + # create list without file extensions, they're not really needed and they only complicate things + allowlist = ['EULA', 'README'] + file_extensions = ['.so', '.a', '.h', '.bc'] + for line in relevant_eula_lines: + for word in line.split(): + if any(ext in word for ext in file_extensions): + allowlist.append(os.path.splitext(word)[0]) + allowlist = sorted(set(allowlist)) + self.log.info("Allowlist for files in CUDA installation that can be redistributed: " + ', '.join(allowlist)) + + # Do some quick sanity checks for things we should or shouldn't have in the list + if 'nvcc' in allowlist: + raise EasyBuildError("Found 'nvcc' in allowlist: %s" % allowlist) + if 'libcudart' not in allowlist: + raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist) + + # iterate over all files in the CUDA installation directory + for dir_path, _, files in os.walk(self.installdir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file is part of the allowlist + basename = os.path.splitext(filename)[0] + if basename in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + else: + self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + basename, full_path) + # if it is not in the allowlist, delete the file and create a symlink to host_injections + host_inj_path = full_path.replace('versions', 'host_injections') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for an EESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + else: + raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") def inject_gpu_property(ec): + """ + Add 'gpu' property, via modluafooter easyconfig parameter + """ ec_dict = ec.asdict() - # Check if CUDA is in the dependencies, if so add the GPU Lmod tag - if ("CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]): - ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") - key = "modluafooter" + # Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property + if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]): + ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version") + key = 'modluafooter' value = 'add_property("arch","gpu")' cuda_version = 0 - for dep in iter(ec_dict["dependencies"]): + for dep in iter(ec_dict['dependencies']): # Make CUDA a build dependency only (rpathing saves us from link errors) - if "CUDA" in dep[0]: + if 'CUDA' in dep[0]: cuda_version = dep[1] - ec_dict["dependencies"].remove(dep) - ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] - value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) if key in ec_dict: if not value in ec_dict[key]: - ec[key] = "\n".join([ec_dict[key], value]) + ec[key] = '\n'.join([ec_dict[key], value]) else: ec[key] = value return ec From 2d378421b697306642467d9019b748a1e67f39bc Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 21:22:42 +0100 Subject: [PATCH 19/31] remove empty line in eessi-2023.06-eb-4.8.2-2023a.yml --- .../software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml index 91efae766f..87ccd69e99 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml @@ -35,7 +35,6 @@ easyconfigs: - Boost-1.82.0-GCC-12.3.0.eb - netCDF-4.9.2-gompi-2023a.eb - FFmpeg-6.0-GCCcore-12.3.0.eb - - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb: # use easyconfig that only install subset of CUDA samples, # to circumvent problem with nvcc linking to glibc of host OS; From f007c4061c3104a29b1053e877c61a32a244c30f Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Dec 2023 23:24:15 +0100 Subject: [PATCH 20/31] use easyconfigs PR 19451 for installing CUDA-Samples v12.1 --- .../2023.06/eessi-2023.06-eb-4.8.2-2023a.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml index 87ccd69e99..596b9ea21f 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.8.2-2023a.yml @@ -37,7 +37,9 @@ easyconfigs: - FFmpeg-6.0-GCCcore-12.3.0.eb - CUDA-Samples-12.1-GCC-12.3.0-CUDA-12.1.1.eb: # use easyconfig that only install subset of CUDA samples, - # to circumvent problem with nvcc linking to glibc of host OS; - # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189 + # to circumvent problem with nvcc linking to glibc of host OS, + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19189; + # and where additional samples are excluded because they fail to build on aarch64, + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451; options: - from-pr: 19189 + from-pr: 19451 From 70fa0f9ac3cd54a582640dc7bdadcd31c210fffc Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 21 Dec 2023 00:34:38 +0100 Subject: [PATCH 21/31] Ship the scripts, and keep them in a single location --- EESSI-install-software.sh | 4 ++-- create_lmodrc.py | 22 +++++++++---------- create_tarball.sh | 4 ++++ install_scripts.sh | 4 ++-- .../nvidia/install_cuda_host_injections.sh | 2 +- .../nvidia/link_nvidia_host_libraries.sh | 2 +- 6 files changed, 20 insertions(+), 18 deletions(-) rename {gpu_support => scripts/gpu_support}/nvidia/install_cuda_host_injections.sh (99%) rename {gpu_support => scripts/gpu_support}/nvidia/link_nvidia_host_libraries.sh (99%) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 4e3f0acee5..b61ca7a579 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -195,12 +195,12 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} # Install full CUDA SDK in host_injections # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install -${EESSI_PREFIX}/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula +${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula # Install drivers in host_injections # TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works; # if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software) -# ${EESSI_PREFIX}/gpu_support/nvidia/link_nvidia_host_libraries.sh +# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh # use PR patch file to determine in which easystack files stuff was added for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do diff --git a/create_lmodrc.py b/create_lmodrc.py index 6a72d8dc62..80635d78cc 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -36,6 +36,7 @@ -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI + local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" if simpleName == 'CUDA' then -- get the full host_injections path local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') @@ -43,11 +44,10 @@ local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" local cudaDirExists = isDir(cudaEasyBuildDir) if not cudaDirExists then - local cvmfsRepo = os.getenv('EESSI_CVMFS_REPO') local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " - advice = advice .. "due to licencing. Please install a full copy of the CUDA SDK using the script " - advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/install_cuda_host_injections.sh.\\n" - advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" + advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI " + advice = advice .. "can find it.\\n" + advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) end end @@ -62,10 +62,9 @@ local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") if not (cudaDriverExists or singularityCudaExists) then local advice = "which relies on the CUDA runtime environment and driver libraries. " - advice = advice .. "In order to be able to use the module, please run the script " - advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/link_nvidia_host_libraries.sh " - advice = advice .. "to make sure EESSI can find the drivers from on your host system.\\n" - advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" + advice = advice .. "In order to be able to use the module, you will need " + advice = advice .. "to make sure EESSI can find the driver libraries on your host system.\\n" + advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) else -- CUDA driver exists, now we check its version to see if an update is needed @@ -85,10 +84,9 @@ end if driver_libs_need_update == true then local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". " - advice = advice .. "Please update your CUDA driver libraries and rerun the script" - advice = advice .. cvmfsRepo .. "/gpu_support/nvidia/install_cuda_host_injections.sh " - advice = advice .. "to let EESSI know about the update.\\n" - advice = advice .. "More information, see https://www.eessi.io/docs/gpu/.\\n" + advice = advice .. "Please update your CUDA driver libraries and then " + advice = advice .. "let EESSI know about the update.\\n" + advice = advice .. refer_to_docs LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice) end end diff --git a/create_tarball.sh b/create_tarball.sh index 8510caebf1..09ce94c835 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -40,6 +40,10 @@ echo ">> Collecting list of files/directories to include in tarball via ${PWD}.. files_list=${tmpdir}/files.list.txt module_files_list=${tmpdir}/module_files.list.txt +if [ -d ${eessi_version}/scripts ]; then + # include scripts we wish to ship along with EESSI, + find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' >> ${files_list} +fi if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then # include Lmod cache and configuration file (lmodrc.lua), # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) diff --git a/install_scripts.sh b/install_scripts.sh index d53dbb6a9c..224400db1c 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -50,8 +50,8 @@ for file in utils.sh; do cp -v -u ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} done # Subdirs for GPU support -NVIDIA_GPU_SUPPORT_DIR_SOURCE=${TOPDIR}/gpu_support/nvidia # Source dir -NVIDIA_GPU_SUPPORT_DIR_TARGET=${INSTALL_PREFIX}/gpu_support/nvidia # Target dir +NVIDIA_GPU_SUPPORT_DIR_SOURCE=${SCRIPTS_DIR_SOURCE}/gpu_support/nvidia # Source dir +NVIDIA_GPU_SUPPORT_DIR_TARGET=${SCRIPTS_DIR_TARGET}/gpu_support/nvidia # Target dir # Create target dir mkdir -p ${NVIDIA_GPU_SUPPORT_DIR_TARGET} diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh similarity index 99% rename from gpu_support/nvidia/install_cuda_host_injections.sh rename to scripts/gpu_support/nvidia/install_cuda_host_injections.sh index 62996fa924..a9310d817a 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -14,7 +14,7 @@ # Initialise our bash functions TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../scripts/utils.sh +source "$TOPDIR"/../../utils.sh # Function to display help message show_help() { diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh similarity index 99% rename from gpu_support/nvidia/link_nvidia_host_libraries.sh rename to scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index cb7420a0e9..e6ff110797 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -5,7 +5,7 @@ # Initialise our bash functions TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../scripts/utils.sh +source "$TOPDIR"/../../utils.sh # We rely on ldconfig to give us the location of the libraries on the host command_name="ldconfig" From db0c14136f7d4e244e42e374f64357eea0f63789 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 21 Dec 2023 09:27:28 +0000 Subject: [PATCH 22/31] Update create_lmodrc.py Co-authored-by: Kenneth Hoste --- create_lmodrc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_lmodrc.py b/create_lmodrc.py index 80635d78cc..0e738a530e 100755 --- a/create_lmodrc.py +++ b/create_lmodrc.py @@ -63,7 +63,7 @@ if not (cudaDriverExists or singularityCudaExists) then local advice = "which relies on the CUDA runtime environment and driver libraries. " advice = advice .. "In order to be able to use the module, you will need " - advice = advice .. "to make sure EESSI can find the driver libraries on your host system.\\n" + advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n" advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) else From 293b1075e5619748f043019a5e856e0b52046d0d Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 21 Dec 2023 09:28:08 +0000 Subject: [PATCH 23/31] Update create_tarball.sh Co-authored-by: Kenneth Hoste --- create_tarball.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/create_tarball.sh b/create_tarball.sh index 09ce94c835..4d3ad37311 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -40,10 +40,12 @@ echo ">> Collecting list of files/directories to include in tarball via ${PWD}.. files_list=${tmpdir}/files.list.txt module_files_list=${tmpdir}/module_files.list.txt +# include scripts that were copied by install_scripts.sh if [ -d ${eessi_version}/scripts ]; then # include scripts we wish to ship along with EESSI, find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' >> ${files_list} fi + if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then # include Lmod cache and configuration file (lmodrc.lua), # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) From 73476b2b20c551ab7e6f0947f544f5f44d3f2537 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 21 Dec 2023 12:16:38 +0100 Subject: [PATCH 24/31] Only copy scripts if the contents differ --- install_scripts.sh | 21 +- .../nvidia/install_cuda_host_injections.sh | 211 ++++++++++++++++++ .../nvidia/link_nvidia_host_libraries.sh | 136 +++++++++++ temp/scripts/utils.sh | 144 ++++++++++++ 4 files changed, 510 insertions(+), 2 deletions(-) create mode 100755 temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh create mode 100755 temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh create mode 100644 temp/scripts/utils.sh diff --git a/install_scripts.sh b/install_scripts.sh index 224400db1c..588248e8d2 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -8,6 +8,23 @@ display_help() { echo " -h | --help - display this usage information" } +compare_and_copy() { + if [ "$#" -ne 2 ]; then + echo "Usage of function: compare_and_copy " + return 1 + fi + + source_file="$1" + destination_file="$2" + + if [ ! -f "$destination_file" ] || ! diff -q "$source_file" "$destination_file" ; then + cp "$source_file" "$destination_file" + echo "File $1 copied to $2." + else + echo "Files $1 and $2 are identical. No copy needed." + fi +} + POSITIONAL_ARGS=() @@ -47,7 +64,7 @@ mkdir -p ${SCRIPTS_DIR_TARGET} # Copy scripts into this prefix echo "copying scripts from ${SCRIPTS_DIR_SOURCE} to ${SCRIPTS_DIR_TARGET}" for file in utils.sh; do - cp -v -u ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} + compare_and_copy ${SCRIPTS_DIR_SOURCE}/${file} ${SCRIPTS_DIR_TARGET}/${file} done # Subdirs for GPU support NVIDIA_GPU_SUPPORT_DIR_SOURCE=${SCRIPTS_DIR_SOURCE}/gpu_support/nvidia # Source dir @@ -60,5 +77,5 @@ mkdir -p ${NVIDIA_GPU_SUPPORT_DIR_TARGET} # To be on the safe side, we dont do recursive copies, but we are explicitely copying each individual file we want to add echo "copying scripts from ${NVIDIA_GPU_SUPPORT_DIR_SOURCE} to ${NVIDIA_GPU_SUPPORT_DIR_TARGET}" for file in install_cuda_host_injections.sh link_nvidia_host_libraries.sh; do - cp -v -u ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} + compare_and_copy ${NVIDIA_GPU_SUPPORT_DIR_SOURCE}/${file} ${NVIDIA_GPU_SUPPORT_DIR_TARGET}/${file} done diff --git a/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh new file mode 100755 index 0000000000..a9310d817a --- /dev/null +++ b/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash + +# This script can be used to install CUDA under the `.../host_injections` directory. +# This provides the parts of the CUDA installation that cannot be redistributed as +# part of EESSI due to license limitations. While GPU-based software from EESSI will +# _run_ without these, installation of additional CUDA software requires the CUDA +# installation(s) under `host_injections` to be present. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" + echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the CUDA install" + echo " (must have >10GB available)" +} + +# Initialize variables +install_cuda_version="" +eula_accepted=0 + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -c|--cuda-version) + if [ -n "$2" ]; then + install_cuda_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + CUDA_TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +# Make sure EESSI is initialised +check_eessi_initialised + +# Make sure the CUDA version supplied is a semantic version +is_semantic_version() { + local version=$1 + local regex='^[0-9]+\.[0-9]+\.[0-9]+$' + + if [[ $version =~ $regex ]]; then + return 0 # Return success (0) if it's a semantic version + else + return 1 # Return failure (1) if it's not a semantic version + fi +} +if ! is_semantic_version "$install_cuda_version"; then + show_help + error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" + error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" + error="${error}version to provide is probably one of those available under\n" + error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" + fatal_error "${error}" +fi + +# Make sure they have accepted the CUDA EULA +if [ "$eula_accepted" -ne 1 ]; then + show_help + error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# Only install CUDA if specified version is not found. +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if ! command -v "eb" &>/dev/null; then + echo_yellow "Attempting to load an EasyBuild module to do actual install" + module load EasyBuild + # There are some scenarios where this may fail + if [ $? -ne 0 ]; then + error="'eb' command not found in your environment and\n" + error="${error} module load EasyBuild\n" + error="${error}failed for some reason.\n" + error="${error}Please re-run this script with the 'eb' command available." + fatal_error "${error}" + fi + fi + + cuda_easyconfig="CUDA-${install_cuda_version}.eb" + + # Check the easyconfig file is available in the release + # (eb search always returns 0, so we need a grep to ensure a usable exit code) + eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 + # Check the exit code + if [ $? -ne 0 ]; then + eb_version=$(eb --version) + available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) + + error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" + error="${error} ${eb_version}\n" + error="${error}You either need to give a different version of CUDA to install _or_ \n" + error="${error}use a different version of EasyBuild for the installation.\n" + error="${error}\nThe versions of available with the current eb command are:\n" + error="${error}${available_cuda_easyconfigs}" + fatal_error "${error}" + fi + + # We need the --rebuild option, as the CUDA module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the CUDA + # installation in the `.../versions/...` prefix + # We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" + ret=$? + if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi diff --git a/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh new file mode 100755 index 0000000000..e6ff110797 --- /dev/null +++ b/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# This script links host libraries related to GPU drivers to a location where +# they can be found by the EESSI linker + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# We rely on ldconfig to give us the location of the libraries on the host +command_name="ldconfig" +# We cannot use a version of ldconfig that's being shipped under CVMFS +exclude_prefix="/cvmfs" + +found_paths=() +# Always attempt to use /sbin/ldconfig +if [ -x "/sbin/$command_name" ]; then + found_paths+=("/sbin/$command_name") +fi +IFS=':' read -ra path_dirs <<< "$PATH" +for dir in "${path_dirs[@]}"; do + if [ "$dir" = "/sbin" ]; then + continue # we've already checked for $command_name in /sbin, don't need to do it twice + fi + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi +done + +if [ ${#found_paths[@]} -gt 0 ]; then + echo "Found $command_name in the following locations:" + printf -- "- %s\n" "${found_paths[@]}" + echo "Using first version" + host_ldconfig=${found_paths[0]} +else + error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fatal_error "$error" +fi + +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + +# Find the CUDA version of the host CUDA drivers +# (making sure that this can still work inside prefix environment inside a container) +export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH +nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command > /dev/null; then + host_driver_version=$($nvidia_smi_command | tail -n1) + # If the first worked, this should work too + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +else + error="Failed to successfully execute\n $nvidia_smi_command\n" + fatal_error "$error" +fi + +# Let's make sure the driver libraries are not already in place +link_drivers=1 + +host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" +host_injection_driver_dir="${host_injections_nvidia_dir}/host" +host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" +if [ -e "$host_injection_driver_version_file" ]; then + if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then + echo_green "The host CUDA driver libraries have already been linked!" + link_drivers=0 + else + # There's something there but it is out of date + echo_yellow "Cleaning out outdated symlinks" + rm $host_injection_driver_dir/* + if [ $? -ne 0 ]; then + error="Unable to remove files under '$host_injection_driver_dir'." + fatal_error "$error" + fi + fi +fi + +drivers_linked=0 +if [ "$link_drivers" -eq 1 ]; then + if ! create_directory_structure "${host_injection_driver_dir}" ; then + fatal_error "No write permissions to directory ${host_injection_driver_dir}" + fi + cd ${host_injection_driver_dir} + # Need a small temporary space to hold a couple of files + temp_dir=$(mktemp -d) + + # Gather libraries on the host (_must_ be host ldconfig) + $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt + # Allow for the fact that we may be in a container so the CUDA libs might be in there + ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null + + # Leverage singularity to find the full list of libraries we should be linking to + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" + curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + + # Make symlinks to all the interesting libraries + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} + + # Inject driver and CUDA versions into dir + echo $host_driver_version > driver_version.txt + echo $host_cuda_version > cuda_version.txt + drivers_linked=1 + + # Remove the temporary directory when done + rm -r "$temp_dir" +fi + +# Make latest symlink for NVIDIA drivers +cd $host_injections_nvidia_dir +symlink="latest" +if [ -L "$symlink" ]; then + # Unless the drivers have been installed, leave the symlink alone + if [ "$drivers_linked" -eq 1 ]; then + ln -sf host latest + fi +else + # No link exists yet + ln -s host latest +fi + +# Make sure the libraries can be found by the EESSI linker +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} +if [ -L "$host_injection_linker_dir/lib" ]; then + target_path=$(readlink -f "$host_injection_linker_dir/lib") + if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then + cd $host_injection_linker_dir + ln -sf $host_injections_nvidia_dir/latest lib + fi +else + create_directory_structure $host_injection_linker_dir + cd $host_injection_linker_dir + ln -s $host_injections_nvidia_dir/latest lib +fi + +echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" diff --git a/temp/scripts/utils.sh b/temp/scripts/utils.sh new file mode 100644 index 0000000000..b2be3f6221 --- /dev/null +++ b/temp/scripts/utils.sh @@ -0,0 +1,144 @@ +function echo_green() { + echo -e "\e[32m$1\e[0m" +} + +function echo_red() { + echo -e "\e[31m$1\e[0m" +} + +function echo_yellow() { + echo -e "\e[33m$1\e[0m" +} + +ANY_ERROR_EXITCODE=1 +function fatal_error() { + echo_red "ERROR: $1" >&2 + if [[ $# -gt 1 ]]; then + exit "$2" + else + exit "${ANY_ERROR_EXITCODE}" + fi +} + +function check_exit_code { + ec=$1 + ok_msg=$2 + fail_msg=$3 + + if [[ $ec -eq 0 ]]; then + echo_green "${ok_msg}" + else + fatal_error "${fail_msg}" + fi +} + +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE + fi + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p "$dir_structure" 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return_code=$ANY_ERROR_EXITCODE + fi + fi + + return $return_code +} + +function get_path_for_tool { + tool_name=$1 + tool_envvar_name=$2 + + which_out=$(which "${tool_name}" 2>&1) + exit_code=$? + if [[ ${exit_code} -eq 0 ]]; then + echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 + echo "${which_out}" + return 0 + fi + if [[ -z "${tool_envvar_name}" ]]; then + msg="no env var holding the full path to tool '${tool_name}' provided" + echo "${msg}" >&2 + return 1 + else + tool_envvar_value=${!tool_envvar_name} + if [[ -x "${tool_envvar_value}" ]]; then + msg="INFO: found tool ${tool_envvar_value} via env var ${tool_envvar_name}" + echo "${msg}" >&2 + echo "${tool_envvar_value}" + return 0 + else + msg="ERROR: tool '${tool_name}' not in PATH\n" + msg+="ERROR: tool '${tool_envvar_value}' via '${tool_envvar_name}' not in PATH" + echo "${msg}" >&2 + echo "" + return 2 + fi + fi +} + +function get_host_from_url { + url=$1 + re="(http|https)://([^/:]+)" + if [[ $url =~ $re ]]; then + echo "${BASH_REMATCH[2]}" + return 0 + else + echo "" + return 1 + fi +} + +function get_port_from_url { + url=$1 + re="(http|https)://[^:]+:([0-9]+)" + if [[ $url =~ $re ]]; then + echo "${BASH_REMATCH[2]}" + return 0 + else + echo "" + return 1 + fi +} + +function get_ipv4_address { + hname=$1 + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + # TODO try other methods if the one above does not work --> tool that verifies + # what method can be used? + echo "${hipv4}" + return 0 +} From a333a741bb75a68f6d29cb718472af70e3b5c912 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 21 Dec 2023 12:19:19 +0100 Subject: [PATCH 25/31] Remove temporary test directory --- .../nvidia/install_cuda_host_injections.sh | 211 ------------------ .../nvidia/link_nvidia_host_libraries.sh | 136 ----------- temp/scripts/utils.sh | 144 ------------ 3 files changed, 491 deletions(-) delete mode 100755 temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh delete mode 100755 temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh delete mode 100644 temp/scripts/utils.sh diff --git a/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh b/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh deleted file mode 100755 index a9310d817a..0000000000 --- a/temp/scripts/gpu_support/nvidia/install_cuda_host_injections.sh +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env bash - -# This script can be used to install CUDA under the `.../host_injections` directory. -# This provides the parts of the CUDA installation that cannot be redistributed as -# part of EESSI due to license limitations. While GPU-based software from EESSI will -# _run_ without these, installation of additional CUDA software requires the CUDA -# installation(s) under `host_injections` to be present. -# -# The `host_injections` directory is a variant symlink that by default points to -# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see -# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the -# installation to be successful, this directory needs to be writeable by the user -# executing this script. - -# Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# Function to display help message -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --help Display this help message" - echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" - echo " CUDA, see the EULA at" - echo " https://docs.nvidia.com/cuda/eula/index.html" - echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" - echo " storage during the CUDA install" - echo " (must have >10GB available)" -} - -# Initialize variables -install_cuda_version="" -eula_accepted=0 - -# Parse command-line options -while [[ $# -gt 0 ]]; do - case "$1" in - --help) - show_help - exit 0 - ;; - -c|--cuda-version) - if [ -n "$2" ]; then - install_cuda_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - --accept-cuda-eula) - eula_accepted=1 - shift 1 - ;; - -t|--temp-dir) - if [ -n "$2" ]; then - CUDA_TEMP_DIR="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - *) - show_help - fatal_error "Error: Unknown option: $1" - ;; - esac -done - -# Make sure EESSI is initialised -check_eessi_initialised - -# Make sure the CUDA version supplied is a semantic version -is_semantic_version() { - local version=$1 - local regex='^[0-9]+\.[0-9]+\.[0-9]+$' - - if [[ $version =~ $regex ]]; then - return 0 # Return success (0) if it's a semantic version - else - return 1 # Return failure (1) if it's not a semantic version - fi -} -if ! is_semantic_version "$install_cuda_version"; then - show_help - error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" - error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably one of those available under\n" - error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" - fatal_error "${error}" -fi - -# Make sure they have accepted the CUDA EULA -if [ "$eula_accepted" -ne 1 ]; then - show_help - error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" - fatal_error "${error}" -fi - -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) -cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} - -# Only install CUDA if specified version is not found. -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" - fi - - if ! command -v "eb" &>/dev/null; then - echo_yellow "Attempting to load an EasyBuild module to do actual install" - module load EasyBuild - # There are some scenarios where this may fail - if [ $? -ne 0 ]; then - error="'eb' command not found in your environment and\n" - error="${error} module load EasyBuild\n" - error="${error}failed for some reason.\n" - error="${error}Please re-run this script with the 'eb' command available." - fatal_error "${error}" - fi - fi - - cuda_easyconfig="CUDA-${install_cuda_version}.eb" - - # Check the easyconfig file is available in the release - # (eb search always returns 0, so we need a grep to ensure a usable exit code) - eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 - # Check the exit code - if [ $? -ne 0 ]; then - eb_version=$(eb --version) - available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) - - error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" - error="${error} ${eb_version}\n" - error="${error}You either need to give a different version of CUDA to install _or_ \n" - error="${error}use a different version of EasyBuild for the installation.\n" - error="${error}\nThe versions of available with the current eb command are:\n" - error="${error}${available_cuda_easyconfigs}" - fatal_error "${error}" - fi - - # We need the --rebuild option, as the CUDA module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the CUDA - # installation in the `.../versions/...` prefix - # We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. - extra_args="--rebuild --installpath-modules=${tmpdir}" - - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" - ret=$? - if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "CUDA installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi diff --git a/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh deleted file mode 100755 index e6ff110797..0000000000 --- a/temp/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash - -# This script links host libraries related to GPU drivers to a location where -# they can be found by the EESSI linker - -# Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# We rely on ldconfig to give us the location of the libraries on the host -command_name="ldconfig" -# We cannot use a version of ldconfig that's being shipped under CVMFS -exclude_prefix="/cvmfs" - -found_paths=() -# Always attempt to use /sbin/ldconfig -if [ -x "/sbin/$command_name" ]; then - found_paths+=("/sbin/$command_name") -fi -IFS=':' read -ra path_dirs <<< "$PATH" -for dir in "${path_dirs[@]}"; do - if [ "$dir" = "/sbin" ]; then - continue # we've already checked for $command_name in /sbin, don't need to do it twice - fi - if [[ ! "$dir" =~ ^$exclude_prefix ]]; then - if [ -x "$dir/$command_name" ]; then - found_paths+=("$dir/$command_name") - fi - fi -done - -if [ ${#found_paths[@]} -gt 0 ]; then - echo "Found $command_name in the following locations:" - printf -- "- %s\n" "${found_paths[@]}" - echo "Using first version" - host_ldconfig=${found_paths[0]} -else - error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." - fatal_error "$error" -fi - -# Make sure EESSI is initialised (doesn't matter what version) -check_eessi_initialised - -# Find the CUDA version of the host CUDA drivers -# (making sure that this can still work inside prefix environment inside a container) -export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH -nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_command > /dev/null; then - host_driver_version=$($nvidia_smi_command | tail -n1) - # If the first worked, this should work too - host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') -else - error="Failed to successfully execute\n $nvidia_smi_command\n" - fatal_error "$error" -fi - -# Let's make sure the driver libraries are not already in place -link_drivers=1 - -host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" -host_injection_driver_dir="${host_injections_nvidia_dir}/host" -host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" -if [ -e "$host_injection_driver_version_file" ]; then - if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then - echo_green "The host CUDA driver libraries have already been linked!" - link_drivers=0 - else - # There's something there but it is out of date - echo_yellow "Cleaning out outdated symlinks" - rm $host_injection_driver_dir/* - if [ $? -ne 0 ]; then - error="Unable to remove files under '$host_injection_driver_dir'." - fatal_error "$error" - fi - fi -fi - -drivers_linked=0 -if [ "$link_drivers" -eq 1 ]; then - if ! create_directory_structure "${host_injection_driver_dir}" ; then - fatal_error "No write permissions to directory ${host_injection_driver_dir}" - fi - cd ${host_injection_driver_dir} - # Need a small temporary space to hold a couple of files - temp_dir=$(mktemp -d) - - # Gather libraries on the host (_must_ be host ldconfig) - $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt - # Allow for the fact that we may be in a container so the CUDA libs might be in there - ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null - - # Leverage singularity to find the full list of libraries we should be linking to - echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" - curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf - - # Make symlinks to all the interesting libraries - grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} - - # Inject driver and CUDA versions into dir - echo $host_driver_version > driver_version.txt - echo $host_cuda_version > cuda_version.txt - drivers_linked=1 - - # Remove the temporary directory when done - rm -r "$temp_dir" -fi - -# Make latest symlink for NVIDIA drivers -cd $host_injections_nvidia_dir -symlink="latest" -if [ -L "$symlink" ]; then - # Unless the drivers have been installed, leave the symlink alone - if [ "$drivers_linked" -eq 1 ]; then - ln -sf host latest - fi -else - # No link exists yet - ln -s host latest -fi - -# Make sure the libraries can be found by the EESSI linker -host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} -if [ -L "$host_injection_linker_dir/lib" ]; then - target_path=$(readlink -f "$host_injection_linker_dir/lib") - if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then - cd $host_injection_linker_dir - ln -sf $host_injections_nvidia_dir/latest lib - fi -else - create_directory_structure $host_injection_linker_dir - cd $host_injection_linker_dir - ln -s $host_injections_nvidia_dir/latest lib -fi - -echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" diff --git a/temp/scripts/utils.sh b/temp/scripts/utils.sh deleted file mode 100644 index b2be3f6221..0000000000 --- a/temp/scripts/utils.sh +++ /dev/null @@ -1,144 +0,0 @@ -function echo_green() { - echo -e "\e[32m$1\e[0m" -} - -function echo_red() { - echo -e "\e[31m$1\e[0m" -} - -function echo_yellow() { - echo -e "\e[33m$1\e[0m" -} - -ANY_ERROR_EXITCODE=1 -function fatal_error() { - echo_red "ERROR: $1" >&2 - if [[ $# -gt 1 ]]; then - exit "$2" - else - exit "${ANY_ERROR_EXITCODE}" - fi -} - -function check_exit_code { - ec=$1 - ok_msg=$2 - fail_msg=$3 - - if [[ $ec -eq 0 ]]; then - echo_green "${ok_msg}" - else - fatal_error "${fail_msg}" - fi -} - -function check_eessi_initialised() { - if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "EESSI has not been initialised!" - else - return 0 - fi -} - -function check_in_prefix_shell() { - # Make sure EPREFIX is defined - if [[ -z "${EPREFIX}" ]]; then - fatal_error "This script cannot be used without having first defined EPREFIX" - fi - if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then - fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" - fi -} - -function create_directory_structure() { - # Ensure we are given a single path argument - if [ $# -ne 1 ]; then - echo_red "Function requires a single (relative or absolute) path argument" >&2 - return $ANY_ERROR_EXITCODE - fi - dir_structure="$1" - - # Attempt to create the directory structure - error_message=$(mkdir -p "$dir_structure" 2>&1) - return_code=$? - # If it fails be explicit about the error - if [ ${return_code} -ne 0 ]; then - real_dir=$(realpath -m "$dir_structure") - echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 - else - # If we're creating it, our use case is that we want to be able to write there - # (this is a check in case the directory already existed) - if [ ! -w "${dir_structure}" ]; then - real_dir=$(realpath -m "$dir_structure") - echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" - return_code=$ANY_ERROR_EXITCODE - fi - fi - - return $return_code -} - -function get_path_for_tool { - tool_name=$1 - tool_envvar_name=$2 - - which_out=$(which "${tool_name}" 2>&1) - exit_code=$? - if [[ ${exit_code} -eq 0 ]]; then - echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 - echo "${which_out}" - return 0 - fi - if [[ -z "${tool_envvar_name}" ]]; then - msg="no env var holding the full path to tool '${tool_name}' provided" - echo "${msg}" >&2 - return 1 - else - tool_envvar_value=${!tool_envvar_name} - if [[ -x "${tool_envvar_value}" ]]; then - msg="INFO: found tool ${tool_envvar_value} via env var ${tool_envvar_name}" - echo "${msg}" >&2 - echo "${tool_envvar_value}" - return 0 - else - msg="ERROR: tool '${tool_name}' not in PATH\n" - msg+="ERROR: tool '${tool_envvar_value}' via '${tool_envvar_name}' not in PATH" - echo "${msg}" >&2 - echo "" - return 2 - fi - fi -} - -function get_host_from_url { - url=$1 - re="(http|https)://([^/:]+)" - if [[ $url =~ $re ]]; then - echo "${BASH_REMATCH[2]}" - return 0 - else - echo "" - return 1 - fi -} - -function get_port_from_url { - url=$1 - re="(http|https)://[^:]+:([0-9]+)" - if [[ $url =~ $re ]]; then - echo "${BASH_REMATCH[2]}" - return 0 - else - echo "" - return 1 - fi -} - -function get_ipv4_address { - hname=$1 - hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) - # TODO try other methods if the one above does not work --> tool that verifies - # what method can be used? - echo "${hipv4}" - return 0 -} From 43c73c07643cf756ff8411b9068bf4cb2a64eec4 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 21 Dec 2023 13:06:53 +0100 Subject: [PATCH 26/31] Get rid of copy/paste unfriendly '.' --- install_scripts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_scripts.sh b/install_scripts.sh index 588248e8d2..6e6cd825ac 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -19,7 +19,7 @@ compare_and_copy() { if [ ! -f "$destination_file" ] || ! diff -q "$source_file" "$destination_file" ; then cp "$source_file" "$destination_file" - echo "File $1 copied to $2." + echo "File $1 copied to $2" else echo "Files $1 and $2 are identical. No copy needed." fi From 3ec3df8f4a91c6684ed83c6b021c231ca74ba7c0 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 21 Dec 2023 14:11:09 +0100 Subject: [PATCH 27/31] Update create_tarball.sh --- create_tarball.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_tarball.sh b/create_tarball.sh index 4d3ad37311..a172428af1 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -43,13 +43,13 @@ module_files_list=${tmpdir}/module_files.list.txt # include scripts that were copied by install_scripts.sh if [ -d ${eessi_version}/scripts ]; then # include scripts we wish to ship along with EESSI, - find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' >> ${files_list} + find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' > ${files_list} fi if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then # include Lmod cache and configuration file (lmodrc.lua), # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) - find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' > ${files_list} + find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' >> ${files_list} fi if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules ]; then # module files From 42e3404a0015d7b4049c5cd3e5a01ee764261d44 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 21 Dec 2023 14:13:23 +0100 Subject: [PATCH 28/31] always append to list of files to include in tarball, to avoid overwriting it --- create_tarball.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/create_tarball.sh b/create_tarball.sh index a172428af1..faaa9fda6f 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -40,10 +40,10 @@ echo ">> Collecting list of files/directories to include in tarball via ${PWD}.. files_list=${tmpdir}/files.list.txt module_files_list=${tmpdir}/module_files.list.txt -# include scripts that were copied by install_scripts.sh -if [ -d ${eessi_version}/scripts ]; then - # include scripts we wish to ship along with EESSI, - find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' > ${files_list} +# include Lmod cache and configuration file (lmodrc.lua), +if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then + # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) + find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' >> ${files_list} fi if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then @@ -51,6 +51,12 @@ if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod ]; then # skip whiteout files and backup copies of Lmod cache (spiderT.old.*) find ${eessi_version}/software/${os}/${cpu_arch_subdir}/.lmod -type f | egrep -v '/\.wh\.|spiderT.old' >> ${files_list} fi + +# include scripts that were copied by install_scripts.sh, which we want to ship in EESSI repository +if [ -d ${eessi_version}/scripts ]; then + find ${eessi_version}/scripts -type f | grep -v '/\.wh\.' >> ${files_list} +fi + if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules ]; then # module files find ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules -type f | grep -v '/\.wh\.' >> ${files_list} @@ -61,6 +67,7 @@ if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/modules ]; then | grep -v '/\.wh\.' | grep -v '/\.modulerc\.lua' | sed -e 's/.lua$//' | sed -e 's@.*/modules/all/@@g' | sort -u \ >> ${module_files_list} fi + if [ -d ${eessi_version}/software/${os}/${cpu_arch_subdir}/software -a -r ${module_files_list} ]; then # installation directories but only those for which module files were created # Note, we assume that module names (as defined by 'PACKAGE_NAME/VERSION.lua' From 60741ae979757b72c87551a9d0e15afefda4a4a6 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 21 Dec 2023 20:48:36 +0100 Subject: [PATCH 29/31] make link_nvidia_host_libraries.sh script a bit more robust, in case target of host_injections directory is a non-existing directory --- .../nvidia/link_nvidia_host_libraries.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index e6ff110797..e8d7f0d0a7 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -48,8 +48,10 @@ export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" if $nvidia_smi_command > /dev/null; then host_driver_version=$($nvidia_smi_command | tail -n1) + echo_green "Found NVIDIA GPU driver version ${host_driver_version}" # If the first worked, this should work too host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') + echo_green "Found host CUDA version ${host_cuda_version}" else error="Failed to successfully execute\n $nvidia_smi_command\n" fatal_error "$error" @@ -58,12 +60,18 @@ fi # Let's make sure the driver libraries are not already in place link_drivers=1 +# first make sure that target of host_injections variant symlink is an existing directory +host_injections_target=$(realpath -m ${EESSI_CVMFS_REPO}/host_injections) +if [ ! -d ${host_injections_target} ]; then + create_directory_structure ${host_injections_target} +fi + host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" host_injection_driver_dir="${host_injections_nvidia_dir}/host" host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" if [ -e "$host_injection_driver_version_file" ]; then if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then - echo_green "The host CUDA driver libraries have already been linked!" + echo_green "The host GPU driver libraries (v${host_driver_version}) have already been linked! (based on ${host_injection_driver_version_file})" link_drivers=0 else # There's something there but it is out of date @@ -91,8 +99,8 @@ if [ "$link_drivers" -eq 1 ]; then ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null # Leverage singularity to find the full list of libraries we should be linking to - echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" - curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer to ${temp_dir}/nvliblist.conf" + curl --silent --output "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf # Make symlinks to all the interesting libraries grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} @@ -133,4 +141,4 @@ else ln -s $host_injections_nvidia_dir/latest lib fi -echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" +echo_green "Host NVIDIA GPU drivers linked successfully for EESSI" From 5c248d1dcb844ec11a76f9f368ba523b7edf3e87 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 22 Dec 2023 11:39:31 +0100 Subject: [PATCH 30/31] Ensure that bot reports success if no EasyStacks were changed in a PR. Relevant for PRs that e.g. only update things in .../scripts --- EESSI-install-software.sh | 75 +++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index b61ca7a579..edbcf7040b 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -203,42 +203,47 @@ ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12 # ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh # use PR patch file to determine in which easystack files stuff was added -for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do - - echo -e "Processing easystack file ${easystack_file}...\n\n" - - # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file - eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g') - - # load EasyBuild module (will be installed if it's not available yet) - source ${TOPDIR}/load_easybuild_module.sh ${eb_version} - - ${EB} --show-config - - echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..." - - if [ -f ${easystack_file} ]; then - echo_green "Feeding easystack file ${easystack_file} to EasyBuild..." - - ${EB} --easystack ${TOPDIR}/${easystack_file} --robot - ec=$? - - # copy EasyBuild log file if EasyBuild exited with an error - if [ ${ec} -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - # copy to current working directory - cp -a ${eb_last_log} . - echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}" - # copy to build logs dir (with context added) - copy_build_log "${eb_last_log}" "${build_logs_dir}" +changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') +if [ -z ${changed_easystacks} ]; then + echo "No missing installations" # Ensure the bot report success, as there was nothing to be build here +else + for easystack_file in ${changed_easystacks}; do + + echo -e "Processing easystack file ${easystack_file}...\n\n" + + # determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file + eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g') + + # load EasyBuild module (will be installed if it's not available yet) + source ${TOPDIR}/load_easybuild_module.sh ${eb_version} + + ${EB} --show-config + + echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..." + + if [ -f ${easystack_file} ]; then + echo_green "Feeding easystack file ${easystack_file} to EasyBuild..." + + ${EB} --easystack ${TOPDIR}/${easystack_file} --robot + ec=$? + + # copy EasyBuild log file if EasyBuild exited with an error + if [ ${ec} -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + # copy to current working directory + cp -a ${eb_last_log} . + echo "Last EasyBuild log file copied from ${eb_last_log} to ${PWD}" + # copy to build logs dir (with context added) + copy_build_log "${eb_last_log}" "${build_logs_dir}" + fi + + $TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file} + else + fatal_error "Easystack file ${easystack_file} not found!" fi - - $TOPDIR/check_missing_installations.sh ${TOPDIR}/${easystack_file} - else - fatal_error "Easystack file ${easystack_file} not found!" - fi - -done + + done +fi ### add packages here From ac53cf0ea0160d47e302393ef60d2829586708af Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 22 Dec 2023 14:54:46 +0100 Subject: [PATCH 31/31] Make the pedantic deploy step of the bot happy... --- EESSI-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index edbcf7040b..69de9d1997 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -205,7 +205,7 @@ ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12 # use PR patch file to determine in which easystack files stuff was added changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') if [ -z ${changed_easystacks} ]; then - echo "No missing installations" # Ensure the bot report success, as there was nothing to be build here + echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here else for easystack_file in ${changed_easystacks}; do