From 3aac7f958463279f7099d5ed95ce1bf4c2219e68 Mon Sep 17 00:00:00 2001 From: Richard Top Date: Thu, 6 Jun 2024 08:59:59 +0000 Subject: [PATCH 1/3] {2023.06}[foss/2023b] NLTK V3.8.1 --- .../pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml index 29bb31e9aa..03da218a78 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023b.yml @@ -25,3 +25,4 @@ easyconfigs: - GDB-13.2-GCCcore-13.2.0.eb - IPython-8.17.2-GCCcore-13.2.0.eb - Qt5-5.15.13-GCCcore-13.2.0.eb + - NLTK-3.8.1-foss-2023b.eb From 9339f897510177490c8a3da91f3719adc98f84bc Mon Sep 17 00:00:00 2001 From: Richard Top Date: Fri, 7 Jun 2024 06:38:56 +0000 Subject: [PATCH 2/3] use directory of called script as base dir for other scripts in bot/inspect.sh --- bot/inspect.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/bot/inspect.sh b/bot/inspect.sh index 9d1fa87e1f..533968bffc 100755 --- a/bot/inspect.sh +++ b/bot/inspect.sh @@ -27,6 +27,10 @@ # stop as soon as something fails set -e +# script_dir is the directory that contains THIS (inspect.sh) script, usually +# stored in the directory '.../bot' +script_dir=$(dirname $(realpath $BASH_SOURCE)) + display_help() { echo "usage: $0 [OPTIONS]" echo " -h | --help - display this usage information" @@ -81,8 +85,8 @@ done set -- "${POSITIONAL_ARGS[@]}" # source utils.sh and cfg_files.sh -source scripts/utils.sh -source scripts/cfg_files.sh +source ${script_dir}/../scripts/utils.sh +source ${script_dir}/../scripts/cfg_files.sh if [[ -z ${resume_tgz} ]]; then echo_red "path to tarball for resuming build job is missing" @@ -255,10 +259,8 @@ CMDLINE_ARGS+=("--storage" "${JOB_STORAGE}") # make sure some environment settings are available inside the shell started via # startprefix -base_dir=$(dirname $(realpath $0)) -# base_dir of inspect.sh script is '.../bot', 'init' dir is at the same level # TODO better use script from tarball??? -source ${base_dir}/../init/eessi_defaults +source ${script_dir}/../init/eessi_defaults if [ -z $EESSI_VERSION ]; then echo "ERROR: \$EESSI_VERSION must be set!" >&2 @@ -432,14 +434,14 @@ echo "Executing command to start interactive session to inspect build job:" # These initializations are combined into a single script that is executed when # the shell in startprefix is started. We set the env variable BASH_ENV here. if [[ -z ${run_in_prefix} ]]; then - echo "./eessi_container.sh ${CMDLINE_ARGS[@]}" + echo "${script_dir}/../eessi_container.sh ${CMDLINE_ARGS[@]}" echo " -- ${EESSI_COMPAT_LAYER_DIR}/startprefix" - ./eessi_container.sh "${CMDLINE_ARGS[@]}" \ + ${script_dir}/../eessi_container.sh "${CMDLINE_ARGS[@]}" \ -- ${EESSI_COMPAT_LAYER_DIR}/startprefix else - echo "./eessi_container.sh ${CMDLINE_ARGS[@]}" + echo "${script_dir}/../eessi_container.sh ${CMDLINE_ARGS[@]}" echo " -- ${EESSI_COMPAT_LAYER_DIR}/startprefix <<< ${run_in_prefix}" - ./eessi_container.sh "${CMDLINE_ARGS[@]}" \ + ${script_dir}/../eessi_container.sh "${CMDLINE_ARGS[@]}" \ -- ${EESSI_COMPAT_LAYER_DIR}/startprefix <<< ${run_in_prefix} fi From fc7707e2e3b05f89247b3a7db41f93a85f84f28c Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 9 Jun 2024 11:59:08 +0200 Subject: [PATCH 3/3] {2023.06,2023a} dependencies for PyTorch-bundle v2.1.2 --- EESSI-install-software.sh | 16 +- bot/build.sh | 16 +- .../2023.06/eessi-2023.06-eb-4.9.1-2023a.yml | 20 +++ eb_hooks.py | 86 +++++++++++ eessi_container.sh | 5 + install_scripts.sh | 1 + run_in_compat_layer_env.sh | 6 +- scripts/extra/custom_ctypes-1.2.eb | 29 ++++ .../extra/eessi-2023.06-extra-packages.yml | 2 + scripts/extra/install_extra_packages.sh | 95 ++++++++++++ .../nvidia/copy_nvidia_host_libraries.sh | 145 ++++++++++++++++++ scripts/utils.sh | 5 + 12 files changed, 412 insertions(+), 14 deletions(-) create mode 100644 scripts/extra/custom_ctypes-1.2.eb create mode 100644 scripts/extra/eessi-2023.06-extra-packages.yml create mode 100755 scripts/extra/install_extra_packages.sh create mode 100755 scripts/gpu_support/nvidia/copy_nvidia_host_libraries.sh diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index ca6fed71d5..201ae6f8cb 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,11 +17,6 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } -# Function to check if a command exists -function command_exists() { - command -v "$1" >/dev/null 2>&1 -} - function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -159,8 +154,13 @@ fi # are: # - .lmod/lmodrc.lua # - .lmod/SitePackage.lua +# # We run scripts to create them if they don't exist or if the scripts have been # changed in the PR. +# +# (TODO do we need to change the path if we have sub-directories for +# accelerators? And would we need different scripts for creating lua files under +# different directories?) # Set base directory for software and for Lmod config files _eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} @@ -256,6 +256,12 @@ if command_exists "nvidia-smi"; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh fi +# Install extra software that is needed (e.g., for providing a custom ctypes +# library when needed) +cd ${TOPDIR}/scripts/extra +./install_extra_packages.sh --temp-dir /tmp/temp --easystack eessi-2023.06-extra-packages.yml +cd ${TOPDIR} + # use PR patch file to determine in which easystack files stuff was added changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing') if [ -z "${changed_easystacks}" ]; then diff --git a/bot/build.sh b/bot/build.sh index 6e835cb6aa..de73faef0b 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -266,16 +266,20 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -BUILD_STEP_ARGS+=("--nvidia" "all") +if command_exists "nvidia-smi"; then + echo "Command 'nvidia-smi' found, using available GPU" + BUILD_STEP_ARGS+=("--nvidia" "all") +else + echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + BUILD_STEP_ARGS+=("--nvidia" "install") +fi +# Retain location for host injections so we don't reinstall CUDA +# (Always need to run the driver installation as available driver may change) + if [[ ! -z ${SHARED_FS_PATH} ]]; then BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections") fi -# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway) -echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'" -export EESSI_OVERRIDE_GPU_CHECK=1 -echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'" - # create tmp file for output of build step build_outerr=$(mktemp build.outerr.XXXX) diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml index 4f31c4dd08..81f4afd70e 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml @@ -55,3 +55,23 @@ easyconfigs: - PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb: options: cuda-compute-capabilities: 6.0,6.1,7.0,7.5,8.0,8.6,8.9,9.0 + # PyTorch-bundle-CUDA's dependencies without CUDA + - librosa-0.10.1-foss-2023a.eb + - NLTK-3.8.1-foss-2023a.eb + - parameterized-0.9.0-GCCcore-12.3.0.eb + - Scalene-1.5.26-GCCcore-12.3.0.eb + - scikit-image-0.22.0-foss-2023a.eb + - SentencePiece-0.2.0-GCC-12.3.0.eb: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19987 + options: + from-pr: 19987 + - libmad-0.15.1b-GCCcore-12.3.0.eb: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19987 + options: + from-pr: 19987 + - SoX-14.4.2-GCCcore-12.3.0.eb: + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19987 + options: + from-pr: 19987 + - tensorboard-2.15.1-gfbf-2023a.eb + - tqdm-4.66.1-GCCcore-12.3.0.eb diff --git a/eb_hooks.py b/eb_hooks.py index e4a957acdc..849a135fa8 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -6,6 +6,7 @@ import easybuild.tools.environment as env from easybuild.easyblocks.generic.configuremake import obtain_config_guess from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS +from easybuild.easyblocks.python import EXTS_FILTER_PYTHON_PACKAGES from easybuild.tools.build_log import EasyBuildError, print_msg from easybuild.tools.config import build_option, update_build_option from easybuild.tools.filetools import apply_regex_substitutions, copy_file, remove_file, symlink, which @@ -311,6 +312,31 @@ def parse_hook_qt5_check_qtwebengine_disable(ec, eprefix): raise EasyBuildError("Qt5-specific hook triggered for non-Qt5 easyconfig?!") + +def parse_hook_sentencepiece_disable_tcmalloc_aarch64(ec, eprefix): + """ + Disable using TC_Malloc on 'aarch64/generic' + """ + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if ec.name == 'SentencePiece' and ec.version in ['0.2.0']: + if cpu_target == CPU_TARGET_AARCH64_GENERIC: + print_msg("parse_hook for SentencePiece: OLD '%s'", ec['components']) + new_components = [] + for item in ec['components']: + if item[2]['easyblock'] == 'CMakeMake': + new_item = item[2] + new_item['configopts'] = '-DSPM_ENABLE_TCMALLOC=OFF' + new_components.append((item[0], item[1], new_item)) + else: + new_components.append(item) + ec['components'] = new_components + print_msg("parse_hook for SentencePiece: NEW '%s'", ec['components']) + else: + print_msg("parse_hook for SentencePiece on %s -> leaving configopts unchanged", cpu_target) + else: + raise EasyBuildError("SentencePiece-specific hook triggered for non-SentencePiece easyconfig?!") + + def parse_hook_ucx_eprefix(ec, eprefix): """Make UCX aware of compatibility layer via additional configuration options.""" if ec.name == 'UCX': @@ -349,6 +375,30 @@ def parse_hook_lammps_remove_deps_for_CI_aarch64(ec, *args, **kwargs): raise EasyBuildError("LAMMPS-specific hook triggered for non-LAMMPS easyconfig?!") +def parse_hook_librosa_custom_ctypes(ec, *args, **kwargs): + """ + Add exts_filter to soundfile extension in exts_list + """ + if ec.name == 'librosa' and ec.version in ('0.10.1',): + ec_dict = ec.asdict() + eessi_software_path = get_eessi_envvar('EESSI_SOFTWARE_PATH') + custom_ctypes_path = os.path.join(eessi_software_path, "software", "custom_ctypes", "1.2") + ebpythonprefixes = "EBPYTHONPREFIXES=%s" % custom_ctypes_path + exts_list_new = [] + for item in ec_dict['exts_list']: + if item[0] == 'soundfile': + ext_dict = item[2] + ext_dict['exts_filter'] = (ebpythonprefixes + ' ' + EXTS_FILTER_PYTHON_PACKAGES[0], + EXTS_FILTER_PYTHON_PACKAGES[1]) + exts_list_new.append((item[0], item[1], ext_dict)) + else: + exts_list_new.append(item) + ec['exts_list'] = exts_list_new + print_msg("New exts_list: '%s'", ec['exts_list']) + else: + raise EasyBuildError("librosa/0.10.1-specific hook triggered for non-librosa/0.10.1 easyconfig?!") + + def pre_prepare_hook_highway_handle_test_compilation_issues(self, *args, **kwargs): """ Solve issues with compiling or running the tests on both @@ -852,6 +902,36 @@ def inject_gpu_property(ec): return ec + +def pre_module_hook(self, *args, **kwargs): + """Main pre-module-check hook: trigger custom functions based on software name.""" + if self.name in PRE_MODULE_HOOKS: + PRE_MODULE_HOOKS[self.name](self, *args, **kwargs) + + +def pre_module_hook_librosa_augment_modluafooter(self, *args, **kwargs): + """ + Add EBPYTHONPREFIXES to modluafooter + """ + if self.name == 'librosa' and self.version == '0.10.1': + eessi_software_path = get_eessi_envvar('EESSI_SOFTWARE_PATH') + custom_ctypes_path = os.path.join(eessi_software_path, "software", "custom_ctypes", "1.2") + key = 'modluafooter' + values = ['prepend_path("EBPYTHONPREFIXES","%s")' % (custom_ctypes_path)] + print_msg("Adding '%s' to modluafooter", values[0]) + if not key in self.cfg: + self.cfg[key] = '\n'.join(values) + else: + new_value = self.cfg[key] + for value in values: + if not value in new_value: + new_value = '\n'.join([new_value, value]) + self.cfg[key] = new_value + print_msg("Full modluafooter is '%s'", self.cfg[key]) + else: + raise EasyBuildError("librosa/0.10.1-specific hook triggered for non-librosa/0.10.1 easyconfig?!") + + PARSE_HOOKS = { 'casacore': parse_hook_casacore_disable_vectorize, 'CGAL': parse_hook_cgal_toolchainopts_precise, @@ -859,10 +939,12 @@ def inject_gpu_property(ec): 'GPAW': parse_hook_gpaw_harcoded_path, 'ImageMagick': parse_hook_imagemagick_add_dependency, 'LAMMPS': parse_hook_lammps_remove_deps_for_CI_aarch64, + 'librosa': parse_hook_librosa_custom_ctypes, 'OpenBLAS': parse_hook_openblas_relax_lapack_tests_num_errors, 'Pillow-SIMD' : parse_hook_Pillow_SIMD_harcoded_paths, 'pybind11': parse_hook_pybind11_replace_catch2, 'Qt5': parse_hook_qt5_check_qtwebengine_disable, + 'SentencePiece': parse_hook_sentencepiece_disable_tcmalloc_aarch64, 'UCX': parse_hook_ucx_eprefix, } @@ -909,3 +991,7 @@ def inject_gpu_property(ec): 'cuDNN': post_sanitycheck_cudnn, 'cuTENSOR': post_sanitycheck_cutensor, } + +PRE_MODULE_HOOKS = { + 'librosa': pre_module_hook_librosa_augment_modluafooter, +} diff --git a/eessi_container.sh b/eessi_container.sh index a9405b6d8e..e6bb13cbe7 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -477,6 +477,11 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then mkdir -p ${EESSI_USR_LOCAL_CUDA} BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" + if [[ "${NVIDIA_MODE}" == "install" ]] ; then + # No GPU so we need to "trick" Lmod to allow us to load CUDA modules even without a CUDA driver + # (this variable means EESSI_OVERRIDE_GPU_CHECK=1 will be set inside the container) + export SINGULARITYENV_EESSI_OVERRIDE_GPU_CHECK=1 + fi fi fi diff --git a/install_scripts.sh b/install_scripts.sh index 07643a39e6..d2e45466e3 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -113,6 +113,7 @@ nvidia_files=( eessi-2023.06-cuda-and-libraries.yml install_cuda_and_libraries.sh link_nvidia_host_libraries.sh + copy_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" diff --git a/run_in_compat_layer_env.sh b/run_in_compat_layer_env.sh index 393956a0c1..cc2cdae034 100755 --- a/run_in_compat_layer_env.sh +++ b/run_in_compat_layer_env.sh @@ -26,12 +26,12 @@ fi if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then INPUT="export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}; ${INPUT}" fi -if [ ! -z ${http_proxy} ]; then - INPUT="export http_proxy=${http_proxy}; ${INPUT}" -fi if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}" fi +if [ ! -z ${http_proxy} ]; then + INPUT="export http_proxy=${http_proxy}; ${INPUT}" +fi if [ ! -z ${https_proxy} ]; then INPUT="export https_proxy=${https_proxy}; ${INPUT}" fi diff --git a/scripts/extra/custom_ctypes-1.2.eb b/scripts/extra/custom_ctypes-1.2.eb new file mode 100644 index 0000000000..35be6dcc41 --- /dev/null +++ b/scripts/extra/custom_ctypes-1.2.eb @@ -0,0 +1,29 @@ +## +# This is a contribution from the NESSI project +# Homepage: https://documentation.sigma2.no +# +# Authors:: Thomas Roeblitz +# License:: GPL-2.0-only +# +## + +easyblock = 'Tarball' + +name = 'custom_ctypes' +version = '1.2' + +homepage = 'https://github.com/ComputeCanada/custom_ctypes' +description = """custum_ctypes is a small Python package to fix the discovery of libraries with Python's ctypes module. It changes the behavior of find_library to return absolute paths to shared objects rather than just the names.""" + +toolchain = SYSTEM + +source_urls = ['https://github.com/ComputeCanada/custom_ctypes/archive/refs/tags'] +sources = ['%(version)s.tar.gz'] +checksums = ['3b30ce633c6a329169f2b10ff24b8eaaeef3fa208a66cdacdb53c22f02a88d9b'] + +sanity_check_paths = { + 'files': ['README.md'], + 'dirs': ['lib'], +} + +moduleclass = 'lib' diff --git a/scripts/extra/eessi-2023.06-extra-packages.yml b/scripts/extra/eessi-2023.06-extra-packages.yml new file mode 100644 index 0000000000..22670ec7a3 --- /dev/null +++ b/scripts/extra/eessi-2023.06-extra-packages.yml @@ -0,0 +1,2 @@ +easyconfigs: + - custom_ctypes-1.2.eb diff --git a/scripts/extra/install_extra_packages.sh b/scripts/extra/install_extra_packages.sh new file mode 100755 index 0000000000..ccd2890864 --- /dev/null +++ b/scripts/extra/install_extra_packages.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +# This script can be used to install extra packages under ${EESSI_SOFTWARE_PATH} + +# some logging +echo ">>> Running ${BASH_SOURCE}" + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath ${BASH_SOURCE})) +source "${TOPDIR}"/../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " -e, --easystack EASYSTACKFILE Easystack file which specifies easyconfigs to be installed." + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the installation" +} + +# Initialize variables +TEMP_DIR= +EASYSTACK_FILE= + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -e|--easystack) + if [ -n "$2" ]; then + EASYSTACK_FILE="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +if [[ -z ${EASYSTACK_FILE} ]]; then + show_help + fatal_error "Error: need to specify easystack file" +fi + +# Make sure NESSI is initialised +check_eessi_initialised + +# As an installation location just use $EESSI_SOFTWARE_PATH +export NESSI_CVMFS_INSTALL=${EESSI_SOFTWARE_PATH} + +# we need a directory we can use for temporary storage +if [[ -z "${TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) +else + mkdir -p ${TEMP_DIR} + tmpdir=$(mktemp -d --tmpdir=${TEMP_DIR} extra.XXX) + if [[ ! -d "$tmpdir" ]] ; then + fatal_error "Could not create directory ${tmpdir}" + fi +fi +echo "Created temporary directory '${tmpdir}'" +export WORKING_DIR=${tmpdir} + +# load EasyBuild +ml EasyBuild + +# load NESSI-extend/2023.06-easybuild +ml NESSI-extend/2023.06-easybuild + +eb --show-config + +eb --easystack ${EASYSTACK_FILE} --robot + +# clean up tmpdir +rm -rf "${tmpdir}" diff --git a/scripts/gpu_support/nvidia/copy_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/copy_nvidia_host_libraries.sh new file mode 100755 index 0000000000..ebc428a50d --- /dev/null +++ b/scripts/gpu_support/nvidia/copy_nvidia_host_libraries.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# This script links host libraries related to GPU drivers to a location where +# they can be found by the EESSI linker + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# We rely on ldconfig to give us the location of the libraries on the host +command_name="ldconfig" +# We cannot use a version of ldconfig that's being shipped under CVMFS +exclude_prefix="/cvmfs" + +found_paths=() +# Always attempt to use /sbin/ldconfig +if [ -x "/sbin/$command_name" ]; then + found_paths+=("/sbin/$command_name") +fi +IFS=':' read -ra path_dirs <<< "$PATH" +for dir in "${path_dirs[@]}"; do + if [ "$dir" = "/sbin" ]; then + continue # we've already checked for $command_name in /sbin, don't need to do it twice + fi + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi +done + +if [ ${#found_paths[@]} -gt 0 ]; then + echo "Found $command_name in the following locations:" + printf -- "- %s\n" "${found_paths[@]}" + echo "Using first version" + host_ldconfig=${found_paths[0]} +else + error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fatal_error "$error" +fi + +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + +# Find the CUDA version of the host CUDA drivers +# (making sure that this can still work inside prefix environment inside a container) +export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH +nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command > /dev/null; then + host_driver_version=$($nvidia_smi_command | tail -n1) + echo_green "Found NVIDIA GPU driver version ${host_driver_version}" + # If the first worked, this should work too + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') + echo_green "Found host CUDA version ${host_cuda_version}" +else + error="Failed to successfully execute\n $nvidia_smi_command\n" + fatal_error "$error" +fi + +# Let's make sure the driver libraries are not already in place +link_drivers=1 + +# first make sure that target of host_injections variant symlink is an existing directory +host_injections_target=$(realpath -m ${EESSI_CVMFS_REPO}/host_injections) +if [ ! -d ${host_injections_target} ]; then + create_directory_structure ${host_injections_target} +fi + +host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" +host_injection_driver_dir="${host_injections_nvidia_dir}/host" +host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" +if [ -e "$host_injection_driver_version_file" ]; then + if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then + echo_green "The host GPU driver libraries (v${host_driver_version}) have already been linked! (based on ${host_injection_driver_version_file})" + link_drivers=0 + else + # There's something there but it is out of date + echo_yellow "Cleaning out outdated symlinks" + rm $host_injection_driver_dir/* + if [ $? -ne 0 ]; then + error="Unable to remove files under '$host_injection_driver_dir'." + fatal_error "$error" + fi + fi +fi + +drivers_linked=0 +if [ "$link_drivers" -eq 1 ]; then + if ! create_directory_structure "${host_injection_driver_dir}" ; then + fatal_error "No write permissions to directory ${host_injection_driver_dir}" + fi + cd ${host_injection_driver_dir} + # Need a small temporary space to hold a couple of files + temp_dir=$(mktemp -d) + echo "temp_dir: '${temp_dir}'" + + # Gather libraries on the host (_must_ be host ldconfig) + $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt + # Allow for the fact that we may be in a container so the CUDA libs might be in there + ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null + + # Leverage singularity to find the full list of libraries we should be linking to + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer to ${temp_dir}/nvliblist.conf" + curl --silent --output "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + + # Make symlinks to all the interesting libraries + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i cp -a {} ${host_injection_driver_dir}/. + + # Inject driver and CUDA versions into dir + echo $host_driver_version > driver_version.txt + echo $host_cuda_version > cuda_version.txt + drivers_linked=1 + + # Remove the temporary directory when done + rm -r "$temp_dir" +fi + +# Make latest symlink for NVIDIA drivers +cd $host_injections_nvidia_dir +symlink="latest" +if [ -L "$symlink" ]; then + # Unless the drivers have been installed, leave the symlink alone + if [ "$drivers_linked" -eq 1 ]; then + ln -sf host latest + fi +else + # No link exists yet + ln -s host latest +fi + +# Make sure the libraries can be found by the EESSI linker +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} +if [ -L "$host_injection_linker_dir/lib" ]; then + target_path=$(readlink -f "$host_injection_linker_dir/lib") + if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then + cd $host_injection_linker_dir + ln -sf $host_injections_nvidia_dir/latest lib + fi +else + create_directory_structure $host_injection_linker_dir + cd $host_injection_linker_dir + ln -s $host_injections_nvidia_dir/latest lib +fi + +echo_green "Host NVIDIA GPU drivers linked successfully for EESSI" diff --git a/scripts/utils.sh b/scripts/utils.sh index b2be3f6221..fec6368eb0 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -78,6 +78,11 @@ function create_directory_structure() { return $return_code } +# function to check if a command exists +function command_exists() { + command -v "$1" >/dev/null 2>&1 +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2