diff --git a/EESSI-extend-2023.06-easybuild.eb b/EESSI-extend-2023.06-easybuild.eb index 8e328c3ece..bfe7931c8f 100644 --- a/EESSI-extend-2023.06-easybuild.eb +++ b/EESSI-extend-2023.06-easybuild.eb @@ -87,6 +87,16 @@ if (os.getenv("EESSI_CVMFS_INSTALL") ~= nil) then end eessi_cvmfs_install = true easybuild_installpath = os.getenv("EESSI_SOFTWARE_PATH") + eessi_accelerator_target = os.getenv("EESSI_ACCELERATOR_TARGET") + if (eessi_accelerator_target ~= nil) then + cuda_compute_capability = string.match(eessi_accelerator_target, "^nvidia/cc([0-9][0-9])$") + if (cuda_compute_capability ~= nil) then + easybuild_installpath = pathJoin(easybuild_installpath, 'accel', eessi_accelerator_target) + easybuild_cuda_compute_capabilities = cuda_compute_capability:sub(1, 1) .. "." .. cuda_compute_capability:sub(2, 2) + else + LmodError("Incorrect value for $EESSI_ACCELERATOR_TARGET: " .. eessi_accelerator_target) + end + end elseif (os.getenv("EESSI_SITE_INSTALL") ~= nil) then -- Make sure no other EESSI install environment variables are set if ((os.getenv("EESSI_PROJECT_INSTALL") ~= nil) or (os.getenv("EESSI_USER_INSTALL") ~= nil)) then @@ -146,6 +156,11 @@ setenv ("EASYBUILD_UMASK", "022") -- Allow this module to be loaded when running EasyBuild setenv ("EASYBUILD_ALLOW_LOADED_MODULES", "EasyBuild,EESSI-extend") +-- Set environment variables if building for CUDA compute capabilities +if (easybuild_cuda_compute_capabilities ~= nil) then + setenv ("EASYBUILD_CUDA_COMPUTE_CAPABILITIES", easybuild_cuda_compute_capabilities) +end + -- Set all related environment variables if we have project or user installations (including extending MODULEPATH) if (user_modulepath ~= nil) then -- Use a more restrictive umask for this case diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 65c43d5ac5..83c06c2184 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -229,22 +229,34 @@ if [[ "${EESSI_CVMFS_REPO}" != /cvmfs/dev.eessi.io ]]; then ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} fi +echo ">> Configuring EasyBuild..." + +# Make sure EESSI-extend is not loaded, and configure location variables for a +# CVMFS installation +module unload EESSI-extend +unset EESSI_USER_INSTALL +unset EESSI_PROJECT_INSTALL +unset EESSI_SITE_INSTALL +export EESSI_CVMFS_INSTALL=1 + +# We now run 'source load_eessi_extend_module.sh' to load or install and load the +# EESSI-extend module which sets up all build environment settings. +# The script requires the EESSI_VERSION given as argument, a couple of +# environment variables set (TMPDIR, EB and EASYBUILD_INSTALLPATH) and the +# function check_exit_code defined. +# NOTE 1, the script exits if those variables/functions are undefined. +# NOTE 2, loading the EESSI-extend module may adjust the value of EASYBUILD_INSTALLPATH, +# e.g., to point to the installation directory for accelerators. +# NOTE 3, we have to set a default for EASYBUILD_INSTALLPATH here in cases the +# EESSI-extend module itself needs to be installed. +export EASYBUILD_INSTALLPATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} +source load_eessi_extend_module.sh ${EESSI_VERSION} + # Install full CUDA SDK and cu* libraries in host_injections # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments -# The install_cuda... script uses EasyBuild. So, we need to check if we have EB -# or skip this step. echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary" -module_avail_out=$TMPDIR/ml.out -module avail 2>&1 | grep EasyBuild &> ${module_avail_out} -if [[ $? -eq 0 ]]; then - echo_green ">> Found an EasyBuild module" -else - echo_yellow ">> No EasyBuild module found: skipping step to install CUDA (see output in ${module_avail_out})" - export skip_cuda_install=True -fi - temp_install_storage=${TMPDIR}/temp_install_storage mkdir -p ${temp_install_storage} if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then @@ -253,7 +265,7 @@ if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then --accept-cuda-eula \ --accept-cudnn-eula else - echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" + echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed" fi # Install NVIDIA drivers in host_injections (if they exist) @@ -263,18 +275,6 @@ if command_exists "nvidia-smi"; then fi -echo ">> Configuring EasyBuild..." - -# Make sure that we use the EESSI_CVMFS_INSTALL -# Since the path is set when loading EESSI-extend, we reload it to make sure it works - even if it is already loaded -# Note we need to do this after running install_cuda_and_libraries, since that does installations in the EESSI_SITE_INSTALL -unset EESSI_USER_INSTALL -unset EESSI_PROJECT_INSTALL -unset EESSI_SITE_INSTALL -export EESSI_CVMFS_INSTALL=1 -module unload EESSI-extend -module load EESSI-extend/${EESSI_VERSION}-easybuild - if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path" diff --git a/EESSI-remove-software.sh b/EESSI-remove-software.sh index 98576efcb0..1a03a7af98 100755 --- a/EESSI-remove-software.sh +++ b/EESSI-remove-software.sh @@ -129,6 +129,9 @@ if [ $EUID -eq 0 ]; then echo_yellow "Removing ${app_dir} and ${app_module}..." rm -rf ${app_dir} rm -rf ${app_module} + # recreate some directory to work around permission denied + # issues when rebuilding the package + mkdir -p ${app_dir}/easybuild done else fatal_error "Easystack file ${easystack_file} not found!" diff --git a/bot/build.sh b/bot/build.sh index 3fd343e96f..81b3ef4660 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -208,6 +208,7 @@ else declare -a REMOVAL_STEP_ARGS=() REMOVAL_STEP_ARGS+=("--save" "${TARBALL_TMP_REMOVAL_STEP_DIR}") REMOVAL_STEP_ARGS+=("--storage" "${STORAGE}") + # add fakeroot option in order to be able to remove software, see: # https://github.com/EESSI/software-layer/issues/312 REMOVAL_STEP_ARGS+=("--fakeroot") diff --git a/bot/check-build.sh b/bot/check-build.sh index f185b18dda..41aeab258e 100755 --- a/bot/check-build.sh +++ b/bot/check-build.sh @@ -17,6 +17,7 @@ # - SUCCESS (all of) # - working directory contains slurm-JOBID.out file # - working directory contains eessi*tar.gz +# - no message FATAL # - no message ERROR # - no message FAILED # - no message ' required modules missing:' @@ -25,6 +26,7 @@ # - FAILED (one of ... implemented as NOT SUCCESS) # - no slurm-JOBID.out file # - no tarball +# - message with FATAL # - message with ERROR # - message with FAILED # - message with ' required modules missing:' @@ -105,6 +107,16 @@ else [[ ${VERBOSE} -ne 0 ]] && echo " Slurm output file '"${job_out}"' NOT found" fi +FATAL=-1 +if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then + GP_fatal='FATAL: ' + grep_out=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_fatal}") + [[ $? -eq 0 ]] && FATAL=1 || FATAL=0 + # have to be careful to not add searched for pattern into slurm out file + [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_fatal}"'" + [[ ${VERBOSE} -ne 0 ]] && echo "${grep_out}" +fi + ERROR=-1 if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then GP_error='ERROR: ' @@ -163,6 +175,7 @@ fi [[ ${VERBOSE} -ne 0 ]] && echo "SUMMARY: ${job_dir}/${job_out}" [[ ${VERBOSE} -ne 0 ]] && echo " : ()" +[[ ${VERBOSE} -ne 0 ]] && echo " FATAL......: $([[ $FATAL -eq 1 ]] && echo 'yes' || echo 'no') (no)" [[ ${VERBOSE} -ne 0 ]] && echo " ERROR......: $([[ $ERROR -eq 1 ]] && echo 'yes' || echo 'no') (no)" [[ ${VERBOSE} -ne 0 ]] && echo " FAILED.....: $([[ $FAILED -eq 1 ]] && echo 'yes' || echo 'no') (no)" [[ ${VERBOSE} -ne 0 ]] && echo " REQ_MISSING: $([[ $MISSING -eq 1 ]] && echo 'yes' || echo 'no') (no)" @@ -190,6 +203,7 @@ job_result_file=_bot_job${SLURM_JOB_ID}.result # Default reason: if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]] && \ + [[ ${FATAL} -eq 0 ]] && \ [[ ${ERROR} -eq 0 ]] && \ [[ ${FAILED} -eq 0 ]] && \ [[ ${MISSING} -eq 0 ]] && \ @@ -223,6 +237,7 @@ fi #
_Details_
#
# :white_check_mark: job output file slurm-4682.out
+# :white_check_mark: no message matching FATAL:
# :white_check_mark: no message matching ERROR:
# :white_check_mark: no message matching FAILED:
# :white_check_mark: no message matching required modules missing:
@@ -264,6 +279,7 @@ fi #
_Details_
#
# :white_check_mark: job output file slurm-4682.out
+# :x: no message matching FATAL:
# :x: no message matching ERROR:
# :white_check_mark: no message matching FAILED:
# :x: no message matching required modules missing:
@@ -381,6 +397,10 @@ success_msg="job output file ${job_out}" failure_msg="no job output file ${job_out}" comment_details_list=${comment_details_list}$(add_detail ${SLURM_OUTPUT_FOUND} 1 "${success_msg}" "${failure_msg}") +success_msg="no message matching ${GP_fatal}" +failure_msg="found message matching ${GP_fatal}" +comment_details_list=${comment_details_list}$(add_detail ${FATAL} 0 "${success_msg}" "${failure_msg}") + success_msg="no message matching ${GP_error}" failure_msg="found message matching ${GP_error}" comment_details_list=${comment_details_list}$(add_detail ${ERROR} 0 "${success_msg}" "${failure_msg}") diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml index 03c9ec8f98..d9b7dca5d5 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml @@ -12,3 +12,4 @@ easyconfigs: options: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21783 from-commit: 5fa3db9eb36f91cba3fbf351549f8ba2849abc33 + - GDRCopy-2.4-GCCcore-13.2.0.eb diff --git a/easystacks/software.eessi.io/2023.06/rebuilds/20241112-eb-4.9.4-EESSI-extend.yml b/easystacks/software.eessi.io/2023.06/rebuilds/20241112-eb-4.9.4-EESSI-extend.yml new file mode 100644 index 0000000000..e4c658784f --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/rebuilds/20241112-eb-4.9.4-EESSI-extend.yml @@ -0,0 +1,6 @@ +# 2024.11.12 +# for installations under /cvmfs, if EESSI_ACCELERATOR_TARGET is set, +# EESSI-extend should adjust EASYBUILD_INSTALLPATH and set +# EASYBUILD_CUDA_COMPUTE_CAPABILITIES +easyconfigs: + - EESSI-extend-2023.06-easybuild.eb diff --git a/load_eessi_extend_module.sh b/load_eessi_extend_module.sh new file mode 100755 index 0000000000..62b6e3f3ae --- /dev/null +++ b/load_eessi_extend_module.sh @@ -0,0 +1,116 @@ +# Script to load the environment module for EESSI-extend. +# If that module is not available yet, a specific version will be installed using the latest EasyBuild. +# +# This script must be sourced, since it makes changes in the current environment, like loading an EESSI-extend module. +# +# Assumptions (if one is not satisfied the script prints a message and exits) +# - EESSI version is given as first argument +# - TMPDIR is set +# - EB is set +# - EASYBUILD_INSTALLPATH needs to be set +# - Function check_exit_code is defined; +# scripts/utils.sh in EESSI/software-layer repository defines this function, hence +# scripts/utils.sh shall be sourced before this script is run +# +# This script is part of the EESSI software layer, see +# https://github.com/EESSI/software-layer.git +# +# author: Kenneth Hoste (@boegel, HPC-UGent) +# author: Alan O'Cais (@ocaisa, CECAM) +# author: Thomas Roeblitz (@trz42, University of Bergen) +# +# license: GPLv2 +# +# +set -o pipefail + +# this script is *sourced*, not executed, so can't rely on $0 to determine path to self or script name +# $BASH_SOURCE points to correct path or script name, see also http://mywiki.wooledge.org/BashFAQ/028 +if [ $# -ne 1 ]; then + echo "Usage: source ${BASH_SOURCE} " >&2 + exit 1 +fi + +EESSI_EXTEND_VERSION="${1}-easybuild" + +# make sure that environment variables that we expect to be set are indeed set +if [ -z "${TMPDIR}" ]; then + echo "\$TMPDIR is not set; exiting" >&2 + exit 2 +fi + +# ${EB} is used to specify which 'eb' command should be used; +# can potentially be more than just 'eb', for example when using 'eb --optarch=GENERIC' +if [ -z "${EB}" ]; then + echo "\$EB is not set; exiting" >&2 + exit 2 +fi + +# ${EASYBUILD_INSTALLPATH} points to the installation path and needs to be set +if [ -z "${EASYBUILD_INSTALLPATH}" ]; then + echo "\$EASYBUILD_INSTALLPATH is not set; exiting" >&2 + exit 2 +fi + +# make sure that utility functions are defined (cfr. scripts/utils.sh script in EESSI/software-layer repo) +type check_exit_code +if [ $? -ne 0 ]; then + echo "check_exit_code function is not defined; exiting" >&2 + exit 3 +fi + +echo ">> Checking for EESSI-extend module..." + +ml_av_eessi_extend_out=${TMPDIR}/ml_av_eessi_extend.out +# need to use --ignore_cache to avoid the case that the module was removed (to be +# rebuilt) but it is still in the cache +module --ignore_cache avail 2>&1 | grep -i EESSI-extend/${EESSI_EXTEND_VERSION} &> ${ml_av_eessi_extend_out} + +if [[ $? -eq 0 ]]; then + echo_green ">> Module for EESSI-extend/${EESSI_EXTEND_VERSION} found!" +else + echo_yellow ">> No module yet for EESSI-extend/${EESSI_EXTEND_VERSION}, installing it..." + + EB_TMPDIR=${TMPDIR}/ebtmp + echo ">> Using temporary installation of EasyBuild (in ${EB_TMPDIR})..." + pip_install_out=${TMPDIR}/pip_install.out + pip3 install --prefix ${EB_TMPDIR} easybuild &> ${pip_install_out} + + # keep track of original $PATH and $PYTHONPATH values, so we can restore them + ORIG_PATH=${PATH} + ORIG_PYTHONPATH=${PYTHONPATH} + + # source configure_easybuild to use correct eb settings + ( + export EASYBUILD_PREFIX=${TMPDIR}/easybuild + export EASYBUILD_READ_ONLY_INSTALLDIR=1 + + echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..." + export PATH=${EB_TMPDIR}/bin:${PATH} + export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):${PYTHONPATH} + eb_install_out=${TMPDIR}/eb_install.out + ok_msg="EESSI-extend/${EESSI_EXTEND_VERSION} installed, let's go!" + fail_msg="Installing EESSI-extend/${EESSI_EXTEND_VERSION} failed, that's not good... (output: ${eb_install_out})" + # while always adding --try-amend=keep... may do no harm, we could make + # an attempt to figure out if it is needed, e.g., when we are rebuilding + ${EB} "EESSI-extend-${EESSI_EXTEND_VERSION}.eb" --try-amend=keeppreviousinstall=True 2>&1 | tee ${eb_install_out} + check_exit_code $? "${ok_msg}" "${fail_msg}" + ) + + # restore origin $PATH and $PYTHONPATH values, and clean up environment variables that are no longer needed + export PATH=${ORIG_PATH} + export PYTHONPATH=${ORIG_PYTHONPATH} + unset EB_TMPDIR ORIG_PATH ORIG_PYTHONPATH + + module --ignore_cache avail EESSI-extend/${EESSI_EXTEND_VERSION} &> ${ml_av_eessi_extend_out} + if [[ $? -eq 0 ]]; then + echo_green ">> EESSI-extend/${EESSI_EXTEND_VERSION} module installed!" + else + fatal_error "EESSI-extend/${EESSI_EXTEND_VERSION} module failed to install?! (output of 'pip install' in ${pip_install_out}, output of 'eb' in ${eb_install_out}, output of 'module avail EESSI-extend' in ${ml_av_eessi_extend_out})" + fi +fi + +echo ">> Loading EESSI-extend/${EESSI_EXTEND_VERSION} module..." +module --ignore_cache load EESSI-extend/${EESSI_EXTEND_VERSION} + +unset EESSI_EXTEND_VERSION diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index ee219fb444..741ead0559 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -77,10 +77,6 @@ done # Make sure EESSI is initialised check_eessi_initialised -# Make sure that `EESSI-extend` will install in the site installation path EESSI_SITE_SOFTWARE_PATH -export EESSI_SITE_INSTALL=1 -echo "EESSI_SITE_SOFTWARE_PATH=${EESSI_SITE_SOFTWARE_PATH}" - # we need a directory we can use for temporary storage if [[ -z "${TEMP_DIR}" ]]; then tmpdir=$(mktemp -d) @@ -93,7 +89,7 @@ else fi echo "Created temporary directory '${tmpdir}'" -# use EESSI_SITE_SOFTWARE_PATH/.modules/all as MODULEPATH +# Store MODULEPATH so it can be restored at the end of each loop iteration SAVE_MODULEPATH=${MODULEPATH} for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do @@ -103,8 +99,16 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do eb_version=$(echo ${EASYSTACK_FILE} | sed 's/.*eb-\([0-9.]*\).*/\1/g') # Load EasyBuild version for this easystack file _before_ loading EESSI-extend - module avail EasyBuild + module_avail_out=${tmpdir}/ml.out + module avail 2>&1 | grep EasyBuild/${eb_version} &> ${module_avail_out} + if [[ $? -eq 0 ]]; then + echo_green ">> Found an EasyBuild/${eb_version} module" + else + echo_yellow ">> No EasyBuild/${eb_version} module found: skipping step to install easystack file ${easystack_file} (see output in ${module_avail_out})" + continue + fi module load EasyBuild/${eb_version} + # Make sure EESSI-extend does a site install here # We need to reload it with the current environment variables set unset EESSI_CVMFS_INSTALL @@ -112,7 +116,19 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do unset EESSI_USER_INSTALL export EESSI_SITE_INSTALL=1 module unload EESSI-extend - module load EESSI-extend/${EESSI_VERSION}-easybuild + ml_av_eessi_extend_out=${tmpdir}/ml_av_eessi_extend.out + # need to use --ignore_cache to avoid the case that the module was removed (to be + # rebuilt) but it is still in the cache and the rebuild failed + EESSI_EXTEND_VERSION=${EESSI_VERSION}-easybuild + module --ignore_cache avail 2>&1 | grep -i EESSI-extend/${EESSI_EXTEND_VERSION} &> ${ml_av_eessi_extend_out} + if [[ $? -eq 0 ]]; then + echo_green ">> Module for EESSI-extend/${EESSI_EXTEND_VERSION} found!" + else + error="\nNo module for EESSI-extend/${EESSI_EXTEND_VERSION} found\nwhile EESSI has been initialised to use software under ${EESSI_SOFTWARE_PATH}\n" + fatal_error "${error}" + fi + module --ignore_cache load EESSI-extend/${EESSI_EXTEND_VERSION} + unset EESSI_EXTEND_VERSION # Install modules in hidden .modules dir to keep track of what was installed before # (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild @@ -245,3 +261,5 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do # Restore MODULEPATH for next loop iteration MODULEPATH=${SAVE_MODULEPATH} done +# Remove the temporary directory +rm -rf "${tmpdir}"