Skip to content

Commit

Permalink
Merge branch '2023.06-software.eessi.io' of github-trz:EESSI/software…
Browse files Browse the repository at this point in the history
…-layer into 2023.06-software.eessi.io-TensorFlow-2.15.1-2023a-CUDA-12.1.1-debug
  • Loading branch information
truib committed Nov 21, 2024
2 parents 4001d36 + db16c37 commit 4cfd111
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 34 deletions.
15 changes: 15 additions & 0 deletions EESSI-extend-2023.06-easybuild.eb
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ if (os.getenv("EESSI_CVMFS_INSTALL") ~= nil) then
end
eessi_cvmfs_install = true
easybuild_installpath = os.getenv("EESSI_SOFTWARE_PATH")
eessi_accelerator_target = os.getenv("EESSI_ACCELERATOR_TARGET")
if (eessi_accelerator_target ~= nil) then
cuda_compute_capability = string.match(eessi_accelerator_target, "^nvidia/cc([0-9][0-9])$")
if (cuda_compute_capability ~= nil) then
easybuild_installpath = pathJoin(easybuild_installpath, 'accel', eessi_accelerator_target)
easybuild_cuda_compute_capabilities = cuda_compute_capability:sub(1, 1) .. "." .. cuda_compute_capability:sub(2, 2)
else
LmodError("Incorrect value for $EESSI_ACCELERATOR_TARGET: " .. eessi_accelerator_target)
end
end
elseif (os.getenv("EESSI_SITE_INSTALL") ~= nil) then
-- Make sure no other EESSI install environment variables are set
if ((os.getenv("EESSI_PROJECT_INSTALL") ~= nil) or (os.getenv("EESSI_USER_INSTALL") ~= nil)) then
Expand Down Expand Up @@ -146,6 +156,11 @@ setenv ("EASYBUILD_UMASK", "022")
-- Allow this module to be loaded when running EasyBuild
setenv ("EASYBUILD_ALLOW_LOADED_MODULES", "EasyBuild,EESSI-extend")
-- Set environment variables if building for CUDA compute capabilities
if (easybuild_cuda_compute_capabilities ~= nil) then
setenv ("EASYBUILD_CUDA_COMPUTE_CAPABILITIES", easybuild_cuda_compute_capabilities)
end
-- Set all related environment variables if we have project or user installations (including extending MODULEPATH)
if (user_modulepath ~= nil) then
-- Use a more restrictive umask for this case
Expand Down
48 changes: 24 additions & 24 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -229,22 +229,34 @@ if [[ "${EESSI_CVMFS_REPO}" != /cvmfs/dev.eessi.io ]]; then
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}
fi

echo ">> Configuring EasyBuild..."

# Make sure EESSI-extend is not loaded, and configure location variables for a
# CVMFS installation
module unload EESSI-extend
unset EESSI_USER_INSTALL
unset EESSI_PROJECT_INSTALL
unset EESSI_SITE_INSTALL
export EESSI_CVMFS_INSTALL=1

# We now run 'source load_eessi_extend_module.sh' to load or install and load the
# EESSI-extend module which sets up all build environment settings.
# The script requires the EESSI_VERSION given as argument, a couple of
# environment variables set (TMPDIR, EB and EASYBUILD_INSTALLPATH) and the
# function check_exit_code defined.
# NOTE 1, the script exits if those variables/functions are undefined.
# NOTE 2, loading the EESSI-extend module may adjust the value of EASYBUILD_INSTALLPATH,
# e.g., to point to the installation directory for accelerators.
# NOTE 3, we have to set a default for EASYBUILD_INSTALLPATH here in cases the
# EESSI-extend module itself needs to be installed.
export EASYBUILD_INSTALLPATH=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
source load_eessi_extend_module.sh ${EESSI_VERSION}

# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
# The install_cuda... script uses EasyBuild. So, we need to check if we have EB
# or skip this step.
echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary"
module_avail_out=$TMPDIR/ml.out
module avail 2>&1 | grep EasyBuild &> ${module_avail_out}
if [[ $? -eq 0 ]]; then
echo_green ">> Found an EasyBuild module"
else
echo_yellow ">> No EasyBuild module found: skipping step to install CUDA (see output in ${module_avail_out})"
export skip_cuda_install=True
fi

temp_install_storage=${TMPDIR}/temp_install_storage
mkdir -p ${temp_install_storage}
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
Expand All @@ -253,7 +265,7 @@ if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
--accept-cuda-eula \
--accept-cudnn-eula
else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
fi

# Install NVIDIA drivers in host_injections (if they exist)
Expand All @@ -263,18 +275,6 @@ if command_exists "nvidia-smi"; then
fi


echo ">> Configuring EasyBuild..."

# Make sure that we use the EESSI_CVMFS_INSTALL
# Since the path is set when loading EESSI-extend, we reload it to make sure it works - even if it is already loaded
# Note we need to do this after running install_cuda_and_libraries, since that does installations in the EESSI_SITE_INSTALL
unset EESSI_USER_INSTALL
unset EESSI_PROJECT_INSTALL
unset EESSI_SITE_INSTALL
export EESSI_CVMFS_INSTALL=1
module unload EESSI-extend
module load EESSI-extend/${EESSI_VERSION}-easybuild

if [ ! -z "${shared_fs_path}" ]; then
shared_eb_sourcepath=${shared_fs_path}/easybuild/sources
echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path"
Expand Down
3 changes: 3 additions & 0 deletions EESSI-remove-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ if [ $EUID -eq 0 ]; then
echo_yellow "Removing ${app_dir} and ${app_module}..."
rm -rf ${app_dir}
rm -rf ${app_module}
# recreate some directory to work around permission denied
# issues when rebuilding the package
mkdir -p ${app_dir}/easybuild
done
else
fatal_error "Easystack file ${easystack_file} not found!"
Expand Down
1 change: 1 addition & 0 deletions bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ else
declare -a REMOVAL_STEP_ARGS=()
REMOVAL_STEP_ARGS+=("--save" "${TARBALL_TMP_REMOVAL_STEP_DIR}")
REMOVAL_STEP_ARGS+=("--storage" "${STORAGE}")

# add fakeroot option in order to be able to remove software, see:
# https://github.com/EESSI/software-layer/issues/312
REMOVAL_STEP_ARGS+=("--fakeroot")
Expand Down
20 changes: 20 additions & 0 deletions bot/check-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# - SUCCESS (all of)
# - working directory contains slurm-JOBID.out file
# - working directory contains eessi*tar.gz
# - no message FATAL
# - no message ERROR
# - no message FAILED
# - no message ' required modules missing:'
Expand All @@ -25,6 +26,7 @@
# - FAILED (one of ... implemented as NOT SUCCESS)
# - no slurm-JOBID.out file
# - no tarball
# - message with FATAL
# - message with ERROR
# - message with FAILED
# - message with ' required modules missing:'
Expand Down Expand Up @@ -105,6 +107,16 @@ else
[[ ${VERBOSE} -ne 0 ]] && echo " Slurm output file '"${job_out}"' NOT found"
fi

FATAL=-1
if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then
GP_fatal='FATAL: '
grep_out=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_fatal}")
[[ $? -eq 0 ]] && FATAL=1 || FATAL=0
# have to be careful to not add searched for pattern into slurm out file
[[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_fatal}"'"
[[ ${VERBOSE} -ne 0 ]] && echo "${grep_out}"
fi

ERROR=-1
if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then
GP_error='ERROR: '
Expand Down Expand Up @@ -163,6 +175,7 @@ fi

[[ ${VERBOSE} -ne 0 ]] && echo "SUMMARY: ${job_dir}/${job_out}"
[[ ${VERBOSE} -ne 0 ]] && echo " <test name>: <actual result> (<expected result>)"
[[ ${VERBOSE} -ne 0 ]] && echo " FATAL......: $([[ $FATAL -eq 1 ]] && echo 'yes' || echo 'no') (no)"
[[ ${VERBOSE} -ne 0 ]] && echo " ERROR......: $([[ $ERROR -eq 1 ]] && echo 'yes' || echo 'no') (no)"
[[ ${VERBOSE} -ne 0 ]] && echo " FAILED.....: $([[ $FAILED -eq 1 ]] && echo 'yes' || echo 'no') (no)"
[[ ${VERBOSE} -ne 0 ]] && echo " REQ_MISSING: $([[ $MISSING -eq 1 ]] && echo 'yes' || echo 'no') (no)"
Expand Down Expand Up @@ -190,6 +203,7 @@ job_result_file=_bot_job${SLURM_JOB_ID}.result

# Default reason:
if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]] && \
[[ ${FATAL} -eq 0 ]] && \
[[ ${ERROR} -eq 0 ]] && \
[[ ${FAILED} -eq 0 ]] && \
[[ ${MISSING} -eq 0 ]] && \
Expand Down Expand Up @@ -223,6 +237,7 @@ fi
# <dt>_Details_</dt>
# <dd>
# :white_check_mark: job output file <code>slurm-4682.out</code><br/>
# :white_check_mark: no message matching <code>FATAL: </code><br/>
# :white_check_mark: no message matching <code>ERROR: </code><br/>
# :white_check_mark: no message matching <code>FAILED: </code><br/>
# :white_check_mark: no message matching <code> required modules missing:</code><br/>
Expand Down Expand Up @@ -264,6 +279,7 @@ fi
# <dt>_Details_</dt>
# <dd>
# :white_check_mark: job output file <code>slurm-4682.out</code><br/>
# :x: no message matching <code>FATAL: </code><br/>
# :x: no message matching <code>ERROR: </code><br/>
# :white_check_mark: no message matching <code>FAILED: </code><br/>
# :x: no message matching <code> required modules missing:</code><br/>
Expand Down Expand Up @@ -381,6 +397,10 @@ success_msg="job output file <code>${job_out}</code>"
failure_msg="no job output file <code>${job_out}</code>"
comment_details_list=${comment_details_list}$(add_detail ${SLURM_OUTPUT_FOUND} 1 "${success_msg}" "${failure_msg}")

success_msg="no message matching <code>${GP_fatal}</code>"
failure_msg="found message matching <code>${GP_fatal}</code>"
comment_details_list=${comment_details_list}$(add_detail ${FATAL} 0 "${success_msg}" "${failure_msg}")

success_msg="no message matching <code>${GP_error}</code>"
failure_msg="found message matching <code>${GP_error}</code>"
comment_details_list=${comment_details_list}$(add_detail ${ERROR} 0 "${success_msg}" "${failure_msg}")
Expand Down
2 changes: 1 addition & 1 deletion create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n"
if simpleName == 'CUDA' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyblocks/pull/3496
include-easyblocks-from-commit: 60633b0acfd41a0732992d9e16800dae71a056eb
- Cython-3.0.10-GCCcore-13.2.0.eb
- Mustache-1.3.3-foss-2023b.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21783
from-commit: 5fa3db9eb36f91cba3fbf351549f8ba2849abc33
- GDRCopy-2.4-GCCcore-13.2.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# 2024.11.12
# for installations under /cvmfs, if EESSI_ACCELERATOR_TARGET is set,
# EESSI-extend should adjust EASYBUILD_INSTALLPATH and set
# EASYBUILD_CUDA_COMPUTE_CAPABILITIES
easyconfigs:
- EESSI-extend-2023.06-easybuild.eb
116 changes: 116 additions & 0 deletions load_eessi_extend_module.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Script to load the environment module for EESSI-extend.
# If that module is not available yet, a specific version will be installed using the latest EasyBuild.
#
# This script must be sourced, since it makes changes in the current environment, like loading an EESSI-extend module.
#
# Assumptions (if one is not satisfied the script prints a message and exits)
# - EESSI version is given as first argument
# - TMPDIR is set
# - EB is set
# - EASYBUILD_INSTALLPATH needs to be set
# - Function check_exit_code is defined;
# scripts/utils.sh in EESSI/software-layer repository defines this function, hence
# scripts/utils.sh shall be sourced before this script is run
#
# This script is part of the EESSI software layer, see
# https://github.com/EESSI/software-layer.git
#
# author: Kenneth Hoste (@boegel, HPC-UGent)
# author: Alan O'Cais (@ocaisa, CECAM)
# author: Thomas Roeblitz (@trz42, University of Bergen)
#
# license: GPLv2
#
#
set -o pipefail

# this script is *sourced*, not executed, so can't rely on $0 to determine path to self or script name
# $BASH_SOURCE points to correct path or script name, see also http://mywiki.wooledge.org/BashFAQ/028
if [ $# -ne 1 ]; then
echo "Usage: source ${BASH_SOURCE} <EESSI-extend version>" >&2
exit 1
fi

EESSI_EXTEND_VERSION="${1}-easybuild"

# make sure that environment variables that we expect to be set are indeed set
if [ -z "${TMPDIR}" ]; then
echo "\$TMPDIR is not set; exiting" >&2
exit 2
fi

# ${EB} is used to specify which 'eb' command should be used;
# can potentially be more than just 'eb', for example when using 'eb --optarch=GENERIC'
if [ -z "${EB}" ]; then
echo "\$EB is not set; exiting" >&2
exit 2
fi

# ${EASYBUILD_INSTALLPATH} points to the installation path and needs to be set
if [ -z "${EASYBUILD_INSTALLPATH}" ]; then
echo "\$EASYBUILD_INSTALLPATH is not set; exiting" >&2
exit 2
fi

# make sure that utility functions are defined (cfr. scripts/utils.sh script in EESSI/software-layer repo)
type check_exit_code
if [ $? -ne 0 ]; then
echo "check_exit_code function is not defined; exiting" >&2
exit 3
fi

echo ">> Checking for EESSI-extend module..."

ml_av_eessi_extend_out=${TMPDIR}/ml_av_eessi_extend.out
# need to use --ignore_cache to avoid the case that the module was removed (to be
# rebuilt) but it is still in the cache
module --ignore_cache avail 2>&1 | grep -i EESSI-extend/${EESSI_EXTEND_VERSION} &> ${ml_av_eessi_extend_out}

if [[ $? -eq 0 ]]; then
echo_green ">> Module for EESSI-extend/${EESSI_EXTEND_VERSION} found!"
else
echo_yellow ">> No module yet for EESSI-extend/${EESSI_EXTEND_VERSION}, installing it..."

EB_TMPDIR=${TMPDIR}/ebtmp
echo ">> Using temporary installation of EasyBuild (in ${EB_TMPDIR})..."
pip_install_out=${TMPDIR}/pip_install.out
pip3 install --prefix ${EB_TMPDIR} easybuild &> ${pip_install_out}

# keep track of original $PATH and $PYTHONPATH values, so we can restore them
ORIG_PATH=${PATH}
ORIG_PYTHONPATH=${PYTHONPATH}

# source configure_easybuild to use correct eb settings
(
export EASYBUILD_PREFIX=${TMPDIR}/easybuild
export EASYBUILD_READ_ONLY_INSTALLDIR=1

echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..."
export PATH=${EB_TMPDIR}/bin:${PATH}
export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):${PYTHONPATH}
eb_install_out=${TMPDIR}/eb_install.out
ok_msg="EESSI-extend/${EESSI_EXTEND_VERSION} installed, let's go!"
fail_msg="Installing EESSI-extend/${EESSI_EXTEND_VERSION} failed, that's not good... (output: ${eb_install_out})"
# while always adding --try-amend=keep... may do no harm, we could make
# an attempt to figure out if it is needed, e.g., when we are rebuilding
${EB} "EESSI-extend-${EESSI_EXTEND_VERSION}.eb" --try-amend=keeppreviousinstall=True 2>&1 | tee ${eb_install_out}
check_exit_code $? "${ok_msg}" "${fail_msg}"
)

# restore origin $PATH and $PYTHONPATH values, and clean up environment variables that are no longer needed
export PATH=${ORIG_PATH}
export PYTHONPATH=${ORIG_PYTHONPATH}
unset EB_TMPDIR ORIG_PATH ORIG_PYTHONPATH

module --ignore_cache avail EESSI-extend/${EESSI_EXTEND_VERSION} &> ${ml_av_eessi_extend_out}
if [[ $? -eq 0 ]]; then
echo_green ">> EESSI-extend/${EESSI_EXTEND_VERSION} module installed!"
else
fatal_error "EESSI-extend/${EESSI_EXTEND_VERSION} module failed to install?! (output of 'pip install' in ${pip_install_out}, output of 'eb' in ${eb_install_out}, output of 'module avail EESSI-extend' in ${ml_av_eessi_extend_out})"
fi
fi

echo ">> Loading EESSI-extend/${EESSI_EXTEND_VERSION} module..."
module --ignore_cache load EESSI-extend/${EESSI_EXTEND_VERSION}

unset EESSI_EXTEND_VERSION
Loading

0 comments on commit 4cfd111

Please sign in to comment.