Skip to content

Commit

Permalink
Merge branch '2023.06-software.eessi.io' of github-trz:EESSI/software…
Browse files Browse the repository at this point in the history
…-layer into debug-2023.06-software.eessi.io-PyTorch-2.1.2-foss-2023a
  • Loading branch information
truib committed Sep 3, 2024
2 parents d12685a + f98f5cd commit 82d7e1d
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,18 @@ easyconfigs:
- SciPy-bundle-2023.07-gfbf-2023a.eb
- ESPResSo-4.2.2-foss-2023a.eb
- ParaView-5.11.2-foss-2023a.eb
- OpenFOAM-10-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20958
from-commit: dbadb2074464d816740ee0e95595c2cb31b6338f
- OpenFOAM-11-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20958
from-commit: dbadb2074464d816740ee0e95595c2cb31b6338f
- OpenFOAM-v2312-foss-2023a.eb:
options:
# https://github.com/easybuilders/easybuild-easyblocks/pull/3388
include-easyblocks-from-commit: c8256a36e7062bc09f5ce30552a9de9827054c9e
# https://github.com/easybuilders/easybuild-easyconfigs/pull/20841
from-commit: f0e91e6e430ebf902f7788ebb47f0203dee60649
- R-4.3.2-gfbf-2023a.eb
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ easyconfigs:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21136
from-commit: d8076ebaf8cb915762adebf88d385cc672b350dc
- gnuplot-5.4.6-GCCcore-12.2.0.eb
- gnuplot-5.4.6-GCCcore-12.2.0.eb
- h5py-3.8.0-foss-2022b.eb
- MDAnalysis-2.4.2-foss-2022b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,8 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21136
from-commit: d8076ebaf8cb915762adebf88d385cc672b350dc
- grpcio-1.57.0-GCCcore-12.3.0.eb
- orjson-3.9.15-GCCcore-12.3.0.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20880
from-commit: bc6e08f89759b8b70166de5bfcb5056b9db8ec90
- PyTorch-bundle-2.1.2-foss-2023a.eb
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ easyconfigs:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21034
from-commit: 76e7fc6657bab64bfbec826540a3a8f0040258f2
- STAR-2.7.11b-GCC-13.2.0.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21200
from-commit: 765ba900daf5953e306c4dad896febe52fdd6c00
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# 2024.08.14
# hatchling-1.18.0 rebuild to account for easyconfig changed upstream
# see https://gitlab.com/eessi/support/-/issues/85 and
# https://github.com/easybuilders/easybuild-easyconfigs/pull/20389
easyconfigs:
- hatchling-1.18.0-GCCcore-12.3.0.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20389
from-commit: 9580c0d67d6dd97b160b768a839bfcba6d5b21b9
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# 2024.08.23
# GObject-Introspection sets $LD_LIBRARY_PATH (to many different paths, including $EPREFIX/lib)
# when calling gcc, and this causes a lot of issues for, especially, scripts using /bin/bash.
#
# This rebuild ensures (by using a new EasyBuild hook) that GObject-Introspection will not set
# environment variables that are configured to be filtered by EasyBuild.
# This rebuild was not done initially for A64FX. This file is meant to do the same as the
# previous rebuild of GObject-Introspection-1.76.1-GCCcore-12.3.0 in other architectures,
# but for A64FX.
#
# See https://github.com/EESSI/software-layer/issues/196
easyconfigs:
- GObject-Introspection-1.76.1-GCCcore-12.3.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
easyconfigs:
- ReFrame-4.3.3.eb
77 changes: 57 additions & 20 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ if [[ ${LIST_REPOS} -eq 1 ]]; then
default_label=", default"
else
default_label=""
fi
fi
echo " ${cvmfs_repo} [CVMFS config repo${default_label}]"
done
for cfg_repo in "${!cfg_cvmfs_repos[@]}"
Expand Down Expand Up @@ -323,7 +323,7 @@ do
if [[ ! -n "${eessi_cvmfs_repos[${cvmfs_repo_name}]}" ]] ; then
[[ ${VERBOSE} -eq 1 ]] && echo "repo '${cvmfs_repo_name}' is not an EESSI CVMFS repository..."
# cvmfs_repo_name is actually a repository ID, use that to obtain
# the actual name from the EESSI_REPOS_CFG_FILE
# the actual name from the EESSI_REPOS_CFG_FILE
cfg_repo_id=${cvmfs_repo_name}
cvmfs_repo_name=$(cfg_get_value ${cfg_repo_id} "repo_name")
fi
Expand Down Expand Up @@ -595,11 +595,11 @@ do
# that the necessary information for accessing a CVMFS repository is made
# available inside the container
if [[ -n "${cfg_cvmfs_repos[${cvmfs_repo_name}]}" ]] ; then
cfg_repo_id=${cvmfs_repo_name}
cfg_repo_id=${cvmfs_repo_name}

# obtain CVMFS repository name from section for the given ID
# obtain CVMFS repository name from section for the given ID
cfg_repo_name=$(cfg_get_value ${cfg_repo_id} "repo_name")
# derive domain part from (cfg_)repo_name (everything after first '.')
# derive domain part from (cfg_)repo_name (everything after first '.')
repo_name_domain=${repo_name#*.}

# cfg_cvmfs_repos is populated through reading the file pointed to by
Expand All @@ -609,15 +609,15 @@ do
# copy repos.cfg to job directory --> makes it easier to inspect the job
cp -a ${EESSI_REPOS_CFG_FILE} ${EESSI_TMPDIR}/repos_cfg/.

# cfg file should include sections (one per CVMFS repository to be mounted)
# with each section containing the settings:
# - repo_name,
# - repo_version,
# - config_bundle, and
# - a map { filepath_in_bundle -> container_filepath }
# cfg file should include sections (one per CVMFS repository to be mounted)
# with each section containing the settings:
# - repo_name,
# - repo_version,
# - config_bundle, and
# - a map { filepath_in_bundle -> container_filepath }
#
# The config_bundle includes the files which are mapped ('->') to a target
# location in container:
# The config_bundle includes the files which are mapped ('->') to a target
# location in container:
# - default.local -> /etc/cvmfs/default.local
# contains CVMFS settings, e.g., CVMFS_HTTP_PROXY, CVMFS_QUOTA_LIMIT, ...
# - ${repo_name_domain}.conf -> /etc/cvmfs/domain.d/${repo_name_domain}.conf
Expand All @@ -641,7 +641,7 @@ do
# use information to set up dir ${EESSI_TMPDIR}/repos_cfg and define
# BIND mounts
# check if config_bundle exists, if so, unpack it into
# ${EESSI_TMPDIR}/repos_cfg; if it doesn't, exit with an error
# ${EESSI_TMPDIR}/repos_cfg; if it doesn't, exit with an error
# if config_bundle is relative path (no '/' at start) prepend it with
# EESSI_REPOS_CFG_DIR
config_bundle_path=
Expand Down Expand Up @@ -726,7 +726,7 @@ do
if [[ ${cfg_cvmfs_repos[${cvmfs_repo_name}]} ]]; then
[[ ${VERBOSE} -eq 1 ]] && echo "repo '${cvmfs_repo_name}' is not an EESSI CVMFS repository..."
# cvmfs_repo_name is actually a repository ID, use that to obtain
# the actual name from the EESSI_REPOS_CFG_FILE
# the actual name from the EESSI_REPOS_CFG_FILE
cfg_repo_id=${cvmfs_repo_name}
cvmfs_repo_name=$(cfg_get_value ${cfg_repo_id} "repo_name")
fi
Expand All @@ -736,15 +736,52 @@ do

# add fusemount options depending on requested access mode ('ro' - read-only; 'rw' - read & write)
if [[ ${cvmfs_repo_access} == "ro" ]] ; then
export EESSI_READONLY="container:cvmfs2 ${cvmfs_repo_name} /cvmfs/${cvmfs_repo_name}"
# need to distinguish between basic "ro" access and "ro" after a "rw" session
if [[ -d ${EESSI_TMPDIR}/${cvmfs_repo_name}/overlay-upper ]]; then
# the overlay-upper directory is only created in a read-write-session, thus
# we are resuming from such a session here (otherwise there shouldn't be such
# directory yet as it is only created for read-write-sessions a bit further
# below); the overlay-upper directory can only exist because it is part of
# the ${RESUME} directory or tarball
# to be able to see the contents of the read-write session we have to mount
# the fuse-overlayfs (in read-only mode) on top of the CernVM-FS repository

echo "While processing '${cvmfs_repo_name}' to be mounted 'read-only' we detected an overlay-upper"
echo " directory (${EESSI_TMPDIR}/${cvmfs_repo_name}/overlay-upper) likely from a previous"
echo " session. Will use it as left-most directory in 'lowerdir' argument for fuse-overlayfs."

# make the target CernVM-FS repository available under /cvmfs_ro
export EESSI_READONLY="container:cvmfs2 ${cvmfs_repo_name} /cvmfs_ro/${cvmfs_repo_name}"

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")

# now, put the overlay-upper read-only on top of the repo and make it available under the usual prefix /cvmfs
EESSI_READONLY_OVERLAY="container:fuse-overlayfs"
# The contents of the previous session are available under
# ${EESSI_TMPDIR} which is bind mounted to ${TMP_IN_CONTAINER}.
# Hence, we have to use ${TMP_IN_CONTAINER}/${cvmfs_repo_name}/overlay-upper
# the left-most directory given for the lowerdir argument is put on top,
# and with no upperdir=... the whole overlayfs is made available read-only
EESSI_READONLY_OVERLAY+=" -o lowerdir=${TMP_IN_CONTAINER}/${cvmfs_repo_name}/overlay-upper:/cvmfs_ro/${cvmfs_repo_name}"
EESSI_READONLY_OVERLAY+=" /cvmfs/${cvmfs_repo_name}"
export EESSI_READONLY_OVERLAY

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY_OVERLAY}")
export EESSI_FUSE_MOUNTS
else
# basic "ro" access that doesn't require any fuseoverlay-fs
echo "Mounting '${cvmfs_repo_name}' 'read-only' without fuse-overlayfs."

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")
export EESSI_FUSE_MOUNTS
export EESSI_READONLY="container:cvmfs2 ${cvmfs_repo_name} /cvmfs/${cvmfs_repo_name}"

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")
export EESSI_FUSE_MOUNTS
fi
elif [[ ${cvmfs_repo_access} == "rw" ]] ; then
# use repo-specific overlay directories
mkdir -p ${EESSI_TMPDIR}/${cvmfs_repo_name}/overlay-upper
mkdir -p ${EESSI_TMPDIR}/${cvmfs_repo_name}/overlay-work
[[ ${VERBOSE} -eq 1 ]] && echo -e "TMP directory contents:\n$(ls -l ${EESSI_TMPDIR})"
[[ ${VERBOSE} -eq 1 ]] && echo -e "TMP directory contents:\n$(ls -l ${EESSI_TMPDIR})"

# set environment variables for fuse mounts in Singularity container
export EESSI_READONLY="container:cvmfs2 ${cvmfs_repo_name} /cvmfs_ro/${cvmfs_repo_name}"
Expand All @@ -762,7 +799,7 @@ do
export EESSI_FUSE_MOUNTS
else
echo -e "ERROR: access mode '${cvmfs_repo_access}' for CVMFS repository\n '${cvmfs_repo_name}' is not known"
exit ${REPOSITORY_ERROR_EXITCODE}
exit ${REPOSITORY_ERROR_EXITCODE}
fi
# create repo_settings.sh file in ${EESSI_TMPDIR}/${cvmfs_repo_name} to store
# (intention is that the file could be just sourced to obtain the settings)
Expand Down
2 changes: 2 additions & 0 deletions init/lmod_eessi_archdetect_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# This can be leveraged by the source_sh() feature of Lmod
export EESSI_ARCHDETECT_OPTIONS=$($(dirname $(readlink -f $BASH_SOURCE))/eessi_archdetect.sh -a cpupath)
2 changes: 1 addition & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ TOPDIR=$(dirname $(realpath $0))
# Copy for init directory
init_files=(
bash eessi_archdetect.sh eessi_defaults eessi_environment_variables eessi_software_subdir_for_host.py
minimal_eessi_env README.md test.py
minimal_eessi_env README.md test.py lmod_eessi_archdetect_wrapper.sh
)
copy_files_by_list ${TOPDIR}/init ${INSTALL_PREFIX}/init "${init_files[@]}"

Expand Down
49 changes: 45 additions & 4 deletions test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,17 @@ fi
TMPDIR=$(mktemp -d)

echo ">> Setting up environment..."
module --force purge
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
# For this call to be succesful, it needs to be able to import archspec (which is part of EESSI)
# Thus, we execute it in a subshell where EESSI is already initialized (a bit like a bootstrap)
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(source $TOPDIR/init/bash > /dev/null 2>&1; python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
echo "EESSI_SOFTWARE_SUBDIR_OVERRIDE: $EESSI_SOFTWARE_SUBDIR_OVERRIDE"

source $TOPDIR/init/bash

# We have to ignore the LMOD cache, otherwise the software that is built in the build step cannot be found/loaded
# Reason is that the LMOD cache is normally only updated on the Stratum 0, once everything is ingested
export LMOD_IGNORE_CACHE=1

# Load the ReFrame module
# Currently, we load the default version. Maybe we should somehow make this configurable in the future?
module load ReFrame
Expand Down Expand Up @@ -136,40 +142,75 @@ echo "Configured reframe with the following environment variables:"
env | grep "RFM_"

# Inject correct CPU/memory properties into the ReFrame config file
echo "Collecting system-specific input for the ReFrame configuration file"
cpuinfo=$(lscpu)
if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then
cpu_count=${BASH_REMATCH[1]}
echo "Detected CPU count: ${cpu_count}"
else
fatal_error "Failed to get the number of CPUs for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ Socket\(s\):[^0-9]*([0-9]+) ]]; then
socket_count=${BASH_REMATCH[1]}
echo "Detected socket count: ${socket_count}"
else
fatal_error "Failed to get the number of sockets for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Thread\(s\) per core:[^0-9]*([0-9]+)) ]]; then
threads_per_core=${BASH_REMATCH[2]}
echo "Detected threads per core: ${threads_per_core}"
else
fatal_error "Failed to get the number of threads per core for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then
cores_per_socket=${BASH_REMATCH[2]}
echo "Detected cores per socket: ${cores_per_socket}"
else
fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu."
fi
cgroup_mem_bytes=$(cat /hostsys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes)

# The /sys inside the container is not the same as the /sys of the host
# We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM).
# Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup
# and that's the prefix we use to extract the memory limit from
cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$(</proc/self/cpuset)/memory.limit_in_bytes"
cgroup_v2_mem_limit="/hostsys/fs/cgroup/$(</proc/self/cpuset)/memory.max"
if [ -f "$cgroup_v1_mem_limit" ]; then
echo "Getting memory limit from file $cgroup_v1_mem_limit"
cgroup_mem_bytes=$(cat "$cgroup_v1_mem_limit")
elif [ -f "$cgroup_v2_mem_limit" ]; then
echo "Getting memory limit from file $cgroup_v2_mem_limit"
cgroup_mem_bytes=$(cat "$cgroup_v2_mem_limit")
if [ "$cgroup_mem_bytes" = 'max' ]; then
# In cgroupsv2, the memory.max file may contain 'max', meaning the group can use the full system memory
# Here, we get the system memory from /proc/meminfo. Units are supposedly always in kb, but lets match them too
cgroup_mem_kilobytes=$(grep -oP 'MemTotal:\s+\K\d+(?=\s+kB)' /proc/meminfo)
if [[ $? -ne 0 ]] || [[ -z "$cgroup_mem_kilobytes" ]]; then
fatal_error "Failed to get memory limit from /proc/meminfo"
fi
cgroup_mem_bytes=$(("$cgroup_mem_kilobytes"*1024))
fi
else
fatal_error "Both files ${cgroup_v1_mem_limit} and ${cgroup_v2_mem_limit} couldn't be found. Failed to get the memory limit from the current cgroup"
fi
if [[ $? -eq 0 ]]; then
# Convert to MiB
cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024)))
cgroup_mem_mib=$(("$cgroup_mem_bytes"/(1024*1024)))
else
fatal_error "Failed to get the memory limit in bytes from the current cgroup"
fi
echo "Detected available memory: ${cgroup_mem_mib} MiB"

echo "Replacing detected system information in template ReFrame config file..."
cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES
sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES
# Make debugging easier by printing the final config file:
echo "Final config file (after replacements):"
cat "${RFM_CONFIG_FILES}"

# Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966
export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now
Expand Down

0 comments on commit 82d7e1d

Please sign in to comment.