Skip to content

Commit

Permalink
Merge branch '2023.06-software.eessi.io' into 2023.06-software.eessi.…
Browse files Browse the repository at this point in the history
…io_a64fx_R-bundle-CRAN
  • Loading branch information
boegel committed Sep 10, 2024
2 parents 703fdb1 + 9e363d7 commit e879f45
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 46 deletions.
85 changes: 85 additions & 0 deletions .github/workflows/tests_eessi_module.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions
name: Tests for eessi_module_functionality in software.eessi.io
on:
push:
branches: [ "*-software.eessi.io" ]
pull_request:
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
EESSI_VERSION:
- 2023.06
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Mount EESSI CernVM-FS pilot repository
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
with:
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
cvmfs_http_proxy: DIRECT
cvmfs_repositories: software.eessi.io

- name: Test for making sure spider cache is being used and not being rebuilt
run: |
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
export MODULEPATH=init/modules
configfile="configfile.txt"
module -T load EESSI/${{matrix.EESSI_VERSION}}
module --config > "${configfile}" 2>&1
grep cache "${configfile}" | grep software | grep -v compat
if timeout 10s bash -c "LMOD_PAGER=none module --terse avail" && grep cache "${configfile}" | grep software | grep -v compat; then
echo "EESSI spider cache is being used"
else
echo "EESSI spider cache is being rebuilt" >&2
exit 1
fi
env | grep LMOD
module purge
unset MODULEPATH
- name: Test for archdetect_cpu functionality with invalid path
run: |
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
export MODULEPATH=init/modules
set +e # Do not exit immediately if a command exits with a non-zero status
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu"
outfile="outfile.txt"
module load EESSI/${{matrix.EESSI_VERSION}} > "${outfile}" 2>&1
cat "${outfile}"
if grep -q "Software directory check" "${outfile}"; then
echo "Test for picking up invalid path on \${archdetect_cpu} PASSED"
else
echo "Test for picking up invalid path on \${archdetect_cpu} FAILED" >&2
exit 1
fi
unset EESSI_ARCHDETECT_OPTIONS
set -e # Re-enable exit on non-zero status
- name: Test for expected variables while adding dummy cpu archs and loading EESSI module
run: |
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
export MODULEPATH=init/modules
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
moduleoutfile="moduleout.txt"
sourceoutfile="sourceout.txt"
module load EESSI/${{matrix.EESSI_VERSION}}
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${moduleoutfile}"
module unload EESSI/${{matrix.EESSI_VERSION}}
source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${sourceoutfile}"
cat "${moduleoutfile}"
cat "${sourceoutfile}"
if (diff "${moduleoutfile}" "${sourceoutfile}" > /dev/null); then
echo "Test for checking env variables PASSED"
else
echo "Test for checking env variables FAILED" >&2
exit 1
fi
2 changes: 1 addition & 1 deletion create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
-- If EESSI_PREFIX wasn't defined, we cannot check if this module was from the EESSI environment
-- In that case, we assume it isn't, otherwise EESSI_PREFIX would (probably) have been set
if eessi_prefix == nil then
return False
return false
else
-- NOTE: exact paths for site so may need to be updated later.
-- See https://github.com/EESSI/software-layer/pull/371
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ easyconfigs:
# https://github.com/easybuilders/easybuild-easyconfigs/pull/20841
from-commit: f0e91e6e430ebf902f7788ebb47f0203dee60649
- R-4.3.2-gfbf-2023a.eb
- Highway-1.0.4-GCCcore-12.3.0.eb
- Brunsli-0.1-GCCcore-12.3.0.eb:
options:
# https://github.com/easybuilders/easybuild-easyconfigs/pull/21366
from-commit: 1736a123b1685836452587a5c51793257570bb2d
- R-bundle-CRAN-2023.12-foss-2023a.eb
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,25 @@ easyconfigs:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20880
from-commit: bc6e08f89759b8b70166de5bfcb5056b9db8ec90
- wradlib-2.0.3-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21094
from-commit: 3a2e0b8e6ee45277d01fb7e2eb93027a28c9461a
- MBX-1.1.0-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21155
from-commit: 6929a67401f2a2ec58f91fb306332a77497d73ff
- Transrate-1.0.3-GCC-12.3.0.eb:
options:
# https://github.com/easybuilders/easybuild-easyblocks/pull/3381
include-easyblocks-from-commit: bb86f05d4917b29e022023f152efdf0ca5c14ded
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20964
from-commit: 7d539a9e599d8bc5ac2bda6ee9587ef62351ee03
- Critic2-1.2-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/20833
from-commit: 78426c2383fc7e4b9b9e77d7a77f336e1bee3843
- LRBinner-0.1-foss-2023a.eb:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21310
from-commit: 799d9101df2cf81aabe252f00cc82a7246363f53
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ easyconfigs:
options:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/21200
from-commit: 765ba900daf5953e306c4dad896febe52fdd6c00
- HPL-2.3-foss-2023b.eb
6 changes: 3 additions & 3 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def post_ready_hook(self, *args, **kwargs):
# 'parallel' easyconfig parameter is set via EasyBlock.set_parallel in ready step based on available cores.
# here we reduce parallellism to only use half of that for selected software,
# to avoid failing builds/tests due to out-of-memory problems;
memory_hungry_build = self.name in ['libxc', 'TensorFlow']
memory_hungry_build = self.name in ['libxc', 'MBX', 'TensorFlow']
# on A64FX systems, (HBM) memory is typically scarce, so we need to use fewer cores for some builds
cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR')
memory_hungry_build_a64fx = cpu_target == CPU_TARGET_A64FX and self.name in ['Qt5']
Expand Down Expand Up @@ -333,7 +333,7 @@ def pre_prepare_hook_highway_handle_test_compilation_issues(self, *args, **kwarg
Solve issues with compiling or running the tests on both
neoverse_n1 and neoverse_v1 with Highway 1.0.4 and GCC 12.3.0:
- for neoverse_n1 we set optarch to GENERIC
- for neoverse_v1 we completely disable the tests
- for neoverse_v1 and a64fx we completely disable the tests
cfr. https://github.com/EESSI/software-layer/issues/469
"""
if self.name == 'Highway':
Expand All @@ -342,7 +342,7 @@ def pre_prepare_hook_highway_handle_test_compilation_issues(self, *args, **kwarg
# note: keep condition in sync with the one used in
# post_prepare_hook_highway_handle_test_compilation_issues
if self.version in ['1.0.4'] and tcname == 'GCCcore' and tcversion == '12.3.0':
if cpu_target == CPU_TARGET_NEOVERSE_V1:
if cpu_target in [CPU_TARGET_A64FX, CPU_TARGET_NEOVERSE_V1]:
self.cfg.update('configopts', '-DHWY_ENABLE_TESTS=OFF')
if cpu_target == CPU_TARGET_NEOVERSE_N1:
self.orig_optarch = build_option('optarch')
Expand Down
70 changes: 70 additions & 0 deletions init/modules/EESSI/2023.06.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
help([[
Description
===========
The European Environment for Scientific Software Installations (EESSI, pronounced as easy) is a collaboration between different European partners in HPC community.The goal of this project is to build a common stack of scientific software installations for HPC systems and beyond, including laptops, personal workstations and cloud infrastructure.
More information
================
- URL: https://www.eessi.io/docs/
]])
whatis("Description: The European Environment for Scientific Software Installations (EESSI, pronounced as easy) is a collaboration between different European partners in HPC community. The goal of this project is to build a common stack of scientific software installations for HPC systems and beyond, including laptops, personal workstations and cloud infrastructure.")
whatis("URL: https://www.eessi.io/docs/")
conflict("EESSI")
local eessi_version = myModuleVersion()
local eessi_repo = "/cvmfs/software.eessi.io"
local eessi_prefix = pathJoin(eessi_repo, "versions", eessi_version)
local eessi_os_type = "linux"
setenv("EESSI_VERSION", eessi_version)
setenv("EESSI_CVMFS_REPO", eessi_repo)
setenv("EESSI_OS_TYPE", eessi_os_type)
function archdetect_cpu()
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper.sh')
if not os.getenv("EESSI_ARCHDETECT_OPTIONS") then
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic")
end
source_sh("bash", script)
end
local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or ""
for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do
if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then
-- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected,
-- since optimized software installations for Zen4 are a work-in-progress,
-- see https://gitlab.com/eessi/support/-/issues/37
if archdetect_filter_cpu == "x86_64/amd/zen4" then
archdetect_filter_cpu = "x86_64/amd/zen3"
if mode() == "load" then
LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.")
end
end
return archdetect_filter_cpu
end
end
LmodError("Software directory check for the detected architecture failed")
end
local archdetect = archdetect_cpu()
local eessi_cpu_family = archdetect:match("([^/]+)")
local eessi_software_subdir = archdetect
local eessi_eprefix = pathJoin(eessi_prefix, "compat", eessi_os_type, eessi_cpu_family)
local eessi_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_software_subdir)
local eessi_module_path = pathJoin(eessi_software_path, "modules", "all")
local eessi_site_module_path = string.gsub(eessi_module_path, "versions", "host_injections")
setenv("EPREFIX", eessi_eprefix)
setenv("EESSI_CPU_FAMILY", eessi_cpu_family)
setenv("EESSI_SITE_MODULEPATH", eessi_site_module_path)
setenv("EESSI_SOFTWARE_SUBDIR", eessi_software_subdir)
setenv("EESSI_PREFIX", eessi_prefix)
setenv("EESSI_EPREFIX", eessi_eprefix)
prepend_path("PATH", pathJoin(eessi_eprefix, "bin"))
prepend_path("PATH", pathJoin(eessi_eprefix, "usr/bin"))
setenv("EESSI_SOFTWARE_PATH", eessi_software_path)
setenv("EESSI_MODULEPATH", eessi_module_path)
if ( mode() ~= "spider" ) then
prepend_path("MODULEPATH", eessi_module_path)
end
prepend_path("LMOD_RC", pathJoin(eessi_software_path, "/.lmod/lmodrc.lua"))
prepend_path("MODULEPATH", eessi_site_module_path)
setenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod"))
if mode() == "load" then
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
end
6 changes: 6 additions & 0 deletions install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ mc_files=(
)
copy_files_by_list ${TOPDIR}/init/Magic_Castle ${INSTALL_PREFIX}/init/Magic_Castle "${mc_files[@]}"

# Copy for init/modules/EESSI directory
mc_files=(
2023.06.lua
)
copy_files_by_list ${TOPDIR}/init/modules/EESSI ${INSTALL_PREFIX}/init/modules/EESSI "${mc_files[@]}"

# Copy for the scripts directory
script_files=(
utils.sh
Expand Down
11 changes: 2 additions & 9 deletions reframe_config_bot.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,13 @@ site_configuration = {
'modules_system': 'lmod',
'partitions': [
{
'name': 'default',
'name': '__RFM_PARTITION__',
'scheduler': 'local',
'launcher': 'mpirun',
'environs': ['default'],
'features': [
FEATURES[CPU]
] + list(SCALES.keys()),
'processor': {
'num_cpus': __NUM_CPUS__,
'num_sockets': __NUM_SOCKETS__,
'num_cpus_per_core': __NUM_CPUS_PER_CORE__,
'num_cpus_per_socket': __NUM_CPUS_PER_SOCKET__,
},
'resources': [
{
'name': 'memory',
Expand Down Expand Up @@ -56,8 +50,7 @@ site_configuration = {
{
'purge_environment': True,
'resolve_module_conflicts': False, # avoid loading the module before submitting the job
# disable automatic detection of CPU architecture (since we're using local scheduler)
'remote_detect': False,
'remote_detect': True,
}
],
'logging': common_logging_config(),
Expand Down
38 changes: 5 additions & 33 deletions test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -141,34 +141,6 @@ export RFM_PREFIX=$PWD/reframe_runs
echo "Configured reframe with the following environment variables:"
env | grep "RFM_"

# Inject correct CPU/memory properties into the ReFrame config file
echo "Collecting system-specific input for the ReFrame configuration file"
cpuinfo=$(lscpu)
if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then
cpu_count=${BASH_REMATCH[1]}
echo "Detected CPU count: ${cpu_count}"
else
fatal_error "Failed to get the number of CPUs for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ Socket\(s\):[^0-9]*([0-9]+) ]]; then
socket_count=${BASH_REMATCH[1]}
echo "Detected socket count: ${socket_count}"
else
fatal_error "Failed to get the number of sockets for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Thread\(s\) per core:[^0-9]*([0-9]+)) ]]; then
threads_per_core=${BASH_REMATCH[2]}
echo "Detected threads per core: ${threads_per_core}"
else
fatal_error "Failed to get the number of threads per core for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then
cores_per_socket=${BASH_REMATCH[2]}
echo "Detected cores per socket: ${cores_per_socket}"
else
fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu."
fi

# The /sys inside the container is not the same as the /sys of the host
# We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM).
# Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup
Expand Down Expand Up @@ -201,13 +173,13 @@ else
fi
echo "Detected available memory: ${cgroup_mem_mib} MiB"

echo "Replacing detected system information in template ReFrame config file..."
cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES
echo "Replacing memory limit in the ReFrame config file with the detected CGROUP memory limit: ${cgroup_mem_mib} MiB"
sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES
RFM_PARTITION="${SLURM_JOB_PARTITION}"
echo "Replacing partition name in the template ReFrame config file: ${RFM_PARTITION}"
sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES

# Make debugging easier by printing the final config file:
echo "Final config file (after replacements):"
cat "${RFM_CONFIG_FILES}"
Expand Down

0 comments on commit e879f45

Please sign in to comment.