From 1341964ec4592ab178b282c6fca5655e7bec39aa Mon Sep 17 00:00:00 2001 From: Richard Top Date: Fri, 14 Jun 2024 09:29:17 +0000 Subject: [PATCH 1/2] Add mem per node setting in ReFrame configuration template --- reframe_config_bot.py.tmpl | 5 +++++ test_suite.sh | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index 0cc3e9f530..607373767a 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -34,6 +34,11 @@ site_configuration = { 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': __MEM_PER_NODE__, + }, 'max_jobs': 1 } ] diff --git a/test_suite.sh b/test_suite.sh index 2f304dd9bc..104f09fc2c 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -135,7 +135,7 @@ export RFM_PREFIX=$PWD/reframe_runs echo "Configured reframe with the following environment variables:" env | grep "RFM_" -# Inject correct CPU properties into the ReFrame config file +# Inject correct CPU/memory properties into the ReFrame config file cpuinfo=$(lscpu) if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then cpu_count=${BASH_REMATCH[1]} @@ -165,6 +165,13 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi +cgroup_mem_bytes=$(cat /sys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes) +if [[ $? -eq 0 ]]; then + # Convert to MiB + cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024))) +else + fatal_error "Failed to get the memory limit in bytes from the current cgroup" +fi cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES From e05c9e70ef7a99c28b4f2c101751a24a27bf499c Mon Sep 17 00:00:00 2001 From: Richard Top Date: Fri, 14 Jun 2024 09:31:28 +0000 Subject: [PATCH 2/2] Add mem per node setting in ReFrame configuration template --- test_suite.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test_suite.sh b/test_suite.sh index 104f09fc2c..af2c5bfb2f 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -177,6 +177,7 @@ sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES +sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES # Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966 export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now