From 1dcf08174c6b5118661a6508781422fee6bd3f23 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 21 Jun 2024 21:53:59 +0200 Subject: [PATCH 1/6] add setting to give all jobs a unique name --- README.md | 7 +++++++ app.cfg.example | 4 ++++ eessi_bot_event_handler.py | 1 + eessi_bot_job_manager.py | 8 +++++++- tasks/build.py | 9 +++++++++ tools/config.py | 1 + 6 files changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e268fc41..9e556311 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,13 @@ package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +``` +job_name = JOB_NAME +``` +Replace `JOB_NAME` with a string of at least 3 characters that is used as job +name when a job is submitted. This is used to filter jobs, e.g., should be used +to make sure that multiple bot instances can run in the same Slurm environment. + ``` jobs_base_dir = PATH_TO_JOBS_BASE_DIR ``` diff --git a/app.cfg.example b/app.cfg.example index ae51ade6..7cbde15d 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -87,6 +87,10 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# Used to give all jobs of a bot instance the same name. Can be used to allow +# multiple bot instances running on the same Slurm cluster. +job_name = prod + # directory under which the bot prepares directories per job # structure created is as follows: YYYY.MM/pr_PR_NUMBER/event_EVENT_ID/run_RUN_NUMBER/OS+SUBDIR jobs_base_dir = $HOME/jobs diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 5677ed2c..d414f947 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -51,6 +51,7 @@ # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional + config.BUILDENV_SETTING_JOB_NAME, # required config.BUILDENV_SETTING_JOBS_BASE_DIR, # required # config.BUILDENV_SETTING_LOAD_MODULES, # optional config.BUILDENV_SETTING_LOCAL_TMP, # required diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index e7473f00..aba40081 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -50,6 +50,8 @@ # settings that are required in 'app.cfg' REQUIRED_CONFIG = { + config.SECTION_BUILDENV: [ + config.BUILDENV_SETTING_JOB_NAME], # required config.SECTION_FINISHED_JOB_COMMENTS: [ config.FINISHED_JOB_COMMENTS_SETTING_JOB_RESULT_UNKNOWN_FMT, # required config.FINISHED_JOB_COMMENTS_SETTING_JOB_TEST_UNKNOWN_FMT], # required @@ -85,6 +87,10 @@ def __init__(self): cfg = config.read_config() job_manager_cfg = cfg[config.SECTION_JOB_MANAGER] self.logfile = job_manager_cfg.get(config.JOB_MANAGER_SETTING_LOG_PATH) + buildenv_cfg = cfg[config.SECTION_BUILDENV] + self.job_name = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_NAME) + if len(self.job_name) < 3: + raise Exception(f"job name ({self.job_name}) is shorter than 3 characters") def get_current_jobs(self): """ @@ -105,7 +111,7 @@ def get_current_jobs(self): if username is None: raise Exception("Unable to find username") - squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) + squeue_cmd = "%s --long --noheader --user=%s --name='%s'" % (self.poll_command, username, self.job_name) squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", diff --git a/tasks/build.py b/tasks/build.py index 82a0911e..5fa36076 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -67,6 +67,10 @@ def get_build_env_cfg(cfg): buildenv = cfg[config.SECTION_BUILDENV] + job_name = buildenv.get(config.BUILDENV_SETTING_JOB_NAME) + log(f"{fn}(): job_name '{job_name}'") + config_data = {config.BUILDENV_SETTING_JOB_NAME: job_name} + jobs_base_dir = buildenv.get(config.BUILDENV_SETTING_JOBS_BASE_DIR) log(f"{fn}(): jobs_base_dir '{jobs_base_dir}'") config_data = {config.BUILDENV_SETTING_JOBS_BASE_DIR: jobs_base_dir} @@ -640,6 +644,10 @@ def submit_job(job, cfg): build_env_cfg = get_build_env_cfg(cfg) + # the job_name is used to filter jobs in case multiple bot + # instances run on the same system + job_name = cfg[config.SECTION_BUILDENV].get(config.BUILDENV_SETTING_JOB_NAME) + # add a default time limit of 24h to the job submit command if no other time # limit is specified already all_opts_str = " ".join([build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], job.slurm_opts]) @@ -654,6 +662,7 @@ def submit_job(job, cfg): build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], time_limit, job.slurm_opts, + f"--job-name='{job_name}'", build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT], ]) diff --git a/tools/config.py b/tools/config.py index dcffe03d..11527702 100644 --- a/tools/config.py +++ b/tools/config.py @@ -43,6 +43,7 @@ BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' BUILDENV_SETTING_HTTP_PROXY = 'http_proxy' +BUILDENV_SETTING_JOB_NAME = 'job_name' BUILDENV_SETTING_JOBS_BASE_DIR = 'jobs_base_dir' BUILDENV_SETTING_LOAD_MODULES = 'load_modules' BUILDENV_SETTING_LOCAL_TMP = 'local_tmp' From 326dd38d55a7d937a7867767c184d4889f99aa93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 24 Jun 2024 21:20:04 +0200 Subject: [PATCH 2/6] Only use job_name if it is not None Co-authored-by: Kenneth Hoste --- eessi_bot_job_manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index aba40081..f8d4368a 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -89,7 +89,7 @@ def __init__(self): self.logfile = job_manager_cfg.get(config.JOB_MANAGER_SETTING_LOG_PATH) buildenv_cfg = cfg[config.SECTION_BUILDENV] self.job_name = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_NAME) - if len(self.job_name) < 3: + if self.job_name and len(self.job_name) < 3: raise Exception(f"job name ({self.job_name}) is shorter than 3 characters") def get_current_jobs(self): @@ -111,7 +111,9 @@ def get_current_jobs(self): if username is None: raise Exception("Unable to find username") - squeue_cmd = "%s --long --noheader --user=%s --name='%s'" % (self.poll_command, username, self.job_name) + squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) + if self.job_name: + squeue_cmd += "--name='%s'" % self.job_name squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", From 76a263d0a6843c9acc5bd7a3b8be48f47f5fcd72 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 24 Jun 2024 21:23:35 +0200 Subject: [PATCH 3/6] add space before name argument --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index f8d4368a..bb0c6dd8 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -113,7 +113,7 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: - squeue_cmd += "--name='%s'" % self.job_name + squeue_cmd += " --name='%s'" % self.job_name squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", From c614a96f801ae2c79f9f1fa9cd5d3cdecbc156db Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 24 Jun 2024 21:34:42 +0200 Subject: [PATCH 4/6] only add job-name submission arg if job_name is not None --- tasks/build.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 5fa36076..b2de809d 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -661,10 +661,9 @@ def submit_job(job, cfg): build_env_cfg[config.BUILDENV_SETTING_SUBMIT_COMMAND], build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], time_limit, - job.slurm_opts, - f"--job-name='{job_name}'", - build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT], - ]) + job.slurm_opts] + + ([f"--job-name='{job_name}'"] if job_name else []) + + [build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT]]) cmdline_output, cmdline_error, cmdline_exit_code = run_cmd(command_line, "submit job for target '%s'" % job.arch_target, From b285f81c59426854cb65a77ece21cce686e92127 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 25 Jun 2024 08:43:52 +0200 Subject: [PATCH 5/6] add missing config section to app.cfg for tests --- tests/test_app.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_app.cfg b/tests/test_app.cfg index f940c1df..fd91ed8b 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -11,6 +11,8 @@ # sample config file for tests (some functions run config.read_config() # which reads app.cfg by default) +[buildenv] + [job_manager] # variable 'comment' under 'submitted_job_comments' should not be changed as there are regular expression patterns matching it From 78e625e2e737192c4b6a45eeffbc237826a89170 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 7 Aug 2024 11:04:08 +0200 Subject: [PATCH 6/6] fix issue overwriting of config_data entry --- tasks/build.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index b2de809d..46b9543a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -65,15 +65,16 @@ def get_build_env_cfg(cfg): """ fn = sys._getframe().f_code.co_name + config_data = {} buildenv = cfg[config.SECTION_BUILDENV] job_name = buildenv.get(config.BUILDENV_SETTING_JOB_NAME) log(f"{fn}(): job_name '{job_name}'") - config_data = {config.BUILDENV_SETTING_JOB_NAME: job_name} + config_data[config.BUILDENV_SETTING_JOB_NAME] = job_name jobs_base_dir = buildenv.get(config.BUILDENV_SETTING_JOBS_BASE_DIR) log(f"{fn}(): jobs_base_dir '{jobs_base_dir}'") - config_data = {config.BUILDENV_SETTING_JOBS_BASE_DIR: jobs_base_dir} + config_data[config.BUILDENV_SETTING_JOBS_BASE_DIR] = jobs_base_dir local_tmp = buildenv.get(config.BUILDENV_SETTING_LOCAL_TMP) log(f"{fn}(): local_tmp '{local_tmp}'")