From 75c00bbaeb07ebd79adc2cc8657e3ca6be8d2268 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Wed, 21 Feb 2024 12:56:03 +0100 Subject: [PATCH 1/6] fix for job manager crash: Unable to contact slurm controller --- eessi_bot_job_manager.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 7d40b545..58f68fbe 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -113,8 +113,14 @@ def get_current_jobs(self): squeue_cmd, "get_current_jobs(): squeue command", log_file=self.logfile, + raise_on_error=False, ) + if squeue_exitcode != 0: + current_jobs = {} + log("The squeue command failed will try again in {} seconds".format(config.read_config()["job_manager"].get("poll_interval"))) + return current_jobs + # create dictionary of jobs from output of 'squeue_cmd' # with the following information per job: jobid, state, # nodelist_reason From d458da17fd20a1aaed17132e929c96cb847b2ca5 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Wed, 21 Feb 2024 13:59:46 +0100 Subject: [PATCH 2/6] fix long line --- eessi_bot_job_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 58f68fbe..ebb00fd6 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -118,7 +118,8 @@ def get_current_jobs(self): if squeue_exitcode != 0: current_jobs = {} - log("The squeue command failed will try again in {} seconds".format(config.read_config()["job_manager"].get("poll_interval"))) + poll_interval = config.read_config()["job_manager"].get("poll_interval") + log("The squeue command failed will try again in {} seconds".format(poll_interval)) return current_jobs # create dictionary of jobs from output of 'squeue_cmd' From 307c62c85c2eada4a1be7c93ea4130294874685d Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Thu, 22 Feb 2024 13:09:12 +0100 Subject: [PATCH 3/6] move poll_interval and skip loop if job_manager.get_current_jobs() fails --- eessi_bot_job_manager.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index ebb00fd6..288b8014 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -674,6 +674,13 @@ def main(): if max_iter != 0: known_jobs = job_manager.get_known_jobs() while max_iter < 0 or i < max_iter: + # sleep poll_interval seconds (not for the first iteration) + if i != 0 and (max_iter < 0 or i < max_iter): + log( + "job manager main loop: sleep %d seconds" % poll_interval, + job_manager.logfile, + ) + time.sleep(poll_interval) log("job manager main loop: iteration %d" % i, job_manager.logfile) log( "job manager main loop: known_jobs='%s'" % ",".join( @@ -681,7 +688,12 @@ def main(): job_manager.logfile, ) - current_jobs = job_manager.get_current_jobs() + try: + current_jobs = job_manager.get_current_jobs() + except: + i = i + 1 + continue + log( "job manager main loop: current_jobs='%s'" % ",".join( current_jobs.keys()), @@ -736,13 +748,7 @@ def main(): known_jobs = current_jobs - # sleep poll_interval seconds (only if at least one more iteration) - if max_iter < 0 or i + 1 < max_iter: - log( - "job manager main loop: sleep %d seconds" % poll_interval, - job_manager.logfile, - ) - time.sleep(poll_interval) + # add one iteration to the loop i = i + 1 From 43ab1a6493e7583bbdaa8b99fe2afd105f276302 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Thu, 22 Feb 2024 14:07:16 +0100 Subject: [PATCH 4/6] remove other fix --- eessi_bot_job_manager.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 288b8014..558060b4 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -113,15 +113,8 @@ def get_current_jobs(self): squeue_cmd, "get_current_jobs(): squeue command", log_file=self.logfile, - raise_on_error=False, ) - if squeue_exitcode != 0: - current_jobs = {} - poll_interval = config.read_config()["job_manager"].get("poll_interval") - log("The squeue command failed will try again in {} seconds".format(poll_interval)) - return current_jobs - # create dictionary of jobs from output of 'squeue_cmd' # with the following information per job: jobid, state, # nodelist_reason From f7b38df9792e327686229347de9051b56c0065ca Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Thu, 22 Feb 2024 14:11:32 +0100 Subject: [PATCH 5/6] fix except --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 558060b4..224fc201 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -683,7 +683,7 @@ def main(): try: current_jobs = job_manager.get_current_jobs() - except: + except Exception as err: i = i + 1 continue From e73bfe62ea39bb67cbd4d140896aab85ce024d18 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Thu, 22 Feb 2024 14:15:51 +0100 Subject: [PATCH 6/6] implement suggested changes --- eessi_bot_job_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 224fc201..5c475898 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -668,7 +668,7 @@ def main(): known_jobs = job_manager.get_known_jobs() while max_iter < 0 or i < max_iter: # sleep poll_interval seconds (not for the first iteration) - if i != 0 and (max_iter < 0 or i < max_iter): + if i != 0: log( "job manager main loop: sleep %d seconds" % poll_interval, job_manager.logfile, @@ -683,7 +683,7 @@ def main(): try: current_jobs = job_manager.get_current_jobs() - except Exception as err: + except RuntimeError: i = i + 1 continue