From 95aa9909ca0b01b478f282c84b24d77a29c7cb9b Mon Sep 17 00:00:00 2001 From: Robert Schilling Date: Thu, 31 Oct 2024 07:32:25 +0100 Subject: [PATCH] [dvsim] Improve error handling on slurm launcher Incorporate post-merge feedback from @matutem. Signed-off-by: Robert Schilling --- util/dvsim/SlurmLauncher.py | 45 ++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/util/dvsim/SlurmLauncher.py b/util/dvsim/SlurmLauncher.py index 3f5c0dc0f8244e..14772635e89615 100644 --- a/util/dvsim/SlurmLauncher.py +++ b/util/dvsim/SlurmLauncher.py @@ -5,6 +5,7 @@ import logging as log import os import shlex +import shutil import subprocess from Launcher import ErrorMessage, Launcher, LauncherError @@ -46,21 +47,21 @@ def _do_launch(self): self._dump_env_vars(exports) + # Add a command delimiter if necessary + slurm_setup_cmd = SLURM_SETUP_CMD + if slurm_setup_cmd and not slurm_setup_cmd.endswith(';'): + slurm_setup_cmd += ';' + + # Encapsulate the run command with the slurm invocation + slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \ + f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \ + f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"' + try: with open(self.slurm_log_file, 'w') as out_file: out_file.write("[Executing]:\n{}\n\n".format(self.deploy.cmd)) out_file.flush() - # Add a command delimiter if necessary - slurm_setup_cmd = SLURM_SETUP_CMD - if slurm_setup_cmd != '' and not slurm_setup_cmd.endswith(';'): - slurm_setup_cmd += ';' - - # Encapsulate the run command with the slurm invocation - slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \ - f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \ - f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"' - log.info(f'Executing slurm command: {slurm_cmd}') self.process = subprocess.Popen(shlex.split(slurm_cmd), bufsize=4096, @@ -68,6 +69,8 @@ def _do_launch(self): stdout=out_file, stderr=out_file, env=exports) + except IOError as e: + raise LauncherError(f'File Error: {e}\nError while handling {self.slurm_log_file}') except subprocess.SubprocessError as e: raise LauncherError(f'IO Error: {e}\nSee {self.deploy.get_log_path()}') finally: @@ -91,13 +94,20 @@ def poll(self): # Copy slurm job results to log file if os.path.exists(self.slurm_log_file): - with open(self.slurm_log_file, 'r') as slurm_file: - lines = slurm_file.readlines() - with open(self.deploy.get_log_path(), 'a') as out_file: - for line in lines: - out_file.write(line) - out_file.flush() - os.remove(self.slurm_log_file) + + shutil.move(slurm_file, ) + try: + with open(self.slurm_log_file, 'r') as slurm_file: + try: + with open(self.deploy.get_log_path(), 'a') as out_file: + shutil.copyfileobj(slurm_file, out_file) + except IOError as e: + raise LauncherError(f'File Error: {e} when handling ' + f'{self.deploy.get_log_path()}') + # Remove the temporary file from the slurm process + os.remove(self.slurm_log_file) + except IOError as e: + raise LauncherError(f'File Error: {e} when handling {self.slurm_log_file}') self.exit_code = self.process.returncode status, err_msg = self._check_status() @@ -109,7 +119,6 @@ def kill(self): This must be called between dispatching and reaping the process (the same window as poll()). - ''' assert self.process is not None