diff --git a/util/dvsim/SlurmLauncher.py b/util/dvsim/SlurmLauncher.py index 3f5c0dc0f8244..254fd49628b7e 100644 --- a/util/dvsim/SlurmLauncher.py +++ b/util/dvsim/SlurmLauncher.py @@ -5,6 +5,7 @@ import logging as log import os import shlex +import shutil import subprocess from Launcher import ErrorMessage, Launcher, LauncherError @@ -46,21 +47,21 @@ def _do_launch(self): self._dump_env_vars(exports) + # Add a command delimiter if necessary + slurm_setup_cmd = SLURM_SETUP_CMD + if slurm_setup_cmd and not slurm_setup_cmd.endswith(';'): + slurm_setup_cmd += ';' + + # Encapsulate the run command with the slurm invocation + slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \ + f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \ + f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"' + try: with open(self.slurm_log_file, 'w') as out_file: out_file.write("[Executing]:\n{}\n\n".format(self.deploy.cmd)) out_file.flush() - # Add a command delimiter if necessary - slurm_setup_cmd = SLURM_SETUP_CMD - if slurm_setup_cmd != '' and not slurm_setup_cmd.endswith(';'): - slurm_setup_cmd += ';' - - # Encapsulate the run command with the slurm invocation - slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \ - f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \ - f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"' - log.info(f'Executing slurm command: {slurm_cmd}') self.process = subprocess.Popen(shlex.split(slurm_cmd), bufsize=4096, @@ -68,6 +69,8 @@ def _do_launch(self): stdout=out_file, stderr=out_file, env=exports) + except IOError as e: + raise LauncherError(f'File Error: {e}\nError while handling {self.slurm_log_file}') except subprocess.SubprocessError as e: raise LauncherError(f'IO Error: {e}\nSee {self.deploy.get_log_path()}') finally: @@ -91,13 +94,18 @@ def poll(self): # Copy slurm job results to log file if os.path.exists(self.slurm_log_file): - with open(self.slurm_log_file, 'r') as slurm_file: - lines = slurm_file.readlines() - with open(self.deploy.get_log_path(), 'a') as out_file: - for line in lines: - out_file.write(line) - out_file.flush() - os.remove(self.slurm_log_file) + try: + with open(self.slurm_log_file, 'r') as slurm_file: + try: + with open(self.deploy.get_log_path(), 'a') as out_file: + shutil.copyfileobj(slurm_file, out_file) + except IOError as e: + raise LauncherError(f'File Error: {e} when handling ' + f'{self.deploy.get_log_path()}') + # Remove the temporary file from the slurm process + os.remove(self.slurm_log_file) + except IOError as e: + raise LauncherError(f'File Error: {e} when handling {self.slurm_log_file}') self.exit_code = self.process.returncode status, err_msg = self._check_status() @@ -109,7 +117,6 @@ def kill(self): This must be called between dispatching and reaping the process (the same window as poll()). - ''' assert self.process is not None