Skip to content

Commit

Permalink
[dvsim] Improve error handling on slurm launcher
Browse files Browse the repository at this point in the history
Incorporate post-merge feedback from @matutem.

Signed-off-by: Robert Schilling <[email protected]>
  • Loading branch information
Razer6 committed Nov 5, 2024
1 parent 6581ef8 commit 0ead59f
Showing 1 changed file with 25 additions and 18 deletions.
43 changes: 25 additions & 18 deletions util/dvsim/SlurmLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging as log
import os
import shlex
import shutil
import subprocess

from Launcher import ErrorMessage, Launcher, LauncherError
Expand Down Expand Up @@ -46,28 +47,30 @@ def _do_launch(self):

self._dump_env_vars(exports)

# Add a command delimiter if necessary
slurm_setup_cmd = SLURM_SETUP_CMD
if slurm_setup_cmd and not slurm_setup_cmd.endswith(';'):
slurm_setup_cmd += ';'

# Encapsulate the run command with the slurm invocation
slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \
f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \
f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"'

try:
with open(self.slurm_log_file, 'w') as out_file:
out_file.write("[Executing]:\n{}\n\n".format(self.deploy.cmd))
out_file.flush()

# Add a command delimiter if necessary
slurm_setup_cmd = SLURM_SETUP_CMD
if slurm_setup_cmd != '' and not slurm_setup_cmd.endswith(';'):
slurm_setup_cmd += ';'

# Encapsulate the run command with the slurm invocation
slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \
f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \
f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"'

log.info(f'Executing slurm command: {slurm_cmd}')
self.process = subprocess.Popen(shlex.split(slurm_cmd),
bufsize=4096,
universal_newlines=True,
stdout=out_file,
stderr=out_file,
env=exports)
except IOError as e:
raise LauncherError(f'File Error: {e}\nError while handling {self.slurm_log_file}')
except subprocess.SubprocessError as e:
raise LauncherError(f'IO Error: {e}\nSee {self.deploy.get_log_path()}')
finally:
Expand All @@ -91,13 +94,18 @@ def poll(self):

# Copy slurm job results to log file
if os.path.exists(self.slurm_log_file):
with open(self.slurm_log_file, 'r') as slurm_file:
lines = slurm_file.readlines()
with open(self.deploy.get_log_path(), 'a') as out_file:
for line in lines:
out_file.write(line)
out_file.flush()
os.remove(self.slurm_log_file)
try:
with open(self.slurm_log_file, 'r') as slurm_file:
try:
with open(self.deploy.get_log_path(), 'a') as out_file:
shutil.copyfileobj(slurm_file, out_file)
except IOError as e:
raise LauncherError(f'File Error: {e} when handling '
f'{self.deploy.get_log_path()}')
# Remove the temporary file from the slurm process
os.remove(self.slurm_log_file)
except IOError as e:
raise LauncherError(f'File Error: {e} when handling {self.slurm_log_file}')

self.exit_code = self.process.returncode
status, err_msg = self._check_status()
Expand All @@ -109,7 +117,6 @@ def kill(self):
This must be called between dispatching and reaping the process (the
same window as poll()).
'''
assert self.process is not None

Expand Down

0 comments on commit 0ead59f

Please sign in to comment.