Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dvsim] Improve error handling on slurm launcher #24958

Merged
merged 1 commit into from
Nov 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions util/dvsim/SlurmLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging as log
import os
import shlex
import shutil
import subprocess

from Launcher import ErrorMessage, Launcher, LauncherError
Expand Down Expand Up @@ -46,28 +47,30 @@ def _do_launch(self):

self._dump_env_vars(exports)

# Add a command delimiter if necessary
slurm_setup_cmd = SLURM_SETUP_CMD
if slurm_setup_cmd and not slurm_setup_cmd.endswith(';'):
slurm_setup_cmd += ';'

# Encapsulate the run command with the slurm invocation
slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \
f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \
f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"'

try:
with open(self.slurm_log_file, 'w') as out_file:
out_file.write("[Executing]:\n{}\n\n".format(self.deploy.cmd))
out_file.flush()

# Add a command delimiter if necessary
slurm_setup_cmd = SLURM_SETUP_CMD
if slurm_setup_cmd != '' and not slurm_setup_cmd.endswith(';'):
slurm_setup_cmd += ';'

# Encapsulate the run command with the slurm invocation
slurm_cmd = f'srun -p {SLURM_QUEUE} --mem={SLURM_MEM} --mincpus={SLURM_MINCPUS} ' \
f'--time={SLURM_TIMEOUT} --cpus-per-task={SLURM_CPUS_PER_TASK} ' \
f'bash -c "{slurm_setup_cmd} {self.deploy.cmd}"'

log.info(f'Executing slurm command: {slurm_cmd}')
self.process = subprocess.Popen(shlex.split(slurm_cmd),
bufsize=4096,
universal_newlines=True,
stdout=out_file,
stderr=out_file,
env=exports)
except IOError as e:
raise LauncherError(f'File Error: {e}\nError while handling {self.slurm_log_file}')
except subprocess.SubprocessError as e:
raise LauncherError(f'IO Error: {e}\nSee {self.deploy.get_log_path()}')
finally:
Expand All @@ -91,13 +94,18 @@ def poll(self):

# Copy slurm job results to log file
if os.path.exists(self.slurm_log_file):
with open(self.slurm_log_file, 'r') as slurm_file:
lines = slurm_file.readlines()
with open(self.deploy.get_log_path(), 'a') as out_file:
for line in lines:
out_file.write(line)
out_file.flush()
os.remove(self.slurm_log_file)
try:
with open(self.slurm_log_file, 'r') as slurm_file:
try:
with open(self.deploy.get_log_path(), 'a') as out_file:
shutil.copyfileobj(slurm_file, out_file)
except IOError as e:
raise LauncherError(f'File Error: {e} when handling '
f'{self.deploy.get_log_path()}')
# Remove the temporary file from the slurm process
os.remove(self.slurm_log_file)
rswarbrick marked this conversation as resolved.
Show resolved Hide resolved
except IOError as e:
raise LauncherError(f'File Error: {e} when handling {self.slurm_log_file}')

self.exit_code = self.process.returncode
status, err_msg = self._check_status()
Expand All @@ -109,7 +117,6 @@ def kill(self):

This must be called between dispatching and reaping the process (the
same window as poll()).

'''
assert self.process is not None

Expand Down
Loading