From ff68e8f964429d2a355eecbd51bc58a4742605e9 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 9 Apr 2018 16:29:31 -0700 Subject: [PATCH 01/88] Added containertasks --- sciluigi/__init__.py | 5 + sciluigi/containertask.py | 201 ++++++++++++++++++++++++++++++++++++++ sciluigi/task.py | 10 +- 3 files changed, 215 insertions(+), 1 deletion(-) create mode 100755 sciluigi/containertask.py diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index 97a5345..01fd6bf 100644 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -43,3 +43,8 @@ from sciluigi.util import timepath from sciluigi.util import recordfile_to_dict from sciluigi.util import dict_to_recordfile + +from sciluigi import containertask +from sciluigi.containertask import ContainerInfo +from sciluigi.containertask import ContainerTask +from sciluigi.containertask import ContainerHelpers diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py new file mode 100755 index 0000000..cfa99c6 --- /dev/null +++ b/sciluigi/containertask.py @@ -0,0 +1,201 @@ +import luigi +import sciluigi +import json +import logging +import subprocess +import docker +import os +from string import Template + + +# Setup logging +log = logging.getLogger('sciluigi-interface') + + +class ContainerInfo(): + """ + A data object to store parameters related to running a specific + tasks in a container (docker / batch / etc). Mostly around resources. + """ + + # num vcpu required + vcpu = None + # max memory (mb) + mem = None + # Env + env = None + # Timeout in seconds + timeout = None + # Local Container cache location. For things like singularity that need to pull + # And create a local container + container_cache = None + + # engine + # Docker by default. Extensible in the future for batch, slurm-singularity, etc + + def __init__(self, + vcpu=1, + mem=4096, + timeout=604800, # Seven days of seconds + container_cache='.'): + self.vcpu = vcpu + self.mem = mem + self.timeout = timeout + self.container_cache = container_cache + + def __str__(self): + """ + Return string of this information + """ + return( + "Cpu {}, Mem {} MB, timeout {} secs, and container cache {}".format( + self.vcpu, + self.mem, + self.timeout, + self.container_cache + )) + + +class ContainerInfoParameter(sciluigi.parameter.Parameter): + ''' + A specialized luigi parameter, taking ContainerInfo objects. + ''' + + def parse(self, x): + if isinstance(x, ContainerInfo): + return x + else: + log.error('parameter is not instance of ContainerInfo. It is instead {}' + .format(type(x))) + raise Exception('parameter is not instance of ContainerInfo. It is instead {}' + .format(type(x))) + + +class ContainerHelpers(): + """ + Mixin with various methods and variables for running commands in containers using (Sci)-Luigi + """ + # Other class-fields + # Resource guidance for this container at runtime. + containerinfo = ContainerInfoParameter(default=None) + + # The ID of the container (docker registry style). + container = None + # Choices include docker right now. Eventually we can add batch, slurm-singularity, etc + engine = 'docker' + + def map_paths_to_container(self, paths, container_base_path='/mnt'): + """ + Accepts a dictionary where the keys are identifiers for various targets + and the value is the HOST path for that target + + What this does is find a common HOST prefix + and remaps to the CONTAINER BASE PATH + + Returns a dict of the paths for the targets as they would be seen + if the common prefix is mounted within the container at the container_base_path + """ + common_prefix = os.path.commonprefix( + [os.path.dirname(p) for p in paths.values()] + ) + container_paths = { + i: os.path.join( + container_base_path, + os.path.relpath(paths[i], common_prefix)) + for i in paths + } + return os.path.abspath(common_prefix), container_paths + + def ex( + self, + command, + input_paths={}, + output_paths={}, + mounts={}, + inputs_mode='ro', + outputs_mode='rw'): + if self.engine == 'docker': + return self.ex_docker(command, input_paths, output_paths, mounts, inputs_mode, outputs_mode) + else: + raise Exception("Container engine {} is invalid".format(self.engine)) + + def ex_docker( + self, + command, + input_paths={}, + output_paths={}, + mounts={}, + inputs_mode='ro', + outputs_mode='rw'): + """ + Run command in the container using docker, with mountpoints + command is assumed to be in python template substitution format + """ + client = docker.from_env() + container_paths = {} + + if len(output_paths) > 0: + output_host_path_ca, output_container_paths = self.map_paths_to_container( + output_paths, + container_base_path='/mnt/outputs' + ) + container_paths.update(output_container_paths) + mounts[output_host_path_ca] = {'bind': '/mnt/outputs', 'mode': outputs_mode} + + if len(input_paths) > 0: + input_host_path_ca, input_container_paths = self.map_paths_to_container( + input_paths, + container_base_path='/mnt/inputs' + ) + # Handle the edge case where the common directory for inputs is equal to the outputs + if len(output_paths) > 0 and (output_host_path_ca == input_host_path_ca): + log.warn("Input and Output host paths the same {}".format(output_host_path_ca)) + # Repeat our mapping, now using the outputs path for both + input_host_path_ca, input_container_paths = self.map_paths_to_container( + input_paths, + container_base_path='/mnt/outputs' + ) + else: # output and input paths different OR there are only input paths + mounts[input_host_path_ca] = {'bind': '/mnt/inputs', 'mode': inputs_mode} + + # No matter what, add our mappings + container_paths.update(input_container_paths) + + command = Template(command).substitute(container_paths) + + try: + log.info("Attempting to run {} in {}".format( + command, + self.container + )) + stdout = client.containers.run( + image=self.container, + command=command, + volumes=mounts, + mem_limit="{}m".format(self.containerinfo.mem), + ) + log.info(stdout) + return (0, stdout, "") + except docker.errors.ContainerError as e: + log.error("Non-zero return code from the container: {}".format(e)) + return (-1, "", "") + except docker.errors.ImageNotFound: + log.error("Could not find container {}".format( + self.container) + ) + return (-1, "", "") + except docker.errors.APIError as e: + log.error("Docker Server failed {}".format(e)) + return (-1, "", "") + except Exception as e: + log.error("Unknown error occurred: {}".format(e)) + return (-1, "", "") + + +# ================================================================================ + +class ContainerTask(ContainerHelpers, sciluigi.task.Task): + ''' + luigi task that includes the ContainerHelpers mixin. + ''' + pass diff --git a/sciluigi/task.py b/sciluigi/task.py index 6a9e4f6..6d02eb8 100644 --- a/sciluigi/task.py +++ b/sciluigi/task.py @@ -15,12 +15,14 @@ # ============================================================================== + def new_task(name, cls, workflow_task, **kwargs): ''' Instantiate a new task. Not supposed to be used by the end-user (use WorkflowTask.new_task() instead). ''' slurminfo = None + containerinfo = None for key, val in [(key, val) for key, val in iteritems(kwargs)]: # Handle non-string keys if not isinstance(key, string_types): @@ -29,19 +31,24 @@ def new_task(name, cls, workflow_task, **kwargs): if isinstance(val, sciluigi.slurm.SlurmInfo): slurminfo = val kwargs[key] = val + if isinstance(val, sciluigi.containertask.ContainerInfo): + containerinfo = val + kwargs[key] = val elif not isinstance(val, string_types): try: - kwargs[key] = json.dumps(val) # Force conversion into string + kwargs[key] = json.dumps(val) # Force conversion into string except TypeError: kwargs[key] = str(val) kwargs['instance_name'] = name kwargs['workflow_task'] = workflow_task kwargs['slurminfo'] = slurminfo + kwargs['containerinfo'] = containerinfo newtask = cls.from_str_params(kwargs) if slurminfo is not None: newtask.slurminfo = slurminfo return newtask + class Task(sciluigi.audit.AuditTrailHelpers, sciluigi.dependencies.DependencyHelpers, luigi.Task): ''' SciLuigi Task, implementing SciLuigi specific functionality for dependency resolution @@ -88,6 +95,7 @@ def ex(self, command): # ============================================================================== + class ExternalTask( sciluigi.audit.AuditTrailHelpers, sciluigi.dependencies.DependencyHelpers, From 9dabb6d3d6a20c81683860905ec62aa91b954d7e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 9 Apr 2018 16:43:46 -0700 Subject: [PATCH 02/88] Added engine to containerinfo --- sciluigi/containertask.py | 22 +++++++++++----------- sciluigi/task.py | 2 ++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index cfa99c6..7c515e2 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -17,7 +17,9 @@ class ContainerInfo(): A data object to store parameters related to running a specific tasks in a container (docker / batch / etc). Mostly around resources. """ - + # Which container system to use + # Docker by default. Extensible in the future for batch, slurm-singularity, etc + engine = None # num vcpu required vcpu = None # max memory (mb) @@ -30,14 +32,13 @@ class ContainerInfo(): # And create a local container container_cache = None - # engine - # Docker by default. Extensible in the future for batch, slurm-singularity, etc - def __init__(self, + engine='docker', vcpu=1, mem=4096, timeout=604800, # Seven days of seconds container_cache='.'): + self.engine = engine self.vcpu = vcpu self.mem = mem self.timeout = timeout @@ -48,7 +49,8 @@ def __str__(self): Return string of this information """ return( - "Cpu {}, Mem {} MB, timeout {} secs, and container cache {}".format( + "{} with Cpu {}, Mem {} MB, timeout {} secs, and container cache {}".format( + self.engine, self.vcpu, self.mem, self.timeout, @@ -81,18 +83,16 @@ class ContainerHelpers(): # The ID of the container (docker registry style). container = None - # Choices include docker right now. Eventually we can add batch, slurm-singularity, etc - engine = 'docker' def map_paths_to_container(self, paths, container_base_path='/mnt'): """ Accepts a dictionary where the keys are identifiers for various targets and the value is the HOST path for that target - What this does is find a common HOST prefix + What this does is find a common HOST prefix and remaps to the CONTAINER BASE PATH - Returns a dict of the paths for the targets as they would be seen + Returns a dict of the paths for the targets as they would be seen if the common prefix is mounted within the container at the container_base_path """ common_prefix = os.path.commonprefix( @@ -114,10 +114,10 @@ def ex( mounts={}, inputs_mode='ro', outputs_mode='rw'): - if self.engine == 'docker': + if self.containerinfo.engine == 'docker': return self.ex_docker(command, input_paths, output_paths, mounts, inputs_mode, outputs_mode) else: - raise Exception("Container engine {} is invalid".format(self.engine)) + raise Exception("Container engine {} is invalid".format(self.containerinfo.engine)) def ex_docker( self, diff --git a/sciluigi/task.py b/sciluigi/task.py index 6d02eb8..42609e4 100644 --- a/sciluigi/task.py +++ b/sciluigi/task.py @@ -46,6 +46,8 @@ def new_task(name, cls, workflow_task, **kwargs): newtask = cls.from_str_params(kwargs) if slurminfo is not None: newtask.slurminfo = slurminfo + if containerinfo is not None: + newtask.containerinfo = containerinfo return newtask From 3c3e878a80295ff8efbaf7046386a8a57b94ab66 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 9 Apr 2018 16:55:55 -0700 Subject: [PATCH 03/88] added singularity_slurm engine basics --- sciluigi/containertask.py | 90 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7c515e2..6cd78ac 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -115,10 +115,98 @@ def ex( inputs_mode='ro', outputs_mode='rw'): if self.containerinfo.engine == 'docker': - return self.ex_docker(command, input_paths, output_paths, mounts, inputs_mode, outputs_mode) + return self.ex_docker( + command, + input_paths, + output_paths, + mounts, + inputs_mode, + outputs_mode + ) + elif self.containerinfo.engine == 'singularity_slurm': + return self.ex_singularity_slurm( + command, + input_paths, + output_paths, + mounts, + inputs_mode, + outputs_mode + ) else: raise Exception("Container engine {} is invalid".format(self.containerinfo.engine)) + def ex_singularity_slurm( + self, + command, + input_paths={}, + output_paths={}, + mounts={}, + inputs_mode='ro', + outputs_mode='rw'): + """ + Run command in the container using docker, with mountpoints + command is assumed to be in python template substitution format + """ + client = docker.from_env() + container_paths = {} + + if len(output_paths) > 0: + output_host_path_ca, output_container_paths = self.map_paths_to_container( + output_paths, + container_base_path='/mnt/outputs' + ) + container_paths.update(output_container_paths) + mounts[output_host_path_ca] = {'bind': '/mnt/outputs', 'mode': outputs_mode} + + if len(input_paths) > 0: + input_host_path_ca, input_container_paths = self.map_paths_to_container( + input_paths, + container_base_path='/mnt/inputs' + ) + # Handle the edge case where the common directory for inputs is equal to the outputs + if len(output_paths) > 0 and (output_host_path_ca == input_host_path_ca): + log.warn("Input and Output host paths the same {}".format(output_host_path_ca)) + # Repeat our mapping, now using the outputs path for both + input_host_path_ca, input_container_paths = self.map_paths_to_container( + input_paths, + container_base_path='/mnt/outputs' + ) + else: # output and input paths different OR there are only input paths + mounts[input_host_path_ca] = {'bind': '/mnt/inputs', 'mode': inputs_mode} + + # No matter what, add our mappings + container_paths.update(input_container_paths) + + command = Template(command).substitute(container_paths) + + try: + log.info("Attempting to run {} in {}".format( + command, + self.container + )) + stdout = client.containers.run( + image=self.container, + command=command, + volumes=mounts, + mem_limit="{}m".format(self.containerinfo.mem), + ) + log.info(stdout) + return (0, stdout, "") + except docker.errors.ContainerError as e: + log.error("Non-zero return code from the container: {}".format(e)) + return (-1, "", "") + except docker.errors.ImageNotFound: + log.error("Could not find container {}".format( + self.container) + ) + return (-1, "", "") + except docker.errors.APIError as e: + log.error("Docker Server failed {}".format(e)) + return (-1, "", "") + except Exception as e: + log.error("Unknown error occurred: {}".format(e)) + return (-1, "", "") + def ex_docker( self, command, From bcbb8f5843c40cb3d0089b931101256937d11075 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 10 Apr 2018 10:32:47 -0700 Subject: [PATCH 04/88] Added bucket_command_wrapper.py to tools --- .gitignore | 0 LICENSE | 0 MANIFEST.in | 0 README.md | 0 README.rst | 0 sciluigi/__init__.py | 0 sciluigi/audit.py | 0 sciluigi/containertask.py | 89 ++++++++++++++++++++++----------- sciluigi/dependencies.py | 0 sciluigi/interface.py | 0 sciluigi/parameter.py | 0 sciluigi/slurm.py | 0 sciluigi/task.py | 0 sciluigi/util.py | 0 sciluigi/workflow.py | 0 setup.py | 0 test/test_dependencies.py | 0 test/test_paramval.py | 0 tools/bucket_command_wrapper.py | 53 ++++++++++++++++++++ 19 files changed, 114 insertions(+), 28 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 LICENSE mode change 100644 => 100755 MANIFEST.in mode change 100644 => 100755 README.md mode change 100644 => 100755 README.rst mode change 100644 => 100755 sciluigi/__init__.py mode change 100644 => 100755 sciluigi/audit.py mode change 100644 => 100755 sciluigi/dependencies.py mode change 100644 => 100755 sciluigi/interface.py mode change 100644 => 100755 sciluigi/parameter.py mode change 100644 => 100755 sciluigi/slurm.py mode change 100644 => 100755 sciluigi/task.py mode change 100644 => 100755 sciluigi/util.py mode change 100644 => 100755 sciluigi/workflow.py mode change 100644 => 100755 setup.py mode change 100644 => 100755 test/test_dependencies.py mode change 100644 => 100755 test/test_paramval.py create mode 100755 tools/bucket_command_wrapper.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py old mode 100644 new mode 100755 diff --git a/sciluigi/audit.py b/sciluigi/audit.py old mode 100644 new mode 100755 diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 6cd78ac..75f09ee 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -6,7 +6,7 @@ import docker import os from string import Template - +import shlex # Setup logging log = logging.getLogger('sciluigi-interface') @@ -106,6 +106,15 @@ def map_paths_to_container(self, paths, container_base_path='/mnt'): } return os.path.abspath(common_prefix), container_paths + def make_fs_name(self, uri): + uri_list = uri.split('://') + if len(uri_list) == 1: + name = uri_list[0] + else: + name = uri_list[1] + keepcharacters = ('.', '_') + return "".join(c if (c.isalnum() or c in keepcharacters) else '_' for c in name).rstrip() + def ex( self, command, @@ -144,10 +153,9 @@ def ex_singularity_slurm( inputs_mode='ro', outputs_mode='rw'): """ - Run command in the container using docker, with mountpoints + Run command in the container using singularity, with mountpoints command is assumed to be in python template substitution format """ - client = docker.from_env() container_paths = {} if len(output_paths) > 0: @@ -177,35 +185,60 @@ def ex_singularity_slurm( # No matter what, add our mappings container_paths.update(input_container_paths) - command = Template(command).substitute(container_paths) + img_location = os.path.join( + self.containerinfo.container_cache, + "{}.singularity.img".format(self.make_fs_name(self.container)) + ) + log.info("Looking for singularity image {}".format(img_location)) + if not os.path.exists(img_location): + log.info("No image at {} Creating....".format(img_location)) + try: + os.makedirs(os.path.dirname(img_location)) + except FileExistsError: + # No big deal + pass + # Singularity is dumb and can only pull images to the working dir + # So, get our current working dir. + cwd = os.getcwd() + # Move to our target dir + os.chdir(os.path.dirname(img_location)) + # Attempt to pull our image + pull_proc = subprocess.run( + [ + 'singularity', + 'pull', + '--name', + os.path.basename(img_location), + self.container + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + print(pull_proc) + # Move back + os.chdir(cwd) - try: - log.info("Attempting to run {} in {}".format( + command = Template(command).substitute(container_paths) + log.info("Attempting to run {} in {}".format( command, self.container )) - stdout = client.containers.run( - image=self.container, - command=command, - volumes=mounts, - mem_limit="{}m".format(self.containerinfo.mem), - ) - log.info(stdout) - return (0, stdout, "") - except docker.errors.ContainerError as e: - log.error("Non-zero return code from the container: {}".format(e)) - return (-1, "", "") - except docker.errors.ImageNotFound: - log.error("Could not find container {}".format( - self.container) - ) - return (-1, "", "") - except docker.errors.APIError as e: - log.error("Docker Server failed {}".format(e)) - return (-1, "", "") - except Exception as e: - log.error("Unknown error occurred: {}".format(e)) - return (-1, "", "") + + command_list = [ + 'singularity', 'exec' + ] + for mp in mounts: + command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] + command_list.append(img_location) + command_list += shlex.split(command) + command_proc = subprocess.run( + command_list, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + log.info(command_proc.stdout) + if command_proc.stderr: + log.warn(command_proc.stderr) def ex_docker( self, diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py old mode 100644 new mode 100755 diff --git a/sciluigi/interface.py b/sciluigi/interface.py old mode 100644 new mode 100755 diff --git a/sciluigi/parameter.py b/sciluigi/parameter.py old mode 100644 new mode 100755 diff --git a/sciluigi/slurm.py b/sciluigi/slurm.py old mode 100644 new mode 100755 diff --git a/sciluigi/task.py b/sciluigi/task.py old mode 100644 new mode 100755 diff --git a/sciluigi/util.py b/sciluigi/util.py old mode 100644 new mode 100755 diff --git a/sciluigi/workflow.py b/sciluigi/workflow.py old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/test/test_dependencies.py b/test/test_dependencies.py old mode 100644 new mode 100755 diff --git a/test/test_paramval.py b/test/test_paramval.py old mode 100644 new mode 100755 diff --git a/tools/bucket_command_wrapper.py b/tools/bucket_command_wrapper.py new file mode 100755 index 0000000..6525001 --- /dev/null +++ b/tools/bucket_command_wrapper.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +import argparse +import os + +# +# Script to be placed in containers to help with making mount points +# and pulling / pushing to buckets (eg S3) to accomodate utilities +# that are unaware of S3 / buckets in containers +# (to run on things like AWS Batch) +# + +def build_parser(): + parser = argparse.ArgumentParser(description=""" + Wrapper to pull from buckets, run a command, and push back to buckets.""") + parser.add_argument( + '--command', + '-c', + type=str, + help=""" + Command to be run AFTER downloads BEFORE uploads. + Will be passed unaltered as a shell command.""" + ) + parser.add_argument( + '--download-files', + '-DF', + nargs='+', + help="""Format is + bucket_file_uri:container_path:mode + Where mode can be 'ro' or 'rw'. + If 'rw' the file will be pushed back to the bucket after the command + IF 'ro, the file will only be pulled from the bucket + e.g: s3://bucket/key/path.txt:/mnt/inputs/path.txt:ro""", + ) + parser.add_argument( + '--upload-files', + '-UF', + nargs='+', + help="""Format is + container_path:bucket_file_uri + Mode is presumed to be w. (If you want rw / a / use input in mode 'rw') + e.g: /mnt/outputs/path.txt:s3://bucket/key/path.txt""", + ) + + return parser + + +def main(): + """Entrypoint for main script.""" + parser = build_parser() + args = parser.parse_args() + +if __name__ == "__main__": + main() \ No newline at end of file From 8e763da215aae82dbda4a7dbeee35da76ba5bc6d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 11 Apr 2018 09:00:41 -0700 Subject: [PATCH 05/88] Pre pull commit --- tools/.logging.conf.template | 0 tools/init_projdir.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/.logging.conf.template mode change 100644 => 100755 tools/init_projdir.py diff --git a/tools/.logging.conf.template b/tools/.logging.conf.template old mode 100644 new mode 100755 diff --git a/tools/init_projdir.py b/tools/init_projdir.py old mode 100644 new mode 100755 From cccc0ac26fd84039828b27fa695b146f7317ec7e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 11 Apr 2018 11:00:36 -0700 Subject: [PATCH 06/88] Working bucket_command_wrapper.py --- tools/bucket_command_wrapper.py | 238 +++++++++++++++++++++++++++----- 1 file changed, 206 insertions(+), 32 deletions(-) diff --git a/tools/bucket_command_wrapper.py b/tools/bucket_command_wrapper.py index 6525001..e6a0297 100755 --- a/tools/bucket_command_wrapper.py +++ b/tools/bucket_command_wrapper.py @@ -1,6 +1,9 @@ #!/usr/bin/env python import argparse import os +import sys +import re +import subprocess # # Script to be placed in containers to help with making mount points @@ -9,45 +12,216 @@ # (to run on things like AWS Batch) # -def build_parser(): - parser = argparse.ArgumentParser(description=""" - Wrapper to pull from buckets, run a command, and push back to buckets.""") - parser.add_argument( - '--command', - '-c', - type=str, - help=""" - Command to be run AFTER downloads BEFORE uploads. - Will be passed unaltered as a shell command.""" + +class BCW(): + INPUT_RE = re.compile( + r'^(?P\w+)://(?P[^/]+)/(?P.+?(?P[^/]+))::(?P[^\0]+)::(?Prw|ro)$' ) - parser.add_argument( - '--download-files', - '-DF', - nargs='+', - help="""Format is - bucket_file_uri:container_path:mode - Where mode can be 'ro' or 'rw'. - If 'rw' the file will be pushed back to the bucket after the command - IF 'ro, the file will only be pulled from the bucket - e.g: s3://bucket/key/path.txt:/mnt/inputs/path.txt:ro""", + OUTPUT_RE = re.compile( + r'^(?P[^\0]+)::(?P\w+)://(?P[^/]+)/(?P.+?(?P[^/]+))$' ) - parser.add_argument( - '--upload-files', - '-UF', - nargs='+', - help="""Format is - container_path:bucket_file_uri - Mode is presumed to be w. (If you want rw / a / use input in mode 'rw') - e.g: /mnt/outputs/path.txt:s3://bucket/key/path.txt""", + VALID_BUCKET_PROVIDERS = ( + 's3', ) - return parser + def __init__(self): + parser = self.build_parser() + args = parser.parse_args() + + self.command = args.command + + if not args.download_files: + self.download_files = [] + else: + self.download_files = self.parse_download_files(args.download_files) + + if not args.upload_files: + self.upload_files = [] + else: + self.upload_files = self.parse_upload_files(args.upload_files) + + # Download from the bucket + self.download_files_from_bucket() + + # Run the command + subprocess.run( + args.command, + shell=True + ) + + # Upload files + self.upload_files_to_bucket() + + def build_parser(self): + parser = argparse.ArgumentParser(description=""" + Wrapper to pull from buckets, run a command, and push back to buckets. + example: + bucket_command_wrapper.py -c 'echo hello' \ + -DF s3://bucket/key/path.txt::/mnt/inputs/path.txt::rw \ + s3://bucket/key/path2.txt::/mnt/inputs/path2.txt::ro \ + -UF /mnt/outputs/path.txt::s3://bucket/key/path.txt + """) + + if len(sys.argv) < 2: + parser.print_help() + + # Implicit else + parser.add_argument( + '--command', + '-c', + type=str, + required=True, + help=""" + Command to be run AFTER downloads BEFORE uploads. + Please enclose in quotes. + Will be passed unaltered as a shell command.""" + ) + parser.add_argument( + '--download-files', + '-DF', + nargs='+', + help="""Format is + bucket_file_uri::container_path::mode + Where mode can be 'ro' or 'rw'. + If 'rw' the file will be pushed back to the bucket after the command + IF 'ro, the file will only be pulled from the bucket + e.g: s3://bucket/key/path.txt::/mnt/inputs/path.txt::ro""", + ) + parser.add_argument( + '--upload-files', + '-UF', + nargs='+', + help="""Format is + container_path::bucket_file_uri + Mode is presumed to be w. (If you want rw / a / use input in mode 'rw') + e.g: /mnt/outputs/path.txt::s3://bucket/key/path.txt""", + ) + + return parser + + def parse_upload_files(self, raw_upload_files): + upload_files = [] + for d in raw_upload_files: + m = self.OUTPUT_RE.search(d.strip()) + if not m: + raise Exception("Invalid upload file {}".format(d)) + # Implicit else + bucket_provider = m.group('bucket_provider').lower() + bucket = m.group('bucket') + key = m.group('key') + bucket_fn = m.group('bucket_fn') + container_path = os.path.abspath(m.group('container_path')) + + # Be sure this is one of the providers we know how to handle + if bucket_provider not in self.VALID_BUCKET_PROVIDERS: + raise Exception("Invalid bucket provider {}. Valid choices are {}".format( + bucket_provider, + ", ".join(self.VALID_BUCKET_PROVIDERS) + )) + + # Be sure we can create the path to the proposed container mount point + try: + os.makedirs( + os.path.dirname( + container_path + ) + ) + except FileExistsError: + # Fine if this path already exists + pass + + upload_files.append({ + 'bucket_provider': bucket_provider, + 'bucket': bucket, + 'key': key, + 'bucket_fn': bucket_fn, + 'container_path': container_path, + }) + return(upload_files) + + def parse_download_files(self, raw_download_files): + download_files = [] + for d in raw_download_files: + m = self.INPUT_RE.search(d.strip()) + if not m: + raise Exception("Invalid download file {}".format(d)) + # Implicit else + bucket_provider = m.group('bucket_provider') + bucket = m.group('bucket') + key = m.group('key') + bucket_fn = m.group('bucket_fn') + container_path = os.path.abspath(m.group('container_path')) + mode = m.group('mode').lower() + + # Be sure this is one of the providers we know how to handle + if bucket_provider not in self.VALID_BUCKET_PROVIDERS: + raise Exception("Invalid bucket provider {}. Valid choices are {}".format( + bucket_provider, + ", ".join(self.VALID_BUCKET_PROVIDERS) + )) + + # Be sure we can create the path to the proposed container mount point + try: + os.makedirs( + os.path.dirname( + container_path + ) + ) + except FileExistsError: + # Fine if this path already exists + pass + + download_files.append({ + 'bucket_provider': bucket_provider, + 'bucket': bucket, + 'key': key, + 'bucket_fn': bucket_fn, + 'container_path': container_path, + 'mode': mode, + }) + return(download_files) + + def download_file_s3(self, df): + import boto3 + s3_client = boto3.client('s3') + s3_client.download_file( + Bucket=df['bucket'], + Key=df['key'], + Filename=df['container_path'], + ) + + def download_files_from_bucket(self): + for df in self.download_files: + if df['bucket_provider'] == 's3': + self.download_file_s3(df) + else: + raise Exception("Invalid bucket provider {}".format( + df['bucket_provider']) + ) + + def upload_file_s3(self, df): + import boto3 + s3_client = boto3.client('s3') + s3_client.upload_file( + Bucket=df['bucket'], + Key=df['key'], + Filename=df['container_path'], + ) + + def upload_files_to_bucket(self): + upload_files = self.upload_files+[f for f in self.download_files if f['mode']=='rw'] + for df in upload_files: + if df['bucket_provider'] == 's3': + self.upload_file_s3(df) + else: + raise Exception("Invalid bucket provider {}".format( + df['bucket_provider']) + ) def main(): """Entrypoint for main script.""" - parser = build_parser() - args = parser.parse_args() + BCW() if __name__ == "__main__": - main() \ No newline at end of file + main() From f771ba070b55f30df82681ef6c0f8059f1bcec08 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 12 Apr 2018 10:31:34 -0700 Subject: [PATCH 07/88] Changed BCW to allow repeated file commands and accept command as env variable --- tools/bucket_command_wrapper.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tools/bucket_command_wrapper.py b/tools/bucket_command_wrapper.py index e6a0297..d6adbc6 100755 --- a/tools/bucket_command_wrapper.py +++ b/tools/bucket_command_wrapper.py @@ -28,17 +28,35 @@ def __init__(self): parser = self.build_parser() args = parser.parse_args() - self.command = args.command + if args.command: + self.command = args.command + elif os.environ.get('bcw_command'): + self.command = os.environ.get('bcw_command').strip() + else: + raise Exception(""" + No command provided on command line or as an environmental variable (bcw_command) + """) + + print(type(args.download_files)) + print(args.download_files) if not args.download_files: self.download_files = [] else: - self.download_files = self.parse_download_files(args.download_files) + raw_download_files = [f.strip() for f in args.download_files if f.strip() != ""] + if len(raw_download_files) > 0: + self.download_files = self.parse_download_files(raw_download_files) + else: + self.download_files = [] if not args.upload_files: self.upload_files = [] else: - self.upload_files = self.parse_upload_files(args.upload_files) + raw_upload_files = [f.strip() for f in args.upload_files if f.strip() != ""] + if len(raw_upload_files) > 0: + self.upload_files = self.parse_upload_files(raw_upload_files) + else: + self.upload_files = [] # Download from the bucket self.download_files_from_bucket() @@ -70,16 +88,16 @@ def build_parser(self): '--command', '-c', type=str, - required=True, help=""" Command to be run AFTER downloads BEFORE uploads. Please enclose in quotes. - Will be passed unaltered as a shell command.""" + Will be passed unaltered as a shell command. + Can also be provided as an environmental variable bcw_command""" ) parser.add_argument( '--download-files', '-DF', - nargs='+', + action='append', help="""Format is bucket_file_uri::container_path::mode Where mode can be 'ro' or 'rw'. @@ -90,7 +108,7 @@ def build_parser(self): parser.add_argument( '--upload-files', '-UF', - nargs='+', + action='append', help="""Format is container_path::bucket_file_uri Mode is presumed to be w. (If you want rw / a / use input in mode 'rw') From 52f0426cd4ba93b6c454994ad9b87a9225874437 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 12 Apr 2018 12:15:57 -0700 Subject: [PATCH 08/88] Added ContainerTarget --- sciluigi/dependencies.py | 52 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py index 2a83454..2f92b1b 100755 --- a/sciluigi/dependencies.py +++ b/sciluigi/dependencies.py @@ -8,8 +8,47 @@ from luigi.contrib.s3 import S3Target from luigi.six import iteritems +try: + from urlparse import urlsplit +except ImportError: + from urllib.parse import urlsplit + # ============================================================================== + +class ContainerTargetInfo(object): + ''' + Class to be used for sending specification of which target, from which + task, to use, when stitching workflow tasks' outputs and inputs together. + Accepts a url as a path, and then can properly create the proper target type + for a given scheme (e.g. s3, file, etc) + ''' + task = None + path = None + target = None + scheme = None + + def __init__(self, task, path, format=None, is_tmp=False, client=None): + self.task = task + self.path = path + sr = urlsplit(path) + self.scheme = sr.scheme + + if sr.scheme == 's3': + self.target = S3Target(path, format=format, client=client) + elif sr.scheme == 'file' or sr.scheme == '': + self.target = luigi.LocalTarget(path, format, is_tmp) + self.scheme = 'file' + else: + raise ValueError("URL scheme {} is not supported".format(sr.scheme)) + + def open(self, *args, **kwargs): + ''' + Forward open method, from target class + ''' + return self.target.open(*args, **kwargs) + + class TargetInfo(object): ''' Class to be used for sending specification of which target, from which @@ -32,6 +71,7 @@ def open(self, *args, **kwargs): # ============================================================================== + class S3TargetInfo(TargetInfo): def __init__(self, task, path, format=None, client=None): self.task = task @@ -40,6 +80,7 @@ def __init__(self, task, path, format=None, client=None): # ============================================================================== + class PostgresTargetInfo(TargetInfo): def __init__(self, task, host, database, user, password, update_id, table=None, port=None): self.task = task @@ -50,10 +91,19 @@ def __init__(self, task, host, database, user, password, update_id, table=None, self.update_id = update_id self.table = table self.port = port - self.target = PostgresTarget(host=host, database=database, user=user, password=password, table=table, update_id=update_id, port=port) + self.target = PostgresTarget( + host=host, + database=database, + user=user, + password=password, + table=table, + update_id=update_id, + port=port + ) # ============================================================================== + class DependencyHelpers(object): ''' Mixin implementing methods for supporting dynamic, and target-based From 034765374ee7a731f4edefe6a98db5d16b4a3631 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 12 Apr 2018 12:16:50 -0700 Subject: [PATCH 09/88] Made filesytem path just path component of url --- sciluigi/dependencies.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py index 2f92b1b..eb58da8 100755 --- a/sciluigi/dependencies.py +++ b/sciluigi/dependencies.py @@ -39,6 +39,7 @@ def __init__(self, task, path, format=None, is_tmp=False, client=None): elif sr.scheme == 'file' or sr.scheme == '': self.target = luigi.LocalTarget(path, format, is_tmp) self.scheme = 'file' + self.path = sr.path else: raise ValueError("URL scheme {} is not supported".format(sr.scheme)) From 93737401ef24924b7fefb4bded604155b42a5340 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 12 Apr 2018 12:25:48 -0700 Subject: [PATCH 10/88] Added ContainerTaskInfo to base import --- sciluigi/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index 01fd6bf..e81acf3 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -9,6 +9,7 @@ from sciluigi.audit import AuditTrailHelpers from sciluigi import dependencies +from sciluigi.dependencies import ContainerTargetInfo from sciluigi.dependencies import TargetInfo from sciluigi.dependencies import S3TargetInfo from sciluigi.dependencies import DependencyHelpers From 09e9032eacedf8f15bc7602e5d4741e550d6d9d6 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 12 Apr 2018 12:31:49 -0700 Subject: [PATCH 11/88] Made containertaskinfo inherit from taskinfo --- sciluigi/dependencies.py | 50 ++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py index eb58da8..a41d5bd 100755 --- a/sciluigi/dependencies.py +++ b/sciluigi/dependencies.py @@ -16,59 +16,53 @@ # ============================================================================== -class ContainerTargetInfo(object): +class TargetInfo(object): ''' Class to be used for sending specification of which target, from which task, to use, when stitching workflow tasks' outputs and inputs together. - Accepts a url as a path, and then can properly create the proper target type - for a given scheme (e.g. s3, file, etc) ''' task = None path = None target = None - scheme = None - def __init__(self, task, path, format=None, is_tmp=False, client=None): + def __init__(self, task, path, format=None, is_tmp=False): self.task = task self.path = path - sr = urlsplit(path) - self.scheme = sr.scheme - - if sr.scheme == 's3': - self.target = S3Target(path, format=format, client=client) - elif sr.scheme == 'file' or sr.scheme == '': - self.target = luigi.LocalTarget(path, format, is_tmp) - self.scheme = 'file' - self.path = sr.path - else: - raise ValueError("URL scheme {} is not supported".format(sr.scheme)) + self.target = luigi.LocalTarget(path, format, is_tmp) def open(self, *args, **kwargs): ''' - Forward open method, from target class + Forward open method, from luigi's target class ''' return self.target.open(*args, **kwargs) +# ============================================================================== -class TargetInfo(object): + +class ContainerTargetInfo(TargetInfo): ''' Class to be used for sending specification of which target, from which task, to use, when stitching workflow tasks' outputs and inputs together. + Accepts a url as a path, and then can properly create the proper target type + for a given scheme (e.g. s3, file, etc) ''' - task = None - path = None - target = None + scheme = None - def __init__(self, task, path, format=None, is_tmp=False): + def __init__(self, task, path, format=None, is_tmp=False, client=None): self.task = task self.path = path - self.target = luigi.LocalTarget(path, format, is_tmp) + sr = urlsplit(path) + self.scheme = sr.scheme + + if sr.scheme == 's3': + self.target = S3Target(path, format=format, client=client) + elif sr.scheme == 'file' or sr.scheme == '': + self.target = luigi.LocalTarget(path, format, is_tmp) + self.scheme = 'file' + self.path = sr.path + else: + raise ValueError("URL scheme {} is not supported".format(sr.scheme)) - def open(self, *args, **kwargs): - ''' - Forward open method, from luigi's target class - ''' - return self.target.open(*args, **kwargs) # ============================================================================== From c2e630afefc440108c5943e6540465a2aa484b18 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 13 Apr 2018 10:21:25 -0700 Subject: [PATCH 12/88] Partially completed AWS batch engine --- sciluigi/containertask.py | 90 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 75f09ee..2382996 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -7,6 +7,7 @@ import os from string import Template import shlex +import uuid # Setup logging log = logging.getLogger('sciluigi-interface') @@ -26,23 +27,29 @@ class ContainerInfo(): mem = None # Env env = None - # Timeout in seconds + # Timeout in minutes timeout = None # Local Container cache location. For things like singularity that need to pull # And create a local container container_cache = None + # AWS specific stuff + aws_jobRoleArn = None + def __init__(self, engine='docker', vcpu=1, mem=4096, timeout=604800, # Seven days of seconds - container_cache='.'): + container_cache='.', + aws_jobRoleArn='', + ): self.engine = engine self.vcpu = vcpu self.mem = mem self.timeout = timeout self.container_cache = container_cache + self.aws_jobRoleArn = aws_jobRoleArn def __str__(self): """ @@ -132,6 +139,15 @@ def ex( inputs_mode, outputs_mode ) + elif self.containerinfo.engine == 'aws_batch': + return self.ex_aws_batch( + command, + input_paths, + output_paths, + mounts, + inputs_mode, + outputs_mode + ) elif self.containerinfo.engine == 'singularity_slurm': return self.ex_singularity_slurm( command, @@ -240,6 +256,76 @@ def ex_singularity_slurm( if command_proc.stderr: log.warn(command_proc.stderr) + def ex_aws_batch( + self, + command, + input_paths={}, + output_paths={}, + mounts={}, + inputs_mode='ro', + outputs_mode='rw'): + """ + Run a command in a container using AWS batch. + Handles uploading of files to / from s3 and then into the container. + Assumes the container has batch_command_wrapper.py + """ + # + # The steps: + # 1) Register / retrieve the job definition + # 2) Upload local input files to S3 scratch bucket/key + # 3) submit the job definition with parameters filled with this specific command + # 4) Retrieve the output paths from the s3 scratch bucket / key + # + + # Only import AWS libs as needed + import boto3 + batch_client = boto3.client('batch') + s3_client = boto3.client('s3') + + # 1) Register / retrieve job definition + + # Make a UUID based on the container / command + job_def_name = "sl_containertask__{}".format( + uuid.uuid5(uuid.NAMESPACE_URL, self.container+command) + ) + + # Search to see if this job is ALREADY defined. + job_def_search = batch_client.describe_job_definitions( + maxResults=1, + jobDefinitionName=job_def_name, + ) + if len(job_def_search['jobDefinitions']) == 0: + # Not registered yet. Register it now + log.info('Registering job definition for {} in {} under name {}'.format( + command, + self.container, + job_def_name, + )) + batch_client.register_job_definition( + jobDefinitionName=job_def_name, + type='container', + containerProperties={ + 'image': self.container, + 'vcpus': 123, + 'memory': 123, + 'command': shlex.split(command), + 'jobRoleArn': self.containerinfo.aws_jobRoleArn, + }, + timeout={ + 'attemptDurationSeconds': self.containerinfo.timeout * 60 + } + ) + else: # Already registered + aws_job_def = job_def_search['jobDefinitions'][0] + log.info('Found job definition for {} in {} under name {}'.format( + aws_job_def['containerProperties']['command'], + aws_job_def['containerProperties']['image'], + job_def_name, + )) + + + + def ex_docker( self, command, From 652df087788d3eaf4f054123aac4e7a1e1517950 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 13 Apr 2018 10:29:44 -0700 Subject: [PATCH 13/88] working on aws batch --- sciluigi/containertask.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 2382996..92e54ac 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -35,6 +35,7 @@ class ContainerInfo(): # AWS specific stuff aws_jobRoleArn = None + aws_s3_scratch_loc = None def __init__(self, engine='docker', @@ -43,6 +44,7 @@ def __init__(self, timeout=604800, # Seven days of seconds container_cache='.', aws_jobRoleArn='', + aws_s3_scratch_loc='', ): self.engine = engine self.vcpu = vcpu @@ -50,6 +52,7 @@ def __init__(self, self.timeout = timeout self.container_cache = container_cache self.aws_jobRoleArn = aws_jobRoleArn + self.aws_s3_scratch_loc = aws_s3_scratch_loc def __str__(self): """ @@ -282,6 +285,11 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') + # First a bit of file mapping / uploading of input items + for (key, path) in input_paths.items(): + + print(key, "::", path) + # 1) Register / retrieve job definition # Make a UUID based on the container / command From a594b5f5cb084c2f87c6e2400aa878f3a30a916c Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 16 Apr 2018 10:03:44 -0700 Subject: [PATCH 14/88] Working towards complete AWS-batch engine --- sciluigi/containertask.py | 75 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 5 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 92e54ac..3705af3 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -8,6 +8,10 @@ from string import Template import shlex import uuid +try: + from urlparse import urlsplit, urljoin +except ImportError: + from urllib.parse import urlsplit, urljoin # Setup logging log = logging.getLogger('sciluigi-interface') @@ -274,8 +278,8 @@ def ex_aws_batch( """ # # The steps: - # 1) Register / retrieve the job definition - # 2) Upload local input files to S3 scratch bucket/key + # 1) Upload local input files to S3 scratch bucket/key + # 2) Register / retrieve the job definition # 3) submit the job definition with parameters filled with this specific command # 4) Retrieve the output paths from the s3 scratch bucket / key # @@ -285,12 +289,73 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') - # First a bit of file mapping / uploading of input items + run_uuid = uuid.uuid4() + + # 1. First a bit of file mapping / uploading of input items + s3_input_paths = {} + need_s3_uploads = set() for (key, path) in input_paths.items(): + # First split the path, to see which scheme it is + path_split = urlsplit(path) + if path_split.scheme == 's3': + # Nothing to do. Already an S3 path. + s3_input_paths[key] = path + elif path_split.scheme == 'file' or path_split.scheme == '': + # File path. Will need to upload to S3 to a temporary key within a bucket + need_s3_uploads.add((key, path_split)) + else: + raise ValueError("File storage scheme {} is not supported".format( + path_split.scheme + )) + + input_common_prefix = os.path.commonpath([ + os.path.dirname(os.path.abspath(ps[1].path)) + for ps in need_s3_uploads + ]) + for k, ps in need_s3_uploads: + s3_file_temp_path = "{}{}/{}".format( + self.containerinfo.aws_s3_scratch_loc, + run_uuid, + os.path.relpath(ps.path, input_common_prefix) + ) + s3_input_paths[k] = urlsplit(s3_file_temp_path) + s3_client.upload_file( + Filename=input_paths[k], + Bucket=s3_input_paths[k].netloc, + Key=s3_input_paths[k].path + ) - print(key, "::", path) + # While we are at it, make mappings for our outputs. + s3_output_paths = {} + need_s3_downloads = set() + + for (key, path) in output_paths.items(): + # First split the path, to see which scheme it is + path_split = urlsplit(path) + if path_split.scheme == 's3': + # Nothing to do. Already an S3 path. + s3_output_paths[key] = path + elif path_split.scheme == 'file' or path_split.scheme == '': + # File path. Will need to upload to S3 to a temporary key within a bucket + need_s3_downloads.add((key, path_split)) + else: + raise ValueError("File storage scheme {} is not supported".format( + path_split.scheme + )) + output_common_prefix = os.path.commonpath([ + os.path.dirname(os.path.abspath(ps[1].path)) + for ps in need_s3_downloads + ]) + + for k, ps in need_s3_downloads: + s3_file_temp_path = "{}{}/{}".format( + self.containerinfo.aws_s3_scratch_loc, + run_uuid, + os.path.relpath(ps.path, output_common_prefix) + ) + s3_output_paths[k] = urlsplit(s3_file_temp_path) - # 1) Register / retrieve job definition + # 2) Register / retrieve job definition # Make a UUID based on the container / command job_def_name = "sl_containertask__{}".format( From c141ef8d8d81d0e66375c75099fdad763f2209c4 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 20 Apr 2018 09:12:06 -0700 Subject: [PATCH 15/88] Pre work --- sciluigi/containertask.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 3705af3..aef7a0a 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -292,11 +292,16 @@ def ex_aws_batch( run_uuid = uuid.uuid4() # 1. First a bit of file mapping / uploading of input items + # We need mappings for both two and from S3 and from S3 to within the container + # <-> <-> + container_paths = {} s3_input_paths = {} need_s3_uploads = set() + ip = set() for (key, path) in input_paths.items(): # First split the path, to see which scheme it is path_split = urlsplit(path) + ip.add(path_split) if path_split.scheme == 's3': # Nothing to do. Already an S3 path. s3_input_paths[key] = path @@ -309,11 +314,11 @@ def ex_aws_batch( )) input_common_prefix = os.path.commonpath([ - os.path.dirname(os.path.abspath(ps[1].path)) - for ps in need_s3_uploads + os.path.dirname(os.path.abspath(ip)) + for ps in ip ]) for k, ps in need_s3_uploads: - s3_file_temp_path = "{}{}/{}".format( + s3_file_temp_path = "{}{}/in/{}".format( self.containerinfo.aws_s3_scratch_loc, run_uuid, os.path.relpath(ps.path, input_common_prefix) @@ -325,7 +330,7 @@ def ex_aws_batch( Key=s3_input_paths[k].path ) - # While we are at it, make mappings for our outputs. + # Outputs s3_output_paths = {} need_s3_downloads = set() @@ -348,7 +353,7 @@ def ex_aws_batch( ]) for k, ps in need_s3_downloads: - s3_file_temp_path = "{}{}/{}".format( + s3_file_temp_path = "{}{}/out/{}".format( self.containerinfo.aws_s3_scratch_loc, run_uuid, os.path.relpath(ps.path, output_common_prefix) @@ -385,7 +390,7 @@ def ex_aws_batch( 'jobRoleArn': self.containerinfo.aws_jobRoleArn, }, timeout={ - 'attemptDurationSeconds': self.containerinfo.timeout * 60 + 'attemptDurationSeconds': self.containerinfo.timeout * 60 } ) else: # Already registered From 7f1ae5d5be408b411f7f916721f7709d84d8031e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 25 Apr 2018 11:14:11 -0700 Subject: [PATCH 16/88] Working AWS batch engine with ability to upload / download from S3 as needed --- sciluigi/containertask.py | 186 ++++++++++++++++++++++++++++++++++---- 1 file changed, 166 insertions(+), 20 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index aef7a0a..3bedc00 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -8,6 +8,7 @@ from string import Template import shlex import uuid +import time try: from urlparse import urlsplit, urljoin except ImportError: @@ -40,6 +41,7 @@ class ContainerInfo(): # AWS specific stuff aws_jobRoleArn = None aws_s3_scratch_loc = None + aws_batch_job_queue = None def __init__(self, engine='docker', @@ -49,6 +51,7 @@ def __init__(self, container_cache='.', aws_jobRoleArn='', aws_s3_scratch_loc='', + aws_batch_job_queue='' ): self.engine = engine self.vcpu = vcpu @@ -57,6 +60,7 @@ def __init__(self, self.container_cache = container_cache self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc + self.aws_batch_job_queue = aws_batch_job_queue def __str__(self): """ @@ -289,50 +293,92 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') - run_uuid = uuid.uuid4() + run_uuid = str(uuid.uuid4()) # 1. First a bit of file mapping / uploading of input items # We need mappings for both two and from S3 and from S3 to within the container # <-> <-> + # The script in the container, bucket_command_wrapper.py, handles the second half + # practically, but we need to provide the link s3://bucket/key::/container/path/file::mode + # the first half we have to do here. + # s3_input_paths will hold the s3 path container_paths = {} + + in_container_paths_from_s3 = {} + in_container_paths_from_local_fs = {} s3_input_paths = {} need_s3_uploads = set() - ip = set() for (key, path) in input_paths.items(): # First split the path, to see which scheme it is path_split = urlsplit(path) - ip.add(path_split) if path_split.scheme == 's3': # Nothing to do. Already an S3 path. + in_container_paths_from_s3[key] = os.path.join( + path_split.netloc, + path_split.path + ) s3_input_paths[key] = path elif path_split.scheme == 'file' or path_split.scheme == '': # File path. Will need to upload to S3 to a temporary key within a bucket + in_container_paths_from_local_fs[key] = path_split.path need_s3_uploads.add((key, path_split)) else: raise ValueError("File storage scheme {} is not supported".format( path_split.scheme )) - input_common_prefix = os.path.commonpath([ - os.path.dirname(os.path.abspath(ip)) - for ps in ip - ]) + in_from_local_fs_common_prefix = os.path.dirname( + os.path.commonprefix([ + p for p in in_container_paths_from_local_fs.values() + ]) + ) + for k, ps in need_s3_uploads: s3_file_temp_path = "{}{}/in/{}".format( self.containerinfo.aws_s3_scratch_loc, run_uuid, - os.path.relpath(ps.path, input_common_prefix) + os.path.relpath(ps.path, in_from_local_fs_common_prefix) ) - s3_input_paths[k] = urlsplit(s3_file_temp_path) + s3_input_paths[k] = s3_file_temp_path + log.info("Uploading {} to {}".format( + input_paths[k], + s3_input_paths[k], + )) s3_client.upload_file( Filename=input_paths[k], - Bucket=s3_input_paths[k].netloc, - Key=s3_input_paths[k].path + Bucket=urlsplit(s3_input_paths[k]).netloc, + Key=urlsplit(s3_input_paths[k]).path.strip('/'), + ExtraArgs={ + 'ServerSideEncryption': 'AES256' + } + ) + # build our container paths for inputs from fs and S3 + for k in in_container_paths_from_local_fs: + container_paths[k] = os.path.join( + '/mnt/inputs/fs/', + os.path.relpath( + in_container_paths_from_local_fs[k], + in_from_local_fs_common_prefix) ) + in_from_s3_common_prefix = os.path.dirname( + os.path.commonprefix([ + p for p in in_container_paths_from_s3.values() + ]) + ) + for k in in_container_paths_from_s3: + container_paths[k] = os.path.join( + '/mnt/inputs/s3/', + os.path.relpath( + in_container_paths_from_s3[k], + in_from_s3_common_prefix) + ) + # Outputs s3_output_paths = {} need_s3_downloads = set() + out_container_paths_from_s3 = {} + out_container_paths_from_local_fs = {} for (key, path) in output_paths.items(): # First split the path, to see which scheme it is @@ -340,9 +386,14 @@ def ex_aws_batch( if path_split.scheme == 's3': # Nothing to do. Already an S3 path. s3_output_paths[key] = path + out_container_paths_from_s3[key] = os.path.join( + path_split.netloc, + path_split.path + ) elif path_split.scheme == 'file' or path_split.scheme == '': # File path. Will need to upload to S3 to a temporary key within a bucket need_s3_downloads.add((key, path_split)) + out_container_paths_from_local_fs[key] = path_split.path else: raise ValueError("File storage scheme {} is not supported".format( path_split.scheme @@ -358,13 +409,40 @@ def ex_aws_batch( run_uuid, os.path.relpath(ps.path, output_common_prefix) ) - s3_output_paths[k] = urlsplit(s3_file_temp_path) + s3_output_paths[k] = s3_file_temp_path - # 2) Register / retrieve job definition + # Make our container paths for outputs + out_from_local_fs_common_prefix = os.path.dirname( + os.path.commonprefix([ + p for p in out_container_paths_from_local_fs.values() + ]) + ) + for k in out_container_paths_from_local_fs: + container_paths[k] = os.path.join( + '/mnt/outputs/fs/', + os.path.relpath( + out_container_paths_from_local_fs[k], + out_from_local_fs_common_prefix) + ) + + out_from_s3_common_prefix = os.path.dirname( + os.path.commonprefix([ + p for p in out_container_paths_from_s3.values() + ]) + ) + for k in out_container_paths_from_s3: + container_paths[k] = os.path.join( + '/mnt/outputs/s3/', + os.path.relpath( + out_container_paths_from_s3[k], + out_from_s3_common_prefix) + ) + + # 2) Register / retrieve job definition for this container, command, and job role arn # Make a UUID based on the container / command job_def_name = "sl_containertask__{}".format( - uuid.uuid5(uuid.NAMESPACE_URL, self.container+command) + uuid.uuid5(uuid.NAMESPACE_URL, self.container+self.containerinfo.aws_jobRoleArn) ) # Search to see if this job is ALREADY defined. @@ -374,9 +452,9 @@ def ex_aws_batch( ) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now - log.info('Registering job definition for {} in {} under name {}'.format( - command, + log.info('Registering job definition for {} with role {} under name {}'.format( self.container, + self.containerinfo.aws_jobRoleArn, job_def_name, )) batch_client.register_job_definition( @@ -384,8 +462,8 @@ def ex_aws_batch( type='container', containerProperties={ 'image': self.container, - 'vcpus': 123, - 'memory': 123, + 'vcpus': 1, + 'memory': 1024, 'command': shlex.split(command), 'jobRoleArn': self.containerinfo.aws_jobRoleArn, }, @@ -401,8 +479,76 @@ def ex_aws_batch( job_def_name, )) - - + # Build our container command list + container_command_list = [ + 'bucket_command_wrapper.py', + '--command', Template(command).safe_substitute(container_paths) + ] + # Add in our inputs + for k in s3_input_paths: + container_command_list += [ + '-DF', + "{}::{}::{}".format( + s3_input_paths[k], + container_paths[k], + inputs_mode.lower() + ) + ] + + # And our outputs + for k in s3_output_paths: + container_command_list += [ + '-UF', + "{}::{}".format( + container_paths[k], + s3_output_paths[k] + ) + ] + + # Submit the job + job_submission = batch_client.submit_job( + jobName=run_uuid, + jobQueue=self.containerinfo.aws_batch_job_queue, + jobDefinition=job_def_name, + containerOverrides={ + 'vcpus': self.containerinfo.vcpu, + 'memory': self.containerinfo.mem, + 'command': container_command_list, + }, + ) + job_submission_id = job_submission.get('jobId') + log.info("Running {} under jobId {}".format( + container_command_list, + job_submission_id + )) + while True: + job_status = batch_client.describe_jobs( + jobs=[job_submission_id] + ).get('jobs')[0] + if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': + break + time.sleep(10) + if job_status.get('status') != 'SUCCEEDED': + raise Exception("Batch job failed. {}".format( + job_status.get('statusReason') + )) + # Implicit else we succeeded + # Now we need to copy back from S3 to our local filesystem + for k, ps in need_s3_downloads: + s3_client.download_file( + Filename=ps.path, + Bucket=urlsplit(s3_output_paths[k]).netloc, + Key=urlsplit(s3_output_paths[k]).path.strip('/') + ) + # And the inputs if we are rw + if inputs_mode == 'rw': + for k, ps in need_s3_uploads: + s3_client.download_file( + Filename=ps.path, + Bucket=urlsplit(s3_input_paths[k]).netloc, + Key=urlsplit(s3_input_paths[k]).path.strip('/') + ) + # And done def ex_docker( self, From 78023560caaaab0e1c715f7eb2b3b7f3a10768f3 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 25 Apr 2018 11:59:46 -0700 Subject: [PATCH 17/88] Tidied up a bit of the logging for AWS-batch engine --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 3bedc00..7081a0c 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -473,9 +473,9 @@ def ex_aws_batch( ) else: # Already registered aws_job_def = job_def_search['jobDefinitions'][0] - log.info('Found job definition for {} in {} under name {}'.format( - aws_job_def['containerProperties']['command'], + log.info('Found job definition for {} with job role {} under name {}'.format( aws_job_def['containerProperties']['image'], + aws_job_def['containerProperties']['jobRoleArn'], job_def_name, )) From 8ef016de30c0f7c7dd8161f15ef5d6c206bb8cb8 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 25 Apr 2018 13:03:40 -0700 Subject: [PATCH 18/88] Allowed for piped and && / || commands in both docker and aws_batch --- sciluigi/containertask.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7081a0c..bc72fb0 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -138,6 +138,7 @@ def ex( command, input_paths={}, output_paths={}, + extra_params={}, mounts={}, inputs_mode='ro', outputs_mode='rw'): @@ -146,6 +147,7 @@ def ex( command, input_paths, output_paths, + extra_params, mounts, inputs_mode, outputs_mode @@ -155,6 +157,7 @@ def ex( command, input_paths, output_paths, + extra_params, mounts, inputs_mode, outputs_mode @@ -164,6 +167,7 @@ def ex( command, input_paths, output_paths, + extra_params, mounts, inputs_mode, outputs_mode @@ -176,6 +180,7 @@ def ex_singularity_slurm( command, input_paths={}, output_paths={}, + extra_params={}, mounts={}, inputs_mode='ro', outputs_mode='rw'): @@ -272,6 +277,7 @@ def ex_aws_batch( command, input_paths={}, output_paths={}, + extra_params={}, mounts={}, inputs_mode='ro', outputs_mode='rw'): @@ -480,9 +486,11 @@ def ex_aws_batch( )) # Build our container command list + template_dict = container_paths.copy() + template_dict.update(extra_params) container_command_list = [ 'bucket_command_wrapper.py', - '--command', Template(command).safe_substitute(container_paths) + '--command', Template(command).safe_substitute(template_dict) ] # Add in our inputs for k in s3_input_paths: @@ -555,6 +563,7 @@ def ex_docker( command, input_paths={}, output_paths={}, + extra_params={}, mounts={}, inputs_mode='ro', outputs_mode='rw'): @@ -592,7 +601,9 @@ def ex_docker( # No matter what, add our mappings container_paths.update(input_container_paths) - command = Template(command).substitute(container_paths) + template_dict = container_paths.copy() + template_dict.update(extra_params) + command = Template(command).substitute(template_dict) try: log.info("Attempting to run {} in {}".format( @@ -601,7 +612,7 @@ def ex_docker( )) stdout = client.containers.run( image=self.container, - command=command, + command=['bash', '-c', command], volumes=mounts, mem_limit="{}m".format(self.containerinfo.mem), ) From 21055c8f8afa17a1d183806de8cd960aa563025b Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 09:48:38 -0700 Subject: [PATCH 19/88] Moved custom mounts to containerinfo --- sciluigi/containertask.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index bc72fb0..889411f 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -34,6 +34,8 @@ class ContainerInfo(): env = None # Timeout in minutes timeout = None + # Format is {'source_path': {'bind': '/container/path', 'mode': mode}} + mounts = None # Local Container cache location. For things like singularity that need to pull # And create a local container container_cache = None @@ -47,7 +49,8 @@ def __init__(self, engine='docker', vcpu=1, mem=4096, - timeout=604800, # Seven days of seconds + timeout=10080, # Seven days of minutes + mounts={}, container_cache='.', aws_jobRoleArn='', aws_s3_scratch_loc='', @@ -57,6 +60,7 @@ def __init__(self, self.vcpu = vcpu self.mem = mem self.timeout = timeout + self.mounts = mounts self.container_cache = container_cache self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc @@ -139,7 +143,6 @@ def ex( input_paths={}, output_paths={}, extra_params={}, - mounts={}, inputs_mode='ro', outputs_mode='rw'): if self.containerinfo.engine == 'docker': @@ -148,7 +151,6 @@ def ex( input_paths, output_paths, extra_params, - mounts, inputs_mode, outputs_mode ) @@ -158,7 +160,6 @@ def ex( input_paths, output_paths, extra_params, - mounts, inputs_mode, outputs_mode ) @@ -168,7 +169,6 @@ def ex( input_paths, output_paths, extra_params, - mounts, inputs_mode, outputs_mode ) @@ -181,7 +181,6 @@ def ex_singularity_slurm( input_paths={}, output_paths={}, extra_params={}, - mounts={}, inputs_mode='ro', outputs_mode='rw'): """ @@ -189,6 +188,7 @@ def ex_singularity_slurm( command is assumed to be in python template substitution format """ container_paths = {} + mounts = self.containerinfo.mounts.copy() if len(output_paths) > 0: output_host_path_ca, output_container_paths = self.map_paths_to_container( @@ -278,7 +278,6 @@ def ex_aws_batch( input_paths={}, output_paths={}, extra_params={}, - mounts={}, inputs_mode='ro', outputs_mode='rw'): """ @@ -564,7 +563,6 @@ def ex_docker( input_paths={}, output_paths={}, extra_params={}, - mounts={}, inputs_mode='ro', outputs_mode='rw'): """ @@ -573,6 +571,7 @@ def ex_docker( """ client = docker.from_env() container_paths = {} + mounts = self.containerinfo.mounts.copy() if len(output_paths) > 0: output_host_path_ca, output_container_paths = self.map_paths_to_container( From 3041e8820a2b5b46d2113d7614c70d095546315d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 16:02:00 -0700 Subject: [PATCH 20/88] Moved mounts to a containerinfo parameter, and implemented for AWS batch --- sciluigi/containertask.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 889411f..cb9eadd 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -447,7 +447,10 @@ def ex_aws_batch( # Make a UUID based on the container / command job_def_name = "sl_containertask__{}".format( - uuid.uuid5(uuid.NAMESPACE_URL, self.container+self.containerinfo.aws_jobRoleArn) + uuid.uuid5( + uuid.NAMESPACE_URL, + self.container+self.containerinfo.aws_jobRoleArn+str(self.containerinfo.mounts) + ) ) # Search to see if this job is ALREADY defined. @@ -457,11 +460,32 @@ def ex_aws_batch( ) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now - log.info('Registering job definition for {} with role {} under name {}'.format( - self.container, - self.containerinfo.aws_jobRoleArn, - job_def_name, - )) + log.info( + """Registering job definition for {} with role {} and mounts {} under name {} + """.format( + self.container, + self.containerinfo.aws_jobRoleArn, + self.containerinfo.mounts, + job_def_name, + )) + # To be passed along for container properties + aws_volumes = set() + aws_mountPoints = set() + for (host_path, container_details) in self.containerinfo.mounts.items(): + aws_volumes.add({ + 'host': {'sourcePath': host_path}, + 'name': host_path + }) + if container_details['mode'].lower() == 'ro': + read_only = 'True' + else: + read_only = 'False' + aws_mountPoints.add({ + 'containerPath': container_details['bind'], + 'sourceVolume': host_path, + 'readOnly': read_only, + }) + batch_client.register_job_definition( jobDefinitionName=job_def_name, type='container', @@ -471,6 +495,8 @@ def ex_aws_batch( 'memory': 1024, 'command': shlex.split(command), 'jobRoleArn': self.containerinfo.aws_jobRoleArn, + 'mountPoints': list(aws_mountPoints), + 'volumes': list(aws_volumes) }, timeout={ 'attemptDurationSeconds': self.containerinfo.timeout * 60 From 814ac0a5cd51b87e601cf68a148479d28debd2f8 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 16:54:37 -0700 Subject: [PATCH 21/88] Switch from set to list --- sciluigi/containertask.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index cb9eadd..8e7a4cc 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -469,10 +469,10 @@ def ex_aws_batch( job_def_name, )) # To be passed along for container properties - aws_volumes = set() - aws_mountPoints = set() + aws_volumes = [] + aws_mountPoints = [] for (host_path, container_details) in self.containerinfo.mounts.items(): - aws_volumes.add({ + aws_volumes.append({ 'host': {'sourcePath': host_path}, 'name': host_path }) @@ -480,7 +480,7 @@ def ex_aws_batch( read_only = 'True' else: read_only = 'False' - aws_mountPoints.add({ + aws_mountPoints.append({ 'containerPath': container_details['bind'], 'sourceVolume': host_path, 'readOnly': read_only, @@ -495,8 +495,8 @@ def ex_aws_batch( 'memory': 1024, 'command': shlex.split(command), 'jobRoleArn': self.containerinfo.aws_jobRoleArn, - 'mountPoints': list(aws_mountPoints), - 'volumes': list(aws_volumes) + 'mountPoints': aws_mountPoints, + 'volumes': aws_volumes }, timeout={ 'attemptDurationSeconds': self.containerinfo.timeout * 60 From f532674b29e9c310d608971665b70d51b78d4a5d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 16:56:16 -0700 Subject: [PATCH 22/88] From string to bool for readonly --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 8e7a4cc..d3f5de1 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -477,9 +477,9 @@ def ex_aws_batch( 'name': host_path }) if container_details['mode'].lower() == 'ro': - read_only = 'True' + read_only = True else: - read_only = 'False' + read_only = False aws_mountPoints.append({ 'containerPath': container_details['bind'], 'sourceVolume': host_path, From c3c54dd4e3223065a6af83db5cc058f594fd0f1d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 16:59:01 -0700 Subject: [PATCH 23/88] name for volume through uuid --- sciluigi/containertask.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index d3f5de1..2c75249 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -472,9 +472,10 @@ def ex_aws_batch( aws_volumes = [] aws_mountPoints = [] for (host_path, container_details) in self.containerinfo.mounts.items(): + name = uuid.uuid5(uuid.NAMESPACE_URL, host_path) aws_volumes.append({ 'host': {'sourcePath': host_path}, - 'name': host_path + 'name': name }) if container_details['mode'].lower() == 'ro': read_only = True @@ -482,7 +483,7 @@ def ex_aws_batch( read_only = False aws_mountPoints.append({ 'containerPath': container_details['bind'], - 'sourceVolume': host_path, + 'sourceVolume': name, 'readOnly': read_only, }) From 7e572c68c75cdad1af9856a8681624bd23df5b5d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 17:06:52 -0700 Subject: [PATCH 24/88] check not just registered job def, but active --- sciluigi/containertask.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 2c75249..d6e069f 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -456,6 +456,7 @@ def ex_aws_batch( # Search to see if this job is ALREADY defined. job_def_search = batch_client.describe_job_definitions( maxResults=1, + status='ACTIVE', jobDefinitionName=job_def_name, ) if len(job_def_search['jobDefinitions']) == 0: From 23dcc64a5bd5878b0222d6cde21b1869cd87cfec Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 26 Apr 2018 17:08:39 -0700 Subject: [PATCH 25/88] String for name not class uuid --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index d6e069f..ce8c8da 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -473,7 +473,7 @@ def ex_aws_batch( aws_volumes = [] aws_mountPoints = [] for (host_path, container_details) in self.containerinfo.mounts.items(): - name = uuid.uuid5(uuid.NAMESPACE_URL, host_path) + name = str(uuid.uuid5(uuid.NAMESPACE_URL, host_path)) aws_volumes.append({ 'host': {'sourcePath': host_path}, 'name': name From 554e4c86dbbe72ed2d34ee70fa56d909ed665743 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 27 Apr 2018 15:21:21 -0700 Subject: [PATCH 26/88] Fixed a few bugs, and moved over to the packaged bucket_container_wrapper --- sciluigi/containertask.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index ce8c8da..7318bd7 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -50,7 +50,7 @@ def __init__(self, vcpu=1, mem=4096, timeout=10080, # Seven days of minutes - mounts={}, + mounts={}, container_cache='.', aws_jobRoleArn='', aws_s3_scratch_loc='', @@ -516,7 +516,7 @@ def ex_aws_batch( template_dict = container_paths.copy() template_dict.update(extra_params) container_command_list = [ - 'bucket_command_wrapper.py', + 'bucket_command_wrapper', '--command', Template(command).safe_substitute(template_dict) ] # Add in our inputs @@ -583,6 +583,14 @@ def ex_aws_batch( Bucket=urlsplit(s3_input_paths[k]).netloc, Key=urlsplit(s3_input_paths[k]).path.strip('/') ) + + # Cleanup the temp S3 + for k, ps in need_s3_uploads: + s3_client.delete_object( + Bucket=urlsplit(s3_input_paths[k]).netloc, + Key=urlsplit(s3_input_paths[k]).path.strip('/'), + ) + # And done def ex_docker( From 1c62815f6b212040391a91a79defa93ab76a6de9 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 27 Apr 2018 15:49:13 -0700 Subject: [PATCH 27/88] Fixed bug where no files are being uploaded to S3 temp --- sciluigi/containertask.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7318bd7..7e65e46 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -403,10 +403,13 @@ def ex_aws_batch( raise ValueError("File storage scheme {} is not supported".format( path_split.scheme )) - output_common_prefix = os.path.commonpath([ - os.path.dirname(os.path.abspath(ps[1].path)) - for ps in need_s3_downloads - ]) + if len(need_s3_downloads) > 0: + output_common_prefix = os.path.commonpath([ + os.path.dirname(os.path.abspath(ps[1].path)) + for ps in need_s3_downloads + ]) + else: + output_common_prefix = '' for k, ps in need_s3_downloads: s3_file_temp_path = "{}{}/out/{}".format( From ab6a6cbc810b0a5146a80a722aca30f196356282 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 27 Apr 2018 17:49:29 -0700 Subject: [PATCH 28/88] Docker engine working, with switch to targets rather than paths to be faancy --- sciluigi/containertask.py | 189 +++++++++++++++++++++++++++----------- 1 file changed, 136 insertions(+), 53 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7e65e46..38617c9 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -106,27 +106,54 @@ class ContainerHelpers(): # The ID of the container (docker registry style). container = None - def map_paths_to_container(self, paths, container_base_path='/mnt'): + def map_targets_to_container(self, targets): """ Accepts a dictionary where the keys are identifiers for various targets - and the value is the HOST path for that target - - What this does is find a common HOST prefix - and remaps to the CONTAINER BASE PATH - - Returns a dict of the paths for the targets as they would be seen - if the common prefix is mounted within the container at the container_base_path - """ - common_prefix = os.path.commonprefix( - [os.path.dirname(p) for p in paths.values()] - ) - container_paths = { - i: os.path.join( - container_base_path, - os.path.relpath(paths[i], common_prefix)) - for i in paths + and the value is the target + + This breaks down the targets by their schema (file, s3, etc). + For each schema a lowest-common-path is found and a suggested container + mountpoint is generated + + What one gets back is a nested dict + { + 'scheme': { + 'common_prefix': '/path/on/source/shared/by/all/targets/of/schema', + 'rel_paths': { + 'identifier': 'path_rel_to_common_prefix' + } + 'targets': { + 'identifier': target, + } + } } - return os.path.abspath(common_prefix), container_paths + """ + # Determine the schema for these targets via comprehension + schema = {t.scheme for t in targets.values()} + return_dict = {} + for scheme in schema: + return_dict[scheme] = {} + # Get only the targets for this scheme + scheme_targets = {i: t for i, t in targets.items() if t.scheme == scheme} + common_prefix = os.path.commonprefix( + [os.path.dirname( + os.path.join( + urlsplit(t.path).netloc, + urlsplit(t.path).path + ) + ) for t in scheme_targets.values()]) + return_dict[scheme]['common_prefix'] = common_prefix + return_dict[scheme]['targets'] = scheme_targets + return_dict[scheme]['relpaths'] = { + i: os.path.relpath( + os.path.join( + urlsplit(t.path).netloc, + urlsplit(t.path).path + ), + common_prefix) + for i, t in scheme_targets.items() + } + return return_dict def make_fs_name(self, uri): uri_list = uri.split('://') @@ -140,16 +167,16 @@ def make_fs_name(self, uri): def ex( self, command, - input_paths={}, - output_paths={}, + input_targets={}, + output_targets={}, extra_params={}, inputs_mode='ro', outputs_mode='rw'): if self.containerinfo.engine == 'docker': return self.ex_docker( command, - input_paths, - output_paths, + input_targets, + output_targets, extra_params, inputs_mode, outputs_mode @@ -599,11 +626,13 @@ def ex_aws_batch( def ex_docker( self, command, - input_paths={}, - output_paths={}, + input_targets={}, + output_targets={}, extra_params={}, inputs_mode='ro', - outputs_mode='rw'): + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs'): """ Run command in the container using docker, with mountpoints command is assumed to be in python template substitution format @@ -611,46 +640,100 @@ def ex_docker( client = docker.from_env() container_paths = {} mounts = self.containerinfo.mounts.copy() + UF = [] + DF = [] - if len(output_paths) > 0: - output_host_path_ca, output_container_paths = self.map_paths_to_container( - output_paths, - container_base_path='/mnt/outputs' + if len(output_targets) > 0: + output_target_maps = self.map_targets_to_container( + output_targets, ) - container_paths.update(output_container_paths) - mounts[output_host_path_ca] = {'bind': '/mnt/outputs', 'mode': outputs_mode} - - if len(input_paths) > 0: - input_host_path_ca, input_container_paths = self.map_paths_to_container( - input_paths, - container_base_path='/mnt/inputs' + out_schema = set(output_target_maps.keys()) + # Local file targets can just be mapped. + file_output_common_prefix = None + if 'file' in out_schema: + file_output_common_prefix = output_target_maps['file']['common_prefix'] + mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(output_mount_point, 'file'), + 'mode': outputs_mode + } + container_paths.update({ + i: os.path.join(output_mount_point, 'file', rp) + for i, rp in output_target_maps['file']['relpaths'].items() + }) + out_schema.remove('file') + # Handle other schema here using BCW, creating the appropriate UF parameters + for scheme in out_schema: + for identifier in output_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + output_mount_point, + scheme, + output_target_maps[scheme]['relpaths'][identifier] + ) + UF.append("{}::{}".format( + container_paths[identifier], + output_target_maps[scheme]['targets'][identifier].path + )) + + if len(input_targets) > 0: + input_target_maps = self.map_targets_to_container( + input_targets ) - # Handle the edge case where the common directory for inputs is equal to the outputs - if len(output_paths) > 0 and (output_host_path_ca == input_host_path_ca): - log.warn("Input and Output host paths the same {}".format(output_host_path_ca)) - # Repeat our mapping, now using the outputs path for both - input_host_path_ca, input_container_paths = self.map_paths_to_container( - input_paths, - container_base_path='/mnt/outputs' - ) - else: # output and input paths different OR there are only input paths - mounts[input_host_path_ca] = {'bind': '/mnt/inputs', 'mode': inputs_mode} - - # No matter what, add our mappings - container_paths.update(input_container_paths) + in_schema = set(input_target_maps.keys()) + if 'file' in in_schema: + # Check for the edge case where our common prefix for input and output is the same + if file_output_common_prefix and file_output_common_prefix == input_target_maps['file']['common_prefix']: + # It is! Skip adding a mount for inputs then, and reset our input mountpoint + input_mount_point = output_mount_point + pass + else: # Add our mount + mounts[os.path.abspath(input_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(input_mount_point, 'file'), + 'mode': inputs_mode + } + container_paths.update({ + i: os.path.join(input_mount_point, 'file', rp) + for i, rp in input_target_maps['file']['relpaths'].items() + }) + in_schema.remove('file') + + # Handle other schema here using BCW, creating the appropriate DF parameters + for scheme in in_schema: + for identifier in input_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + input_mount_point, + scheme, + input_target_maps[scheme]['relpaths'][identifier] + ) + DF.append("{}::{}::{}".format( + input_target_maps[scheme]['targets'][identifier].path, + container_paths[identifier], + inputs_mode, + )) template_dict = container_paths.copy() template_dict.update(extra_params) command = Template(command).substitute(template_dict) + command_list = [ + 'bucket_command_wrapper', + '--command', command, + ] + for df in DF: + command_list.append('-DF') + command_list.append(df) + for uf in UF: + command_list.append('-UF') + command_list.append(uf) + try: - log.info("Attempting to run {} in {}".format( - command, - self.container + log.info("Attempting to run {} in {} with mounts {}".format( + command_list, + self.container, + mounts, )) stdout = client.containers.run( image=self.container, - command=['bash', '-c', command], + command=command_list, volumes=mounts, mem_limit="{}m".format(self.containerinfo.mem), ) From ec3dcd6703f8cf1ba89257683fa418ec75620c38 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 1 May 2018 14:27:41 -0700 Subject: [PATCH 29/88] basic engine for docker and aws_batch done --- sciluigi/containertask.py | 466 +++++++++++++++++++------------------- 1 file changed, 228 insertions(+), 238 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 38617c9..98c523d 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -9,6 +9,8 @@ import shlex import uuid import time +import io + try: from urlparse import urlsplit, urljoin except ImportError: @@ -44,6 +46,7 @@ class ContainerInfo(): aws_jobRoleArn = None aws_s3_scratch_loc = None aws_batch_job_queue = None + aws_secrets_loc = None def __init__(self, engine='docker', @@ -54,7 +57,8 @@ def __init__(self, container_cache='.', aws_jobRoleArn='', aws_s3_scratch_loc='', - aws_batch_job_queue='' + aws_batch_job_queue='', + aws_secrets_loc=os.path.expanduser('~/.aws') ): self.engine = engine self.vcpu = vcpu @@ -65,6 +69,7 @@ def __init__(self, self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue + self.aws_secrets_loc = aws_secrets_loc def __str__(self): """ @@ -111,7 +116,7 @@ def map_targets_to_container(self, targets): Accepts a dictionary where the keys are identifiers for various targets and the value is the target - This breaks down the targets by their schema (file, s3, etc). + This breaks down the targets by their schema (file, s3, etc). For each schema a lowest-common-path is found and a suggested container mountpoint is generated @@ -171,7 +176,9 @@ def ex( output_targets={}, extra_params={}, inputs_mode='ro', - outputs_mode='rw'): + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs'): if self.containerinfo.engine == 'docker': return self.ex_docker( command, @@ -179,25 +186,31 @@ def ex( output_targets, extra_params, inputs_mode, - outputs_mode + outputs_mode, + input_mount_point, + output_mount_point ) elif self.containerinfo.engine == 'aws_batch': return self.ex_aws_batch( command, - input_paths, - output_paths, + input_targets, + output_targets, extra_params, inputs_mode, - outputs_mode + outputs_mode, + input_mount_point, + output_mount_point ) elif self.containerinfo.engine == 'singularity_slurm': return self.ex_singularity_slurm( command, - input_paths, - output_paths, + input_targets, + output_targets, extra_params, inputs_mode, - outputs_mode + outputs_mode, + input_mount_point, + output_mount_point ) else: raise Exception("Container engine {} is invalid".format(self.containerinfo.engine)) @@ -302,11 +315,13 @@ def ex_singularity_slurm( def ex_aws_batch( self, command, - input_paths={}, - output_paths={}, + input_targets={}, + output_targets={}, extra_params={}, inputs_mode='ro', - outputs_mode='rw'): + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs'): """ Run a command in a container using AWS batch. Handles uploading of files to / from s3 and then into the container. @@ -327,151 +342,132 @@ def ex_aws_batch( run_uuid = str(uuid.uuid4()) - # 1. First a bit of file mapping / uploading of input items - # We need mappings for both two and from S3 and from S3 to within the container + # We need mappings for both to and from S3 and from S3 to within the container # <-> <-> # The script in the container, bucket_command_wrapper.py, handles the second half # practically, but we need to provide the link s3://bucket/key::/container/path/file::mode # the first half we have to do here. - # s3_input_paths will hold the s3 path - container_paths = {} - in_container_paths_from_s3 = {} - in_container_paths_from_local_fs = {} - s3_input_paths = {} - need_s3_uploads = set() - for (key, path) in input_paths.items(): - # First split the path, to see which scheme it is - path_split = urlsplit(path) - if path_split.scheme == 's3': - # Nothing to do. Already an S3 path. - in_container_paths_from_s3[key] = os.path.join( - path_split.netloc, - path_split.path - ) - s3_input_paths[key] = path - elif path_split.scheme == 'file' or path_split.scheme == '': - # File path. Will need to upload to S3 to a temporary key within a bucket - in_container_paths_from_local_fs[key] = path_split.path - need_s3_uploads.add((key, path_split)) - else: - raise ValueError("File storage scheme {} is not supported".format( - path_split.scheme - )) + container_paths = {} # Dict key is command template key. Value is in-container path + UF = set() # Set of UF lines to be added. Format is container_path::bucket_file_uri + DF = set() # Set of UF lines to be added. Format is bucket_file_uri::container_path::mode + needs_s3_download = set() # Set of Tuples. (s3::/bucket/key, target) + s3_temp_to_be_deleted = set() # S3 paths to be deleted. - in_from_local_fs_common_prefix = os.path.dirname( - os.path.commonprefix([ - p for p in in_container_paths_from_local_fs.values() - ]) + # Group our output targets by schema + output_target_maps = self.map_targets_to_container( + output_targets, ) - - for k, ps in need_s3_uploads: - s3_file_temp_path = "{}{}/in/{}".format( - self.containerinfo.aws_s3_scratch_loc, - run_uuid, - os.path.relpath(ps.path, in_from_local_fs_common_prefix) - ) - s3_input_paths[k] = s3_file_temp_path - log.info("Uploading {} to {}".format( - input_paths[k], - s3_input_paths[k], - )) - s3_client.upload_file( - Filename=input_paths[k], - Bucket=urlsplit(s3_input_paths[k]).netloc, - Key=urlsplit(s3_input_paths[k]).path.strip('/'), - ExtraArgs={ - 'ServerSideEncryption': 'AES256' - } - ) - # build our container paths for inputs from fs and S3 - for k in in_container_paths_from_local_fs: - container_paths[k] = os.path.join( - '/mnt/inputs/fs/', - os.path.relpath( - in_container_paths_from_local_fs[k], - in_from_local_fs_common_prefix) - ) - - in_from_s3_common_prefix = os.path.dirname( - os.path.commonprefix([ - p for p in in_container_paths_from_s3.values() - ]) + out_schema = set(output_target_maps.keys()) + # Make our container paths + for schema, schema_targets in output_target_maps.items(): + for k, relpath in schema_targets['relpaths'].items(): + container_paths[k] = os.path.join( + output_mount_point, + schema, + relpath + ) + # Inputs too + # Group by schema + input_target_maps = self.map_targets_to_container( + input_targets, ) - for k in in_container_paths_from_s3: - container_paths[k] = os.path.join( - '/mnt/inputs/s3/', - os.path.relpath( - in_container_paths_from_s3[k], - in_from_s3_common_prefix) + in_schema = set(input_target_maps.keys()) + # Make our container paths + for schema, schema_targets in input_target_maps.items(): + for k, relpath in schema_targets['relpaths'].items(): + container_paths[k] = os.path.join( + input_mount_point, + schema, + relpath ) + # Container paths should be done now. + + # Now the need to handle our mapping to-from S3. + # Inputs + for scheme, schema_targets in input_target_maps.items(): + if scheme == 's3': # Already coming from S3. Just make our DF entry + for k, target in schema_targets['targets'].items(): + DF.add('{}::{}::{}'.format( + target.path, + container_paths[k], + inputs_mode + )) + else: # NOT in S3. Will need to be upload to a temp location + for k, target in schema_targets['targets'].items(): + s3_temp_loc = os.path.join( + self.containerinfo.aws_s3_scratch_loc, + run_uuid, + scheme, + 'in', + schema_targets['relpaths'][k] + ) + # Add to DF for inside the container + DF.add('{}::{}::{}'.format( + s3_temp_loc, + container_paths[k], + inputs_mode + )) + # If we are read-write, we can add this to our todo list later + if inputs_mode == 'rw': + needs_s3_download.add(( + s3_temp_loc, + target + )) + # And actually upload to the S3 temp location now + if scheme == 'file' or scheme == '': + s3_client.upload_file( + Filename=target.path, + Bucket=urlsplit(s3_temp_loc).netloc, + Key=urlsplit(s3_temp_loc).path.strip('/'), + ExtraArgs={ + 'ServerSideEncryption': 'AES256' + } + ) + else: + # Have to use BytesIO because luigi targets can ONLY be opened in + # binary mode, and upload / download fileobj can ONLY accept binary mode files + # For reasons. + s3_client.upload_fileobj( + Fileobj=io.BytesIO( + target.open('r').read().encode('utf-8') + ), + Bucket=urlsplit(s3_temp_loc).netloc, + Key=urlsplit(s3_temp_loc).path.strip('/'), + ExtraArgs={ + 'ServerSideEncryption': 'AES256' + } + ) + s3_temp_to_be_deleted.add(s3_temp_loc) # Outputs - s3_output_paths = {} - need_s3_downloads = set() - out_container_paths_from_s3 = {} - out_container_paths_from_local_fs = {} - - for (key, path) in output_paths.items(): - # First split the path, to see which scheme it is - path_split = urlsplit(path) - if path_split.scheme == 's3': - # Nothing to do. Already an S3 path. - s3_output_paths[key] = path - out_container_paths_from_s3[key] = os.path.join( - path_split.netloc, - path_split.path - ) - elif path_split.scheme == 'file' or path_split.scheme == '': - # File path. Will need to upload to S3 to a temporary key within a bucket - need_s3_downloads.add((key, path_split)) - out_container_paths_from_local_fs[key] = path_split.path - else: - raise ValueError("File storage scheme {} is not supported".format( - path_split.scheme - )) - if len(need_s3_downloads) > 0: - output_common_prefix = os.path.commonpath([ - os.path.dirname(os.path.abspath(ps[1].path)) - for ps in need_s3_downloads - ]) - else: - output_common_prefix = '' - - for k, ps in need_s3_downloads: - s3_file_temp_path = "{}{}/out/{}".format( - self.containerinfo.aws_s3_scratch_loc, - run_uuid, - os.path.relpath(ps.path, output_common_prefix) - ) - s3_output_paths[k] = s3_file_temp_path - - # Make our container paths for outputs - out_from_local_fs_common_prefix = os.path.dirname( - os.path.commonprefix([ - p for p in out_container_paths_from_local_fs.values() - ]) - ) - for k in out_container_paths_from_local_fs: - container_paths[k] = os.path.join( - '/mnt/outputs/fs/', - os.path.relpath( - out_container_paths_from_local_fs[k], - out_from_local_fs_common_prefix) - ) - - out_from_s3_common_prefix = os.path.dirname( - os.path.commonprefix([ - p for p in out_container_paths_from_s3.values() - ]) - ) - for k in out_container_paths_from_s3: - container_paths[k] = os.path.join( - '/mnt/outputs/s3/', - os.path.relpath( - out_container_paths_from_s3[k], - out_from_s3_common_prefix) - ) + for scheme, schema_targets in output_target_maps.items(): + if scheme == 's3': # Already going to S3. Just make our UF entry + for k, target in schema_targets['targets'].items(): + UF.add('{}::{}'.format( + container_paths[k], + target.path, + )) + else: # NOT ending in S3. Will need to download to target and make a temp destination in s3 + for k, target in schema_targets['targets'].items(): + s3_temp_loc = os.path.join( + self.containerinfo.aws_s3_scratch_loc, + run_uuid, + scheme, + 'out', + schema_targets['relpaths'][k] + ) + # Add to UF for inside the container + UF.add('{}::{}'.format( + container_paths[k], + s3_temp_loc + )) + # add this to our download from s3 list later + needs_s3_download.add(( + s3_temp_loc, + target + )) + s3_temp_to_be_deleted.add(s3_temp_loc) # 2) Register / retrieve job definition for this container, command, and job role arn @@ -550,24 +546,17 @@ def ex_aws_batch( '--command', Template(command).safe_substitute(template_dict) ] # Add in our inputs - for k in s3_input_paths: + for df in DF: container_command_list += [ '-DF', - "{}::{}::{}".format( - s3_input_paths[k], - container_paths[k], - inputs_mode.lower() - ) + df ] # And our outputs - for k in s3_output_paths: + for uf in UF: container_command_list += [ '-UF', - "{}::{}".format( - container_paths[k], - s3_output_paths[k] - ) + uf ] # Submit the job @@ -599,26 +588,25 @@ def ex_aws_batch( )) # Implicit else we succeeded # Now we need to copy back from S3 to our local filesystem - for k, ps in need_s3_downloads: - s3_client.download_file( - Filename=ps.path, - Bucket=urlsplit(s3_output_paths[k]).netloc, - Key=urlsplit(s3_output_paths[k]).path.strip('/') - ) - # And the inputs if we are rw - if inputs_mode == 'rw': - for k, ps in need_s3_uploads: + for s3_loc, target in needs_s3_download: + if target.scheme == 'file': s3_client.download_file( - Filename=ps.path, - Bucket=urlsplit(s3_input_paths[k]).netloc, - Key=urlsplit(s3_input_paths[k]).path.strip('/') + Bucket=urlsplit(s3_loc).netloc, + Key=urlsplit(s3_loc).path.split('/'), + Filename=target.path, ) - + else: + with target.open('w') as target_h: + s3_client.download_file( + Bucket=urlsplit(s3_loc).netloc, + Key=urlsplit(s3_loc).path.split('/'), + Fileobj=target_h, + ) # Cleanup the temp S3 - for k, ps in need_s3_uploads: + for s3_path in s3_temp_to_be_deleted: s3_client.delete_object( - Bucket=urlsplit(s3_input_paths[k]).netloc, - Key=urlsplit(s3_input_paths[k]).path.strip('/'), + Bucket=urlsplit(s3_path).netloc, + Key=urlsplit(s3_path).path.strip('/'), ) # And done @@ -643,72 +631,74 @@ def ex_docker( UF = [] DF = [] - if len(output_targets) > 0: - output_target_maps = self.map_targets_to_container( - output_targets, - ) - out_schema = set(output_target_maps.keys()) - # Local file targets can just be mapped. - file_output_common_prefix = None - if 'file' in out_schema: - file_output_common_prefix = output_target_maps['file']['common_prefix'] - mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { - 'bind': os.path.join(output_mount_point, 'file'), - 'mode': outputs_mode + output_target_maps = self.map_targets_to_container( + output_targets, + ) + out_schema = set(output_target_maps.keys()) + # Local file targets can just be mapped. + file_output_common_prefix = None + if 'file' in out_schema: + file_output_common_prefix = output_target_maps['file']['common_prefix'] + mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(output_mount_point, 'file'), + 'mode': outputs_mode + } + container_paths.update({ + i: os.path.join(output_mount_point, 'file', rp) + for i, rp in output_target_maps['file']['relpaths'].items() + }) + out_schema.remove('file') + # Handle other schema here using BCW, creating the appropriate UF parameters + for scheme in out_schema: + for identifier in output_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + output_mount_point, + scheme, + output_target_maps[scheme]['relpaths'][identifier] + ) + UF.append("{}::{}".format( + container_paths[identifier], + output_target_maps[scheme]['targets'][identifier].path + )) + + input_target_maps = self.map_targets_to_container( + input_targets + ) + in_schema = set(input_target_maps.keys()) + if 'file' in in_schema: + # Check for the edge case where our common prefix for input and output is the same + if file_output_common_prefix and file_output_common_prefix == input_target_maps['file']['common_prefix']: + # It is! Skip adding a mount for inputs then, and reset our input mountpoint + input_mount_point = output_mount_point + pass + else: # Add our mount + mounts[os.path.abspath(input_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(input_mount_point, 'file'), + 'mode': inputs_mode } - container_paths.update({ - i: os.path.join(output_mount_point, 'file', rp) - for i, rp in output_target_maps['file']['relpaths'].items() - }) - out_schema.remove('file') - # Handle other schema here using BCW, creating the appropriate UF parameters - for scheme in out_schema: - for identifier in output_target_maps[scheme]['targets']: - container_paths[identifier] = os.path.join( - output_mount_point, - scheme, - output_target_maps[scheme]['relpaths'][identifier] - ) - UF.append("{}::{}".format( - container_paths[identifier], - output_target_maps[scheme]['targets'][identifier].path - )) + container_paths.update({ + i: os.path.join(input_mount_point, 'file', rp) + for i, rp in input_target_maps['file']['relpaths'].items() + }) + in_schema.remove('file') + + # Handle other schema here using BCW, creating the appropriate DF parameters + for scheme in in_schema: + for identifier in input_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + input_mount_point, + scheme, + input_target_maps[scheme]['relpaths'][identifier] + ) + DF.append("{}::{}::{}".format( + input_target_maps[scheme]['targets'][identifier].path, + container_paths[identifier], + inputs_mode, + )) - if len(input_targets) > 0: - input_target_maps = self.map_targets_to_container( - input_targets - ) - in_schema = set(input_target_maps.keys()) - if 'file' in in_schema: - # Check for the edge case where our common prefix for input and output is the same - if file_output_common_prefix and file_output_common_prefix == input_target_maps['file']['common_prefix']: - # It is! Skip adding a mount for inputs then, and reset our input mountpoint - input_mount_point = output_mount_point - pass - else: # Add our mount - mounts[os.path.abspath(input_target_maps['file']['common_prefix'])] = { - 'bind': os.path.join(input_mount_point, 'file'), - 'mode': inputs_mode - } - container_paths.update({ - i: os.path.join(input_mount_point, 'file', rp) - for i, rp in input_target_maps['file']['relpaths'].items() - }) - in_schema.remove('file') - - # Handle other schema here using BCW, creating the appropriate DF parameters - for scheme in in_schema: - for identifier in input_target_maps[scheme]['targets']: - container_paths[identifier] = os.path.join( - input_mount_point, - scheme, - input_target_maps[scheme]['relpaths'][identifier] - ) - DF.append("{}::{}::{}".format( - input_target_maps[scheme]['targets'][identifier].path, - container_paths[identifier], - inputs_mode, - )) + # Mount the AWS secrets if we have some AND s3 is in one of our schema + if self.containerinfo.aws_secrets_loc and ('s3' in out_schema or 's3' in in_schema): + mounts[self.containerinfo.aws_secrets_loc] = {'bind': '/root/.aws', 'mode': 'ro'} template_dict = container_paths.copy() template_dict.update(extra_params) From a860c348e8206055f3d6fb45653037f7935c4b1d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 1 May 2018 15:54:48 -0700 Subject: [PATCH 30/88] Effort to get basics of slurm_singularity engine working and share with docker engine code --- sciluigi/containertask.py | 257 +++++++++++++++++++++----------------- 1 file changed, 140 insertions(+), 117 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 98c523d..74ea617 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -48,6 +48,9 @@ class ContainerInfo(): aws_batch_job_queue = None aws_secrets_loc = None + # SLURM specifics + slurm_partition = None + def __init__(self, engine='docker', vcpu=1, @@ -58,7 +61,8 @@ def __init__(self, aws_jobRoleArn='', aws_s3_scratch_loc='', aws_batch_job_queue='', - aws_secrets_loc=os.path.expanduser('~/.aws') + aws_secrets_loc=os.path.expanduser('~/.aws', + slurm_partition='') ): self.engine = engine self.vcpu = vcpu @@ -66,11 +70,14 @@ def __init__(self, self.timeout = timeout self.mounts = mounts self.container_cache = container_cache + self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue self.aws_secrets_loc = aws_secrets_loc + self.slurm_partition = slurm_partition + def __str__(self): """ Return string of this information @@ -156,10 +163,95 @@ def map_targets_to_container(self, targets): urlsplit(t.path).path ), common_prefix) - for i, t in scheme_targets.items() + for i, t in scheme_targets.items() } return return_dict + def mounts_CP_DF_UF( + self, + input_targets, + output_targets, + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point): + + container_paths = {} + mounts = self.containerinfo.mounts.copy() + UF = [] + DF = [] + + output_target_maps = self.map_targets_to_container( + output_targets, + ) + out_schema = set(output_target_maps.keys()) + # Local file targets can just be mapped. + file_output_common_prefix = None + if 'file' in out_schema: + file_output_common_prefix = output_target_maps['file']['common_prefix'] + mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(output_mount_point, 'file'), + 'mode': outputs_mode + } + container_paths.update({ + i: os.path.join(output_mount_point, 'file', rp) + for i, rp in output_target_maps['file']['relpaths'].items() + }) + out_schema.remove('file') + # Handle other schema here using BCW, creating the appropriate UF parameters + for scheme in out_schema: + for identifier in output_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + output_mount_point, + scheme, + output_target_maps[scheme]['relpaths'][identifier] + ) + UF.append("{}::{}".format( + container_paths[identifier], + output_target_maps[scheme]['targets'][identifier].path + )) + + input_target_maps = self.map_targets_to_container( + input_targets + ) + in_schema = set(input_target_maps.keys()) + if 'file' in in_schema: + # Check for the edge case where our common prefix for input and output is the same + if file_output_common_prefix and file_output_common_prefix == input_target_maps['file']['common_prefix']: + # It is! Skip adding a mount for inputs then, and reset our input mountpoint + input_mount_point = output_mount_point + pass + else: # Add our mount + mounts[os.path.abspath(input_target_maps['file']['common_prefix'])] = { + 'bind': os.path.join(input_mount_point, 'file'), + 'mode': inputs_mode + } + container_paths.update({ + i: os.path.join(input_mount_point, 'file', rp) + for i, rp in input_target_maps['file']['relpaths'].items() + }) + in_schema.remove('file') + + # Handle other schema here using BCW, creating the appropriate DF parameters + for scheme in in_schema: + for identifier in input_target_maps[scheme]['targets']: + container_paths[identifier] = os.path.join( + input_mount_point, + scheme, + input_target_maps[scheme]['relpaths'][identifier] + ) + DF.append("{}::{}::{}".format( + input_target_maps[scheme]['targets'][identifier].path, + container_paths[identifier], + inputs_mode, + )) + + # Mount the AWS secrets if we have some AND s3 is in one of our schema + if self.containerinfo.aws_secrets_loc and ('s3' in out_schema or 's3' in in_schema): + mounts[self.containerinfo.aws_secrets_loc] = {'bind': '/root/.aws', 'mode': 'ro'} + + return (mounts, container_paths, DF, UF) + def make_fs_name(self, uri): uri_list = uri.split('://') if len(uri_list) == 1: @@ -218,44 +310,24 @@ def ex( def ex_singularity_slurm( self, command, - input_paths={}, - output_paths={}, + input_targets={}, + output_targets={}, extra_params={}, inputs_mode='ro', - outputs_mode='rw'): + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs'): """ - Run command in the container using singularity, with mountpoints + Run command in the container using singularity on slurm, with mountpoints command is assumed to be in python template substitution format """ - container_paths = {} - mounts = self.containerinfo.mounts.copy() - - if len(output_paths) > 0: - output_host_path_ca, output_container_paths = self.map_paths_to_container( - output_paths, - container_base_path='/mnt/outputs' - ) - container_paths.update(output_container_paths) - mounts[output_host_path_ca] = {'bind': '/mnt/outputs', 'mode': outputs_mode} - - if len(input_paths) > 0: - input_host_path_ca, input_container_paths = self.map_paths_to_container( - input_paths, - container_base_path='/mnt/inputs' - ) - # Handle the edge case where the common directory for inputs is equal to the outputs - if len(output_paths) > 0 and (output_host_path_ca == input_host_path_ca): - log.warn("Input and Output host paths the same {}".format(output_host_path_ca)) - # Repeat our mapping, now using the outputs path for both - input_host_path_ca, input_container_paths = self.map_paths_to_container( - input_paths, - container_base_path='/mnt/outputs' - ) - else: # output and input paths different OR there are only input paths - mounts[input_host_path_ca] = {'bind': '/mnt/inputs', 'mode': inputs_mode} - - # No matter what, add our mappings - container_paths.update(input_container_paths) + mounts, container_paths, DF, UF = self.mounts_CP_DF_UF( + input_targets, + output_targets, + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point) img_location = os.path.join( self.containerinfo.container_cache, @@ -281,7 +353,7 @@ def ex_singularity_slurm( 'pull', '--name', os.path.basename(img_location), - self.container + "docker://{}".format(self.container) ], stdout=subprocess.PIPE, stderr=subprocess.PIPE @@ -290,19 +362,32 @@ def ex_singularity_slurm( # Move back os.chdir(cwd) - command = Template(command).substitute(container_paths) + template_dict = container_paths.copy() + template_dict.update(extra_params) + command = Template(command).substitute(template_dict) + log.info("Attempting to run {} in {}".format( command, self.container )) command_list = [ - 'singularity', 'exec' + 'salloc', + '-c', self.containerinfo.vcpu, + '--mem={}M'.format(self.containerinfo.mem), + '-t', self.containerinfo.timeout, + '-p', self.containerinfo.slurm_partition, + 'singularity', 'exec', '-c', ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] command_list.append(img_location) - command_list += shlex.split(command) + command_list+=['bucket_command_wrapper', '-c', command] + for uf in UF: + command_list+=['-UF', uf] + for df in DF: + command_list+=['-DF', df] + command_proc = subprocess.run( command_list, stdout=subprocess.PIPE, @@ -417,7 +502,7 @@ def ex_aws_batch( # And actually upload to the S3 temp location now if scheme == 'file' or scheme == '': s3_client.upload_file( - Filename=target.path, + Filename=os.path.abspath(target.path), Bucket=urlsplit(s3_temp_loc).netloc, Key=urlsplit(s3_temp_loc).path.strip('/'), ExtraArgs={ @@ -588,18 +673,21 @@ def ex_aws_batch( )) # Implicit else we succeeded # Now we need to copy back from S3 to our local filesystem - for s3_loc, target in needs_s3_download: + for (s3_loc, target) in needs_s3_download: + print(urlsplit(s3_loc).netloc) + print(urlsplit(s3_loc).path.strip('/')) + print(os.path.abspath(target.path)) if target.scheme == 'file': s3_client.download_file( Bucket=urlsplit(s3_loc).netloc, - Key=urlsplit(s3_loc).path.split('/'), - Filename=target.path, + Key=urlsplit(s3_loc).path.strip('/'), + Filename=os.path.abspath(target.path), ) else: with target.open('w') as target_h: s3_client.download_file( Bucket=urlsplit(s3_loc).netloc, - Key=urlsplit(s3_loc).path.split('/'), + Key=urlsplit(s3_loc).path.strip('/'), Fileobj=target_h, ) # Cleanup the temp S3 @@ -626,80 +714,15 @@ def ex_docker( command is assumed to be in python template substitution format """ client = docker.from_env() - container_paths = {} - mounts = self.containerinfo.mounts.copy() - UF = [] - DF = [] - - output_target_maps = self.map_targets_to_container( + + mounts, container_paths, DF, UF = self.mounts_CP_DF_UF( + input_targets, output_targets, - ) - out_schema = set(output_target_maps.keys()) - # Local file targets can just be mapped. - file_output_common_prefix = None - if 'file' in out_schema: - file_output_common_prefix = output_target_maps['file']['common_prefix'] - mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { - 'bind': os.path.join(output_mount_point, 'file'), - 'mode': outputs_mode - } - container_paths.update({ - i: os.path.join(output_mount_point, 'file', rp) - for i, rp in output_target_maps['file']['relpaths'].items() - }) - out_schema.remove('file') - # Handle other schema here using BCW, creating the appropriate UF parameters - for scheme in out_schema: - for identifier in output_target_maps[scheme]['targets']: - container_paths[identifier] = os.path.join( - output_mount_point, - scheme, - output_target_maps[scheme]['relpaths'][identifier] - ) - UF.append("{}::{}".format( - container_paths[identifier], - output_target_maps[scheme]['targets'][identifier].path - )) - - input_target_maps = self.map_targets_to_container( - input_targets - ) - in_schema = set(input_target_maps.keys()) - if 'file' in in_schema: - # Check for the edge case where our common prefix for input and output is the same - if file_output_common_prefix and file_output_common_prefix == input_target_maps['file']['common_prefix']: - # It is! Skip adding a mount for inputs then, and reset our input mountpoint - input_mount_point = output_mount_point - pass - else: # Add our mount - mounts[os.path.abspath(input_target_maps['file']['common_prefix'])] = { - 'bind': os.path.join(input_mount_point, 'file'), - 'mode': inputs_mode - } - container_paths.update({ - i: os.path.join(input_mount_point, 'file', rp) - for i, rp in input_target_maps['file']['relpaths'].items() - }) - in_schema.remove('file') - - # Handle other schema here using BCW, creating the appropriate DF parameters - for scheme in in_schema: - for identifier in input_target_maps[scheme]['targets']: - container_paths[identifier] = os.path.join( - input_mount_point, - scheme, - input_target_maps[scheme]['relpaths'][identifier] - ) - DF.append("{}::{}::{}".format( - input_target_maps[scheme]['targets'][identifier].path, - container_paths[identifier], - inputs_mode, - )) - - # Mount the AWS secrets if we have some AND s3 is in one of our schema - if self.containerinfo.aws_secrets_loc and ('s3' in out_schema or 's3' in in_schema): - mounts[self.containerinfo.aws_secrets_loc] = {'bind': '/root/.aws', 'mode': 'ro'} - + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point) + template_dict = container_paths.copy() template_dict.update(extra_params) command = Template(command).substitute(template_dict) From 06ba0258131045fd98ef30fb5fd118074ee0092f Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 1 May 2018 16:04:49 -0700 Subject: [PATCH 31/88] Fixed bug in containerinfo init --- sciluigi/containertask.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 74ea617..c7f1141 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -61,8 +61,8 @@ def __init__(self, aws_jobRoleArn='', aws_s3_scratch_loc='', aws_batch_job_queue='', - aws_secrets_loc=os.path.expanduser('~/.aws', - slurm_partition='') + aws_secrets_loc=os.path.expanduser('~/.aws'), + slurm_partition='' ): self.engine = engine self.vcpu = vcpu @@ -168,13 +168,13 @@ def map_targets_to_container(self, targets): return return_dict def mounts_CP_DF_UF( - self, - input_targets, - output_targets, - inputs_mode, - outputs_mode, - input_mount_point, - output_mount_point): + self, + input_targets, + output_targets, + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point): container_paths = {} mounts = self.containerinfo.mounts.copy() @@ -674,9 +674,6 @@ def ex_aws_batch( # Implicit else we succeeded # Now we need to copy back from S3 to our local filesystem for (s3_loc, target) in needs_s3_download: - print(urlsplit(s3_loc).netloc) - print(urlsplit(s3_loc).path.strip('/')) - print(os.path.abspath(target.path)) if target.scheme == 'file': s3_client.download_file( Bucket=urlsplit(s3_loc).netloc, From b212c62dba6c57c52a1fa8d33ef67f16d6749303 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 1 May 2018 17:56:13 -0700 Subject: [PATCH 32/88] Working on singularity_slurm code --- .gitignore | 0 LICENSE | 0 MANIFEST.in | 0 README.md | 0 README.rst | 0 sciluigi/__init__.py | 0 sciluigi/audit.py | 0 sciluigi/containertask.py | 39 ++++++++++++++++++++++-------------- sciluigi/dependencies.py | 0 sciluigi/interface.py | 0 sciluigi/parameter.py | 0 sciluigi/slurm.py | 0 sciluigi/task.py | 0 sciluigi/util.py | 0 sciluigi/workflow.py | 0 setup.py | 0 test/test_dependencies.py | 0 test/test_paramval.py | 0 tools/.logging.conf.template | 0 tools/init_projdir.py | 0 20 files changed, 24 insertions(+), 15 deletions(-) mode change 100755 => 100644 .gitignore mode change 100755 => 100644 LICENSE mode change 100755 => 100644 MANIFEST.in mode change 100755 => 100644 README.md mode change 100755 => 100644 README.rst mode change 100755 => 100644 sciluigi/__init__.py mode change 100755 => 100644 sciluigi/audit.py mode change 100755 => 100644 sciluigi/dependencies.py mode change 100755 => 100644 sciluigi/interface.py mode change 100755 => 100644 sciluigi/parameter.py mode change 100755 => 100644 sciluigi/slurm.py mode change 100755 => 100644 sciluigi/task.py mode change 100755 => 100644 sciluigi/util.py mode change 100755 => 100644 sciluigi/workflow.py mode change 100755 => 100644 setup.py mode change 100755 => 100644 test/test_dependencies.py mode change 100755 => 100644 test/test_paramval.py mode change 100755 => 100644 tools/.logging.conf.template mode change 100755 => 100644 tools/init_projdir.py diff --git a/.gitignore b/.gitignore old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 diff --git a/README.rst b/README.rst old mode 100755 new mode 100644 diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py old mode 100755 new mode 100644 diff --git a/sciluigi/audit.py b/sciluigi/audit.py old mode 100755 new mode 100644 diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index c7f1141..2e029b6 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -62,7 +62,7 @@ def __init__(self, aws_s3_scratch_loc='', aws_batch_job_queue='', aws_secrets_loc=os.path.expanduser('~/.aws'), - slurm_partition='' + slurm_partition=None ): self.engine = engine self.vcpu = vcpu @@ -372,27 +372,36 @@ def ex_singularity_slurm( )) command_list = [ - 'salloc', - '-c', self.containerinfo.vcpu, - '--mem={}M'.format(self.containerinfo.mem), - '-t', self.containerinfo.timeout, - '-p', self.containerinfo.slurm_partition, 'singularity', 'exec', '-c', ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] command_list.append(img_location) - command_list+=['bucket_command_wrapper', '-c', command] + command_list += ['bucket_command_wrapper', '-c', command] for uf in UF: - command_list+=['-UF', uf] + command_list += ['-UF', uf] for df in DF: - command_list+=['-DF', df] + command_list += ['-DF', df] - command_proc = subprocess.run( - command_list, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + if not self.containerinfo.slurm_partition: # No slurm partition. Run without slurm + command_proc = subprocess.run( + command_list, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + else: + command_proc = subprocess.run( + [ + 'salloc', + '-c', str(self.containerinfo.vcpu), + '--mem={}M'.format(self.containerinfo.mem), + '-t', str(self.containerinfo.timeout), + '-p', self.containerinfo.slurm_partition, + ]+command_list, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + log.info(command_proc.stdout) if command_proc.stderr: log.warn(command_proc.stderr) @@ -559,7 +568,7 @@ def ex_aws_batch( # Make a UUID based on the container / command job_def_name = "sl_containertask__{}".format( uuid.uuid5( - uuid.NAMESPACE_URL, + uuid.NAMESPACE_URL, self.container+self.containerinfo.aws_jobRoleArn+str(self.containerinfo.mounts) ) ) diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py old mode 100755 new mode 100644 diff --git a/sciluigi/interface.py b/sciluigi/interface.py old mode 100755 new mode 100644 diff --git a/sciluigi/parameter.py b/sciluigi/parameter.py old mode 100755 new mode 100644 diff --git a/sciluigi/slurm.py b/sciluigi/slurm.py old mode 100755 new mode 100644 diff --git a/sciluigi/task.py b/sciluigi/task.py old mode 100755 new mode 100644 diff --git a/sciluigi/util.py b/sciluigi/util.py old mode 100755 new mode 100644 diff --git a/sciluigi/workflow.py b/sciluigi/workflow.py old mode 100755 new mode 100644 diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 diff --git a/test/test_dependencies.py b/test/test_dependencies.py old mode 100755 new mode 100644 diff --git a/test/test_paramval.py b/test/test_paramval.py old mode 100755 new mode 100644 diff --git a/tools/.logging.conf.template b/tools/.logging.conf.template old mode 100755 new mode 100644 diff --git a/tools/init_projdir.py b/tools/init_projdir.py old mode 100755 new mode 100644 From 0160a00039df62e344c8e340b3df62bbe3617e11 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 2 May 2018 09:29:46 -0700 Subject: [PATCH 33/88] Generally working slurm-sciluigi version --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 2e029b6..02dacc9 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -372,7 +372,7 @@ def ex_singularity_slurm( )) command_list = [ - 'singularity', 'exec', '-c', + 'singularity', 'exec', '--contain', '--scratch', '/working/' ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] From 771c5e4e46cb655dd1492dc3f5ed3fb4b3084805 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 2 May 2018 09:36:12 -0700 Subject: [PATCH 34/88] Pre pull commit --- .gitignore | 0 LICENSE | 0 MANIFEST.in | 0 README.md | 0 README.rst | 0 examples/clean.sh | 0 examples/data/a.txt | 0 examples/data/acgt.txt | 0 examples/data/afolder/hej.txt | 0 examples/data/c.txt | 0 examples/data/g.txt | 0 examples/data/t.txt | 0 examples/example1.py | 0 examples/example2_ngi.py | 0 examples/example3_components.py | 0 examples/example3_workflow.py | 0 examples/example4_multiwf.py | 0 examples/sciluigi | 1 - sciluigi/__init__.py | 0 sciluigi/audit.py | 0 sciluigi/dependencies.py | 0 sciluigi/interface.py | 0 sciluigi/parameter.py | 0 sciluigi/slurm.py | 0 sciluigi/task.py | 0 sciluigi/util.py | 0 sciluigi/workflow.py | 0 setup.py | 0 test/test_dependencies.py | 0 test/test_paramval.py | 0 tools/.logging.conf.template | 0 tools/init_projdir.py | 0 32 files changed, 1 deletion(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 LICENSE mode change 100644 => 100755 MANIFEST.in mode change 100644 => 100755 README.md mode change 100644 => 100755 README.rst mode change 100644 => 100755 examples/clean.sh mode change 100644 => 100755 examples/data/a.txt mode change 100644 => 100755 examples/data/acgt.txt mode change 100644 => 100755 examples/data/afolder/hej.txt mode change 100644 => 100755 examples/data/c.txt mode change 100644 => 100755 examples/data/g.txt mode change 100644 => 100755 examples/data/t.txt mode change 100644 => 100755 examples/example1.py mode change 100644 => 100755 examples/example2_ngi.py mode change 100644 => 100755 examples/example3_components.py mode change 100644 => 100755 examples/example3_workflow.py mode change 100644 => 100755 examples/example4_multiwf.py delete mode 120000 examples/sciluigi mode change 100644 => 100755 sciluigi/__init__.py mode change 100644 => 100755 sciluigi/audit.py mode change 100644 => 100755 sciluigi/dependencies.py mode change 100644 => 100755 sciluigi/interface.py mode change 100644 => 100755 sciluigi/parameter.py mode change 100644 => 100755 sciluigi/slurm.py mode change 100644 => 100755 sciluigi/task.py mode change 100644 => 100755 sciluigi/util.py mode change 100644 => 100755 sciluigi/workflow.py mode change 100644 => 100755 setup.py mode change 100644 => 100755 test/test_dependencies.py mode change 100644 => 100755 test/test_paramval.py mode change 100644 => 100755 tools/.logging.conf.template mode change 100644 => 100755 tools/init_projdir.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/examples/clean.sh b/examples/clean.sh old mode 100644 new mode 100755 diff --git a/examples/data/a.txt b/examples/data/a.txt old mode 100644 new mode 100755 diff --git a/examples/data/acgt.txt b/examples/data/acgt.txt old mode 100644 new mode 100755 diff --git a/examples/data/afolder/hej.txt b/examples/data/afolder/hej.txt old mode 100644 new mode 100755 diff --git a/examples/data/c.txt b/examples/data/c.txt old mode 100644 new mode 100755 diff --git a/examples/data/g.txt b/examples/data/g.txt old mode 100644 new mode 100755 diff --git a/examples/data/t.txt b/examples/data/t.txt old mode 100644 new mode 100755 diff --git a/examples/example1.py b/examples/example1.py old mode 100644 new mode 100755 diff --git a/examples/example2_ngi.py b/examples/example2_ngi.py old mode 100644 new mode 100755 diff --git a/examples/example3_components.py b/examples/example3_components.py old mode 100644 new mode 100755 diff --git a/examples/example3_workflow.py b/examples/example3_workflow.py old mode 100644 new mode 100755 diff --git a/examples/example4_multiwf.py b/examples/example4_multiwf.py old mode 100644 new mode 100755 diff --git a/examples/sciluigi b/examples/sciluigi deleted file mode 120000 index 79eca18..0000000 --- a/examples/sciluigi +++ /dev/null @@ -1 +0,0 @@ -../sciluigi \ No newline at end of file diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py old mode 100644 new mode 100755 diff --git a/sciluigi/audit.py b/sciluigi/audit.py old mode 100644 new mode 100755 diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py old mode 100644 new mode 100755 diff --git a/sciluigi/interface.py b/sciluigi/interface.py old mode 100644 new mode 100755 diff --git a/sciluigi/parameter.py b/sciluigi/parameter.py old mode 100644 new mode 100755 diff --git a/sciluigi/slurm.py b/sciluigi/slurm.py old mode 100644 new mode 100755 diff --git a/sciluigi/task.py b/sciluigi/task.py old mode 100644 new mode 100755 diff --git a/sciluigi/util.py b/sciluigi/util.py old mode 100644 new mode 100755 diff --git a/sciluigi/workflow.py b/sciluigi/workflow.py old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/test/test_dependencies.py b/test/test_dependencies.py old mode 100644 new mode 100755 diff --git a/test/test_paramval.py b/test/test_paramval.py old mode 100644 new mode 100755 diff --git a/tools/.logging.conf.template b/tools/.logging.conf.template old mode 100644 new mode 100755 diff --git a/tools/init_projdir.py b/tools/init_projdir.py old mode 100644 new mode 100755 From d22e6d6593fef2090923e5920c015aa57a002e82 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 2 May 2018 12:51:50 -0700 Subject: [PATCH 35/88] Working singularity_slurm code --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 02dacc9..d625342 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -372,7 +372,7 @@ def ex_singularity_slurm( )) command_list = [ - 'singularity', 'exec', '--contain', '--scratch', '/working/' + 'singularity', 'exec', '--contain', '--scratch', '/scratch/' ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] From 56ab1bdde37e21ff5e7d05d5033e60aa0d47bc2a Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 2 May 2018 14:46:13 -0700 Subject: [PATCH 36/88] Continued refinement of slurm_singularity engine --- sciluigi/containertask.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index d625342..4077b72 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -372,7 +372,7 @@ def ex_singularity_slurm( )) command_list = [ - 'singularity', 'exec', '--contain', '--scratch', '/scratch/' + 'singularity', 'exec', '--contain', '--scratch', '/scratch' ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -401,7 +401,7 @@ def ex_singularity_slurm( stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - + log.info(command_proc.stdout) if command_proc.stderr: log.warn(command_proc.stderr) @@ -418,7 +418,7 @@ def ex_aws_batch( output_mount_point='/mnt/outputs'): """ Run a command in a container using AWS batch. - Handles uploading of files to / from s3 and then into the container. + Handles uploading of files to / from s3 and then into the container. Assumes the container has batch_command_wrapper.py """ # From eef05afda1861a203dee4fdf40e4538d5c5df44b Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 3 May 2018 09:20:21 -0700 Subject: [PATCH 37/88] Catch ClientError for AWS Batch API calls --- sciluigi/containertask.py | 97 ++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 37 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index c7f1141..3d5774b 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -10,6 +10,7 @@ import uuid import time import io +from botocore.exceptions import ClientError try: from urlparse import urlsplit, urljoin @@ -565,11 +566,17 @@ def ex_aws_batch( ) # Search to see if this job is ALREADY defined. - job_def_search = batch_client.describe_job_definitions( - maxResults=1, - status='ACTIVE', - jobDefinitionName=job_def_name, - ) + while True: + try: + job_def_search = batch_client.describe_job_definitions( + maxResults=1, + status='ACTIVE', + jobDefinitionName=job_def_name, + ) + break + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") + time.sleep(10) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now log.info( @@ -599,22 +606,28 @@ def ex_aws_batch( 'readOnly': read_only, }) - batch_client.register_job_definition( - jobDefinitionName=job_def_name, - type='container', - containerProperties={ - 'image': self.container, - 'vcpus': 1, - 'memory': 1024, - 'command': shlex.split(command), - 'jobRoleArn': self.containerinfo.aws_jobRoleArn, - 'mountPoints': aws_mountPoints, - 'volumes': aws_volumes - }, - timeout={ - 'attemptDurationSeconds': self.containerinfo.timeout * 60 - } - ) + while True: + try: + batch_client.register_job_definition( + jobDefinitionName=job_def_name, + type='container', + containerProperties={ + 'image': self.container, + 'vcpus': 1, + 'memory': 1024, + 'command': shlex.split(command), + 'jobRoleArn': self.containerinfo.aws_jobRoleArn, + 'mountPoints': aws_mountPoints, + 'volumes': aws_volumes + }, + timeout={ + 'attemptDurationSeconds': self.containerinfo.timeout * 60 + } + ) + break + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") + time.sleep(10) else: # Already registered aws_job_def = job_def_search['jobDefinitions'][0] log.info('Found job definition for {} with job role {} under name {}'.format( @@ -645,28 +658,38 @@ def ex_aws_batch( ] # Submit the job - job_submission = batch_client.submit_job( - jobName=run_uuid, - jobQueue=self.containerinfo.aws_batch_job_queue, - jobDefinition=job_def_name, - containerOverrides={ - 'vcpus': self.containerinfo.vcpu, - 'memory': self.containerinfo.mem, - 'command': container_command_list, - }, - ) + while True: + try: + job_submission = batch_client.submit_job( + jobName=run_uuid, + jobQueue=self.containerinfo.aws_batch_job_queue, + jobDefinition=job_def_name, + containerOverrides={ + 'vcpus': self.containerinfo.vcpu, + 'memory': self.containerinfo.mem, + 'command': container_command_list, + }, + ) + break + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") + time.sleep(10) job_submission_id = job_submission.get('jobId') log.info("Running {} under jobId {}".format( container_command_list, job_submission_id )) while True: - job_status = batch_client.describe_jobs( - jobs=[job_submission_id] - ).get('jobs')[0] - if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': - break - time.sleep(10) + try: + job_status = batch_client.describe_jobs( + jobs=[job_submission_id] + ).get('jobs')[0] + if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': + break + time.sleep(10) + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") + time.sleep(10) if job_status.get('status') != 'SUCCEEDED': raise Exception("Batch job failed. {}".format( job_status.get('statusReason') From f617a293b42e1ce722fed50569998944170b3f27 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 3 May 2018 10:07:27 -0700 Subject: [PATCH 38/88] Add MAX_BOTO_TRIES To prevent infinite loops --- sciluigi/containertask.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 3d5774b..13b2006 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -407,7 +407,8 @@ def ex_aws_batch( inputs_mode='ro', outputs_mode='rw', input_mount_point='/mnt/inputs', - output_mount_point='/mnt/outputs'): + output_mount_point='/mnt/outputs', + MAX_BOTO_TRIES=10): """ Run a command in a container using AWS batch. Handles uploading of files to / from s3 and then into the container. @@ -566,7 +567,9 @@ def ex_aws_batch( ) # Search to see if this job is ALREADY defined. - while True: + boto_tries = 0 + while boto_tries < MAX_BOTO_TRIES: + boto_tries += 1 try: job_def_search = batch_client.describe_job_definitions( maxResults=1, @@ -606,7 +609,9 @@ def ex_aws_batch( 'readOnly': read_only, }) - while True: + boto_tries = 0 + while boto_tries < MAX_BOTO_TRIES: + boto_tries += 1 try: batch_client.register_job_definition( jobDefinitionName=job_def_name, @@ -658,7 +663,9 @@ def ex_aws_batch( ] # Submit the job - while True: + boto_tries = 0 + while boto_tries < MAX_BOTO_TRIES: + boto_tries += 1 try: job_submission = batch_client.submit_job( jobName=run_uuid, @@ -680,16 +687,19 @@ def ex_aws_batch( job_submission_id )) while True: - try: - job_status = batch_client.describe_jobs( - jobs=[job_submission_id] - ).get('jobs')[0] - if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': - break - time.sleep(10) - except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") - time.sleep(10) + boto_tries = 0 + while boto_tries < MAX_BOTO_TRIES: + boto_tries += 1 + try: + job_status = batch_client.describe_jobs( + jobs=[job_submission_id] + ).get('jobs')[0] + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") + time.sleep(10) + if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': + break + time.sleep(10) if job_status.get('status') != 'SUCCEEDED': raise Exception("Batch job failed. {}".format( job_status.get('statusReason') From a7e166930d4d8c3f18d4709dd64d8eca39440b6c Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 3 May 2018 12:02:46 -0700 Subject: [PATCH 39/88] Pre pull commit --- .gitignore | 0 LICENSE | 0 MANIFEST.in | 0 README.md | 0 README.rst | 0 examples/clean.sh | 0 examples/data/a.txt | 0 examples/data/acgt.txt | 0 examples/data/afolder/hej.txt | 0 examples/data/c.txt | 0 examples/data/g.txt | 0 examples/data/t.txt | 0 examples/example1.py | 0 examples/example2_ngi.py | 0 examples/example3_components.py | 0 examples/example3_workflow.py | 0 examples/example4_multiwf.py | 0 sciluigi/__init__.py | 0 sciluigi/audit.py | 0 sciluigi/dependencies.py | 0 sciluigi/interface.py | 0 sciluigi/parameter.py | 0 sciluigi/slurm.py | 0 sciluigi/task.py | 0 sciluigi/util.py | 0 sciluigi/workflow.py | 0 setup.py | 0 test/test_dependencies.py | 0 test/test_paramval.py | 0 tools/.logging.conf.template | 0 tools/init_projdir.py | 0 31 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 .gitignore mode change 100755 => 100644 LICENSE mode change 100755 => 100644 MANIFEST.in mode change 100755 => 100644 README.md mode change 100755 => 100644 README.rst mode change 100755 => 100644 examples/clean.sh mode change 100755 => 100644 examples/data/a.txt mode change 100755 => 100644 examples/data/acgt.txt mode change 100755 => 100644 examples/data/afolder/hej.txt mode change 100755 => 100644 examples/data/c.txt mode change 100755 => 100644 examples/data/g.txt mode change 100755 => 100644 examples/data/t.txt mode change 100755 => 100644 examples/example1.py mode change 100755 => 100644 examples/example2_ngi.py mode change 100755 => 100644 examples/example3_components.py mode change 100755 => 100644 examples/example3_workflow.py mode change 100755 => 100644 examples/example4_multiwf.py mode change 100755 => 100644 sciluigi/__init__.py mode change 100755 => 100644 sciluigi/audit.py mode change 100755 => 100644 sciluigi/dependencies.py mode change 100755 => 100644 sciluigi/interface.py mode change 100755 => 100644 sciluigi/parameter.py mode change 100755 => 100644 sciluigi/slurm.py mode change 100755 => 100644 sciluigi/task.py mode change 100755 => 100644 sciluigi/util.py mode change 100755 => 100644 sciluigi/workflow.py mode change 100755 => 100644 setup.py mode change 100755 => 100644 test/test_dependencies.py mode change 100755 => 100644 test/test_paramval.py mode change 100755 => 100644 tools/.logging.conf.template mode change 100755 => 100644 tools/init_projdir.py diff --git a/.gitignore b/.gitignore old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 diff --git a/README.rst b/README.rst old mode 100755 new mode 100644 diff --git a/examples/clean.sh b/examples/clean.sh old mode 100755 new mode 100644 diff --git a/examples/data/a.txt b/examples/data/a.txt old mode 100755 new mode 100644 diff --git a/examples/data/acgt.txt b/examples/data/acgt.txt old mode 100755 new mode 100644 diff --git a/examples/data/afolder/hej.txt b/examples/data/afolder/hej.txt old mode 100755 new mode 100644 diff --git a/examples/data/c.txt b/examples/data/c.txt old mode 100755 new mode 100644 diff --git a/examples/data/g.txt b/examples/data/g.txt old mode 100755 new mode 100644 diff --git a/examples/data/t.txt b/examples/data/t.txt old mode 100755 new mode 100644 diff --git a/examples/example1.py b/examples/example1.py old mode 100755 new mode 100644 diff --git a/examples/example2_ngi.py b/examples/example2_ngi.py old mode 100755 new mode 100644 diff --git a/examples/example3_components.py b/examples/example3_components.py old mode 100755 new mode 100644 diff --git a/examples/example3_workflow.py b/examples/example3_workflow.py old mode 100755 new mode 100644 diff --git a/examples/example4_multiwf.py b/examples/example4_multiwf.py old mode 100755 new mode 100644 diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py old mode 100755 new mode 100644 diff --git a/sciluigi/audit.py b/sciluigi/audit.py old mode 100755 new mode 100644 diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py old mode 100755 new mode 100644 diff --git a/sciluigi/interface.py b/sciluigi/interface.py old mode 100755 new mode 100644 diff --git a/sciluigi/parameter.py b/sciluigi/parameter.py old mode 100755 new mode 100644 diff --git a/sciluigi/slurm.py b/sciluigi/slurm.py old mode 100755 new mode 100644 diff --git a/sciluigi/task.py b/sciluigi/task.py old mode 100755 new mode 100644 diff --git a/sciluigi/util.py b/sciluigi/util.py old mode 100755 new mode 100644 diff --git a/sciluigi/workflow.py b/sciluigi/workflow.py old mode 100755 new mode 100644 diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 diff --git a/test/test_dependencies.py b/test/test_dependencies.py old mode 100755 new mode 100644 diff --git a/test/test_paramval.py b/test/test_paramval.py old mode 100755 new mode 100644 diff --git a/tools/.logging.conf.template b/tools/.logging.conf.template old mode 100755 new mode 100644 diff --git a/tools/init_projdir.py b/tools/init_projdir.py old mode 100755 new mode 100644 From b319c4df4393f1bb51c926eff2ec2bc63a92d3bd Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 3 May 2018 12:03:07 -0700 Subject: [PATCH 40/88] Pre-pull commit --- examples/sciluigi | 1 + 1 file changed, 1 insertion(+) create mode 120000 examples/sciluigi diff --git a/examples/sciluigi b/examples/sciluigi new file mode 120000 index 0000000..79eca18 --- /dev/null +++ b/examples/sciluigi @@ -0,0 +1 @@ +../sciluigi \ No newline at end of file From 8bcb241450085ca399a7309a300942d09efd66fe Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 3 May 2018 12:25:32 -0700 Subject: [PATCH 41/88] Switch boto_max_tries to a containerinfo variable --- sciluigi/containertask.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 13b2006..e802d77 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -48,6 +48,7 @@ class ContainerInfo(): aws_s3_scratch_loc = None aws_batch_job_queue = None aws_secrets_loc = None + aws_boto_max_tries = None # SLURM specifics slurm_partition = None @@ -63,6 +64,7 @@ def __init__(self, aws_s3_scratch_loc='', aws_batch_job_queue='', aws_secrets_loc=os.path.expanduser('~/.aws'), + aws_boto_max_tries=10, slurm_partition='' ): self.engine = engine @@ -76,6 +78,7 @@ def __init__(self, self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue self.aws_secrets_loc = aws_secrets_loc + self.aws_boto_max_tries = aws_boto_max_tries self.slurm_partition = slurm_partition @@ -407,8 +410,7 @@ def ex_aws_batch( inputs_mode='ro', outputs_mode='rw', input_mount_point='/mnt/inputs', - output_mount_point='/mnt/outputs', - MAX_BOTO_TRIES=10): + output_mount_point='/mnt/outputs'): """ Run a command in a container using AWS batch. Handles uploading of files to / from s3 and then into the container. @@ -439,7 +441,7 @@ def ex_aws_batch( UF = set() # Set of UF lines to be added. Format is container_path::bucket_file_uri DF = set() # Set of UF lines to be added. Format is bucket_file_uri::container_path::mode needs_s3_download = set() # Set of Tuples. (s3::/bucket/key, target) - s3_temp_to_be_deleted = set() # S3 paths to be deleted. + s3_temp_to_be_deleted = set() # S3 paths to be deleted. # Group our output targets by schema output_target_maps = self.map_targets_to_container( @@ -568,7 +570,7 @@ def ex_aws_batch( # Search to see if this job is ALREADY defined. boto_tries = 0 - while boto_tries < MAX_BOTO_TRIES: + while boto_tries < self.containerinfo.aws_boto_max_tries: boto_tries += 1 try: job_def_search = batch_client.describe_job_definitions( @@ -610,7 +612,7 @@ def ex_aws_batch( }) boto_tries = 0 - while boto_tries < MAX_BOTO_TRIES: + while boto_tries < self.containerinfo.aws_boto_max_tries: boto_tries += 1 try: batch_client.register_job_definition( @@ -664,7 +666,7 @@ def ex_aws_batch( # Submit the job boto_tries = 0 - while boto_tries < MAX_BOTO_TRIES: + while boto_tries < self.containerinfo.aws_boto_max_tries: boto_tries += 1 try: job_submission = batch_client.submit_job( @@ -687,16 +689,12 @@ def ex_aws_batch( job_submission_id )) while True: - boto_tries = 0 - while boto_tries < MAX_BOTO_TRIES: - boto_tries += 1 - try: - job_status = batch_client.describe_jobs( - jobs=[job_submission_id] - ).get('jobs')[0] - except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") - time.sleep(10) + try: + job_status = batch_client.describe_jobs( + jobs=[job_submission_id] + ).get('jobs')[0] + except ClientError: + log.info("Caught boto3 client error, sleeping for 10 seconds") if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': break time.sleep(10) From 0fbf5dac23f9660da6df6da7c7defb6856c9f3b3 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 3 May 2018 12:53:19 -0700 Subject: [PATCH 42/88] Fixed a minor bug when checking batch job status fails --- sciluigi/containertask.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index c587c0e..cdc2bfb 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -703,7 +703,8 @@ def ex_aws_batch( jobs=[job_submission_id] ).get('jobs')[0] except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") + job_status = {} + log.info("Caught boto3 client error") if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': break time.sleep(10) From 329452f63120f7a3c49221b4938bdca4f0d04d06 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 10 May 2018 14:28:13 -0700 Subject: [PATCH 43/88] Add custom aws batch job name param --- sciluigi/containertask.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index cdc2bfb..7c144a7 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -47,6 +47,7 @@ class ContainerInfo(): aws_jobRoleArn = None aws_s3_scratch_loc = None aws_batch_job_queue = None + aws_batch_job_name = None aws_secrets_loc = None aws_boto_max_tries = None @@ -63,6 +64,7 @@ def __init__(self, aws_jobRoleArn='', aws_s3_scratch_loc='', aws_batch_job_queue='', + aws_batch_job_name=None, aws_secrets_loc=os.path.expanduser('~/.aws'), aws_boto_max_tries=10, slurm_partition=None, @@ -77,6 +79,7 @@ def __init__(self, self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue + self.aws_batch_job_name = aws_batch_job_name self.aws_secrets_loc = aws_secrets_loc self.aws_boto_max_tries = aws_boto_max_tries @@ -438,7 +441,13 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') - run_uuid = str(uuid.uuid4()) + if self.containerinfo.aws_batch_job_name is None: + run_uuid = str(uuid.uuid4()) + else: + run_uuid = "{}-{}".format( + self.containerinfo.aws_batch_job_name, + str(uuid.uuid4()) + ) # We need mappings for both to and from S3 and from S3 to within the container # <-> <-> From 3e4f1fe100400cc24a45e55e69a74ccf1bb410db Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 10 May 2018 14:52:49 -0700 Subject: [PATCH 44/88] Change to "_prefix" to match functionality --- sciluigi/containertask.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7c144a7..a21a77b 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -47,7 +47,7 @@ class ContainerInfo(): aws_jobRoleArn = None aws_s3_scratch_loc = None aws_batch_job_queue = None - aws_batch_job_name = None + aws_batch_job_prefix = None aws_secrets_loc = None aws_boto_max_tries = None @@ -64,7 +64,7 @@ def __init__(self, aws_jobRoleArn='', aws_s3_scratch_loc='', aws_batch_job_queue='', - aws_batch_job_name=None, + aws_batch_job_prefix=None, aws_secrets_loc=os.path.expanduser('~/.aws'), aws_boto_max_tries=10, slurm_partition=None, @@ -79,7 +79,7 @@ def __init__(self, self.aws_jobRoleArn = aws_jobRoleArn self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue - self.aws_batch_job_name = aws_batch_job_name + self.aws_batch_job_prefix = aws_batch_job_prefix self.aws_secrets_loc = aws_secrets_loc self.aws_boto_max_tries = aws_boto_max_tries @@ -441,11 +441,11 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') - if self.containerinfo.aws_batch_job_name is None: + if self.containerinfo.aws_batch_job_prefix is None: run_uuid = str(uuid.uuid4()) else: run_uuid = "{}-{}".format( - self.containerinfo.aws_batch_job_name, + self.containerinfo.aws_batch_job_prefix, str(uuid.uuid4()) ) From 3820e9d9aac464057ba241bec727095b33b3078c Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 11 May 2018 16:25:43 -0700 Subject: [PATCH 45/88] Mild changes --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index a21a77b..8a005e0 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -532,8 +532,8 @@ def ex_aws_batch( } ) else: - # Have to use BytesIO because luigi targets can ONLY be opened in - # binary mode, and upload / download fileobj can ONLY accept binary mode files + # Have to use BytesIO because luigi targets can ONLY be opened in + # text mode, and upload / download fileobj can ONLY accept binary mode files # For reasons. s3_client.upload_fileobj( Fileobj=io.BytesIO( From 44dde95724bad39bee4ec504c9707c2090567552 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 14 May 2018 09:53:59 -0700 Subject: [PATCH 46/88] Fixed bug involving need to make directories when batch -> local FS --- sciluigi/containertask.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 8a005e0..59f0581 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -555,7 +555,9 @@ def ex_aws_batch( container_paths[k], target.path, )) - else: # NOT ending in S3. Will need to download to target and make a temp destination in s3 + else: + # NOT ending in S3. Will need to download to target + # and make a temp destination in s3 for k, target in schema_targets['targets'].items(): s3_temp_loc = os.path.join( self.containerinfo.aws_s3_scratch_loc, @@ -725,6 +727,14 @@ def ex_aws_batch( # Now we need to copy back from S3 to our local filesystem for (s3_loc, target) in needs_s3_download: if target.scheme == 'file': + try: + os.makedirs( + os.path.dirname( + target.path + ) + ) + except FileExistsError: + pass s3_client.download_file( Bucket=urlsplit(s3_loc).netloc, Key=urlsplit(s3_loc).path.strip('/'), From 38942c9b1e35a6e1d2deb8a97ed5a728f96b00b9 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 17 May 2018 12:28:30 -0700 Subject: [PATCH 47/88] More logging for boto3 ClientError --- sciluigi/containertask.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index a21a77b..2dcb0fc 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -597,8 +597,10 @@ def ex_aws_batch( jobDefinitionName=job_def_name, ) break - except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") + except ClientError as e: + log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( + e.response['Error']['Message'] + )) time.sleep(10) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now @@ -650,8 +652,10 @@ def ex_aws_batch( } ) break - except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") + except ClientError as e: + log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( + e.response['Error']['Message'] + )) time.sleep(10) else: # Already registered aws_job_def = job_def_search['jobDefinitions'][0] @@ -698,8 +702,10 @@ def ex_aws_batch( }, ) break - except ClientError: - log.info("Caught boto3 client error, sleeping for 10 seconds") + except ClientError as e: + log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( + e.response['Error']['Message'] + )) time.sleep(10) job_submission_id = job_submission.get('jobId') log.info("Running {} under jobId {}".format( @@ -711,7 +717,10 @@ def ex_aws_batch( job_status = batch_client.describe_jobs( jobs=[job_submission_id] ).get('jobs')[0] - except ClientError: + except ClientError as e: + log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( + e.response['Error']['Message'] + )) job_status = {} log.info("Caught boto3 client error") if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': From 7bedecc5c089899cee47b3311e307bb3943e5a43 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 25 May 2018 09:07:35 -0700 Subject: [PATCH 48/88] Parameterized aws batch job poll time. --- sciluigi/containertask.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 59f0581..24cc94d 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -48,8 +48,10 @@ class ContainerInfo(): aws_s3_scratch_loc = None aws_batch_job_queue = None aws_batch_job_prefix = None + aws_batch_job_pol_sec = None aws_secrets_loc = None aws_boto_max_tries = None + aws_batch_job_poll_sec = None # SLURM specifics slurm_partition = None @@ -65,6 +67,7 @@ def __init__(self, aws_s3_scratch_loc='', aws_batch_job_queue='', aws_batch_job_prefix=None, + aws_batch_job_poll_sec=10, aws_secrets_loc=os.path.expanduser('~/.aws'), aws_boto_max_tries=10, slurm_partition=None, @@ -80,6 +83,7 @@ def __init__(self, self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue self.aws_batch_job_prefix = aws_batch_job_prefix + self.aws_batch_job_pol_sec = aws_batch_job_poll_sec self.aws_secrets_loc = aws_secrets_loc self.aws_boto_max_tries = aws_boto_max_tries @@ -601,7 +605,7 @@ def ex_aws_batch( break except ClientError: log.info("Caught boto3 client error, sleeping for 10 seconds") - time.sleep(10) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now log.info( @@ -654,7 +658,7 @@ def ex_aws_batch( break except ClientError: log.info("Caught boto3 client error, sleeping for 10 seconds") - time.sleep(10) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) else: # Already registered aws_job_def = job_def_search['jobDefinitions'][0] log.info('Found job definition for {} with job role {} under name {}'.format( @@ -702,7 +706,7 @@ def ex_aws_batch( break except ClientError: log.info("Caught boto3 client error, sleeping for 10 seconds") - time.sleep(10) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) job_submission_id = job_submission.get('jobId') log.info("Running {} under jobId {}".format( container_command_list, @@ -718,7 +722,7 @@ def ex_aws_batch( log.info("Caught boto3 client error") if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': break - time.sleep(10) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) if job_status.get('status') != 'SUCCEEDED': raise Exception("Batch job failed. {}".format( job_status.get('statusReason') From 699165b17fb4ed2f33c36c1b7795ba72e1987864 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 25 May 2018 09:19:34 -0700 Subject: [PATCH 49/88] removed extra param --- sciluigi/containertask.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index dbf4bfb..bd5c2b1 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -608,7 +608,7 @@ def ex_aws_batch( log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( e.response['Error']['Message'] )) - time.sleep(self.containerinfo.aws_batch_job_poll_sec)) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) if len(job_def_search['jobDefinitions']) == 0: # Not registered yet. Register it now @@ -665,7 +665,7 @@ def ex_aws_batch( log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( e.response['Error']['Message'] )) - time.sleep(self.containerinfo.aws_batch_job_poll_sec)) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) else: # Already registered aws_job_def = job_def_search['jobDefinitions'][0] @@ -717,7 +717,7 @@ def ex_aws_batch( log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( e.response['Error']['Message'] )) - time.sleep(self.containerinfo.aws_batch_job_poll_sec)) + time.sleep(self.containerinfo.aws_batch_job_poll_sec) job_submission_id = job_submission.get('jobId') log.info("Running {} under jobId {}".format( From 27dfc9879a7e6275d49ac71b023251a0525216b8 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 25 May 2018 10:02:02 -0700 Subject: [PATCH 50/88] commonprefix -> commonpath to fix a bug --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index bd5c2b1..d9eef34 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -158,7 +158,7 @@ def map_targets_to_container(self, targets): return_dict[scheme] = {} # Get only the targets for this scheme scheme_targets = {i: t for i, t in targets.items() if t.scheme == scheme} - common_prefix = os.path.commonprefix( + common_prefix = os.path.commonpath( [os.path.dirname( os.path.join( urlsplit(t.path).netloc, From 611f53f31676f950695bceb38e89d2e4785b4006 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 25 May 2018 15:40:42 -0700 Subject: [PATCH 51/88] Fixed bug in pol vs poll --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index d9eef34..043e745 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -48,7 +48,7 @@ class ContainerInfo(): aws_s3_scratch_loc = None aws_batch_job_queue = None aws_batch_job_prefix = None - aws_batch_job_pol_sec = None + aws_batch_job_poll_sec = None aws_secrets_loc = None aws_boto_max_tries = None aws_batch_job_poll_sec = None @@ -83,7 +83,7 @@ def __init__(self, self.aws_s3_scratch_loc = aws_s3_scratch_loc self.aws_batch_job_queue = aws_batch_job_queue self.aws_batch_job_prefix = aws_batch_job_prefix - self.aws_batch_job_pol_sec = aws_batch_job_poll_sec + self.aws_batch_job_poll_sec = aws_batch_job_poll_sec self.aws_secrets_loc = aws_secrets_loc self.aws_boto_max_tries = aws_boto_max_tries From 637c5762c851e35dce173bf6cd776afdbaa1c9dd Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 27 Jun 2018 16:28:25 -0700 Subject: [PATCH 52/88] Updated the readme to document how containers work now. --- README.md | 133 ++++++++++++++++++++++++++---------------------------- 1 file changed, 65 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index af2efd8..094b5f3 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,13 @@ ![SciLuigi Logo](http://i.imgur.com/2aMT04J.png) -* ***UPDATE, Nov, 2016: A paper with the motivation and design decisions behind SciLuigi [now available](http://dx.doi.org/10.1186/s13321-016-0179-6)*** - * If you use SciLuigi in your research, please cite it like this:
- Lampa S, Alvarsson J, Spjuth O. Towards agile large-scale predictive modelling in drug discovery with flow-based programming design principles. *J Cheminform*. 2016. doi:[10.1186/s13321-016-0179-6](http://dx.doi.org/10.1186/s13321-016-0179-6). -* ***A Virtual Machine with a realistic, runnable, example workflow in a Jupyter Notebook, is available [here](https://github.com/pharmbio/bioimg-sciluigi-casestudy)*** -* ***Watch a 10 minute screencast going through the basics of using SciLuigi [here](https://www.youtube.com/watch?v=gkKUWskRbjw)*** -* ***See a poster describing the motivations behind SciLuigi [here](http://dx.doi.org/10.13140/RG.2.1.1143.6246)*** +# Scientific Luigi +(SciLuigi for short) is a light-weight wrapper library around [Spotify](http://spotify.com)'s [Luigi](http://github.com/spotify/luigi) workflow system that aims to make writing scientific workflows more fluent, flexible and modular. -Scientific Luigi (SciLuigi for short) is a light-weight wrapper library around [Spotify](http://spotify.com)'s [Luigi](http://github.com/spotify/luigi) -workflow system that aims to make writing scientific workflows more fluent, flexible and -modular. +Luigi is a flexile and fun-to-use library. It has turned out though that its default way of defining dependencies by hard coding them in each task's requires() function is not optimal for some type of workflows common e.g. in bioinformatics where multiple inputs and outputs, complex dependencies, and the need to quickly try different workflow connectivity in an explorative fashion is central to the way of working. -Luigi is a flexile and fun-to-use library. It has turned out though -that its default way of defining dependencies by hard coding them in each task's -requires() function is not optimal for some type of workflows common e.g. in bioinformatics where multiple inputs and outputs, complex dependencies, -and the need to quickly try different workflow connectivity in an explorative fashion is central to the way of working. +Sciluigi can (optionally) complete tasks by running commands in containers. This can improve reproducibility (as a container can be portably run on the cloud, on private clusters, or for lightweight tasks on a users computer via docker) and ease of use (not requiring the end-user of a workflow to install finicky bioinformatics software while avoiding the problem of conflicting dependencies). Sciluigi can facilitate running software that only runs on linux when hosted on a Windows or Macintosh computer, and leverage cloud computing resources (AWS batch). -SciLuigi was designed to solve some of these problems, by providing the following -"features" over vanilla Luigi: +SciLuigi was designed to solve some of these problems, by providing the following "features" over vanilla Luigi: - Separation of dependency definitions from the tasks themselves, for improved modularity and composability. @@ -30,39 +20,21 @@ SciLuigi was designed to solve some of these problems, by providing the followin - Inputs and outputs are connected with an intuitive "single-assignment syntax". - "Good default" high-level logging of workflow tasks and execution times. - Produces an easy to read audit-report with high level information per task. -- Integration with some HPC workload managers. - (So far only [SLURM](http://slurm.schedmd.com/) though). +- Integration with some HPC workload managers, currently AWS batch. +- Integration with cloud-bucket stores (currently AWS S3). +- When containers are used, one can prototype and test a task on test data locally + with docker, and then run it on cloud resources (e.g. AWS batch) when confronted + with a large dataset with only a change in a single parameter. Because of Luigi's easy-to-use API these changes have been implemented as a very thin layer on top of luigi's own API with no changes at all to the luigi core, which means that you can continue leveraging the work already being put into maintaining and further developing luigi by the team at Spotify and others. -## Workflow code quick demo - -***For a brief 10 minute screencast going through the basics below, see [this link](https://www.youtube.com/watch?v=gkKUWskRbjw)*** - -Just to give a quick feel for how a workflow definition might look like in SciLuigi, check this code example -(implementation of tasks hidden here for brevity. See Usage section further below for more details): - -```python -import sciluigi as sl - -class MyWorkflow(sl.WorkflowTask): - def workflow(self): - # Initialize tasks: - foowrt = self.new_task('foowriter', MyFooWriter) - foorpl = self.new_task('fooreplacer', MyFooReplacer, - replacement='bar') - - # Here we do the *magic*: Connecting outputs to inputs: - foorpl.in_foo = foowrt.out_foo - - # Return the last task(s) in the workflow chain. - return foorpl -``` - -That's it! And again, see the "usage" section just below for a more detailed description of getting to this! +* ***UPDATE, Nov, 2016: A paper with the motivation and design decisions behind SciLuigi [now available](http://dx.doi.org/10.1186/s13321-016-0179-6)*** + * If you use SciLuigi in your research, please cite it like this:
+ Lampa S, Alvarsson J, Spjuth O. Towards agile large-scale predictive modelling in drug discovery with flow-based programming design principles. *J Cheminform*. 2016. doi:[10.1186/s13321-016-0179-6](http://dx.doi.org/10.1186/s13321-016-0179-6).* +* ***See a poster describing the motivations behind SciLuigi [here](http://dx.doi.org/10.13140/RG.2.1.1143.6246)*** ## Support: Getting help @@ -72,6 +44,8 @@ Please use the [issue queue](https://github.com/pharmbio/sciluigi/issues) for an - Python 2.7 - 3.4 - Luigi 1.3.x - 2.0.1 +- boto3 > 1.7.10 +- docker >= 3.2.1 ## Install @@ -129,31 +103,37 @@ Then, you need to define some tasks that can be done in this workflow. This is done by: -1. Creating a subclass of `sciluigi.Task` (or `sciluigi.SlurmTask` if you want Slurm support) +1. Creating a subclass of `sciluigi.ContainerTask` 2. Adding fields named `in_` for each input, in the new task class -3. Define methods named `out_()` for each output, that return `sciluigi.TargetInfo` objects. (sciluigi.TargetInfo is initialized with a reference to the task object itself - typically `self` - and a path name, where upstream tasks paths can be used). +3. Define methods named `out_()` for each output, that return `sciluigi.ContainerTargetInfo` objects. sciluigi.TargetInfo is initialized with a reference to the task object itself - typically `self` - and an url. ContainerTargets can silently change where they are hosted, including on local filesystems (/path/to/file.txt) or in buckets (s3://bucket/key/file.txt). 4. Define luigi parameters to the task. -5. Implement the `run()` method of the task. +5. Define the container engine and parameters that the container will be run. +6. Implement the `run()` method of the task. #### Example: -Let's define a simple task that just writes "foo" to a file named `foo.txt`: +##### Let's define a simple task that just writes "foo" to a file named `foo.txt`. + +For this very simple task, we do not need a container, and thus we can base the task on the sciluigi.Task class. We do use the sciluigi.ContainerTargetInfo class here. The path/url we gave is for the local filesystem. If instead we gave an S3 bucket/key url (s3://bucket/foo.txt), this class will handle uploading (and later downloading if needed) from S3. ```python class MyFooWriter(sciluigi.Task): # We have no inputs here # Define outputs: def out_foo(self): - return sciluigi.TargetInfo(self, 'foo.txt') + return sciluigi.ContainerTargetInfo(self, 'foo.txt') def run(self): with self.out_foo().open('w') as foofile: foofile.write('foo\n') ``` -Then, let's create a task that replaces "foo" with "bar": +##### Then, let's create a task that replaces "foo" with "bar": + +This task will be run in a container, in this case, the alpine linux container. This way (say if we are running sciluigi on a Windows machine without sed), we can still run the command wihtout fuss. In fact, no matter where this is hosted, the task will reliably run in the docker container the same way. ```python -class MyFooReplacer(sciluigi.Task): +class MyFooReplacer(sciluigi.ContainerTask): + container = 'alpine:3.7' replacement = sciluigi.Parameter() # Here, we take as a parameter # what to replace foo with. # Here we have one input, a "foo file": @@ -162,24 +142,27 @@ class MyFooReplacer(sciluigi.Task): def out_replaced(self): # As the path to the returned target(info), we # use the path of the foo file: - return sciluigi.TargetInfo(self, self.in_foo().path + '.bar.txt') + return sciluigi.ContainerTargetInfo(self, self.in_foo().path + '.bar.txt') def run(self): - with self.in_foo().open() as in_f: - with self.out_replaced().open('w') as out_f: - # Here we see that we use the parameter self.replacement: - out_f.write(in_f.read().replace('foo', self.replacement)) + self.ex( + command="sed 's/foo/$repl/g' $infile > $outfile", + input_targets={ + 'infile': self.in_foo(), + }, + output_targets={ + 'outfile': self.out_replaced(), + }, + extra_parameters={ + 'repl': self.replacement, + } + ) ``` +Several things have happened here: -The last lines, we could have instead written using the command-line `sed` utility, available in linux, by calling it on the commandline, with the built-in `ex()` method: - -```python - def run(self): - # Here, we use the in-built self.ex() method, to execute commands: - self.ex("sed 's/foo/{repl}/g' {inpath} > {outpath}".format( - repl=self.replacement, - inpath=self.in_foo().path, - outpath=self.out_replaced().path)) -``` +- We've specified which container the command should be run in. This can be any docker-style URI +- The command now uses the [python string template system](https://docs.python.org/3.5/library/string.html#string.Template) to replace parameters, input and output targets +- We use a ContainerTargetInfo in place of a ContainerTarget. This replacement target takes a URL, and can seemlessly handle +local files, S3 buckets (and in the future SFTP, etc). ### Write the workflow definition @@ -189,7 +172,8 @@ We do this by: 1. Instantiating the tasks, using the `self.new_task(, , *args, **kwargs)` method, of the workflow task. 2. Connect the tasks together, by pointing the right `out_*` method to the right `in_*` field. -3. Returning the last task in the chain, from the workflow method. +3. Giving some basic parameters as to which sort of container engine should be used for the container task via defining a `sciluigi.ContainerInfo` class. +4. Returning the last task in the chain, from the workflow method. #### Example: @@ -197,9 +181,20 @@ We do this by: import sciluigi class MyWorkflow(sciluigi.WorkflowTask): def workflow(self): - foowriter = self.new_task('foowriter', MyFooWriter) - fooreplacer = self.new_task('fooreplacer', MyFooReplacer, - replacement='bar') + foowriter = self.new_task( + 'foowriter', + MyFooWriter + ) + fooreplacer = self.new_task( + 'fooreplacer', + MyFooReplacer, + containerinfo=sciluigi.ContainerInfo( + vcpu=1, + mem=512, + engine='docker', + ), + replacement='bar' + ) # Here we do the *magic*: Connecting outputs to inputs: fooreplacer.in_foo = foowriter.out_foo @@ -269,6 +264,8 @@ If you run into any of these problems, you might be interested in a new workflow Changelog --------- +- 0.9.4.ct + - Support for containerized tasks and `ContainerTargetInfo` - 0.9.3b4 - Support for Python 3 (Thanks to @jeffcjohnson for contributing this!). - Bug fixes. From 004dcd6a469a7f665a37fca2e011681eb29dc621 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 27 Jun 2018 16:32:01 -0700 Subject: [PATCH 53/88] Added dependencies --- README.md | 2 +- setup.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 094b5f3..6c8c9f7 100644 --- a/README.md +++ b/README.md @@ -264,7 +264,7 @@ If you run into any of these problems, you might be interested in a new workflow Changelog --------- -- 0.9.4.ct +- 0.9.6b7_ct - Support for containerized tasks and `ContainerTargetInfo` - 0.9.3b4 - Support for Python 3 (Thanks to @jeffcjohnson for contributing this!). diff --git a/setup.py b/setup.py index 02e3f24..8548728 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( name='sciluigi', - version='0.9.6b7', + version='0.9.6b7_ct', description='Helper library for writing dynamic, flexible workflows in luigi', long_description=long_description, author='Samuel Lampa', @@ -30,7 +30,9 @@ 'sciluigi', ], install_requires=[ - 'luigi' + 'luigi', + 'boto3', + 'mongo', ], classifiers=[ 'Development Status :: 4 - Beta', From 352b63eddcbcb7ff7566e74d8efff2c994e96f5b Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 7 Aug 2018 16:06:33 -0400 Subject: [PATCH 54/88] Permissions changes. setup.py modified to better fit actual dependencies --- LICENSE | 0 MANIFEST.in | 0 README.md | 0 README.rst | 0 examples/clean.sh | 0 examples/data/a.txt | 0 examples/data/acgt.txt | 0 examples/data/afolder/hej.txt | 0 examples/data/c.txt | 0 examples/data/g.txt | 0 examples/data/t.txt | 0 examples/example1.py | 0 examples/example2_ngi.py | 0 examples/example3_components.py | 0 examples/example3_workflow.py | 0 examples/example4_multiwf.py | 0 examples/sciluigi | 1 - sciluigi/__init__.py | 0 sciluigi/audit.py | 0 sciluigi/dependencies.py | 0 sciluigi/interface.py | 0 sciluigi/parameter.py | 0 sciluigi/slurm.py | 0 sciluigi/task.py | 0 sciluigi/util.py | 0 sciluigi/workflow.py | 0 setup.py | 1 + test/test_dependencies.py | 0 test/test_paramval.py | 0 tools/.logging.conf.template | 0 tools/init_projdir.py | 0 31 files changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 LICENSE mode change 100644 => 100755 MANIFEST.in mode change 100644 => 100755 README.md mode change 100644 => 100755 README.rst mode change 100644 => 100755 examples/clean.sh mode change 100644 => 100755 examples/data/a.txt mode change 100644 => 100755 examples/data/acgt.txt mode change 100644 => 100755 examples/data/afolder/hej.txt mode change 100644 => 100755 examples/data/c.txt mode change 100644 => 100755 examples/data/g.txt mode change 100644 => 100755 examples/data/t.txt mode change 100644 => 100755 examples/example1.py mode change 100644 => 100755 examples/example2_ngi.py mode change 100644 => 100755 examples/example3_components.py mode change 100644 => 100755 examples/example3_workflow.py mode change 100644 => 100755 examples/example4_multiwf.py delete mode 120000 examples/sciluigi mode change 100644 => 100755 sciluigi/__init__.py mode change 100644 => 100755 sciluigi/audit.py mode change 100644 => 100755 sciluigi/dependencies.py mode change 100644 => 100755 sciluigi/interface.py mode change 100644 => 100755 sciluigi/parameter.py mode change 100644 => 100755 sciluigi/slurm.py mode change 100644 => 100755 sciluigi/task.py mode change 100644 => 100755 sciluigi/util.py mode change 100644 => 100755 sciluigi/workflow.py mode change 100644 => 100755 setup.py mode change 100644 => 100755 test/test_dependencies.py mode change 100644 => 100755 test/test_paramval.py mode change 100644 => 100755 tools/.logging.conf.template mode change 100644 => 100755 tools/init_projdir.py diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/examples/clean.sh b/examples/clean.sh old mode 100644 new mode 100755 diff --git a/examples/data/a.txt b/examples/data/a.txt old mode 100644 new mode 100755 diff --git a/examples/data/acgt.txt b/examples/data/acgt.txt old mode 100644 new mode 100755 diff --git a/examples/data/afolder/hej.txt b/examples/data/afolder/hej.txt old mode 100644 new mode 100755 diff --git a/examples/data/c.txt b/examples/data/c.txt old mode 100644 new mode 100755 diff --git a/examples/data/g.txt b/examples/data/g.txt old mode 100644 new mode 100755 diff --git a/examples/data/t.txt b/examples/data/t.txt old mode 100644 new mode 100755 diff --git a/examples/example1.py b/examples/example1.py old mode 100644 new mode 100755 diff --git a/examples/example2_ngi.py b/examples/example2_ngi.py old mode 100644 new mode 100755 diff --git a/examples/example3_components.py b/examples/example3_components.py old mode 100644 new mode 100755 diff --git a/examples/example3_workflow.py b/examples/example3_workflow.py old mode 100644 new mode 100755 diff --git a/examples/example4_multiwf.py b/examples/example4_multiwf.py old mode 100644 new mode 100755 diff --git a/examples/sciluigi b/examples/sciluigi deleted file mode 120000 index 79eca18..0000000 --- a/examples/sciluigi +++ /dev/null @@ -1 +0,0 @@ -../sciluigi \ No newline at end of file diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py old mode 100644 new mode 100755 diff --git a/sciluigi/audit.py b/sciluigi/audit.py old mode 100644 new mode 100755 diff --git a/sciluigi/dependencies.py b/sciluigi/dependencies.py old mode 100644 new mode 100755 diff --git a/sciluigi/interface.py b/sciluigi/interface.py old mode 100644 new mode 100755 diff --git a/sciluigi/parameter.py b/sciluigi/parameter.py old mode 100644 new mode 100755 diff --git a/sciluigi/slurm.py b/sciluigi/slurm.py old mode 100644 new mode 100755 diff --git a/sciluigi/task.py b/sciluigi/task.py old mode 100644 new mode 100755 diff --git a/sciluigi/util.py b/sciluigi/util.py old mode 100644 new mode 100755 diff --git a/sciluigi/workflow.py b/sciluigi/workflow.py old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 8548728..5cdce26 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ 'luigi', 'boto3', 'mongo', + 'docker', ], classifiers=[ 'Development Status :: 4 - Beta', diff --git a/test/test_dependencies.py b/test/test_dependencies.py old mode 100644 new mode 100755 diff --git a/test/test_paramval.py b/test/test_paramval.py old mode 100644 new mode 100755 diff --git a/tools/.logging.conf.template b/tools/.logging.conf.template old mode 100644 new mode 100755 diff --git a/tools/init_projdir.py b/tools/init_projdir.py old mode 100644 new mode 100755 From 5621e16de31c90fe59fe499203bd02ba106725d1 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 15 Aug 2018 14:00:44 -0400 Subject: [PATCH 55/88] Working version for singularity on PBS. --- sciluigi/containertask.py | 161 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 043e745..61f95e0 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -5,12 +5,15 @@ import subprocess import docker import os +import stat from string import Template import shlex import uuid import time import io from botocore.exceptions import ClientError +import tempfile +import datetime try: from urlparse import urlsplit, urljoin @@ -52,6 +55,9 @@ class ContainerInfo(): aws_secrets_loc = None aws_boto_max_tries = None aws_batch_job_poll_sec = None + pbs_account = None + pbs_queue = None + pbs_scriptpath = None # SLURM specifics slurm_partition = None @@ -71,6 +77,9 @@ def __init__(self, aws_secrets_loc=os.path.expanduser('~/.aws'), aws_boto_max_tries=10, slurm_partition=None, + pbs_account='', + pbs_queue='', + pbs_scriptpath=None, ): self.engine = engine self.vcpu = vcpu @@ -89,6 +98,10 @@ def __init__(self, self.slurm_partition = slurm_partition + self.pbs_account = pbs_account + self.pbs_queue = pbs_queue + self.pbs_scriptpath = pbs_scriptpath + def __str__(self): """ Return string of this information @@ -200,6 +213,12 @@ def mounts_CP_DF_UF( file_output_common_prefix = None if 'file' in out_schema: file_output_common_prefix = output_target_maps['file']['common_prefix'] + # Be sure the output directory exists + try: + os.makedirs(os.path.abspath(output_target_maps['file']['common_prefix'])) + except FileExistsError: + # No big deal + pass mounts[os.path.abspath(output_target_maps['file']['common_prefix'])] = { 'bind': os.path.join(output_mount_point, 'file'), 'mode': outputs_mode @@ -272,6 +291,18 @@ def make_fs_name(self, uri): keepcharacters = ('.', '_') return "".join(c if (c.isalnum() or c in keepcharacters) else '_' for c in name).rstrip() + def timeout_to_walltime(self): + td = datetime.timedelta(minutes=self.containerinfo.timeout) + hours = td.days * 7 + td.seconds//3600 + if hours > 99: + hours = 99 + minutes = (td.seconds - (td.seconds//3600)*3600) // 60 + seconds = 0 + return "{:02d}:{:02d}:{:02d}".format( + hours, + minutes, + seconds + ) def ex( self, command, @@ -315,9 +346,139 @@ def ex( input_mount_point, output_mount_point ) + elif self.containerinfo.engine == 'singularity_pbs': + return self.ex_singularity_pbs( + command, + input_targets, + output_targets, + extra_params, + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point + ) else: raise Exception("Container engine {} is invalid".format(self.containerinfo.engine)) + def ex_singularity_pbs( + self, + command, + input_targets={}, + output_targets={}, + extra_params={}, + inputs_mode='ro', + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs'): + """ + Run command in the container using singularity on slurm, with mountpoints + command is assumed to be in python template substitution format + """ + mounts, container_paths, DF, UF = self.mounts_CP_DF_UF( + input_targets, + output_targets, + inputs_mode, + outputs_mode, + input_mount_point, + output_mount_point) + + img_location = os.path.join( + self.containerinfo.container_cache, + "{}.singularity.simg".format(self.make_fs_name(self.container)) + ) + log.info("Looking for singularity image {}".format(img_location)) + if not os.path.exists(img_location): + log.info("No image at {} Creating....".format(img_location)) + try: + os.makedirs(os.path.dirname(img_location)) + except FileExistsError: + # No big deal + pass + # Singularity is dumb and can only pull images to the working dir + # So, get our current working dir. + cwd = os.getcwd() + # Move to our target dir + os.chdir(os.path.dirname(img_location)) + # Attempt to pull our image + pull_proc = subprocess.run( + [ + 'singularity', + 'pull', + '--name', + os.path.basename(img_location), + "docker://{}".format(self.container) + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + log.info(pull_proc) + # Move back + os.chdir(cwd) + + template_dict = container_paths.copy() + template_dict.update(extra_params) + command = Template(command).substitute(template_dict) + + log.info( + "Attempting to run {} in {}".format( + command, + self.container + ) + ) + + command_list = [ + 'singularity', 'exec', '--contain', + ] + for mp in mounts: + command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] + command_list.append(img_location) + command_list += ['bucket_command_wrapper', '-c', shlex.quote(command)] + for uf in UF: + command_list += ['-UF', uf] + for df in DF: + command_list += ['-DF', df] + + # Write the command to a script for PBS / QSUB to consume + + with tempfile.NamedTemporaryFile( + mode='wt', + dir=self.containerinfo.pbs_scriptpath, + delete=False) as script_h: + # Make executable, readable, and writable by owner + os.chmod( + script_h.name, + stat.S_IRUSR | + stat.S_IWUSR | + stat.S_IXUSR + ) + script_h.write("!#/bin/bash\n") + script_h.write(" ".join(command_list)) + script_h.close() + command_proc = subprocess.run( + [ + 'qsub', + '-I', + '-x', + '-V', + '-A', self.containerinfo.pbs_account, + '-q', self.containerinfo.pbs_queue, + '-l', + 'nodes={}:ppn={},mem={}mb,walltime={}'.format( + 1, + self.containerinfo.vcpu, + int(self.containerinfo.mem), + self.containerinfo.timeout * 60 + ), + script_h.name + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + os.unlink(script_h.name) + log.info(command_proc.stdout) + if command_proc.stderr: + log.warn(command_proc.stderr) + def ex_singularity_slurm( self, command, From 8f08f62153da82111c1ca55d7405130dd223914d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 28 Aug 2018 10:04:00 -0400 Subject: [PATCH 56/88] Many changes to optimize with PBS-singularity --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 61f95e0..7910ee5 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -451,7 +451,7 @@ def ex_singularity_pbs( stat.S_IWUSR | stat.S_IXUSR ) - script_h.write("!#/bin/bash\n") + script_h.write("#!/bin/bash\n") script_h.write(" ".join(command_list)) script_h.close() command_proc = subprocess.run( From 0ad11acc56b69f6c8cbe631caaba32214ef6797e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 5 Sep 2018 14:26:18 -0400 Subject: [PATCH 57/88] Fixes for PBS --- sciluigi/containertask.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 7910ee5..f18c25a 100755 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -425,9 +425,9 @@ def ex_singularity_pbs( self.container ) ) - + working_dir = tempfile.mkdtemp() command_list = [ - 'singularity', 'exec', '--contain', + 'singularity', 'exec', '--contain', '--workdir', working_dir ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -530,7 +530,7 @@ def ex_singularity_slurm( stdout=subprocess.PIPE, stderr=subprocess.PIPE ) - print(pull_proc) + log.info(pull_proc) # Move back os.chdir(cwd) From feab1e63777c3381c1700358287191910bf9af31 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 6 Sep 2018 12:04:34 -0400 Subject: [PATCH 58/88] Added example for ContainerTask as well as an example Dockerfile to make a container --- examples/Dockerfile | 20 +++ examples/example-containertask-1.py | 209 ++++++++++++++++++++++++++++ sciluigi/containertask.py | 4 +- 3 files changed, 231 insertions(+), 2 deletions(-) create mode 100644 examples/Dockerfile create mode 100755 examples/example-containertask-1.py mode change 100755 => 100644 sciluigi/containertask.py diff --git a/examples/Dockerfile b/examples/Dockerfile new file mode 100644 index 0000000..d07c024 --- /dev/null +++ b/examples/Dockerfile @@ -0,0 +1,20 @@ +# sciluigi-example +# +# VERSION 0.1.0__bcw.0.3.0 + +FROM ubuntu:16.04 +# Create some mount points in the container for use by bucket-command-wrapper +RUN mkdir -p /mnt/inputs/file && mkdir -p /mnt/outputs/file && mkdir /scratch && mkdir /working +# Install at least python3 (used by BCW). It's OK to change the specific version of python3 used. +RUN apt-get update && apt-get install -y \ +python3>=3.5.1-3 \ +python3-pip>=3.5.1-3 +# Since we are ONLY installing python3 link to it to make it the default python +RUN ln -s /usr/bin/python3 /usr/bin/python +# Install bucket_command_wrapper via pip, along with boto3 / awscli if we want to use AWS at all +RUN pip3 install \ +awscli>=1.15.14 \ +boto3>=1.7.14 \ +bucket_command_wrapper==0.3.0 + +# Feel free to make this more useful by installing software, etc diff --git a/examples/example-containertask-1.py b/examples/example-containertask-1.py new file mode 100755 index 0000000..68217b6 --- /dev/null +++ b/examples/example-containertask-1.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 + +import logging +import luigi +import sciluigi as sl +import argparse +import os +from subprocess import call + +log = logging.getLogger('sciluigi-interface') + +# ------------------------------------------------------------------------ +# Workflow class(es) +# ------------------------------------------------------------------------ + + +class MyWorkflow(sl.WorkflowTask): + # Here are some parameters to define how we want to run our container + engine = sl.Parameter() + aws_secrets_loc = sl.Parameter() + # Only when using AWS_batch + jobRoleArn = sl.Parameter(default="") + s3_scratch_loc = sl.Parameter(default="") + batch_job_queue = sl.Parameter(default="") + + def workflow(self): + rawdata = self.new_task('rawdata', RawData) + + # Run first without a container + atot = self.new_task( + 'atot', + AToT) + atot.in_data = rawdata.out_rawdata + + # And now in a container! + # To run in a container, we have to + # specify which engine, and parameters we need + # This is done through a ContainerInfo class + # We will initialize via the parameters we recieved + # from the command line + test_containerinfo = sl.ContainerInfo( + engine=self.engine, + vcpu=1, # Number of vCPU to request + mem=256, # Memory in MB + timeout=5, # time in minutes + aws_secrets_loc=self.aws_secrets_loc, + aws_jobRoleArn=self.jobRoleArn, + aws_s3_scratch_loc=self.s3_scratch_loc, + aws_batch_job_queue=self.batch_job_queue, + ) + # Now actually start the task. + # Note: This allows different instances of the same task + # to use different engines, queues, etc as needed + atot_in_container = self.new_task( + 'atot_in_container', + AToT_ContainerTask, + containerinfo=test_containerinfo, + ) + atot_in_container.in_data = rawdata.out_rawdata + return (atot, atot_in_container) + +# ------------------------------------------------------------------------ +# Task classes +# ------------------------------------------------------------------------ + + +class RawData(sl.ExternalTask): + # It's perfectly fine to combine local, external and container tasks + # all in one workflow. + def out_rawdata(self): + return sl.ContainerTargetInfo(self, 'data/acgt.txt') + + +class AToT(sl.Task): + # Here is the non-containerized version of this task + in_data = None + + def out_replatot(self): + return sl.TargetInfo(self, self.in_data().path + '.atot') + + # ------------------------------------------------ + + def run(self): + cmd = 'cat ' + self.in_data().path + ' | sed "s/A/T/g" > ' + self.out_replatot().path + log.info("COMMAND TO EXECUTE: " + cmd) + call(cmd, shell=True) + + +class AToT_ContainerTask(sl.ContainerTask): + # Here is the containerized version of this task. + # In this simple example, there isn't much advantage of running in a container + # But when dealing with specialized software (requiring complex and brittle dependencies) + # or with heavy tasks needing big harware to run, there is an advantage. + + # ALL ContainerTasks must specify which container is to be used + container = 'golob/sciluigi-example:0.1.0__bcw.0.3.0' + + # Dependencies (inputs) are the same as in a non-containerized task + in_data = None + + def out_replatot(self): + # ContainerTargetInfo will take care of shifting files to and from + # cloud providers (S3 at this time) and your local filesystems + return sl.ContainerTargetInfo(self, self.in_data().path + '.container.atot') + + # ------------------------------------------------ + + def run(self): + # ContainerTasks use the python string template system to handle inputs and outputs + # Same command as above, but with template placeholders $inFile for in and $outFile + cmd = 'cat $inFile | sed "s/A/T/g" > $outFile' + self.ex( + command=cmd, + input_targets={ # A dictionary with the key being the template placeholder + 'inFile': self.in_data(), # Value is a ContainerTarget + }, + output_targets={ + 'outFile': self.out_replatot() # Same drill for outputs + }, + extra_params={} # Optional dict of other placeholders to fill in the command string + ) + + +# Run this file as script +# ------------------------------------------------------------------------ + +if __name__ == '__main__': + # Depending on the container engine used, you must specify some basic settings + # This can be done (as here) via CLI settings, a config file, or even hard-wired + # into scripts. + + parser = argparse.ArgumentParser(description=""" + Containertask example for sciluigi""") + subparsers = parser.add_subparsers( + help='Which container engine to use', + dest='engine', + required=True, + ) + # If we are going to shuffle to-from AWS-S3 we need to know secrets + parser.add_argument( + '--aws-secrets-loc', + help="""Where are the AWS secrets located""", + default=os.path.expanduser('~/.aws'), + metavar='~/.aws' + ) + # The simplest case is docker; all one needs for docker is to have it installed. + docker_parser = subparsers.add_parser('docker') + + # AWS-batch has a few options that must be specified to work + # Including which account, queue, and a directory to store temporary scripts. + aws_parser = subparsers.add_parser("aws_batch") + aws_parser.add_argument( + '--jobRoleArn', + help="""Job role to use when submitting to batch""", + required=True, + metavar='arn:aws:iam::12345:role/somerole' + ) + aws_parser.add_argument( + '--s3-scratch-loc', + help="""Temporary S3 location to transiently keep input/output files. + format: s3://bucket/key/prefix/""", + required=True, + metavar='s3://bucket/key/prefix/to/temp/loc/' + ) + aws_parser.add_argument( + '--batch-job-queue', + help="""To which batch queue should the jobs be submitted?""", + required=True, + metavar='some_queue_name' + ) + # PBS has a few options that must be specified to work + # Including which account, queue, and distinct from AWS, + # a directory to store temporary scripts AND singularity containers; + # these directories must be on a shared file system visible to nodes + pbs_parser = subparsers.add_parser("pbs") + pbs_parser.add_argument( + '--container_cache', '-cc', + help="""Location to store temporary singularity containers for pbs / slurm. + Must be on a shared file system to work properly.""", + required=True, + ) + pbs_parser.add_argument( + '--account', + help="""Account to use for PBS job submission""", + required=True, + ) + pbs_parser.add_argument( + '--queue', + help="""Into which PBS queue should the jobs be submitted.""", + required=True, + ) + pbs_parser.add_argument( + '--scriptpath', + help="""Location on a shared file system to store temporary scripts""", + required=True, + ) + + args = parser.parse_args() + # Extract these parameters to the arguments for our workflow + args_list = [ + "--{}={}".format(k.replace('_', '-'), v) + for k, v in vars(args).items() + ] + print(args_list) + sl.run( + local_scheduler=True, + main_task_cls=MyWorkflow, + cmdline_args=args_list, + ) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py old mode 100755 new mode 100644 index f18c25a..250eb2a --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -335,7 +335,7 @@ def ex( input_mount_point, output_mount_point ) - elif self.containerinfo.engine == 'singularity_slurm': + elif self.containerinfo.engine == 'slurm': return self.ex_singularity_slurm( command, input_targets, @@ -346,7 +346,7 @@ def ex( input_mount_point, output_mount_point ) - elif self.containerinfo.engine == 'singularity_pbs': + elif self.containerinfo.engine == 'pbs': return self.ex_singularity_pbs( command, input_targets, From 81e99c386e1694d3d61cdff0ce200d2b2bea1743 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 18 Sep 2018 12:16:36 -0400 Subject: [PATCH 59/88] Working example plus new class to poll AWS --- examples/example-containertask-1.py | 12 ++++- sciluigi/AWSBatchTaskWatcher.py | 77 +++++++++++++++++++++++++++++ sciluigi/containertask.py | 2 - 3 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 sciluigi/AWSBatchTaskWatcher.py diff --git a/examples/example-containertask-1.py b/examples/example-containertask-1.py index 68217b6..5a33ecc 100755 --- a/examples/example-containertask-1.py +++ b/examples/example-containertask-1.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +# This is an example of how the ContainerTask and ContainerTargetInfo +# classes extend sciluigi, and allow commands to be run in containers +# seamlessly on different container engines / HPC systems +# Start, ironically, at the bottom and work your way up! + import logging import luigi import sciluigi as sl @@ -91,6 +96,7 @@ class AToT_ContainerTask(sl.ContainerTask): # In this simple example, there isn't much advantage of running in a container # But when dealing with specialized software (requiring complex and brittle dependencies) # or with heavy tasks needing big harware to run, there is an advantage. + # This task will run identically locally via docker, on AWS, or via PBS. # ALL ContainerTasks must specify which container is to be used container = 'golob/sciluigi-example:0.1.0__bcw.0.3.0' @@ -107,7 +113,9 @@ def out_replatot(self): def run(self): # ContainerTasks use the python string template system to handle inputs and outputs - # Same command as above, but with template placeholders $inFile for in and $outFile + # Same command as above, but with template placeholders $inFile for in and $outFile. + # This often works out neater than the more complex string combinations as above + # in the non-containerized task. cmd = 'cat $inFile | sed "s/A/T/g" > $outFile' self.ex( command=cmd, @@ -201,7 +209,7 @@ def run(self): "--{}={}".format(k.replace('_', '-'), v) for k, v in vars(args).items() ] - print(args_list) + sl.run( local_scheduler=True, main_task_cls=MyWorkflow, diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py new file mode 100644 index 0000000..97121d5 --- /dev/null +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -0,0 +1,77 @@ +# Class to monitor and wait on AWS Batch Jobs + +import boto3 +import multiprocessing as mp +import time +import logging + + +class AWSBatchTaskWatcher(): + COMPLETED_JOB_STATES = { + 'SUCCEEDED', + 'FAILED', + 'DOESNOTEXIST' + } + POLLING_DELAY_SEC = 10 + JOB_WAIT_SECS = 1 + + def pollJobState(self): + while True: + jobIDs_needing_update = [ + jID for jID, state in self.jobStateDict.items() + if state not in self.COMPLETED_JOB_STATES + ] + if len(jobIDs_needing_update) > 0: + update_result = self.batch_client.describe_jobs( + jobs=jobIDs_needing_update + ) + update_result_jobs = update_result.get('jobs', []) + updated_job_status = { + j['jobId']: j['status'] + for j in update_result_jobs + } + jobIdsWithoutResult = list(set(jobIDs_needing_update) - set(updated_job_status.keys())) + updated_job_status.update({ + jID: "DOESNOTEXIST" + for jID in jobIdsWithoutResult + }) + self.jobStateDict.update(updated_job_status) + + time.sleep(self.POLLING_DELAY_SEC) + + def waitOnJob(self, jobID): + # Works by adding this jobID to the dict if it does not exist + if jobID not in self.jobStateDict: + self.log.info("Adding jobId {} to our list".format(jobID)) + self.jobStateDict[jobID] = None + # And then waiting for the polling child process to update the job status + while self.jobStateDict[jobID] not in self.COMPLETED_JOB_STATES: + time.sleep(self.JOB_WAIT_SECS) + # Implicitly our job has reached a completed state + if self.jobStateDict[jobID] == 'DOESNOTEXIST': + self.log.warning("JobID {} did not exist on batch".format(jobID)) + + def __init__( + self, + session_options={}): + # Logging first: + self.log = logging.getLogger('AWSBatchTaskWatcher') + self.log.setLevel(logging.INFO) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + self.log.addHandler(console_handler) + # BOTO3 / Batch client + self.session = boto3.session(session_options) + self.batch_client = self.session.client( + 'batch' + ) + # Use the multiprocessing manager to create a job state dict + # that can safely be shared among processes + self.manager = mp.Manager() + self.jobStateDict = self.manager.dict() + # Start a child process to poll batch for job status + self.jobStatePoller = mp.Process(target=self.pollJobState) + + def __del__(self): + # Explicitly stop the polling process when this class is destroyed. + self.jobStatePoller.terminate() diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 250eb2a..774a35d 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -630,7 +630,6 @@ def ex_aws_batch( output_target_maps = self.map_targets_to_container( output_targets, ) - out_schema = set(output_target_maps.keys()) # Make our container paths for schema, schema_targets in output_target_maps.items(): for k, relpath in schema_targets['relpaths'].items(): @@ -644,7 +643,6 @@ def ex_aws_batch( input_target_maps = self.map_targets_to_container( input_targets, ) - in_schema = set(input_target_maps.keys()) # Make our container paths for schema, schema_targets in input_target_maps.items(): for k, relpath in schema_targets['relpaths'].items(): From d63f558e190d8a367d28fcc2749e157c44efddaa Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 18 Sep 2018 13:01:24 -0400 Subject: [PATCH 60/88] Working with a few bits of fuss batch poller --- examples/example-containertask-1.py | 2 +- sciluigi/AWSBatchTaskWatcher.py | 59 +++++++++++++++++++---------- sciluigi/containertask.py | 35 ++++++++--------- 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/examples/example-containertask-1.py b/examples/example-containertask-1.py index 5a33ecc..ae1d451 100755 --- a/examples/example-containertask-1.py +++ b/examples/example-containertask-1.py @@ -115,7 +115,7 @@ def run(self): # ContainerTasks use the python string template system to handle inputs and outputs # Same command as above, but with template placeholders $inFile for in and $outFile. # This often works out neater than the more complex string combinations as above - # in the non-containerized task. + # in the non-containerized task. cmd = 'cat $inFile | sed "s/A/T/g" > $outFile' self.ex( command=cmd, diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index 97121d5..becc25e 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -10,6 +10,8 @@ class AWSBatchTaskWatcher(): COMPLETED_JOB_STATES = { 'SUCCEEDED', 'FAILED', + # A state I've added for jobs that no longer exist on batch. + # This can be for older jobs whose status is deleted from AWS 'DOESNOTEXIST' } POLLING_DELAY_SEC = 10 @@ -17,12 +19,18 @@ class AWSBatchTaskWatcher(): def pollJobState(self): while True: + self.__log__.debug("Poll tick. {} jobs".format( + len(self.__jobStateDict__)) + ) jobIDs_needing_update = [ - jID for jID, state in self.jobStateDict.items() + jID for jID, state in self.__jobStateDict__.items() if state not in self.COMPLETED_JOB_STATES ] if len(jobIDs_needing_update) > 0: - update_result = self.batch_client.describe_jobs( + self.__log__.info("Polling AWS about {} jobs".format( + len(jobIDs_needing_update)) + ) + update_result = self.__batch_client__.describe_jobs( jobs=jobIDs_needing_update ) update_result_jobs = update_result.get('jobs', []) @@ -35,43 +43,54 @@ def pollJobState(self): jID: "DOESNOTEXIST" for jID in jobIdsWithoutResult }) - self.jobStateDict.update(updated_job_status) + self.__jobStateDict__.update(updated_job_status) time.sleep(self.POLLING_DELAY_SEC) def waitOnJob(self, jobID): # Works by adding this jobID to the dict if it does not exist - if jobID not in self.jobStateDict: - self.log.info("Adding jobId {} to our list".format(jobID)) - self.jobStateDict[jobID] = None + if jobID not in self.__jobStateDict__: + self.__log__.info("Adding jobId {} to our list".format(jobID)) + self.__jobStateDict__[jobID] = None # And then waiting for the polling child process to update the job status - while self.jobStateDict[jobID] not in self.COMPLETED_JOB_STATES: + while self.__jobStateDict__[jobID] not in self.COMPLETED_JOB_STATES: + self.__log__.debug("Still waiting on {}".format(jobID)) time.sleep(self.JOB_WAIT_SECS) # Implicitly our job has reached a completed state - if self.jobStateDict[jobID] == 'DOESNOTEXIST': - self.log.warning("JobID {} did not exist on batch".format(jobID)) + self.__log__.info("JobID {} returned with status {}".format( + jobID, + self.__jobStateDict__[jobID] + )) + if self.__jobStateDict__[jobID] == 'DOESNOTEXIST': + self.__log__.warning("JobID {} did not exist on batch".format(jobID)) + return self.__jobStateDict__[jobID] def __init__( self, - session_options={}): + session_options={}, + debug=False): # Logging first: - self.log = logging.getLogger('AWSBatchTaskWatcher') - self.log.setLevel(logging.INFO) + self.__log__ = logging.getLogger('AWSBatchTaskWatcher') + self.__log__.setLevel(logging.INFO) console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - self.log.addHandler(console_handler) + if debug: + console_handler.setLevel(logging.DEBUG) + else: + console_handler.setLevel(logging.INFO) + self.__log__.addHandler(console_handler) # BOTO3 / Batch client - self.session = boto3.session(session_options) - self.batch_client = self.session.client( + self.__session__ = boto3.Session(session_options) + self.__batch_client__ = self.__session__.client( 'batch' ) # Use the multiprocessing manager to create a job state dict # that can safely be shared among processes - self.manager = mp.Manager() - self.jobStateDict = self.manager.dict() + self.__manager__ = mp.Manager() + self.__jobStateDict__ = self.__manager__.dict() # Start a child process to poll batch for job status - self.jobStatePoller = mp.Process(target=self.pollJobState) + self.__jobStatePoller__ = mp.Process(target=self.pollJobState) + self.__jobStatePoller__.start() def __del__(self): # Explicitly stop the polling process when this class is destroyed. - self.jobStatePoller.terminate() + self.__jobStatePoller__.terminate() diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 774a35d..6394bcc 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -23,6 +23,10 @@ # Setup logging log = logging.getLogger('sciluigi-interface') +# Create a holder variable for an AWSBatchTaskWatcher +# So we can ONLY load if needed +batch_task_watcher = None + class ContainerInfo(): """ @@ -605,6 +609,11 @@ def ex_aws_batch( import boto3 batch_client = boto3.client('batch') s3_client = boto3.client('s3') + # And batch_task_watcher if not already done + global batch_task_watcher + if batch_task_watcher is None: + from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher + batch_task_watcher = AWSBatchTaskWatcher() if self.containerinfo.aws_batch_job_prefix is None: run_uuid = str(uuid.uuid4()) @@ -883,24 +892,16 @@ def ex_aws_batch( container_command_list, job_submission_id )) - while True: - try: - job_status = batch_client.describe_jobs( - jobs=[job_submission_id] - ).get('jobs')[0] - except ClientError as e: - log.info("Caught boto3 client error, sleeping for 10 seconds ({})".format( - e.response['Error']['Message'] - )) - job_status = {} - log.info("Caught boto3 client error") - if job_status.get('status') == 'SUCCEEDED' or job_status.get('status') == 'FAILED': - break - time.sleep(self.containerinfo.aws_batch_job_poll_sec) - if job_status.get('status') != 'SUCCEEDED': - raise Exception("Batch job failed. {}".format( - job_status.get('statusReason') + # Wait for the job here + job_final_status = batch_task_watcher.waitOnJob( + job_submission_id + ) + if job_final_status != 'SUCCEEDED': + log.error("Job {} failed with status".format( + job_submission_id, + job_final_status )) + return # Implicit else we succeeded # Now we need to copy back from S3 to our local filesystem for (s3_loc, target) in needs_s3_download: From 6e9d3bf62e19e09555e4ff6383270dbb0f86cbad Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 18 Sep 2018 13:08:14 -0400 Subject: [PATCH 61/88] Working consolidation of polling for job status on AWS --- sciluigi/AWSBatchTaskWatcher.py | 57 ++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index becc25e..278ac98 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -19,38 +19,43 @@ class AWSBatchTaskWatcher(): def pollJobState(self): while True: - self.__log__.debug("Poll tick. {} jobs".format( - len(self.__jobStateDict__)) - ) - jobIDs_needing_update = [ - jID for jID, state in self.__jobStateDict__.items() - if state not in self.COMPLETED_JOB_STATES - ] - if len(jobIDs_needing_update) > 0: - self.__log__.info("Polling AWS about {} jobs".format( - len(jobIDs_needing_update)) + try: + self.__log__.debug("Poll tick. {} jobs".format( + len(self.__jobStateDict__)) ) - update_result = self.__batch_client__.describe_jobs( - jobs=jobIDs_needing_update - ) - update_result_jobs = update_result.get('jobs', []) - updated_job_status = { - j['jobId']: j['status'] - for j in update_result_jobs - } - jobIdsWithoutResult = list(set(jobIDs_needing_update) - set(updated_job_status.keys())) - updated_job_status.update({ - jID: "DOESNOTEXIST" - for jID in jobIdsWithoutResult - }) - self.__jobStateDict__.update(updated_job_status) + jobIDs_needing_update = [ + jID for jID, state in self.__jobStateDict__.items() + if state not in self.COMPLETED_JOB_STATES + ] + if len(jobIDs_needing_update) > 0: + self.__log__.info("Polling AWS about {} jobs".format( + len(jobIDs_needing_update)) + ) + update_result = self.__batch_client__.describe_jobs( + jobs=jobIDs_needing_update + ) + update_result_jobs = update_result.get('jobs', []) + updated_job_status = { + j['jobId']: j['status'] + for j in update_result_jobs + } + jobIdsWithoutResult = list(set(jobIDs_needing_update) - set(updated_job_status.keys())) + updated_job_status.update({ + jID: "DOESNOTEXIST" + for jID in jobIdsWithoutResult + }) + self.__jobStateDict__.update(updated_job_status) - time.sleep(self.POLLING_DELAY_SEC) + time.sleep(self.POLLING_DELAY_SEC) + except BrokenPipeError: + # Handle if the calling process ends, destroying the manager. + # We should terminate too + return def waitOnJob(self, jobID): # Works by adding this jobID to the dict if it does not exist if jobID not in self.__jobStateDict__: - self.__log__.info("Adding jobId {} to our list".format(jobID)) + self.__log__.info("Adding jobId {} to our watch list".format(jobID)) self.__jobStateDict__[jobID] = None # And then waiting for the polling child process to update the job status while self.__jobStateDict__[jobID] not in self.COMPLETED_JOB_STATES: From a6d288ec77e5c88b2778aa7b9347c6138271c981 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 28 Sep 2018 16:51:46 -0400 Subject: [PATCH 62/88] Added ability to read containerinfo from a config file --- sciluigi/containertask.py | 85 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 6394bcc..2e9b940 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -14,6 +14,7 @@ from botocore.exceptions import ClientError import tempfile import datetime +import configparser try: from urlparse import urlsplit, urljoin @@ -106,6 +107,90 @@ def __init__(self, self.pbs_queue = pbs_queue self.pbs_scriptpath = pbs_scriptpath + # Method to allow population from a config file + # Sparing the user from having to repeat this + def from_config( + self, + configfile_path=os.path.expanduser('~/.sciluigi/containerinfo.ini'), + section='DEFAULT'): + config = configparser.ConfigParser() + if not os.path.exists(configfile_path): + log.error( + """Could not find a sciluigi configuration file at {}""".format( + configfile_path) + ) + return + # Implicit else + config.read(configfile_path) + if section not in config.sections(): + log.error( + """Section {} not found in the sciluigi configuration file at {}""".format( + section, + configfile_path + ) + ) + return + # Implicit else, override values if the config value is not a blank string + config_values = config[section] + if config_values['engine'] != "": + self.engine = config_values['engine'] + if config_values['vcpu'] != "": + try: + self.vcpu = int(config_values['vcpu']) + except ValueError: + log.error("Could not convert vcpu {} to int".format(config_values['vcpu'])) + if config_values['mem'] != "": + try: + self.mem = int(config_values['mem']) + except ValueError: + log.error("Could not convert mem {} to int".format(config_values['mem'])) + if config_values['timeout'] != "": + try: + self.timeout = int(config_values['timeout']) + except ValueError: + log.error("Could not convert timeout {} to int".format(config_values['timeout'])) + + if config_values['container_cache'] != "": + self.container_cache = config_values['container_cache'] + + if config_values['aws_jobRoleArn'] != "": + self.aws_jobRoleArn = config_values['aws_jobRoleArn'] + if config_values['aws_s3_scratch_loc'] != "": + self.aws_s3_scratch_loc = config_values['aws_s3_scratch_loc'] + if config_values['aws_batch_job_queue'] != "": + self.aws_batch_job_queue = config_values['aws_batch_job_queue'] + if config_values['aws_batch_job_prefix'] != "": + self.aws_batch_job_prefix = config_values['aws_batch_job_prefix'] + if config_values['aws_batch_job_poll_sec'] != "": + try: + self.aws_batch_job_poll_sec = int(config_values['aws_batch_job_poll_sec']) + except ValueError: + log.error("Could not convert batch poll time of {} to int".format( + config_values['aws_batch_job_poll_sec']) + ) + if config_values['aws_secrets_loc'] != "": + self.aws_secrets_loc = config_values['aws_secrets_loc'] + + if config_values['aws_boto_max_tries'] != "": + try: + self.aws_boto_max_tries = int(config_values['aws_boto_max_tries']) + except ValueError: + log.error("Could not convert boto max tries {} to int".format( + config_values['aws_boto_max_tries']) + ) + + if config_values['slurm_partition'] != "": + self.slurm_partition = config_values['slurm_partition'] + + if config_values['pbs_account'] != "": + self.pbs_account = config_values['pbs_account'] + + if config_values['pbs_queue'] != "": + self.pbs_queue = config_values['pbs_queue'] + + if config_values['pbs_scriptpath'] != "": + self.pbs_scriptpath = config_values['pbs_scriptpath'] + def __str__(self): """ Return string of this information From 8a1ace873c2c0c93e06bd150d63d93ed002c94de Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 7 Nov 2018 11:10:08 -0500 Subject: [PATCH 63/88] Fix of singularity run commands --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 2e9b940..307841f 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -516,7 +516,7 @@ def ex_singularity_pbs( ) working_dir = tempfile.mkdtemp() command_list = [ - 'singularity', 'exec', '--contain', '--workdir', working_dir + 'singularity', 'exec', '--contain', '-e', '--workdir', working_dir ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -633,7 +633,7 @@ def ex_singularity_slurm( )) command_list = [ - 'singularity', 'exec', '--contain', '--scratch', '/scratch' + 'singularity', 'exec', '-e','--contain', '--scratch', '/scratch' ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] From 75d6b765e3ac85d4c26a2dabfe8c19d4a0403f5d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 15 Nov 2018 16:01:54 -0500 Subject: [PATCH 64/88] Corrected loading of mounts from ini --- sciluigi/containertask.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 307841f..df1f855 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -150,6 +150,12 @@ def from_config( except ValueError: log.error("Could not convert timeout {} to int".format(config_values['timeout'])) + if config_values['mounts'] != "": + try: + json.loads(config_values['mounts']) + except ValueError: + log.error("Could not convert {} to a dict".format(config_values['mounts'])) + if config_values['container_cache'] != "": self.container_cache = config_values['container_cache'] From 906445c36ebc4118caf2d91fd0fc15db5e341b6c Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 19 Nov 2018 11:01:01 -0500 Subject: [PATCH 65/88] Reading of config from ini more robust --- sciluigi/containertask.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index df1f855..fbd3a55 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -132,52 +132,52 @@ def from_config( return # Implicit else, override values if the config value is not a blank string config_values = config[section] - if config_values['engine'] != "": + if config_values.get('engine', "") != "": self.engine = config_values['engine'] - if config_values['vcpu'] != "": + if config_values.get('vcpu', "") != "": try: self.vcpu = int(config_values['vcpu']) except ValueError: log.error("Could not convert vcpu {} to int".format(config_values['vcpu'])) - if config_values['mem'] != "": + if config_values.get('mem', "") != "": try: self.mem = int(config_values['mem']) except ValueError: log.error("Could not convert mem {} to int".format(config_values['mem'])) - if config_values['timeout'] != "": + if config_values.get('timeout', "") != "": try: self.timeout = int(config_values['timeout']) except ValueError: log.error("Could not convert timeout {} to int".format(config_values['timeout'])) - if config_values['mounts'] != "": + if config_values.get('mounts',"") != "": try: json.loads(config_values['mounts']) except ValueError: log.error("Could not convert {} to a dict".format(config_values['mounts'])) - if config_values['container_cache'] != "": + if config_values.get('container_cache', "") != "": self.container_cache = config_values['container_cache'] - if config_values['aws_jobRoleArn'] != "": + if config_values.get('aws_jobRoleArn', "") != "": self.aws_jobRoleArn = config_values['aws_jobRoleArn'] - if config_values['aws_s3_scratch_loc'] != "": + if config_values.get('aws_s3_scratch_loc', "") != "": self.aws_s3_scratch_loc = config_values['aws_s3_scratch_loc'] - if config_values['aws_batch_job_queue'] != "": + if config_values.get('aws_batch_job_queue', "") != "": self.aws_batch_job_queue = config_values['aws_batch_job_queue'] - if config_values['aws_batch_job_prefix'] != "": + if config_values.get('aws_batch_job_prefix', "") != "": self.aws_batch_job_prefix = config_values['aws_batch_job_prefix'] - if config_values['aws_batch_job_poll_sec'] != "": + if config_values.get('aws_batch_job_poll_sec', "") != "": try: self.aws_batch_job_poll_sec = int(config_values['aws_batch_job_poll_sec']) except ValueError: log.error("Could not convert batch poll time of {} to int".format( config_values['aws_batch_job_poll_sec']) ) - if config_values['aws_secrets_loc'] != "": + if config_values('aws_secrets_loc', "") != "": self.aws_secrets_loc = config_values['aws_secrets_loc'] - if config_values['aws_boto_max_tries'] != "": + if config_values.get('aws_boto_max_tries', "") != "": try: self.aws_boto_max_tries = int(config_values['aws_boto_max_tries']) except ValueError: @@ -185,16 +185,16 @@ def from_config( config_values['aws_boto_max_tries']) ) - if config_values['slurm_partition'] != "": + if config_values.get('slurm_partition', "") != "": self.slurm_partition = config_values['slurm_partition'] - if config_values['pbs_account'] != "": + if config_values.get('pbs_account', "") != "": self.pbs_account = config_values['pbs_account'] - if config_values['pbs_queue'] != "": + if config_values.get('pbs_queue', "") != "": self.pbs_queue = config_values['pbs_queue'] - if config_values['pbs_scriptpath'] != "": + if config_values.get('pbs_scriptpath', "") != "": self.pbs_scriptpath = config_values['pbs_scriptpath'] def __str__(self): From 4e03a248d40394e419e352fece90ae8727408d4e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Mon, 19 Nov 2018 11:13:03 -0500 Subject: [PATCH 66/88] Fixed bug in readconfig --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index fbd3a55..727e377 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -174,7 +174,7 @@ def from_config( log.error("Could not convert batch poll time of {} to int".format( config_values['aws_batch_job_poll_sec']) ) - if config_values('aws_secrets_loc', "") != "": + if config_values.get('aws_secrets_loc', "") != "": self.aws_secrets_loc = config_values['aws_secrets_loc'] if config_values.get('aws_boto_max_tries', "") != "": From 438c91be8f3d759ae02a2f8c8030476977f5812f Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 20 Nov 2018 12:24:29 -0500 Subject: [PATCH 67/88] Some debugging of aws mounts --- sciluigi/containertask.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 727e377..8c62ea1 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -150,7 +150,7 @@ def from_config( except ValueError: log.error("Could not convert timeout {} to int".format(config_values['timeout'])) - if config_values.get('mounts',"") != "": + if config_values.get('mounts', "") != "": try: json.loads(config_values['mounts']) except ValueError: @@ -176,7 +176,7 @@ def from_config( ) if config_values.get('aws_secrets_loc', "") != "": self.aws_secrets_loc = config_values['aws_secrets_loc'] - + if config_values.get('aws_boto_max_tries', "") != "": try: self.aws_boto_max_tries = int(config_values['aws_boto_max_tries']) @@ -845,11 +845,11 @@ def ex_aws_batch( # Make a UUID based on the container / command job_def_name = "sl_containertask__{}".format( - uuid.uuid5( - uuid.NAMESPACE_URL, - self.container+self.containerinfo.aws_jobRoleArn+str(self.containerinfo.mounts) - ) + uuid.uuid5( + uuid.NAMESPACE_URL, + self.container + self.containerinfo.aws_jobRoleArn + str(self.containerinfo.mounts) ) + ) # Search to see if this job is ALREADY defined. boto_tries = 0 @@ -897,7 +897,8 @@ def ex_aws_batch( 'sourceVolume': name, 'readOnly': read_only, }) - + log.info("AWS Volumes: {}".format(str(aws_volumes))) + log.info("AWS mounts: {}".format(str(aws_mountPoints))) boto_tries = 0 while boto_tries < self.containerinfo.aws_boto_max_tries: boto_tries += 1 From eef7eae0a2dbaf7fc1a032249eb291f5e13f8ea4 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 20 Nov 2018 12:27:21 -0500 Subject: [PATCH 68/88] Switch job def --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 8c62ea1..6f8c7cb 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -847,7 +847,7 @@ def ex_aws_batch( job_def_name = "sl_containertask__{}".format( uuid.uuid5( uuid.NAMESPACE_URL, - self.container + self.containerinfo.aws_jobRoleArn + str(self.containerinfo.mounts) + self.container + str(self.containerinfo.mounts) + self.containerinfo.aws_jobRoleArn ) ) From 6d906ff774471d0e2bec3d9cb6524262da53e13d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 20 Nov 2018 12:31:44 -0500 Subject: [PATCH 69/88] Fixed loading of mounts from config --- sciluigi/containertask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 6f8c7cb..736949d 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -152,7 +152,7 @@ def from_config( if config_values.get('mounts', "") != "": try: - json.loads(config_values['mounts']) + self.mounts = json.loads(config_values['mounts']) except ValueError: log.error("Could not convert {} to a dict".format(config_values['mounts'])) From 99a477a0f685e2090f02bde0495f9be97bc03fe6 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 21 Nov 2018 09:53:10 -0500 Subject: [PATCH 70/88] A bit more logging around task watcher --- sciluigi/containertask.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 736949d..2d0e0cb 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -703,8 +703,11 @@ def ex_aws_batch( # And batch_task_watcher if not already done global batch_task_watcher if batch_task_watcher is None: + log.info("Creating new batch task watcher") from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher batch_task_watcher = AWSBatchTaskWatcher() + else: + log.info("Using existing batch task watcher") if self.containerinfo.aws_batch_job_prefix is None: run_uuid = str(uuid.uuid4()) From e1ff25b89f3311cbf7fb9966400b5de0cac6b6e5 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 21 Nov 2018 10:05:55 -0500 Subject: [PATCH 71/88] Moved batch_task_watcher global holder to __init__.py --- sciluigi/__init__.py | 4 ++++ sciluigi/containertask.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index e81acf3..5939086 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -49,3 +49,7 @@ from sciluigi.containertask import ContainerInfo from sciluigi.containertask import ContainerTask from sciluigi.containertask import ContainerHelpers + +# Create a holder variable for an AWSBatchTaskWatcher +# So we can ONLY load if needed +batch_task_watcher = None diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 2d0e0cb..ae6b53d 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -24,10 +24,6 @@ # Setup logging log = logging.getLogger('sciluigi-interface') -# Create a holder variable for an AWSBatchTaskWatcher -# So we can ONLY load if needed -batch_task_watcher = None - class ContainerInfo(): """ From 94bf2062232c73fff8b8cb4cb9f3467c971c3c36 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 21 Nov 2018 10:18:44 -0500 Subject: [PATCH 72/88] Moved global --- sciluigi/__init__.py | 3 --- sciluigi/interface.py | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index 5939086..97aaace 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -50,6 +50,3 @@ from sciluigi.containertask import ContainerTask from sciluigi.containertask import ContainerHelpers -# Create a holder variable for an AWSBatchTaskWatcher -# So we can ONLY load if needed -batch_task_watcher = None diff --git a/sciluigi/interface.py b/sciluigi/interface.py index cca0532..7a5d6dc 100755 --- a/sciluigi/interface.py +++ b/sciluigi/interface.py @@ -11,6 +11,7 @@ LOGFMT_SCILUIGI = '%(asctime)s %(levelname)8s SCILUIGI %(message)s' DATEFMT = '%Y-%m-%d %H:%M:%S' + def setup_logging(): ''' Set up SciLuigi specific logging @@ -51,12 +52,18 @@ def setup_logging(): setup_logging() +# Create a holder variable for an AWSBatchTaskWatcher +# So we can ONLY load if needed +batch_task_watcher = None + + def run(*args, **kwargs): ''' Forwarding luigi's run method ''' luigi.run(*args, **kwargs) + def run_local(*args, **kwargs): ''' Forwarding luigi's run method, with local scheduler From 59269c0300b832eefd020155eb6350d36c1bd8ca Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Wed, 21 Nov 2018 07:44:54 -0800 Subject: [PATCH 73/88] Moved batch watcher to module global --- sciluigi/__init__.py | 4 ++++ sciluigi/containertask.py | 8 +++++--- sciluigi/interface.py | 4 ---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index 97aaace..ed18385 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -50,3 +50,7 @@ from sciluigi.containertask import ContainerTask from sciluigi.containertask import ContainerHelpers +# Create a holder variable for an AWSBatchTaskWatcher +# So we can ONLY load if needed +batch_task_watcher = None + diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index ae6b53d..bc635c9 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -697,13 +697,15 @@ def ex_aws_batch( batch_client = boto3.client('batch') s3_client = boto3.client('s3') # And batch_task_watcher if not already done - global batch_task_watcher - if batch_task_watcher is None: + + if sciluigi.batch_task_watcher is None: log.info("Creating new batch task watcher") from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher - batch_task_watcher = AWSBatchTaskWatcher() + sciluigi.batch_task_watcher = AWSBatchTaskWatcher() + batch_task_watcher = sciluigi.batch_task_watcher else: log.info("Using existing batch task watcher") + batch_task_watcher = sciluigi.batch_task_watcher if self.containerinfo.aws_batch_job_prefix is None: run_uuid = str(uuid.uuid4()) diff --git a/sciluigi/interface.py b/sciluigi/interface.py index 7a5d6dc..206f7c5 100755 --- a/sciluigi/interface.py +++ b/sciluigi/interface.py @@ -52,10 +52,6 @@ def setup_logging(): setup_logging() -# Create a holder variable for an AWSBatchTaskWatcher -# So we can ONLY load if needed -batch_task_watcher = None - def run(*args, **kwargs): ''' From 0dc340ec83095186787b8700833522074ec90421 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 27 Nov 2018 08:49:17 -0800 Subject: [PATCH 74/88] Change to loading of batch task watcher --- sciluigi/AWSBatchTaskWatcher.py | 2 +- sciluigi/__init__.py | 7 +++++-- sciluigi/containertask.py | 12 ++---------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index 278ac98..db0214f 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -28,7 +28,7 @@ def pollJobState(self): if state not in self.COMPLETED_JOB_STATES ] if len(jobIDs_needing_update) > 0: - self.__log__.info("Polling AWS about {} jobs".format( + self.__log__.debug("Polling AWS about {} jobs".format( len(jobIDs_needing_update)) ) update_result = self.__batch_client__.describe_jobs( diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index ed18385..96076b4 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -49,8 +49,11 @@ from sciluigi.containertask import ContainerInfo from sciluigi.containertask import ContainerTask from sciluigi.containertask import ContainerHelpers +from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher -# Create a holder variable for an AWSBatchTaskWatcher -# So we can ONLY load if needed batch_task_watcher = None +batch_task_watcher = AWSBatchTaskWatcher() + +def getBatchTaskWatcher(): + return batch_task_watcher diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index bc635c9..b58a30f 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -696,16 +696,8 @@ def ex_aws_batch( import boto3 batch_client = boto3.client('batch') s3_client = boto3.client('s3') - # And batch_task_watcher if not already done - - if sciluigi.batch_task_watcher is None: - log.info("Creating new batch task watcher") - from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher - sciluigi.batch_task_watcher = AWSBatchTaskWatcher() - batch_task_watcher = sciluigi.batch_task_watcher - else: - log.info("Using existing batch task watcher") - batch_task_watcher = sciluigi.batch_task_watcher + # And batch_task_watcher from module + batch_task_watcher = sciluigi.getBatchTaskWatcher() if self.containerinfo.aws_batch_job_prefix is None: run_uuid = str(uuid.uuid4()) From 957fa210d9d0db6a25453db87f11c3f370e76caf Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 27 Nov 2018 09:03:02 -0800 Subject: [PATCH 75/88] Made import a bit more robust in broken AWS environments --- sciluigi/AWSBatchTaskWatcher.py | 2 +- sciluigi/__init__.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index db0214f..278ac98 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -28,7 +28,7 @@ def pollJobState(self): if state not in self.COMPLETED_JOB_STATES ] if len(jobIDs_needing_update) > 0: - self.__log__.debug("Polling AWS about {} jobs".format( + self.__log__.info("Polling AWS about {} jobs".format( len(jobIDs_needing_update)) ) update_result = self.__batch_client__.describe_jobs( diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index 96076b4..eb7523d 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -49,11 +49,16 @@ from sciluigi.containertask import ContainerInfo from sciluigi.containertask import ContainerTask from sciluigi.containertask import ContainerHelpers -from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher -batch_task_watcher = None -batch_task_watcher = AWSBatchTaskWatcher() +from sciluigi.AWSBatchTaskWatcher import AWSBatchTaskWatcher +try: + batch_task_watcher = AWSBatchTaskWatcher() +except: + batch_task_watcher = None def getBatchTaskWatcher(): + global batch_task_watcher + if batch_task_watcher is None: + raise NotImplementedError return batch_task_watcher From 59ac40e63c0b379308e09b3890b98e0711ce8fac Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 27 Nov 2018 10:09:33 -0800 Subject: [PATCH 76/88] Switch to debug for task count --- sciluigi/AWSBatchTaskWatcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index 278ac98..db0214f 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -28,7 +28,7 @@ def pollJobState(self): if state not in self.COMPLETED_JOB_STATES ] if len(jobIDs_needing_update) > 0: - self.__log__.info("Polling AWS about {} jobs".format( + self.__log__.debug("Polling AWS about {} jobs".format( len(jobIDs_needing_update)) ) update_result = self.__batch_client__.describe_jobs( From 8b0e400f8f9d8dbcb7dc83ebe25aef9ead474df5 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 27 Nov 2018 11:54:16 -0800 Subject: [PATCH 77/88] back to info for polling --- sciluigi/containertask.py | 93 ++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index b58a30f..246fa65 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -267,19 +267,19 @@ def map_targets_to_container(self, targets): os.path.join( urlsplit(t.path).netloc, urlsplit(t.path).path - ) + ) ) for t in scheme_targets.values()]) return_dict[scheme]['common_prefix'] = common_prefix return_dict[scheme]['targets'] = scheme_targets return_dict[scheme]['relpaths'] = { - i: os.path.relpath( - os.path.join( - urlsplit(t.path).netloc, - urlsplit(t.path).path - ), - common_prefix) - for i, t in scheme_targets.items() - } + i: os.path.relpath( + os.path.join( + urlsplit(t.path).netloc, + urlsplit(t.path).path + ), + common_prefix) + for i, t in scheme_targets.items() + } return return_dict def mounts_CP_DF_UF( @@ -326,7 +326,7 @@ def mounts_CP_DF_UF( output_mount_point, scheme, output_target_maps[scheme]['relpaths'][identifier] - ) + ) UF.append("{}::{}".format( container_paths[identifier], output_target_maps[scheme]['targets'][identifier].path @@ -360,7 +360,7 @@ def mounts_CP_DF_UF( input_mount_point, scheme, input_target_maps[scheme]['relpaths'][identifier] - ) + ) DF.append("{}::{}::{}".format( input_target_maps[scheme]['targets'][identifier].path, container_paths[identifier], @@ -384,16 +384,17 @@ def make_fs_name(self, uri): def timeout_to_walltime(self): td = datetime.timedelta(minutes=self.containerinfo.timeout) - hours = td.days * 7 + td.seconds//3600 + hours = td.days * 7 + td.seconds // 3600 if hours > 99: hours = 99 - minutes = (td.seconds - (td.seconds//3600)*3600) // 60 + minutes = (td.seconds - (td.seconds // 3600) * 3600) // 60 seconds = 0 return "{:02d}:{:02d}:{:02d}".format( hours, minutes, seconds ) + def ex( self, command, @@ -452,15 +453,16 @@ def ex( raise Exception("Container engine {} is invalid".format(self.containerinfo.engine)) def ex_singularity_pbs( - self, - command, - input_targets={}, - output_targets={}, - extra_params={}, - inputs_mode='ro', - outputs_mode='rw', - input_mount_point='/mnt/inputs', - output_mount_point='/mnt/outputs'): + self, + command, + input_targets={}, + output_targets={}, + extra_params={}, + inputs_mode='ro', + outputs_mode='rw', + input_mount_point='/mnt/inputs', + output_mount_point='/mnt/outputs' + ): """ Run command in the container using singularity on slurm, with mountpoints command is assumed to be in python template substitution format @@ -534,17 +536,18 @@ def ex_singularity_pbs( with tempfile.NamedTemporaryFile( mode='wt', dir=self.containerinfo.pbs_scriptpath, - delete=False) as script_h: - # Make executable, readable, and writable by owner - os.chmod( - script_h.name, - stat.S_IRUSR | - stat.S_IWUSR | - stat.S_IXUSR - ) - script_h.write("#!/bin/bash\n") - script_h.write(" ".join(command_list)) - script_h.close() + delete=False + ) as script_h: + # Make executable, readable, and writable by owner + os.chmod( + script_h.name, + stat.S_IRUSR | + stat.S_IWUSR | + stat.S_IXUSR + ) + script_h.write("#!/bin/bash\n") + script_h.write(" ".join(command_list)) + script_h.close() command_proc = subprocess.run( [ 'qsub', @@ -595,7 +598,7 @@ def ex_singularity_slurm( img_location = os.path.join( self.containerinfo.container_cache, "{}.singularity.img".format(self.make_fs_name(self.container)) - ) + ) log.info("Looking for singularity image {}".format(img_location)) if not os.path.exists(img_location): log.info("No image at {} Creating....".format(img_location)) @@ -630,12 +633,12 @@ def ex_singularity_slurm( command = Template(command).substitute(template_dict) log.info("Attempting to run {} in {}".format( - command, - self.container - )) + command, + self.container + )) command_list = [ - 'singularity', 'exec', '-e','--contain', '--scratch', '/scratch' + 'singularity', 'exec', '-e', '--contain', '--scratch', '/scratch' ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -660,7 +663,7 @@ def ex_singularity_slurm( '--mem={}M'.format(self.containerinfo.mem), '-t', str(self.containerinfo.timeout), '-p', self.containerinfo.slurm_partition, - ]+command_list, + ] + command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) @@ -816,12 +819,12 @@ def ex_aws_batch( # and make a temp destination in s3 for k, target in schema_targets['targets'].items(): s3_temp_loc = os.path.join( - self.containerinfo.aws_s3_scratch_loc, - run_uuid, - scheme, - 'out', - schema_targets['relpaths'][k] - ) + self.containerinfo.aws_s3_scratch_loc, + run_uuid, + scheme, + 'out', + schema_targets['relpaths'][k] + ) # Add to UF for inside the container UF.add('{}::{}'.format( container_paths[k], From 613670ec9d20eb7aaeb36a3cbf7df265a867285e Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 29 Nov 2018 09:59:26 -0800 Subject: [PATCH 78/88] Attempt to terminate running aws jobs on workflow exit --- sciluigi/AWSBatchTaskWatcher.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index db0214f..6d58f21 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -27,6 +27,7 @@ def pollJobState(self): jID for jID, state in self.__jobStateDict__.items() if state not in self.COMPLETED_JOB_STATES ] + self.__active_job_ids__ = set(jobIDs_needing_update) if len(jobIDs_needing_update) > 0: self.__log__.debug("Polling AWS about {} jobs".format( len(jobIDs_needing_update)) @@ -88,6 +89,8 @@ def __init__( self.__batch_client__ = self.__session__.client( 'batch' ) + # Holder for active jobs + self.__active_job_ids__ = set() # Use the multiprocessing manager to create a job state dict # that can safely be shared among processes self.__manager__ = mp.Manager() @@ -97,5 +100,12 @@ def __init__( self.__jobStatePoller__.start() def __del__(self): + # Delete active jobs + for jobId in self.__active_job_ids__: + self.__batch_client__.terminate_job( + jobId=jobId, + reason='Workflow cancelled' + ) # Explicitly stop the polling process when this class is destroyed. self.__jobStatePoller__.terminate() + From 2010738ee35a821fe56106f46144e3bab0e72e94 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 29 Nov 2018 16:30:15 -0500 Subject: [PATCH 79/88] Added lock for singularity --- sciluigi/__init__.py | 4 +++ sciluigi/containertask.py | 65 ++++++++++++++++++++------------------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/sciluigi/__init__.py b/sciluigi/__init__.py index eb7523d..76db16a 100755 --- a/sciluigi/__init__.py +++ b/sciluigi/__init__.py @@ -56,6 +56,10 @@ except: batch_task_watcher = None +import threading + +# Lock to ensure only one singularity image is created +singularity_lock = threading.Lock() def getBatchTaskWatcher(): global batch_task_watcher diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 246fa65..be7387a 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -474,39 +474,40 @@ def ex_singularity_pbs( outputs_mode, input_mount_point, output_mount_point) - - img_location = os.path.join( - self.containerinfo.container_cache, - "{}.singularity.simg".format(self.make_fs_name(self.container)) - ) - log.info("Looking for singularity image {}".format(img_location)) - if not os.path.exists(img_location): - log.info("No image at {} Creating....".format(img_location)) - try: - os.makedirs(os.path.dirname(img_location)) - except FileExistsError: - # No big deal - pass - # Singularity is dumb and can only pull images to the working dir - # So, get our current working dir. - cwd = os.getcwd() - # Move to our target dir - os.chdir(os.path.dirname(img_location)) - # Attempt to pull our image - pull_proc = subprocess.run( - [ - 'singularity', - 'pull', - '--name', - os.path.basename(img_location), - "docker://{}".format(self.container) - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE + # Use singularity_lock to ensure only one singularity image is created at a time + with sciluigi.singularity_lock: + img_location = os.path.join( + self.containerinfo.container_cache, + "{}.singularity.simg".format(self.make_fs_name(self.container)) ) - log.info(pull_proc) - # Move back - os.chdir(cwd) + log.info("Looking for singularity image {}".format(img_location)) + if not os.path.exists(img_location): + log.info("No image at {} Creating....".format(img_location)) + try: + os.makedirs(os.path.dirname(img_location)) + except FileExistsError: + # No big deal + pass + # Singularity is dumb and can only pull images to the working dir + # So, get our current working dir. + cwd = os.getcwd() + # Move to our target dir + os.chdir(os.path.dirname(img_location)) + # Attempt to pull our image + pull_proc = subprocess.run( + [ + 'singularity', + 'pull', + '--name', + os.path.basename(img_location), + "docker://{}".format(self.container) + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + log.info(pull_proc) + # Move back + os.chdir(cwd) template_dict = container_paths.copy() template_dict.update(extra_params) From 0c0cddaf2608040f195d9659eba5fbe12740efab Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 14 Feb 2019 10:53:16 -0800 Subject: [PATCH 80/88] Fixed a few bugs in the slurm engine --- sciluigi/containertask.py | 65 ++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index be7387a..500c5c6 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -596,38 +596,39 @@ def ex_singularity_slurm( input_mount_point, output_mount_point) - img_location = os.path.join( - self.containerinfo.container_cache, - "{}.singularity.img".format(self.make_fs_name(self.container)) - ) - log.info("Looking for singularity image {}".format(img_location)) - if not os.path.exists(img_location): - log.info("No image at {} Creating....".format(img_location)) - try: - os.makedirs(os.path.dirname(img_location)) - except FileExistsError: - # No big deal - pass - # Singularity is dumb and can only pull images to the working dir - # So, get our current working dir. - cwd = os.getcwd() - # Move to our target dir - os.chdir(os.path.dirname(img_location)) - # Attempt to pull our image - pull_proc = subprocess.run( - [ - 'singularity', - 'pull', - '--name', - os.path.basename(img_location), - "docker://{}".format(self.container) - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE + with sciluigi.singularity_lock: + img_location = os.path.join( + self.containerinfo.container_cache, + "{}.singularity.img".format(self.make_fs_name(self.container)) ) - log.info(pull_proc) - # Move back - os.chdir(cwd) + log.info("Looking for singularity image {}".format(img_location)) + if not os.path.exists(img_location): + log.info("No image at {} Creating....".format(img_location)) + try: + os.makedirs(os.path.dirname(img_location)) + except FileExistsError: + # No big deal + pass + # Singularity is dumb and can only pull images to the working dir + # So, get our current working dir. + cwd = os.getcwd() + # Move to our target dir + os.chdir(os.path.dirname(img_location)) + # Attempt to pull our image + pull_proc = subprocess.run( + [ + 'singularity', + 'pull', + '--name', + os.path.basename(img_location), + "docker://{}".format(self.container) + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + log.info(pull_proc) + # Move back + os.chdir(cwd) template_dict = container_paths.copy() template_dict.update(extra_params) @@ -659,7 +660,7 @@ def ex_singularity_slurm( else: command_proc = subprocess.run( [ - 'salloc', + 'srun', '-c', str(self.containerinfo.vcpu), '--mem={}M'.format(self.containerinfo.mem), '-t', str(self.containerinfo.timeout), From b9e4348c0569c585083d7c39cff2d221dfaf8690 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 15 Feb 2019 10:35:05 -0800 Subject: [PATCH 81/88] Fixed a few bugs in slurm. Added container_working_dir --- sciluigi/containertask.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 500c5c6..226c240 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -43,6 +43,9 @@ class ContainerInfo(): timeout = None # Format is {'source_path': {'bind': '/container/path', 'mode': mode}} mounts = None + + # Location within the container for scratch work. Can be paired with a mount + container_working_dir = None # Local Container cache location. For things like singularity that need to pull # And create a local container container_cache = None @@ -56,6 +59,8 @@ class ContainerInfo(): aws_secrets_loc = None aws_boto_max_tries = None aws_batch_job_poll_sec = None + + # PBS STUFF pbs_account = None pbs_queue = None pbs_scriptpath = None @@ -81,6 +86,7 @@ def __init__(self, pbs_account='', pbs_queue='', pbs_scriptpath=None, + container_working_dir='/tmp/' ): self.engine = engine self.vcpu = vcpu @@ -193,6 +199,9 @@ def from_config( if config_values.get('pbs_scriptpath', "") != "": self.pbs_scriptpath = config_values['pbs_scriptpath'] + if config_values.get('container_working_dir', "") != "": + self.container_working_dir = config_values['container_working_dir'] + def __str__(self): """ Return string of this information @@ -639,8 +648,9 @@ def ex_singularity_slurm( self.container )) + working_dir = tempfile.mkdtemp() command_list = [ - 'singularity', 'exec', '-e', '--contain', '--scratch', '/scratch' + 'singularity', 'exec', '--contain', '-e', '--workdir', working_dir ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] From 0df2b08bf573d10905a9d09b562f0452083424af Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Sat, 16 Feb 2019 17:42:53 -0800 Subject: [PATCH 82/88] SLURM changes --- sciluigi/containertask.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 226c240..0fcba41 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -668,6 +668,36 @@ def ex_singularity_slurm( stderr=subprocess.PIPE, ) else: + """ + out_fn = os.path.join( + next(tempfile._get_candidate_names()) + ) + command_proc = subprocess.run( + [ + 'sbatch', + '-c', str(self.containerinfo.vcpu), + '--mem={}M'.format(self.containerinfo.mem), + '-t', str(self.containerinfo.timeout), + '-p', self.containerinfo.slurm_partition, + '--wait', + '--output={}'.format(out_fn) + ], + input="#!/bin/bash\n"+subprocess.list2cmdline(command_list)+"\n", + encoding='ascii' + ) + if command_proc.returncode == 0 and os.path.exists(out_fn): + log.info( + open(out_fn, 'rt').read() + ) + elif command_proc.returncode != 0 and os.path.exists(out_fn): + log.error( + open(out_fn, 'rt').read() + ) + try: + os.remove(out_fn) + except: + pass + """ command_proc = subprocess.run( [ 'srun', @@ -679,10 +709,10 @@ def ex_singularity_slurm( stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - - log.info(command_proc.stdout) - if command_proc.stderr: - log.warn(command_proc.stderr) + log.info(command_proc.stdout) + if command_proc.stderr: + log.warn(command_proc.stderr) + def ex_aws_batch( self, From dd6a7a740af313bc4229413035d65283e2eed8a6 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 19 Apr 2019 13:58:21 -0400 Subject: [PATCH 83/88] ugly fix for interface change in luigi --- sciluigi/interface.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sciluigi/interface.py b/sciluigi/interface.py index 206f7c5..85a9813 100755 --- a/sciluigi/interface.py +++ b/sciluigi/interface.py @@ -43,7 +43,10 @@ def setup_logging(): luigi_logger.addHandler(luigi_file_handler) luigi_logger.addHandler(stream_handler) luigi_logger.setLevel(logging.WARN) - luigi.interface.setup_interface_logging.has_run = True + try: + luigi.interface.setup_interface_logging.has_run = True + except: + pass sciluigi_logger = logging.getLogger('sciluigi-interface') sciluigi_logger.addHandler(stream_handler) From 1c5d27abde6d84b4cb9f7c5c62c51dcf11bb9cc2 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Tue, 23 Apr 2019 10:12:38 -0400 Subject: [PATCH 84/88] Added sample config. Bumped version to 2.0.0 --- example-config/containerinfo.ini | 79 ++++++++++++++++++++++++++++++++ sciluigi/AWSBatchTaskWatcher.py | 14 +++--- sciluigi/interface.py | 2 +- setup.py | 2 +- 4 files changed, 89 insertions(+), 8 deletions(-) create mode 100644 example-config/containerinfo.ini diff --git a/example-config/containerinfo.ini b/example-config/containerinfo.ini new file mode 100644 index 0000000..af104f8 --- /dev/null +++ b/example-config/containerinfo.ini @@ -0,0 +1,79 @@ +# Sciluigi needs to know how to run your containers +# This configuration file helps specify the options needed +[DEFAULT] +# Which container engine to use. Options include: +# docker -> docker on the hosting machine +# aws_batch -> AWS batch +# pbs -> PBS / torque via qsub +# slurm -> slurm HPC management engine, via srun +engine = docker + +# How many vcpu to request (concurrent threads) +vcpu = 1 + +# Maximum memory, in MB +mem = 4096 + +# Time limit in minutes +timeout = 10080 + +container_working_dir = /tmp/ + +# Some engine specific options +# ** singularity (for slurm and pbs) ** +# where should we store our singularity containers. +# Should be some shared filesystem between nodes +container_cache = + +# ** slurm ** +# To which partition should we submit +slurm_partition = + +# ** PBS ** +# Under which account should jobs be submitted +pbs_account = +# to which queue? +pbs_queue = +# Path on shared filesystem between nodes +# To use to store scripts. +pbs_scriptpath = + + +# ** AWS batch ** +# The role ID needed for tasks to access S3 +aws_jobRoleArn = +# S3 bucket to use for temporary upload / download of files +aws_s3_scratch_loc = +# Which batch job queue should jobs be submitted +aws_batch_job_queue = +# Prefix to add to jobs (human readable) +aws_batch_job_prefix = +# How often should we poll batch (secs) +aws_batch_job_poll_sec = 10 +# Where can we find credentials (defaults to ~/.aws if not specified) +aws_secrets_loc = +# How many times to try submitting via boto before being killed +aws_boto_max_tries = 10 + +# Now specify some defaults for tasks with specific resource need types +# Overriding only the relevant options + +# High memory relative to number of CPU +[highmem] +mem = 120000 +vcpu = 8 + +# Mixed needs for moderate mulitthreaded tasks +[midcpu] +mem = 4096 +vcpu = 4 + +# Big cpu and memory +[heavy] +mem = 120000 +vcpu = 12 + +# Minimal CPU and memory needs (suitable for IO limited tasks) +[light] +vcpu = 2 +mem = 2048 diff --git a/sciluigi/AWSBatchTaskWatcher.py b/sciluigi/AWSBatchTaskWatcher.py index 6d58f21..b4734d5 100644 --- a/sciluigi/AWSBatchTaskWatcher.py +++ b/sciluigi/AWSBatchTaskWatcher.py @@ -101,11 +101,13 @@ def __init__( def __del__(self): # Delete active jobs - for jobId in self.__active_job_ids__: - self.__batch_client__.terminate_job( - jobId=jobId, - reason='Workflow cancelled' - ) + if hasattr(self, '__active_job_ids__') and self.__active_job_ids__ is not None: + for jobId in self.__active_job_ids__: + self.__batch_client__.terminate_job( + jobId=jobId, + reason='Workflow cancelled' + ) # Explicitly stop the polling process when this class is destroyed. - self.__jobStatePoller__.terminate() + if hasattr(self, '__jobStatePoller__'): + self.__jobStatePoller__.terminate() diff --git a/sciluigi/interface.py b/sciluigi/interface.py index 85a9813..3d1db01 100755 --- a/sciluigi/interface.py +++ b/sciluigi/interface.py @@ -51,7 +51,7 @@ def setup_logging(): sciluigi_logger = logging.getLogger('sciluigi-interface') sciluigi_logger.addHandler(stream_handler) sciluigi_logger.addHandler(sciluigi_file_handler) - sciluigi_logger.setLevel(logging.DEBUG) + sciluigi_logger.setLevel(logging.INFO) setup_logging() diff --git a/setup.py b/setup.py index 5cdce26..2ca1ca7 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( name='sciluigi', - version='0.9.6b7_ct', + version='2.0.0_ct', description='Helper library for writing dynamic, flexible workflows in luigi', long_description=long_description, author='Samuel Lampa', From b43a1d11c124c870e611b528e0405f64823950ed Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Fri, 26 Apr 2019 14:58:23 -0400 Subject: [PATCH 85/88] made easier defaults in config --- example-config/containerinfo.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/example-config/containerinfo.ini b/example-config/containerinfo.ini index af104f8..390d281 100644 --- a/example-config/containerinfo.ini +++ b/example-config/containerinfo.ini @@ -61,7 +61,7 @@ aws_boto_max_tries = 10 # High memory relative to number of CPU [highmem] mem = 120000 -vcpu = 8 +vcpu = 1 # Mixed needs for moderate mulitthreaded tasks [midcpu] @@ -71,9 +71,9 @@ vcpu = 4 # Big cpu and memory [heavy] mem = 120000 -vcpu = 12 +;vcpu = 12 # Minimal CPU and memory needs (suitable for IO limited tasks) [light] -vcpu = 2 -mem = 2048 +vcpu = 1 +mem = 1024 From 79bb7669c32a6fc99ed74ce4f1bb4834cfea2d8d Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 2 May 2019 12:51:12 -0700 Subject: [PATCH 86/88] Adjustments to singularity settings. --- sciluigi/containertask.py | 28 ++++++++----- setup.py | 84 ++++++++++++++------------------------- 2 files changed, 48 insertions(+), 64 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 0fcba41..e639f46 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -528,9 +528,8 @@ def ex_singularity_pbs( self.container ) ) - working_dir = tempfile.mkdtemp() command_list = [ - 'singularity', 'exec', '--contain', '-e', '--workdir', working_dir + 'singularity', 'exec', '--contain', '-e', '--scratch', self.containerinfo.container_working_dir, ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -648,9 +647,8 @@ def ex_singularity_slurm( self.container )) - working_dir = tempfile.mkdtemp() command_list = [ - 'singularity', 'exec', '--contain', '-e', '--workdir', working_dir + 'singularity', 'exec', '--contain', '-e', '--scratch', self.containerinfo.container_working_dir, ] for mp in mounts: command_list += ['-B', "{}:{}:{}".format(mp, mounts[mp]['bind'], mounts[mp]['mode'])] @@ -712,7 +710,6 @@ def ex_singularity_slurm( log.info(command_proc.stdout) if command_proc.stderr: log.warn(command_proc.stderr) - def ex_aws_batch( self, @@ -722,8 +719,8 @@ def ex_aws_batch( extra_params={}, inputs_mode='ro', outputs_mode='rw', - input_mount_point='/mnt/inputs', - output_mount_point='/mnt/outputs'): + input_mount_point='/working/inputs', + output_mount_point='/working/outputs'): """ Run a command in a container using AWS batch. Handles uploading of files to / from s3 and then into the container. @@ -751,6 +748,17 @@ def ex_aws_batch( self.containerinfo.aws_batch_job_prefix, str(uuid.uuid4()) ) + # Get a task-specific working dir + input_container_path = os.path.join( + self.containerinfo.container_working_dir, + run_uuid, + 'inputs' + ) + output_container_path = os.path.join( + self.containerinfo.container_working_dir, + run_uuid, + 'outputs' + ) # We need mappings for both to and from S3 and from S3 to within the container # <-> <-> @@ -772,10 +780,10 @@ def ex_aws_batch( for schema, schema_targets in output_target_maps.items(): for k, relpath in schema_targets['relpaths'].items(): container_paths[k] = os.path.join( - output_mount_point, + output_container_path, schema, relpath - ) + ) # Inputs too # Group by schema input_target_maps = self.map_targets_to_container( @@ -785,7 +793,7 @@ def ex_aws_batch( for schema, schema_targets in input_target_maps.items(): for k, relpath in schema_targets['relpaths'].items(): container_paths[k] = os.path.join( - input_mount_point, + input_container_path, schema, relpath ) diff --git a/setup.py b/setup.py index 2ca1ca7..aec7412 100755 --- a/setup.py +++ b/setup.py @@ -1,54 +1,30 @@ -import os -import sys - -try: - from setuptools import setup -except: - from distutils.core import setup - -readme_note = '''\ -.. note:: - - For the latest source, issues and discussion, etc, please visit the - `GitHub repository `_\n\n -''' - -with open('README.rst') as fobj: - long_description = readme_note + fobj.read() - -setup( - name='sciluigi', - version='2.0.0_ct', - description='Helper library for writing dynamic, flexible workflows in luigi', - long_description=long_description, - author='Samuel Lampa', - author_email='samuel.lampa@farmbio.uu.se', - url='https://github.com/pharmbio/sciluigi', - license='MIT', - keywords='workflows workflow pipeline luigi', - packages=[ - 'sciluigi', - ], - install_requires=[ - 'luigi', - 'boto3', - 'mongo', - 'docker', - ], - classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Chemistry', - ], -) +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="geneshot", + version="0.0.2", + author="Jonathan Golob", + author_email="j-dev@golob.org", + description="A gene-level metagenomics pipeline", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/jgolob/geneshot", + packages=setuptools.find_packages(), + dependency_links=[ + 'https://github.com/jgolob/sciluigi/tarball/containertask', + ], + install_requires=[ + 'sciluigi==2.0.1' + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + entry_points={ + 'console_scripts': ['geneshot=geneshot.geneshot:main'] + } +) From d83ed1da3582b9059bf3b46e898f3136d25ac3b8 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 2 May 2019 12:55:11 -0700 Subject: [PATCH 87/88] Bumped version to 2.0.1 --- setup.py | 84 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/setup.py b/setup.py index aec7412..9e54fbc 100755 --- a/setup.py +++ b/setup.py @@ -1,30 +1,54 @@ -import setuptools - -with open("README.md", "r") as fh: - long_description = fh.read() - -setuptools.setup( - name="geneshot", - version="0.0.2", - author="Jonathan Golob", - author_email="j-dev@golob.org", - description="A gene-level metagenomics pipeline", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/jgolob/geneshot", - packages=setuptools.find_packages(), - dependency_links=[ - 'https://github.com/jgolob/sciluigi/tarball/containertask', - ], - install_requires=[ - 'sciluigi==2.0.1' - ], - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - entry_points={ - 'console_scripts': ['geneshot=geneshot.geneshot:main'] - } -) +import os +import sys + +try: + from setuptools import setup +except: + from distutils.core import setup + +readme_note = '''\ +.. note:: + + For the latest source, issues and discussion, etc, please visit the + `GitHub repository `_\n\n +''' + +with open('README.rst') as fobj: + long_description = readme_note + fobj.read() + +setup( + name='sciluigi', + version='2.0.1', + description='Helper library for writing dynamic, flexible workflows in luigi', + long_description=long_description, + author='Samuel Lampa', + author_email='samuel.lampa@farmbio.uu.se', + url='https://github.com/pharmbio/sciluigi', + license='MIT', + keywords='workflows workflow pipeline luigi', + packages=[ + 'sciluigi', + ], + install_requires=[ + 'luigi', + 'boto3', + 'mongo', + 'docker', + ], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Topic :: Scientific/Engineering :: Chemistry', + ], +) From 2631917eeab6b4765a071bd00a05b993bd61c090 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 2 May 2019 16:03:48 -0400 Subject: [PATCH 88/88] pre pull --- sciluigi/containertask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sciluigi/containertask.py b/sciluigi/containertask.py index 0fcba41..93764a6 100644 --- a/sciluigi/containertask.py +++ b/sciluigi/containertask.py @@ -567,10 +567,10 @@ def ex_singularity_pbs( '-A', self.containerinfo.pbs_account, '-q', self.containerinfo.pbs_queue, '-l', - 'nodes={}:ppn={},mem={}mb,walltime={}'.format( + 'nodes={}:ppn={},mem={}gb,walltime={}'.format( 1, self.containerinfo.vcpu, - int(self.containerinfo.mem), + int(self.containerinfo.mem / 1024), self.containerinfo.timeout * 60 ), script_h.name