From 54751c1503791018f4f19a4920caf8a182ae6481 Mon Sep 17 00:00:00 2001 From: M Bernt Date: Mon, 7 Jan 2019 15:40:59 +0100 Subject: [PATCH 1/6] library_upload_dir: improvements major improvement: when using upload_from_galaxy_filesystem the PATH needs to be given absolute and the script generates the complete hierarchy in Galaxy library (i.e. starting from root). Now the script has an additional parameter `--root_folderi ROOT` which allows to upload ROOT/PATH into the library folder PATH. minor improvements: - added `--library_name` which allows to specify the library by name - fixed `--folder` which now works as documented, ie the root folder of the library is used if not given - added preserve dir and tag using files names - also started a bit with documentation --- scripts/api/library_upload_dir.py | 93 +++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index 421f2d1b072c..3e8722736d64 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -4,20 +4,53 @@ import argparse import os import sys +import textwrap from bioblend import galaxy class Uploader(object): - def __init__(self, url, api, library_id, folder_id, should_link, - non_local): + def __init__(self, url, api, library_id, library_name, folder_id, + should_link, non_local, root_folder, dbkey, preserve_dirs, + tag_using_filenames): + """ + initialize uploader + + url Galaxy URL + api API key to use + library_id id of the library + library_name name of the library + folder_id id of the folder to upload to (None: root_folder) + should_link link data sets instead of uploading + non_local set to true iff not running on Galaxy head node + root_folder path from which files are to be uploaded + ie uploaded files are given relative to this path + dbkey data base key (aka genome build) + preserve_dirs preserve directory structure (boolean) + tag_using_filenames tag data sets using file name + """ self.gi = galaxy.GalaxyInstance(url=url, key=api) - self.library_id = library_id - self.folder_id = folder_id + libs = self.gi.libraries.get_libraries(library_id=library_id, + name=library_name) + if len(libs) == 0: + raise Exception("Unknown library [%s,%s]" % (library_id, library_name)) + elif len(libs) > 1: + raise Exception("Ambiguous library [%s,%s]" % (library_id, library_name)) + else: + libs = libs[0] + + self.library_id = libs['id'] + if folder_id: + self.folder_id = folder_id + else: + self.folder_id = libs['root_folder_id'] self.should_link = should_link self.non_local = non_local - + self.root_folder = root_folder + self.dbkey = dbkey + self.preserve_dirs = preserve_dirs + self.tag_using_filenames = tag_using_filenames self.memo_path = {} self.prepopulate_memo() @@ -34,6 +67,7 @@ def prepopulate_memo(self): contents of the data library, and then filter out things that are interesting to us based on a folder prefix. """ + existing = self.gi.libraries.show_library(self.library_id, contents=True) uploading_to = [x for x in existing if x['id'] == self.folder_id] @@ -119,9 +153,9 @@ def upload(self): all_files = [x.strip() for x in list(sys.stdin.readlines())] for idx, path in enumerate(all_files): - (dirName, fname) = path.rsplit(os.path.sep, 1) - if not os.path.exists(os.path.join(dirName, fname)): + if not os.path.exists(os.path.join(self.root_folder, path)): continue + (dirName, fname) = path.rsplit(os.path.sep, 1) # Figure out what the memo key will be early basepath = self.rec_split(dirName) if len(basepath) == 0: @@ -137,31 +171,58 @@ def upload(self): if not already_uploaded: if self.non_local: self.gi.libraries.upload_file_from_local_path( - self.library_id, - os.path.join(dirName, fname), + library_id=self.library_id, + file_local_path=os.path.join(self.root_folder, path), folder_id=fid, + dbkey=self.dbkey ) else: self.gi.libraries.upload_from_galaxy_filesystem( - self.library_id, - os.path.join(dirName, fname), + library_id=self.library_id, + filesystem_paths=os.path.join(self.root_folder, path), folder_id=fid, + dbkey=self.dbkey, link_data_only='link_to_files' if self.should_link else 'copy_files', + preserve_dirs=self.preserve_dirs, + tag_using_filenames=self.tag_using_filenames ) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Upload a directory into a data library') + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent(''' + Upload files or directories given on stdin into a data library. + + If PATH/TO/FILE_OR_DIR is given on stdin the contents of + ROOT_FOLDER/PATH/TO/FILE_OR_DIR will be uploaded to PATH/TO/FILE_OR_DIR + in the specified folder and libary. If ROOT_FOLDER is empty, then + the path should be absolute. Data sets and folders will only be + created in the library if they are not present yet. If --preserve_dirs + is set then the structure in ROOT_FOLDER/PATH/TO/FILE_OR_DIR will be + preserved in the libary.''')) + parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") parser.add_argument("-a", "--api", dest="api", required=True, help="API Key") - parser.add_argument("-l", "--lib", dest="library_id", required=True, help="Library ID") - parser.add_argument("-f", "--folder", dest="folder_id", help="Folder ID. If not specified, will go to root of library.") + libparser = parser.add_mutually_exclusive_group(required=True) + libparser.add_argument("-l", "--lib", dest="library_id", help="Library ID") + libparser.add_argument("-L", "--library_name", help="Library name") + parser.add_argument("-f", "--folder", dest="folder_id", + help="Folder ID. If not specified upload to root folder of the library.") + parser.add_argument("--root_folder", default="/", + help="files are relative to this dir") parser.add_argument("--nonlocal", dest="non_local", action="store_true", default=False, help="Set this flag if you are NOT running this script on your Galaxy head node with access to the full filesystem") - parser.add_argument("--link", dest="should_link", action="store_true", default=False, - help="Link datasets only, do not upload to Galaxy. ONLY Avaialble if you run 'locally' relative to your Galaxy head node/filesystem ") + localparser = parser.add_argument_group('Options for local upload', 'options that only apply if --nonlocal is not set') + localparser.add_argument("--link", dest="should_link", action="store_true", default=False, + help="Link datasets only, do not upload to Galaxy") + localparser.add_argument("--preserve_dirs", action="store_true", default=False, + help="Preserve directory structure") + localparser.add_argument("--tag_using_filenames", action="store_true", default=False, + help="Tag data sets with file name") + parser.add_argument("--dbkey", default="?", help="Genome build") + args = parser.parse_args() u = Uploader(**vars(args)) From 4b321a1db57bb1318360247d7b35b22dc627f066 Mon Sep 17 00:00:00 2001 From: M Bernt Date: Fri, 8 Feb 2019 12:24:12 +0100 Subject: [PATCH 2/6] cont --- scripts/api/library_upload_dir.py | 59 +++++++++++++++++++------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index 3e8722736d64..0cff2066021d 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -11,9 +11,9 @@ class Uploader(object): - def __init__(self, url, api, library_id, library_name, folder_id, - should_link, non_local, root_folder, dbkey, preserve_dirs, - tag_using_filenames): + def __init__(self, url, api, library_id, library_name, folder_id, folder_name, + should_link, non_local, dbkey, preserve_dirs, + tag_using_filenames, description, paths): """ initialize uploader @@ -21,14 +21,15 @@ def __init__(self, url, api, library_id, library_name, folder_id, api API key to use library_id id of the library library_name name of the library - folder_id id of the folder to upload to (None: root_folder) + folder_id id of the folder to upload to (None: root folder of the library) + folder_name path in the Galaxy library where the files should be uploaded to should_link link data sets instead of uploading non_local set to true iff not running on Galaxy head node - root_folder path from which files are to be uploaded - ie uploaded files are given relative to this path dbkey data base key (aka genome build) preserve_dirs preserve directory structure (boolean) tag_using_filenames tag data sets using file name + description description of the topmost of the created folders + paths list of paths to upload if empty will be read from stdin """ self.gi = galaxy.GalaxyInstance(url=url, key=api) libs = self.gi.libraries.get_libraries(library_id=library_id, @@ -39,20 +40,28 @@ def __init__(self, url, api, library_id, library_name, folder_id, raise Exception("Ambiguous library [%s,%s]" % (library_id, library_name)) else: libs = libs[0] - self.library_id = libs['id'] - if folder_id: - self.folder_id = folder_id - else: - self.folder_id = libs['root_folder_id'] + + # set folder_id to root folder + # - will be overwritten if folder_id or folder_name != None + # - needs to be done after prepopulate_memo + self.folder_id = libs['root_folder_id'] self.should_link = should_link self.non_local = non_local self.root_folder = root_folder self.dbkey = dbkey self.preserve_dirs = preserve_dirs self.tag_using_filenames = tag_using_filenames + self.description = description + self.paths = paths + # name -> id map for all folders 'below' folder_id self.memo_path = {} self.prepopulate_memo() + # now really init the folder_id if folder_id or folder_name != None + if folder_id: + self.folder_id = folder_id + elif folder_name: + self.folder_id = self.memoized_path(self.rec_split(folder_name), base_folder=None) def prepopulate_memo(self): """ @@ -118,14 +127,14 @@ def memoized_path(self, path_parts, base_folder=None): # Recursively create the path from our base_folder starting points, # getting the IDs of each folder per path component - ids = self.recursively_build_path(path_parts, base_folder) + ids = self.recursively_build_path(path_parts, base_folder, description=self.description) # These are then associated with the paths. for (key, fid) in zip(nfk, ids): self.memo_path[key] = fid return ids[-1] - def recursively_build_path(self, path_parts, parent_folder_id, ids=None): + def recursively_build_path(self, path_parts, parent_folder_id, ids=None, description=""): """Given an iterable of path components and a parent folder id, recursively create directories below parent_folder_id""" if ids is None: @@ -133,7 +142,7 @@ def recursively_build_path(self, path_parts, parent_folder_id, ids=None): if len(path_parts) == 0: return ids else: - pf = self.gi.libraries.create_folder(self.library_id, path_parts[0], base_folder_id=parent_folder_id) + pf = self.gi.libraries.create_folder(self.library_id, path_parts[0], base_folder_id=parent_folder_id, description=description) ids.append(pf[0]['id']) return self.recursively_build_path(path_parts[1:], pf[0]['id'], ids=ids) @@ -150,10 +159,14 @@ def rec_split(self, s): return self.rec_split(rest) + (tail,) def upload(self): - all_files = [x.strip() for x in list(sys.stdin.readlines())] + if len(self.paths) == 0: + all_files = [x.strip() for x in list(sys.stdin.readlines())] + else: + all_files = self.paths for idx, path in enumerate(all_files): - if not os.path.exists(os.path.join(self.root_folder, path)): + if not os.path.exists(path): + raise Exception("no such file or directory %s" %(path)) continue (dirName, fname) = path.rsplit(os.path.sep, 1) # Figure out what the memo key will be early @@ -172,14 +185,14 @@ def upload(self): if self.non_local: self.gi.libraries.upload_file_from_local_path( library_id=self.library_id, - file_local_path=os.path.join(self.root_folder, path), + file_local_path=path, folder_id=fid, dbkey=self.dbkey ) else: self.gi.libraries.upload_from_galaxy_filesystem( library_id=self.library_id, - filesystem_paths=os.path.join(self.root_folder, path), + filesystem_paths=path, folder_id=fid, dbkey=self.dbkey, link_data_only='link_to_files' if self.should_link else 'copy_files', @@ -191,7 +204,7 @@ def upload(self): if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(''' - Upload files or directories given on stdin into a data library. + Upload files or directories into a data library. If PATH/TO/FILE_OR_DIR is given on stdin the contents of ROOT_FOLDER/PATH/TO/FILE_OR_DIR will be uploaded to PATH/TO/FILE_OR_DIR @@ -208,9 +221,8 @@ def upload(self): libparser.add_argument("-l", "--lib", dest="library_id", help="Library ID") libparser.add_argument("-L", "--library_name", help="Library name") parser.add_argument("-f", "--folder", dest="folder_id", - help="Folder ID. If not specified upload to root folder of the library.") - parser.add_argument("--root_folder", default="/", - help="files are relative to this dir") + help="Folder ID. If not specified upload to root folder of the library or the folder specified with --folder_name.") + parser.add_argument("-F", "--folder_name", help="Folder to upload to, can be a path") parser.add_argument("--nonlocal", dest="non_local", action="store_true", default=False, help="Set this flag if you are NOT running this script on your Galaxy head node with access to the full filesystem") @@ -222,7 +234,8 @@ def upload(self): localparser.add_argument("--tag_using_filenames", action="store_true", default=False, help="Tag data sets with file name") parser.add_argument("--dbkey", default="?", help="Genome build") - + parser.add_argument("--description", default="", help="description for the topmost of the created folders") + parser.add_argument('paths', nargs='*', help="path(s) to upload, will be read from stdin if not given") args = parser.parse_args() u = Uploader(**vars(args)) From f0c510317699ef43d083b121b8083ffc4d7c7a74 Mon Sep 17 00:00:00 2001 From: Martin Cech Date: Wed, 13 Feb 2019 15:49:29 -0500 Subject: [PATCH 3/6] remove trailing whitespace --- scripts/api/library_upload_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index 3e8722736d64..b4b1a069b393 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -22,7 +22,7 @@ def __init__(self, url, api, library_id, library_name, folder_id, library_id id of the library library_name name of the library folder_id id of the folder to upload to (None: root_folder) - should_link link data sets instead of uploading + should_link link data sets instead of uploading non_local set to true iff not running on Galaxy head node root_folder path from which files are to be uploaded ie uploaded files are given relative to this path From 3aae1c4786c4275cc200b0bfa1b127a2f639ded5 Mon Sep 17 00:00:00 2001 From: M Bernt Date: Thu, 28 Mar 2019 12:44:56 +0100 Subject: [PATCH 4/6] cont --- scripts/api/library_upload_dir.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index 0cff2066021d..cd7d25f44f6c 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -204,15 +204,15 @@ def upload(self): if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(''' - Upload files or directories into a data library. + Takes one or more path given as positional arguments (or via stdin) and uploads | + the files or directories contained therein into a data library. - If PATH/TO/FILE_OR_DIR is given on stdin the contents of - ROOT_FOLDER/PATH/TO/FILE_OR_DIR will be uploaded to PATH/TO/FILE_OR_DIR - in the specified folder and libary. If ROOT_FOLDER is empty, then - the path should be absolute. Data sets and folders will only be - created in the library if they are not present yet. If --preserve_dirs - is set then the structure in ROOT_FOLDER/PATH/TO/FILE_OR_DIR will be - preserved in the libary.''')) + The contents of path (PATH/TO/FILE_OR_DIR) will be uploaded to + PATH/TO/FILE_OR_DIR in the specified folder (-f/-F) and libary (-l/-L). Note that + the path can be given relative to the current working directory. Data sets and + folders will only be created in the library if they are not present yet. If + --preserve_dirs is set then the structure in PATH/TO/FILE_OR_DIR will be preserved + in the libary.''')) parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") parser.add_argument("-a", "--api", dest="api", required=True, help="API Key") @@ -220,9 +220,10 @@ def upload(self): libparser = parser.add_mutually_exclusive_group(required=True) libparser.add_argument("-l", "--lib", dest="library_id", help="Library ID") libparser.add_argument("-L", "--library_name", help="Library name") - parser.add_argument("-f", "--folder", dest="folder_id", + folderparser = parser.add_mutually_exclusive_group(required=False) + folderparser.add_argument("-f", "--folder", dest="folder_id", help="Folder ID. If not specified upload to root folder of the library or the folder specified with --folder_name.") - parser.add_argument("-F", "--folder_name", help="Folder to upload to, can be a path") + folderparser.add_argument("-F", "--folder_name", help="Folder to upload to, can be a path") parser.add_argument("--nonlocal", dest="non_local", action="store_true", default=False, help="Set this flag if you are NOT running this script on your Galaxy head node with access to the full filesystem") From 8bae49a860a50d0fcdc17d13da9475746099bdd1 Mon Sep 17 00:00:00 2001 From: M Bernt Date: Thu, 28 Mar 2019 20:17:32 +0100 Subject: [PATCH 5/6] workaround for bioblend returning also deleted libraries --- scripts/api/library_upload_dir.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index cd7d25f44f6c..4608027cc5e5 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -34,10 +34,17 @@ def __init__(self, url, api, library_id, library_name, folder_id, folder_name, self.gi = galaxy.GalaxyInstance(url=url, key=api) libs = self.gi.libraries.get_libraries(library_id=library_id, name=library_name) + + # TODO libs also contains deleted libraries even if it should not + # https://github.com/galaxyproject/bioblend/issues/239 fixed by + # https://github.com/galaxyproject/bioblend/pull/273 + # remove if bioblend>0.12 is released + libs = [d for d in libs if not d['deleted']] + if len(libs) == 0: raise Exception("Unknown library [%s,%s]" % (library_id, library_name)) elif len(libs) > 1: - raise Exception("Ambiguous library [%s,%s]" % (library_id, library_name)) + raise Exception("Ambiguous libraries for [%s,%s]: %s" % (library_id, library_name, libs)) else: libs = libs[0] self.library_id = libs['id'] @@ -48,7 +55,6 @@ def __init__(self, url, api, library_id, library_name, folder_id, folder_name, self.folder_id = libs['root_folder_id'] self.should_link = should_link self.non_local = non_local - self.root_folder = root_folder self.dbkey = dbkey self.preserve_dirs = preserve_dirs self.tag_using_filenames = tag_using_filenames From 938c32c902a69c6806929746ba19322c3a9d1e6b Mon Sep 17 00:00:00 2001 From: M Bernt Date: Fri, 29 Mar 2019 17:32:24 +0100 Subject: [PATCH 6/6] cont --- scripts/api/library_upload_dir.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/api/library_upload_dir.py b/scripts/api/library_upload_dir.py index 4608027cc5e5..3c3356ffa20f 100644 --- a/scripts/api/library_upload_dir.py +++ b/scripts/api/library_upload_dir.py @@ -82,14 +82,16 @@ def prepopulate_memo(self): contents of the data library, and then filter out things that are interesting to us based on a folder prefix. """ - + print("###prepopulate_memo") existing = self.gi.libraries.show_library(self.library_id, contents=True) + print("existing %s"%existing) uploading_to = [x for x in existing if x['id'] == self.folder_id] if len(uploading_to) == 0: raise Exception("Unknown folder [%s] in library [%s]" % (self.folder_id, self.library_id)) else: + print("uploading_to %s"%(uploading_to)) uploading_to = uploading_to[0] for x in existing: @@ -101,6 +103,7 @@ def prepopulate_memo(self): if name_part.startswith('/'): name_part = name_part[1:] self.memo_path[name_part] = x['id'] + print("self.memo_path"%(self.memo_path)) def memoized_path(self, path_parts, base_folder=None): """Get the folder ID for a given folder path specified by path_parts. @@ -111,6 +114,7 @@ def memoized_path(self, path_parts, base_folder=None): previously existing paths and will not respect those. TODO: handle existing paths. """ + print("###memoized_path") if base_folder is None: base_folder = self.folder_id dropped_prefix = [] @@ -185,8 +189,8 @@ def upload(self): # So that we can check if it really needs to be uploaded. already_uploaded = memo_key in self.memo_path.keys() fid = self.memoized_path(basepath, base_folder=self.folder_id) + print("%s %s"%( memo_key, self.memo_path)) print('[%s/%s] %s/%s uploaded=%s' % (idx + 1, len(all_files), fid, fname, already_uploaded)) - if not already_uploaded: if self.non_local: self.gi.libraries.upload_file_from_local_path( @@ -196,6 +200,7 @@ def upload(self): dbkey=self.dbkey ) else: + print("libid %s\npath %s\nfolderid %s\nlink %s\npreserve %s\ntag %s"%(self.library_id,path,fid,self.dbkey,'link_to_files' if self.should_link else 'copy_files',self.preserve_dirs,self.tag_using_filenames)) self.gi.libraries.upload_from_galaxy_filesystem( library_id=self.library_id, filesystem_paths=path,