From d7067bb1082e41bbcf61b542428b21bd41a47264 Mon Sep 17 00:00:00 2001 From: bebatut Date: Tue, 31 Oct 2023 10:51:42 +0100 Subject: [PATCH 01/20] Split script into 2 commands: 1 to extract, 1 to filter tools --- bin/extract_galaxy_tools.py | 122 +++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 29991cb2..71c55f37 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -17,6 +17,7 @@ # BIOTOOLS_API_URL = "https://bio.tools" BIOTOOLS_API_URL = "https://130.226.25.21" + def read_file(filepath): ''' Read an optional file with 1 element per line @@ -101,6 +102,7 @@ def get_biotools(el): return xref.text return None + def get_conda_package(el): ''' Get conda package information @@ -132,28 +134,25 @@ def check_categories(ts_categories, ts_cat): :param ts_cat: list of ToolShed categories to keep in the extraction ''' if ts_categories is not None and len(ts_cat) > 0: + ts_cats = ts_categories.split(', ') to_keep = False - for cat in ts_categories: + for cat in ts_cats: if cat in ts_cat: to_keep = True return to_keep return True -def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): +def get_tool_metadata(tool, repo): ''' Get tool information - Check the `.shed.yaml` file - Extract metadata from the `.shed.yaml` - - Filter for specific ToolShed categories - Extract the requirements in the macros or xml file to get tool version supported in Galaxy - Extract bio.tools information if available in the macros or xml :param tool: GitHub ContentFile object :param repo: GitHub Repository object - :param ts_cat: list of ToolShed categories to keep in the extraction - :param excluded_tools: list of tools to skip - :param keep_tools: list of tools to keep ''' if tool.type != 'dir': return None @@ -176,13 +175,7 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): 'bio.tool id': None, 'Conda id': None, 'Conda version': None, - 'Reviewed': tool.name in keep_tools or tool.name in excluded_tools, - 'To keep':'' } - if tool.name in keep_tools: - metadata['To keep'] = True - elif tool.name in excluded_tools: - metadata['To keep'] = False # extract .shed.yml information and check macros.xml try: shed = repo.get_contents(f"{tool.path}/.shed.yml") @@ -203,10 +196,7 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): metadata['Source'] = yaml_content['homepage_url'] metadata['ToolShed categories'] = get_shed_attribute('categories', yaml_content, []) if metadata['ToolShed categories'] is None: - metadata['ToolShed categories'] = [] - # filter ToolShed categories and leave function if not in expected categories - if not check_categories(metadata['ToolShed categories'], ts_cat): - return None + metadata['ToolShed categories'] = [] # find and parse macro file for file in repo.get_contents(tool.path): if 'macro' in file.name and file.name.endswith('xml'): @@ -221,7 +211,6 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): biotools = get_biotools(child) if biotools is not None: metadata['bio.tool id'] = biotools - # parse XML file and get meta data from there, also tool ids for file in repo.get_contents(tool.path): if file.name.endswith('xml') and 'macro' not in file.name: @@ -256,7 +245,6 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): # tool ids if 'id' in root.attrib: metadata['Galaxy tool ids'].append(root.attrib['id']) - # get latest conda version and compare to the wrapper version if metadata["Conda id"] is not None: r = requests.get(f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}') @@ -286,14 +274,11 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools): return metadata -def parse_tools(repo, ts_cat=[], excluded_tools=[], keep_tools=[]): +def parse_tools(repo): ''' - Parse tools in a GitHub repository to expact + Parse tools in a GitHub repository, extract them and their metadata :param repo: GitHub Repository object - :param ts_cat: list of ToolShed categories to keep in the extraction - :param excluded_tools: list of tools to skip - :param keep_tools: list of tools to keep ''' # get tool folders tool_folders = [] @@ -328,11 +313,11 @@ def parse_tools(repo, ts_cat=[], excluded_tools=[], keep_tools=[]): if tool.type != 'dir': continue for content in repo.get_contents(tool.path): - metadata = get_tool_metadata(content, repo, ts_cat, excluded_tools, keep_tools) + metadata = get_tool_metadata(content, repo) if metadata is not None: tools.append(metadata) else: - metadata = get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools) + metadata = get_tool_metadata(tool, repo) if metadata is not None: tools.append(metadata) return tools @@ -353,32 +338,67 @@ def export_tools(tools, output_fp): df.to_csv(output_fp, sep="\t", index=False) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Extract a GitHub project to CSV') - parser.add_argument('--api', '-a', required=True, help="GitHub access token") - parser.add_argument('--output', '-o', required=True, help="Output filepath") - parser.add_argument('--categories', '-c', help="Path to a file with ToolShed category to keep in the extraction (one per line)") - parser.add_argument('--excluded', '-e', help="Path to a file with ToolShed ids of tools to exclude (one per line)") - parser.add_argument('--keep', '-ek', help="Path to a file with ToolShed ids of tools to keep (one per line)") - args = parser.parse_args() +def filter_tools(tools, ts_cat, excluded_tools, keep_tools): + ''' + Filter tools for specific ToolShed categories and add information if to keep or to exclude - # connect to GitHub - g = Github(args.api) - # get list of GitHub repositories to parse - repo_list = get_tool_github_repositories(g) + :param tools: dictionary with tools and their metadata + :param ts_cat: list of ToolShed categories to keep in the extraction + :param excluded_tools: list of tools to skip + :param keep_tools: list of tools to keep + ''' + filtered_tools = [] + for tool in tools: + # filter ToolShed categories and leave function if not in expected categories + if check_categories(tool['ToolShed categories'], ts_cat): + name = tool['Galaxy wrapper id'] + tool['Reviewed'] = tool.name in keep_tools or tool.name in excluded_tools + tool['To keep'] = None + if name in keep_tools: + tool['To keep'] = True + elif name in excluded_tools: + tool['To keep'] = False + filtered_tools.append(tool) - # get categories and tools to exclude - categories = read_file(args.categories) - excl_tools = read_file(args.excluded) - keep_tools = read_file(args.keep) - # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file - tools = [] - for r in repo_list: - print(r) - if "github" not in r: - continue - repo = get_github_repo(r, g) - tools += parse_tools(repo, categories, excl_tools, keep_tools) - export_tools(tools, args.output) - print() +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Extract Galaxy tools from GitHub repositories together with biotools and conda metadata') + subparser = parser.add_subparsers(dest='command') + # Extract tools + extractools = subparser.add_parser('extractools', help="Extract tools") + extractools.add_argument('--api', '-a', required=True, help="GitHub access token") + extractools.add_argument('--all_tools', '-o', required=True, help="Filepath to CSV with all extracted tools") + # Filter tools + filtertools = subparser.add_parser('filtertools', help="Filter tools") + filtertools.add_argument('--tools', '-t', required=True, help="Filepath to CSV with all extracted tools, generated by extractools command") + filtertools.add_argument('--filtered_tools', '-f', required=True, help="Filepath to CSV with filtered tools") + filtertools.add_argument('--categories', '-c', help="Path to a file with ToolShed category to keep in the extraction (one per line)") + filtertools.add_argument('--exclude', '-e', help="Path to a file with ToolShed ids of tools to exclude (one per line)") + filtertools.add_argument('--keep', '-k', help="Path to a file with ToolShed ids of tools to keep (one per line)") + args = parser.parse_args() + + if args.command == 'extractools': + # connect to GitHub + g = Github(args.api) + # get list of GitHub repositories to parse + repo_list = get_tool_github_repositories(g) + # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file + tools = [] + for r in repo_list: + print(r) + if "github" not in r: + continue + repo = get_github_repo(r, g) + tools += parse_tools(repo) + export_tools(tools, args.all_tools) + print() + elif args.command == 'filtertools': + tools = pd.read_csv(Path(args.tools)).to_dict('records') + # get categories and tools to exclude + categories = read_file(args.categories) + excl_tools = read_file(args.exclude) + keep_tools = read_file(args.keep) + # filter tool lists + filtered_tools = filter_tools(tools, categories, excl_tools, keep_tools) + export_tools(filtered_tools, args.filtered_tools) + From 8ec29ec00d82b4ff24e05f32b33647ae0842176a Mon Sep 17 00:00:00 2001 From: bebatut Date: Tue, 31 Oct 2023 10:52:21 +0100 Subject: [PATCH 02/20] Update README and bash scripts --- README.md | 63 ++++++++++++++++++-------------- bin/extract_all_tools.sh | 6 +++ bin/extract_microgalaxy_tools.sh | 16 ++++++-- 3 files changed, 54 insertions(+), 31 deletions(-) create mode 100644 bin/extract_all_tools.sh diff --git a/README.md b/README.md index a4670193..33ae4c0e 100644 --- a/README.md +++ b/README.md @@ -38,40 +38,22 @@ Galaxy Tool extractor $ python3 -m pip install -r requirements.txt ``` -# Extract tools for categories in the ToolShed +## Extract all tools 1. Get an API key ([personal token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens)) for GitHub -2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories)) -3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude)) -4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep)) -4. Run the tool extractor script +2. Export the GitHub API key as an environment variable: ``` - $ python bin/extract_galaxy_tools.py \ - --api \ - --output \ - [--categories ] \ - [--excluded ]\ - [--keep ] + $ export GITHUB_API_KEY= ``` - For microGalaxy, a Bash script in `bin` can used by: - 1. Exporting the GitHub API key as an environment variable: - - ``` - $ export GITHUB_API_KEY= - ``` - - 2. Running the script - - ``` - $ bash bin/extract_microgalaxy_tools.sh - ``` - - It will take the files in the `data/microgalaxy` folder and export the tools into `microgalaxy_tools.csv` +3. Run the script + ``` + $ python bin/extract_all_tools.sh + ``` -The script will generate a CSV file with each tool found in the list of GitHub repository and several information for these tools: +The script will generate a CSV file with each tool found in the list of GitHub repositories and metadata for these tools: 1. Galaxy wrapper id 2. Description @@ -89,5 +71,30 @@ The script will generate a CSV file with each tool found in the list of GitHub r 14. Galaxy wrapper version 15. Conda id 16. Conda version -17. Reviewed -18. To keep \ No newline at end of file + +## Filter tools based on their categories in the ToolShed + +1. Run the extraction as explained before +2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories)) +3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude)) +4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep)) +4. Run the tool extractor script + + ``` + $ python bin/extract_galaxy_tools.py \ + --tools \ + --filtered_tools \ + [--categories ] \ + [--excluded ]\ + [--keep ] + ``` + +### Filter tools for microbial data analysis + +For microGalaxy, a Bash script in `bin` can used by running the script + +``` +$ bash bin/extract_microgalaxy_tools.sh +``` + +It will take the files in the `data/microgalaxy` folder and export the tools into `microgalaxy_tools.csv` diff --git a/bin/extract_all_tools.sh b/bin/extract_all_tools.sh new file mode 100644 index 00000000..f68a00c1 --- /dev/null +++ b/bin/extract_all_tools.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +python bin/extract_galaxy_tools.py \ + extractools \ + --api $GITHUB_API_KEY \ + --all_tools 'results/all_tools.csv' \ No newline at end of file diff --git a/bin/extract_microgalaxy_tools.sh b/bin/extract_microgalaxy_tools.sh index fcc15e5a..12bf25ce 100644 --- a/bin/extract_microgalaxy_tools.sh +++ b/bin/extract_microgalaxy_tools.sh @@ -1,8 +1,18 @@ #!/usr/bin/env bash +curl \ + -L \ + "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=1533244711" \ + -o "data/microgalaxy/tools_to_keep" + +curl \ + -L \ + "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \ + -o "data/microgalaxy/tools_to_exclude" + python bin/extract_galaxy_tools.py \ - --api $GITHUB_API_KEY \ - --output microgalaxy_tools.csv \ + filtertools \ + --tools 'results/all_tools.csv' \ --categories "data/microgalaxy/categories" \ - --excluded "data/microgalaxy/tools_to_exclude" \ + --exclude "data/microgalaxy/tools_to_exclude" \ --keep "data/microgalaxy/tools_to_keep" \ No newline at end of file From ccc2063f12d4f48d33d319e3e04c87b3b2cbd968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 12:58:53 +0100 Subject: [PATCH 03/20] Fix a function call and verify to False for bio.tools API --- bin/extract_galaxy_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 09d4d44f..e28449ac 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -258,7 +258,7 @@ def get_tool_metadata(tool, repo): metadata["Status"] = "Up-to-date" # get bio.tool information if metadata["bio.tool id"] is not None: - r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json') + r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json', verify=False) if r.status_code == requests.codes.ok: biotool_info = r.json() if "function" in biotool_info: @@ -319,7 +319,7 @@ def parse_tools(repo): file_list = repo.get_contents(tool.path) assert isinstance(file_list, list) for content in file_list: - metadata = get_tool_metadata(content, repo, ts_cat, excluded_tools, keep_tools) + metadata = get_tool_metadata(content, repo) if metadata is not None: tools.append(metadata) else: From 34d9057134d7950bf0c6024d735ffaf69ee32d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 13:08:38 +0100 Subject: [PATCH 04/20] Fix linting --- bin/extract_galaxy_tools.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index e28449ac..e0656f5c 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -135,7 +135,7 @@ def check_categories(ts_categories, ts_cat): :param ts_cat: list of ToolShed categories to keep in the extraction """ if ts_categories is not None and len(ts_cat) > 0: - ts_cats = ts_categories.split(', ') + ts_cats = ts_categories.split(", ") to_keep = False for cat in ts_cats: if cat in ts_cat: @@ -145,12 +145,10 @@ def check_categories(ts_categories, ts_cat): def get_tool_metadata(tool, repo): - """" - Get tool information - - Check the `.shed.yaml` file - - Extract metadata from the `.shed.yaml` - - Extract the requirements in the macros or xml file to get tool version supported in Galaxy - - Extract bio.tools information if available in the macros or xml + """" " + Get tool metadata from the .shed.yaml, requirements in the macros or xml + file, bio.tools information if available in the macros or xml, EDAM + annotations using bio.tools API, recent conda version using conda API :param tool: GitHub ContentFile object :param repo: GitHub Repository object @@ -174,7 +172,7 @@ def get_tool_metadata(tool, repo): "Galaxy wrapper source": None, "Galaxy wrapper version": None, "Conda id": None, - "Conda version": None + "Conda version": None, } # extract .shed.yml information and check macros.xml try: @@ -368,7 +366,9 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata") + parser = argparse.ArgumentParser( + description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata" + ) subparser = parser.add_subparsers(dest="command") # Extract tools extractools = subparser.add_parser("extractools", help="Extract tools") @@ -376,10 +376,19 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to CSV with all extracted tools") # Filter tools filtertools = subparser.add_parser("filtertools", help="Filter tools") - filtertools.add_argument("--tools", "-t", required=True, help="Filepath to CSV with all extracted tools, generated by extractools command") + filtertools.add_argument( + "--tools", + "-t", + required=True, + help="Filepath to CSV with all extracted tools, generated by extractools command", + ) filtertools.add_argument("--filtered_tools", "-f", required=True, help="Filepath to CSV with filtered tools") - filtertools.add_argument("--categories", "-c", help="Path to a file with ToolShed category to keep in the extraction (one per line)") - filtertools.add_argument("--exclude", "-e", help="Path to a file with ToolShed ids of tools to exclude (one per line)") + filtertools.add_argument( + "--categories", "-c", help="Path to a file with ToolShed category to keep in the extraction (one per line)" + ) + filtertools.add_argument( + "--exclude", "-e", help="Path to a file with ToolShed ids of tools to exclude (one per line)" + ) filtertools.add_argument("--keep", "-k", help="Path to a file with ToolShed ids of tools to keep (one per line)") args = parser.parse_args() From 0be309d5c67492be9288a7928161a3503783eea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 14:51:36 +0100 Subject: [PATCH 05/20] Fix filtering by categories --- bin/extract_galaxy_tools.py | 23 ++++++++++++++----- ...y_tools.sh => filter_microgalaxy_tools.sh} | 3 +++ 2 files changed, 20 insertions(+), 6 deletions(-) rename bin/{extract_microgalaxy_tools.sh => filter_microgalaxy_tools.sh} (88%) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index e0656f5c..15dcfb24 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -327,6 +327,16 @@ def parse_tools(repo): return tools +def format_list_column(col): + """ + Format a column that could be a list before exporting + """ + if isinstance(col, list): + return col.apply(lambda x: ", ".join([str(i) for i in x])) + else: + return col + + def export_tools(tools: list, output_fp: str) -> None: """ Export tool metadata to tsv output file @@ -335,10 +345,10 @@ def export_tools(tools: list, output_fp: str) -> None: :param output_fp: path to output file """ df = pd.DataFrame(tools) - df["ToolShed categories"] = df["ToolShed categories"].apply(lambda x: ", ".join([str(i) for i in x])) - df["EDAM operation"] = df["EDAM operation"].apply(lambda x: ", ".join([str(i) for i in x])) - df["EDAM topic"] = df["EDAM topic"].apply(lambda x: ", ".join([str(i) for i in x])) - df["Galaxy tool ids"] = df["Galaxy tool ids"].apply(lambda x: ", ".join([str(i) for i in x])) + df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) + df["EDAM operation"] = format_list_column(df["EDAM operation"]) + df["EDAM topic"] = format_list_column(df["EDAM topic"]) + df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"]) df.to_csv(output_fp, sep="\t", index=False) @@ -356,13 +366,14 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): # filter ToolShed categories and leave function if not in expected categories if check_categories(tool["ToolShed categories"], ts_cat): name = tool["Galaxy wrapper id"] - tool["Reviewed"] = tool.name in keep_tools or tool.name in excluded_tools + tool["Reviewed"] = name in keep_tools or name in excluded_tools tool["To keep"] = None if name in keep_tools: tool["To keep"] = True elif name in excluded_tools: tool["To keep"] = False filtered_tools.append(tool) + return filtered_tools if __name__ == "__main__": @@ -408,7 +419,7 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): export_tools(tools, args.all_tools) print() elif args.command == "filtertools": - tools = pd.read_csv(Path(args.tools)).to_dict("records") + tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records") # get categories and tools to exclude categories = read_file(args.categories) excl_tools = read_file(args.exclude) diff --git a/bin/extract_microgalaxy_tools.sh b/bin/filter_microgalaxy_tools.sh similarity index 88% rename from bin/extract_microgalaxy_tools.sh rename to bin/filter_microgalaxy_tools.sh index 12bf25ce..b7597276 100644 --- a/bin/extract_microgalaxy_tools.sh +++ b/bin/filter_microgalaxy_tools.sh @@ -10,9 +10,12 @@ curl \ "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \ -o "data/microgalaxy/tools_to_exclude" +mkdir -p 'results/microgalaxy' + python bin/extract_galaxy_tools.py \ filtertools \ --tools 'results/all_tools.csv' \ + --filtered_tools 'results/microgalaxy/tools.csv' \ --categories "data/microgalaxy/categories" \ --exclude "data/microgalaxy/tools_to_exclude" \ --keep "data/microgalaxy/tools_to_keep" \ No newline at end of file From 8fdaeec0b3b5a38e10e34f822704363cbc0729e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 14:56:10 +0100 Subject: [PATCH 06/20] Apply suggestions from code review Co-authored-by: Nicola Soranzo --- bin/extract_galaxy_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 15dcfb24..eb905ae4 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -144,8 +144,8 @@ def check_categories(ts_categories, ts_cat): return True -def get_tool_metadata(tool, repo): - """" " +def get_tool_metadata(tool: ContentFile, repo: Repository) -> : + """ Get tool metadata from the .shed.yaml, requirements in the macros or xml file, bio.tools information if available in the macros or xml, EDAM annotations using bio.tools API, recent conda version using conda API @@ -274,7 +274,7 @@ def get_tool_metadata(tool, repo): return metadata -def parse_tools(repo): +def parse_tools(repo: Repository): """ Parse tools in a GitHub repository, extract them and their metadata @@ -352,7 +352,7 @@ def export_tools(tools: list, output_fp: str) -> None: df.to_csv(output_fp, sep="\t", index=False) -def filter_tools(tools, ts_cat, excluded_tools, keep_tools): +def filter_tools(tools, ts_cat: List[str], excluded_tools: List[str], keep_tools: List[str]): """ Filter tools for specific ToolShed categories and add information if to keep or to exclude From 907004aad8da00eff8fc045f379be3318c154006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:01:55 +0100 Subject: [PATCH 07/20] Restructure check_categories function Co-authored-by: Nicola Soranzo --- bin/extract_galaxy_tools.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index eb905ae4..bd2124c2 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -134,14 +134,12 @@ def check_categories(ts_categories, ts_cat): :param ts_categories: tool ToolShed categories :param ts_cat: list of ToolShed categories to keep in the extraction """ - if ts_categories is not None and len(ts_cat) > 0: + if not ts_cat: + return True + if not ts_categories: + return False ts_cats = ts_categories.split(", ") - to_keep = False - for cat in ts_cats: - if cat in ts_cat: - to_keep = True - return to_keep - return True + return bool(set(ts_cat) & set(ts_cats)) def get_tool_metadata(tool: ContentFile, repo: Repository) -> : From b3c34f41d4822d27b52f7fc6e3ff5e13f201a895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:02:31 +0100 Subject: [PATCH 08/20] Apply suggestions from code review Co-authored-by: Nicola Soranzo --- bin/extract_galaxy_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index bd2124c2..bf9afd79 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -142,7 +142,7 @@ def check_categories(ts_categories, ts_cat): return bool(set(ts_cat) & set(ts_cats)) -def get_tool_metadata(tool: ContentFile, repo: Repository) -> : +def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, Any]]: """ Get tool metadata from the .shed.yaml, requirements in the macros or xml file, bio.tools information if available in the macros or xml, EDAM From 4c5db4764ecf41d1b039ef470294305b078a9dd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:03:37 +0100 Subject: [PATCH 09/20] Update extract_galaxy_tools.py --- bin/extract_galaxy_tools.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index bf9afd79..23dd95a6 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -5,7 +5,12 @@ import time import xml.etree.ElementTree as et from pathlib import Path -from typing import List +from typing import ( + Any, + Dict, + List, + Optional, +) import pandas as pd import requests From f462b9d46ce483fe4c2892a68c0542fcf3f8fba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:05:16 +0100 Subject: [PATCH 10/20] Update extract_galaxy_tools.py --- bin/extract_galaxy_tools.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 23dd95a6..82ffc8eb 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -5,12 +5,7 @@ import time import xml.etree.ElementTree as et from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) +from typing import Any, Dict, List, Optional import pandas as pd import requests From a560b019042e0c8ba9b3ac1769e147467aa30180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:09:52 +0100 Subject: [PATCH 11/20] Update bin/extract_galaxy_tools.py Co-authored-by: Nicola Soranzo --- bin/extract_galaxy_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 82ffc8eb..3825c1b6 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -138,7 +138,7 @@ def check_categories(ts_categories, ts_cat): return True if not ts_categories: return False - ts_cats = ts_categories.split(", ") + ts_cats = ts_categories.split(", ") return bool(set(ts_cat) & set(ts_cats)) From e2868e1065342b2aaa3c1c225b5259c636b6bca6 Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Tue, 31 Oct 2023 14:23:47 +0000 Subject: [PATCH 12/20] Fix type annotations + add `.isort.cfg` --- .isort.cfg | 11 +++++++++++ bin/extract_galaxy_tools.py | 23 +++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) create mode 100644 .isort.cfg diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..b7d1f8b2 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,11 @@ +[settings] +combine_as_imports=true +force_alphabetical_sort_within_sections=true +# Override force_grid_wrap value from profile=black, but black is still happy +force_grid_wrap=2 +# Same line length as for black +line_length=120 +no_lines_before=LOCALFOLDER +profile=black +reverse_relative=true +skip_gitignore=true diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 3825c1b6..19694a0d 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -5,7 +5,12 @@ import time import xml.etree.ElementTree as et from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import ( + Any, + Dict, + List, + Optional, +) import pandas as pd import requests @@ -19,7 +24,7 @@ BIOTOOLS_API_URL = "https://130.226.25.21" -def read_file(filepath): +def read_file(filepath) -> List[str]: """ Read an optional file with 1 element per line @@ -194,7 +199,9 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, if metadata["ToolShed categories"] is None: metadata["ToolShed categories"] = [] # find and parse macro file - for file in repo.get_contents(tool.path): + file_list = repo.get_contents(tool.path) + assert isinstance(file_list, list) + for file in file_list: if "macro" in file.name and file.name.endswith("xml"): file_content = get_string_content(file) root = et.fromstring(file_content) @@ -208,7 +215,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, if biotools is not None: metadata["bio.tool id"] = biotools # parse XML file and get meta data from there, also tool ids - for file in repo.get_contents(tool.path): + for file in file_list: if file.name.endswith("xml") and "macro" not in file.name: file_content = get_string_content(file) try: @@ -272,7 +279,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, return metadata -def parse_tools(repo: Repository): +def parse_tools(repo: Repository) -> List[Dict[str, Any]]: """ Parse tools in a GitHub repository, extract them and their metadata @@ -335,7 +342,7 @@ def format_list_column(col): return col -def export_tools(tools: list, output_fp: str) -> None: +def export_tools(tools: List[Dict], output_fp: str) -> None: """ Export tool metadata to tsv output file @@ -350,7 +357,7 @@ def export_tools(tools: list, output_fp: str) -> None: df.to_csv(output_fp, sep="\t", index=False) -def filter_tools(tools, ts_cat: List[str], excluded_tools: List[str], keep_tools: List[str]): +def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str], keep_tools: List[str]) -> List[Dict]: """ Filter tools for specific ToolShed categories and add information if to keep or to exclude @@ -407,7 +414,7 @@ def filter_tools(tools, ts_cat: List[str], excluded_tools: List[str], keep_tools # get list of GitHub repositories to parse repo_list = get_tool_github_repositories(g) # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file - tools = [] + tools: List[Dict] = [] for r in repo_list: print(r) if "github" not in r: From 4383361245648105d61a118607001bf3dfbc16f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:52:20 +0100 Subject: [PATCH 13/20] Fix format col --- bin/extract_galaxy_tools.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 19694a0d..bad5775a 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -336,24 +336,23 @@ def format_list_column(col): """ Format a column that could be a list before exporting """ - if isinstance(col, list): - return col.apply(lambda x: ", ".join([str(i) for i in x])) - else: - return col + return col.apply(lambda x: ", ".join([str(i) for i in x])) -def export_tools(tools: List[Dict], output_fp: str) -> None: +def export_tools(tools: List[Dict], output_fp: str, format_list_col=False) -> None: """ Export tool metadata to tsv output file :param tools: dictionary with tools :param output_fp: path to output file + :param format_list_col: boolean indicating if list columns should be formatting """ df = pd.DataFrame(tools) - df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) - df["EDAM operation"] = format_list_column(df["EDAM operation"]) - df["EDAM topic"] = format_list_column(df["EDAM topic"]) - df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"]) + if format_list_col: + df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) + df["EDAM operation"] = format_list_column(df["EDAM operation"]) + df["EDAM topic"] = format_list_column(df["EDAM topic"]) + df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"]) df.to_csv(output_fp, sep="\t", index=False) @@ -421,7 +420,7 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] continue repo = get_github_repo(r, g) tools += parse_tools(repo) - export_tools(tools, args.all_tools) + export_tools(tools, args.all_tools, format_col=True) print() elif args.command == "filtertools": tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records") From e28f24d14ebeff0bc0e29587666eb24618a18be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:53:31 +0100 Subject: [PATCH 14/20] Change CSV to TSV --- README.md | 2 +- bin/extract_all_tools.sh | 2 +- bin/extract_galaxy_tools.py | 6 +++--- bin/filter_microgalaxy_tools.sh | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 33ae4c0e..82c6689e 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Galaxy Tool extractor $ python bin/extract_all_tools.sh ``` -The script will generate a CSV file with each tool found in the list of GitHub repositories and metadata for these tools: +The script will generate a TSV file with each tool found in the list of GitHub repositories and metadata for these tools: 1. Galaxy wrapper id 2. Description diff --git a/bin/extract_all_tools.sh b/bin/extract_all_tools.sh index f68a00c1..dcfb47c9 100644 --- a/bin/extract_all_tools.sh +++ b/bin/extract_all_tools.sh @@ -3,4 +3,4 @@ python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all_tools 'results/all_tools.csv' \ No newline at end of file + --all_tools 'results/all_tools.tsv' \ No newline at end of file diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index bad5775a..bf814057 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -388,16 +388,16 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] # Extract tools extractools = subparser.add_parser("extractools", help="Extract tools") extractools.add_argument("--api", "-a", required=True, help="GitHub access token") - extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to CSV with all extracted tools") + extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to TSV with all extracted tools") # Filter tools filtertools = subparser.add_parser("filtertools", help="Filter tools") filtertools.add_argument( "--tools", "-t", required=True, - help="Filepath to CSV with all extracted tools, generated by extractools command", + help="Filepath to TSV with all extracted tools, generated by extractools command", ) - filtertools.add_argument("--filtered_tools", "-f", required=True, help="Filepath to CSV with filtered tools") + filtertools.add_argument("--filtered_tools", "-f", required=True, help="Filepath to TSV with filtered tools") filtertools.add_argument( "--categories", "-c", help="Path to a file with ToolShed category to keep in the extraction (one per line)" ) diff --git a/bin/filter_microgalaxy_tools.sh b/bin/filter_microgalaxy_tools.sh index b7597276..b3613fd6 100644 --- a/bin/filter_microgalaxy_tools.sh +++ b/bin/filter_microgalaxy_tools.sh @@ -14,8 +14,8 @@ mkdir -p 'results/microgalaxy' python bin/extract_galaxy_tools.py \ filtertools \ - --tools 'results/all_tools.csv' \ - --filtered_tools 'results/microgalaxy/tools.csv' \ + --tools 'results/all_tools.tsv' \ + --filtered_tools 'results/microgalaxy/tools.tsv' \ --categories "data/microgalaxy/categories" \ --exclude "data/microgalaxy/tools_to_exclude" \ --keep "data/microgalaxy/tools_to_keep" \ No newline at end of file From df13bef321f482f6d4435c282b81f4b4dda7c292 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 15:56:26 +0100 Subject: [PATCH 15/20] Fix format col --- bin/extract_galaxy_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index bf814057..a7bfb5be 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -420,7 +420,7 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] continue repo = get_github_repo(r, g) tools += parse_tools(repo) - export_tools(tools, args.all_tools, format_col=True) + export_tools(tools, args.all_tools, format_list_col=True) print() elif args.command == "filtertools": tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records") From 1e42376947423b1a152ffd77ce7ee2eecfd88fd3 Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Tue, 31 Oct 2023 16:47:46 +0000 Subject: [PATCH 16/20] bio.tools DNS entry is back --- bin/extract_galaxy_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index a7bfb5be..56266d21 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -20,8 +20,8 @@ from github.Repository import Repository # Config variables -# BIOTOOLS_API_URL = "https://bio.tools" -BIOTOOLS_API_URL = "https://130.226.25.21" +BIOTOOLS_API_URL = "https://bio.tools" +# BIOTOOLS_API_URL = "https://130.226.25.21" def read_file(filepath) -> List[str]: @@ -261,7 +261,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, metadata["Status"] = "Up-to-date" # get bio.tool information if metadata["bio.tool id"] is not None: - r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json', verify=False) + r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json') if r.status_code == requests.codes.ok: biotool_info = r.json() if "function" in biotool_info: From 7d04335d5fb66556ff912c8d8d11873d18e206da Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Tue, 31 Oct 2023 17:08:26 +0000 Subject: [PATCH 17/20] Add missing type annotations --- bin/extract_galaxy_tools.py | 18 +++++++++--------- mypy.ini | 1 + 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 56266d21..1191db12 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -24,7 +24,7 @@ # BIOTOOLS_API_URL = "https://130.226.25.21" -def read_file(filepath) -> List[str]: +def read_file(filepath: Optional[str]) -> List[str]: """ Read an optional file with 1 element per line @@ -40,7 +40,7 @@ def read_file(filepath) -> List[str]: return [] -def get_string_content(cf): +def get_string_content(cf: ContentFile) -> str: """ Get string of the content from a ContentFile @@ -81,7 +81,7 @@ def get_github_repo(url: str, g: Github) -> Repository: return g.get_user(u_split[-2]).get_repo(u_split[-1]) -def get_shed_attribute(attrib, shed_content, empty_value): +def get_shed_attribute(attrib: str, shed_content: Dict[str, Any], empty_value: Any) -> Any: """ Get a shed attribute @@ -95,7 +95,7 @@ def get_shed_attribute(attrib, shed_content, empty_value): return empty_value -def get_biotools(el): +def get_biotools(el: et.Element) -> Optional[str]: """ Get bio.tools information @@ -109,7 +109,7 @@ def get_biotools(el): return None -def get_conda_package(el): +def get_conda_package(el: et.Element) -> Optional[str]: """ Get conda package information @@ -132,7 +132,7 @@ def get_conda_package(el): return None -def check_categories(ts_categories, ts_cat): +def check_categories(ts_categories: str, ts_cat: List[str]) -> bool: """ Check if tool fit in ToolShed categories to keep @@ -332,14 +332,14 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]: return tools -def format_list_column(col): +def format_list_column(col: pd.Series) -> pd.Series: """ Format a column that could be a list before exporting """ - return col.apply(lambda x: ", ".join([str(i) for i in x])) + return col.apply(lambda x: ", ".join(str(i) for i in x)) -def export_tools(tools: List[Dict], output_fp: str, format_list_col=False) -> None: +def export_tools(tools: List[Dict], output_fp: str, format_list_col: bool = False) -> None: """ Export tool metadata to tsv output file diff --git a/mypy.ini b/mypy.ini index 180dae24..d801b021 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,6 @@ [mypy] check_untyped_defs = True +disallow_untyped_defs = True ignore_missing_imports = True pretty = True no_implicit_optional = True From 0ab36c2ca1074144ca366a2296f716a3a4183bdb Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Tue, 31 Oct 2023 18:00:26 +0000 Subject: [PATCH 18/20] Exclude broken pandas-stubs version for Python 3.8 mypy --- tox.ini | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tox.ini b/tox.ini index 9ffdce31..6a29c3f3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,18 +1,18 @@ [tox] envlist = lint -[testenv:lint] +[testenv] commands = - ruff . - black --check --diff . - isort --check --diff . - mypy . + lint: ruff . + lint: black --check --diff . + lint: isort --check --diff . + lint: mypy . deps = - black - isort - mypy - pandas-stubs - ruff - types-PyYAML - types-requests + lint: black + lint: isort + lint: mypy + lint: pandas-stubs != 2.0.3.230814 + lint: ruff + lint: types-PyYAML + lint: types-requests skip_install = true From c7daa34ad9a4215f5c1ec1025091abf591b9c454 Mon Sep 17 00:00:00 2001 From: Nicola Soranzo Date: Wed, 1 Nov 2023 08:20:49 +0000 Subject: [PATCH 19/20] Write extracted tools only once Also: - Use `.extend()` instead of `+=` (slightly faster) - Write error messages to `sys.stderr` --- bin/extract_galaxy_tools.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 1191db12..54bfbcea 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -2,6 +2,7 @@ import argparse import base64 +import sys import time import xml.etree.ElementTree as et from pathlib import Path @@ -60,7 +61,7 @@ def get_tool_github_repositories(g: Github) -> List[str]: for i in range(1, 5): repo_f = repo.get_contents(f"repositories0{i}.list") repo_l = get_string_content(repo_f).rstrip() - repo_list += repo_l.split("\n") + repo_list.extend(repo_l.split("\n")) return repo_list @@ -221,7 +222,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, try: root = et.fromstring(file_content) except Exception: - print(file_content) + print(file_content, sys.stderr) else: # version if metadata["Galaxy wrapper version"] is None: @@ -293,7 +294,7 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]: try: repo_tools = repo.get_contents("wrappers") except Exception: - print("No tool folder found") + print("No tool folder found", sys.stderr) return [] assert isinstance(repo_tools, list) tool_folders.append(repo_tools) @@ -418,10 +419,12 @@ def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str] print(r) if "github" not in r: continue - repo = get_github_repo(r, g) - tools += parse_tools(repo) - export_tools(tools, args.all_tools, format_list_col=True) - print() + try: + repo = get_github_repo(r, g) + tools.extend(parse_tools(repo)) + except Exception as e: + print(f"Error while extracting tools from repo {r}: {e}", file=sys.stderr) + export_tools(tools, args.all_tools, format_list_col=True) elif args.command == "filtertools": tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records") # get categories and tools to exclude From e0bc3db675d3c33ce45fdd68bc0c9f67b2dc9d19 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Wed, 1 Nov 2023 11:12:27 +0100 Subject: [PATCH 20/20] create the folder if needed --- bin/extract_all_tools.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/extract_all_tools.sh b/bin/extract_all_tools.sh index dcfb47c9..1dc96028 100644 --- a/bin/extract_all_tools.sh +++ b/bin/extract_all_tools.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +mkdir -p 'results/' + python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \