From 6c04f4fd504a347c880fb5522005e99a9bbe5395 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Mon, 4 Mar 2024 09:54:17 +0100 Subject: [PATCH] * store the location of the parsed folder * add some docu comments --- bin/extract_galaxy_tools.py | 26 +++++++++++++++++++++++--- results/test.list_tools.tsv | 8 ++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 4b36a619..eb9de460 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -246,6 +246,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, """ if tool.type != "dir": return None + + # the folder of the tool is used as Galaxy wrapper id (maybe rather use the .shed.yml name) metadata = { "Galaxy wrapper id": tool.name, "Galaxy tool ids": [], @@ -261,7 +263,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, "ToolShed categories": [], "ToolShed id": None, "Galaxy wrapper owner": None, - "Galaxy wrapper source": None, + "Galaxy wrapper source": None, # this is what it written in the .shed.yml + "Galaxy wrapper parsed folder": None, # this is the actual parsed file "Galaxy wrapper version": None, "Conda id": None, "Conda version": None, @@ -271,6 +274,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, shed = repo.get_contents(f"{tool.path}/.shed.yml") except Exception: return None + # parse the .shed.yml else: file_content = get_string_content(shed) yaml_content = yaml.load(file_content, Loader=yaml.FullLoader) @@ -287,9 +291,15 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, metadata["ToolShed categories"] = get_shed_attribute("categories", yaml_content, []) if metadata["ToolShed categories"] is None: metadata["ToolShed categories"] = [] - # find and parse macro file + + # get all files in the folder file_list = repo.get_contents(tool.path) assert isinstance(file_list, list) + + # store the github location where the folder was parsed + metadata["Galaxy wrapper parsed folder"] = tool.html_url + + # find and parse macro file for file in file_list: if "macro" in file.name and file.name.endswith("xml"): file_content = get_string_content(file) @@ -308,7 +318,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, biii = get_xref(child, attrib_type="biii") if biii is not None: metadata["biii"] = biii - # parse XML file and get meta data from there, also tool ids + + # parse XML file and get meta data from there for file in file_list: if file.name.endswith("xml") and "macro" not in file.name: file_content = get_string_content(file) @@ -350,6 +361,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, # tool ids if "id" in root.attrib: metadata["Galaxy tool ids"].append(root.attrib["id"]) + # get latest conda version and compare to the wrapper version if metadata["Conda id"] is not None: r = requests.get(f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}') @@ -396,6 +408,7 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]: print("No tool folder found", sys.stderr) return [] assert isinstance(repo_tools, list) + tool_folders.append(repo_tools) try: repo_tools = repo.get_contents("tool_collections") @@ -404,6 +417,10 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]: else: assert isinstance(repo_tools, list) tool_folders.append(repo_tools) + + # tool_folders will contain a list of all folders in the + # repository named wrappers/tools/tool_collections + # parse folders tools = [] for folder in tool_folders: @@ -413,7 +430,10 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]: print("WAITING for 1 hour to retrieve GitHub API request access !!!") print() time.sleep(60 * 60) + # parse tool + # if the folder (tool) has a .shed.yml file run get get_tool_metadata on that folder, + # otherwise go one level down and check if there is a .shed.yml in a subfolder try: repo.get_contents(f"{tool.path}/.shed.yml") except Exception: diff --git a/results/test.list_tools.tsv b/results/test.list_tools.tsv index c52820d8..975b4e21 100644 --- a/results/test.list_tools.tsv +++ b/results/test.list_tools.tsv @@ -1,4 +1,4 @@ -Galaxy wrapper id Total tool usage (usegalaxy.eu) No. of tool users (2022-2023) (usegalaxy.eu) Galaxy tool ids Description bio.tool id biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper version Conda id Conda version https://usegalaxy.org https://usegalaxy.org.au https://usegalaxy.eu -2d_auto_threshold 6541.0 39.0 ip_threshold Automatic thresholding scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ 0.0.5-2 scikit-image (0/1) (1/1) (1/1) -abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes Up-to-date https://zenodo.org/record/7370628 Sequence Analysis abritamr iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr 1.0.14 abritamr 1.0.14 (0/1) (0/1) (0/1) -aldex2 129.0 13.1 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 (0/1) (0/1) (1/1) +Galaxy wrapper id Total tool usage (usegalaxy.eu) No. of tool users (2022-2023) (usegalaxy.eu) Galaxy tool ids Description bio.tool id biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper parsed folder Galaxy wrapper version Conda id Conda version https://usegalaxy.org https://usegalaxy.org.au https://usegalaxy.eu +2d_auto_threshold 6541.0 39.0 ip_threshold Automatic thresholding scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold 0.0.6-2 scikit-image (0/1) (1/1) (1/1) +abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes Up-to-date https://zenodo.org/record/7370628 Sequence Analysis abritamr iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr 1.0.14 abritamr 1.0.14 (0/1) (0/1) (0/1) +aldex2 129.0 13.0 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 (0/1) (0/1) (1/1)