Skip to content

Commit

Permalink
* store the location of the parsed folder
Browse files Browse the repository at this point in the history
* add some docu comments
  • Loading branch information
paulzierep committed Mar 4, 2024
1 parent 8fe3173 commit 6c04f4f
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 7 deletions.
26 changes: 23 additions & 3 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
"""
if tool.type != "dir":
return None

# the folder of the tool is used as Galaxy wrapper id (maybe rather use the .shed.yml name)
metadata = {
"Galaxy wrapper id": tool.name,
"Galaxy tool ids": [],
Expand All @@ -261,7 +263,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
"ToolShed categories": [],
"ToolShed id": None,
"Galaxy wrapper owner": None,
"Galaxy wrapper source": None,
"Galaxy wrapper source": None, # this is what it written in the .shed.yml
"Galaxy wrapper parsed folder": None, # this is the actual parsed file
"Galaxy wrapper version": None,
"Conda id": None,
"Conda version": None,
Expand All @@ -271,6 +274,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
shed = repo.get_contents(f"{tool.path}/.shed.yml")
except Exception:
return None
# parse the .shed.yml
else:
file_content = get_string_content(shed)
yaml_content = yaml.load(file_content, Loader=yaml.FullLoader)
Expand All @@ -287,9 +291,15 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
metadata["ToolShed categories"] = get_shed_attribute("categories", yaml_content, [])
if metadata["ToolShed categories"] is None:
metadata["ToolShed categories"] = []
# find and parse macro file

# get all files in the folder
file_list = repo.get_contents(tool.path)
assert isinstance(file_list, list)

# store the github location where the folder was parsed
metadata["Galaxy wrapper parsed folder"] = tool.html_url

# find and parse macro file
for file in file_list:
if "macro" in file.name and file.name.endswith("xml"):
file_content = get_string_content(file)
Expand All @@ -308,7 +318,8 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
biii = get_xref(child, attrib_type="biii")
if biii is not None:
metadata["biii"] = biii
# parse XML file and get meta data from there, also tool ids

# parse XML file and get meta data from there
for file in file_list:
if file.name.endswith("xml") and "macro" not in file.name:
file_content = get_string_content(file)
Expand Down Expand Up @@ -350,6 +361,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str,
# tool ids
if "id" in root.attrib:
metadata["Galaxy tool ids"].append(root.attrib["id"])

# get latest conda version and compare to the wrapper version
if metadata["Conda id"] is not None:
r = requests.get(f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}')
Expand Down Expand Up @@ -396,6 +408,7 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
print("No tool folder found", sys.stderr)
return []
assert isinstance(repo_tools, list)

tool_folders.append(repo_tools)
try:
repo_tools = repo.get_contents("tool_collections")
Expand All @@ -404,6 +417,10 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
else:
assert isinstance(repo_tools, list)
tool_folders.append(repo_tools)

# tool_folders will contain a list of all folders in the
# repository named wrappers/tools/tool_collections

# parse folders
tools = []
for folder in tool_folders:
Expand All @@ -413,7 +430,10 @@ def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
print("WAITING for 1 hour to retrieve GitHub API request access !!!")
print()
time.sleep(60 * 60)

# parse tool
# if the folder (tool) has a .shed.yml file run get get_tool_metadata on that folder,
# otherwise go one level down and check if there is a .shed.yml in a subfolder
try:
repo.get_contents(f"{tool.path}/.shed.yml")
except Exception:
Expand Down
8 changes: 4 additions & 4 deletions results/test.list_tools.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Galaxy wrapper id Total tool usage (usegalaxy.eu) No. of tool users (2022-2023) (usegalaxy.eu) Galaxy tool ids Description bio.tool id biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper version Conda id Conda version https://usegalaxy.org https://usegalaxy.org.au https://usegalaxy.eu
2d_auto_threshold 6541.0 39.0 ip_threshold Automatic thresholding scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ 0.0.5-2 scikit-image (0/1) (1/1) (1/1)
abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes Up-to-date https://zenodo.org/record/7370628 Sequence Analysis abritamr iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr 1.0.14 abritamr 1.0.14 (0/1) (0/1) (0/1)
aldex2 129.0 13.1 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 (0/1) (0/1) (1/1)
Galaxy wrapper id Total tool usage (usegalaxy.eu) No. of tool users (2022-2023) (usegalaxy.eu) Galaxy tool ids Description bio.tool id biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper parsed folder Galaxy wrapper version Conda id Conda version https://usegalaxy.org https://usegalaxy.org.au https://usegalaxy.eu
2d_auto_threshold 6541.0 39.0 ip_threshold Automatic thresholding scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold 0.0.6-2 scikit-image (0/1) (1/1) (1/1)
abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes Up-to-date https://zenodo.org/record/7370628 Sequence Analysis abritamr iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr 1.0.14 abritamr 1.0.14 (0/1) (0/1) (0/1)
aldex2 129.0 13.0 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 (0/1) (0/1) (1/1)

0 comments on commit 6c04f4f

Please sign in to comment.