Skip to content

Commit

Permalink
Merge pull request galaxyproject#52 from EngyNasr/scripts
Browse files Browse the repository at this point in the history
Updating the extract galaxy tools script to add two more columns one for reduced EDAM operation and one for reduced EDAM topic
  • Loading branch information
bebatut authored Jun 4, 2024
2 parents 64a325d + 667eee6 commit 05bc0ba
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 3 deletions.
2 changes: 1 addition & 1 deletion bin/extract_all_tools_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ json_output="results/${1}_tools.json"
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--all-tools $tsv_output \
--all-tools-json $json_output \
--planemo-repository-list $1 \
--test
Expand Down
59 changes: 58 additions & 1 deletion bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from github import Github
from github.ContentFile import ContentFile
from github.Repository import Repository
from owlready2 import get_ontology

# Config variables
BIOTOOLS_API_URL = "https://bio.tools"
Expand Down Expand Up @@ -576,6 +577,10 @@ def export_tools_to_tsv(
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["EDAM operation (no superclasses)"] = format_list_column(df["EDAM operation (no superclasses)"])
df["EDAM topic (no superclasses)"] = format_list_column(df["EDAM topic (no superclasses)"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand Down Expand Up @@ -620,6 +625,48 @@ def filter_tools(
return ts_filtered_tools, filtered_tools


def reduce_ontology_terms(terms: List, ontology: Any) -> List:
"""
Reduces a list of Ontology terms, to include only terms that are not super-classes of one of the other terms.
In other terms all classes that have a subclass in the terms are removed.
:terms: list of terms from that ontology
:ontology: Ontology
"""

# if list is empty do nothing
if not terms:
return terms

classes = [ontology.search_one(label=term) for term in terms]
check_classes = [cla for cla in classes if cla is not None] # Remove None values

new_classes = []
for cla in check_classes:
try:
# get all subclasses
subclasses = list(cla.subclasses())

# check if any of the other classes is a subclass
include_class = True
for subcla in subclasses:
for cla2 in check_classes:
if subcla == cla2:
include_class = False

# only keep the class if it is not a parent class
if include_class:
new_classes.append(cla)

except Exception as e:
print(f"Error processing class {cla}: {e}")

# convert back to terms, skipping None values
new_terms = [cla.label[0] for cla in new_classes if cla is not None]
# print(f"Terms: {len(terms)}, New terms: {len(new_terms)}")
return new_terms


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata"
Expand Down Expand Up @@ -695,7 +742,7 @@ def filter_tools(
run_test=args.test,
add_extra_repositories=not args.avoid_extra_repositories,
)
# parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
# parse tools in GitHub repositories to extract metadata, filter by TS categories and export to output file
tools: List[Dict] = []
for r in repo_list:
print("Parsing tools from:", (r))
Expand All @@ -709,6 +756,16 @@ def filter_tools(
f"Error while extracting tools from repo {r}: {e}",
file=sys.stderr,
)

# add additional information to the List[Dict] object
edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load()

for tool in tools:
tool["EDAM operation (no superclasses)"] = reduce_ontology_terms(
tool["EDAM operation"], ontology=edam_ontology
)
tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology)

export_tools_to_json(tools, args.all_tools_json)
export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pyyaml
numpy
Pillow
matplotlib
wordcloud
wordcloud
owlready2
Binary file removed results/microgalaxy/tools_wordcloud.png
Binary file not shown.

0 comments on commit 05bc0ba

Please sign in to comment.