Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reduced EDAM terms without terms being subclasses of other terms #111

Closed
Closed
2 changes: 1 addition & 1 deletion bin/extract_all_tools_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ json_output="results/${1}_tools.json"
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--all-tools $tsv_output \
--all-tools-json $json_output \
--planemo-repository-list $1 \
--test
Expand Down
59 changes: 58 additions & 1 deletion bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from github import Github
from github.ContentFile import ContentFile
from github.Repository import Repository
from owlready2 import get_ontology

# Config variables
BIOTOOLS_API_URL = "https://bio.tools"
Expand Down Expand Up @@ -576,6 +577,10 @@ def export_tools_to_tsv(
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])

df["EDAM operation (no superclasses)"] = format_list_column(df["EDAM operation (no superclasses)"])
df["EDAM topic (no superclasses)"] = format_list_column(df["EDAM topic (no superclasses)"])

df["bio.tool ids"] = format_list_column(df["bio.tool ids"])

# the Galaxy tools need to be formatted for the add_instances_to_table to work
Expand Down Expand Up @@ -620,6 +625,48 @@ def filter_tools(
return ts_filtered_tools, filtered_tools


def reduce_ontology_terms(terms: List, ontology: Any) -> List:
"""
Reduces a list of Ontology terms, to include only terms that are not super-classes of one of the other terms.
In other terms all classes that have a subclass in the terms are removed.

:terms: list of terms from that ontology
:ontology: Ontology
"""

# if list is empty do nothing
if not terms:
return terms

classes = [ontology.search_one(label=term) for term in terms]
check_classes = [cla for cla in classes if cla is not None] # Remove None values

new_classes = []
for cla in check_classes:
try:
# get all subclasses
subclasses = list(cla.subclasses())

# check if any of the other classes is a subclass
include_class = True
for subcla in subclasses:
for cla2 in check_classes:
if subcla == cla2:
include_class = False

# only keep the class if it is not a parent class
if include_class:
new_classes.append(cla)

except Exception as e:
print(f"Error processing class {cla}: {e}")

# convert back to terms, skipping None values
new_terms = [cla.label[0] for cla in new_classes if cla is not None]
# print(f"Terms: {len(terms)}, New terms: {len(new_terms)}")
return new_terms


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata"
Expand Down Expand Up @@ -695,7 +742,7 @@ def filter_tools(
run_test=args.test,
add_extra_repositories=not args.avoid_extra_repositories,
)
# parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
# parse tools in GitHub repositories to extract metadata, filter by TS categories and export to output file
tools: List[Dict] = []
for r in repo_list:
print("Parsing tools from:", (r))
Expand All @@ -709,6 +756,16 @@ def filter_tools(
f"Error while extracting tools from repo {r}: {e}",
file=sys.stderr,
)

# add additional information to the List[Dict] object
edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load()

for tool in tools:
tool["EDAM operation (no superclasses)"] = reduce_ontology_terms(
tool["EDAM operation"], ontology=edam_ontology
)
tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology)

export_tools_to_json(tools, args.all_tools_json)
export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pyyaml
numpy
Pillow
matplotlib
wordcloud
wordcloud
owlready2