From 9fe3cb213b920c206e75f6f9ff16dc2efb56794c Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 12:20:09 +0200 Subject: [PATCH 1/6] add reduce_ontology_terms func and apply to EDAM --- bin/extract_galaxy_tools.py | 55 ++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index c558d6fa..c0fb2774 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -22,6 +22,8 @@ from github.ContentFile import ContentFile from github.Repository import Repository +from owlready2 import get_ontology, Thing + # Config variables BIOTOOLS_API_URL = "https://bio.tools" # BIOTOOLS_API_URL = "https://130.226.25.21" @@ -620,6 +622,47 @@ def filter_tools( return ts_filtered_tools, filtered_tools +def reduce_ontology_terms(terms: List, ontology: Any) -> List: + """ + Reduces a list of Ontology terms, to include only terms that are not subclasses of one of the other terms. + + :terms: list of terms from that ontology + :ontology: Ontology + """ + + # if list is empty do nothing + if not terms: + return terms + + classes = [ontology.search_one(label=term) for term in terms] + check_classes = [cla for cla in classes if cla is not None] # Remove None values + + new_classes = [] + for cla in check_classes: + try: + # get all subclasses + subclasses = list(cla.subclasses()) + + # check if any of the other classes is a subclass + include_class = True + for subcla in subclasses: + for cla2 in check_classes: + if subcla == cla2: + include_class = False + + # only keep the class if it is not a parent class + if include_class: + new_classes.append(cla) + + except Exception as e: + print(f"Error processing class {cla}: {e}") + + # convert back to terms, skipping None values + new_terms = [cla.label[0] for cla in new_classes if cla is not None] + # print(f"Terms: {len(terms)}, New terms: {len(new_terms)}") + return new_terms + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata" @@ -695,7 +738,7 @@ def filter_tools( run_test=args.test, add_extra_repositories=not args.avoid_extra_repositories, ) - # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file + # parse tools in GitHub repositories to extract metadata, filter by TS categories and export to output file tools: List[Dict] = [] for r in repo_list: print("Parsing tools from:", (r)) @@ -709,6 +752,16 @@ def filter_tools( f"Error while extracting tools from repo {r}: {e}", file=sys.stderr, ) + + # add additional information to the List[Dict] object + edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load() + + for tool in tools: + tool["EDAM operation (no subclasses)"] = reduce_ontology_terms( + tool["EDAM operation"], ontology=edam_ontology + ) + tool["EDAM topic (no subclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology) + export_tools_to_json(tools, args.all_tools_json) export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True) From 22cae73598a4019a01b0b4b63600e5af88a0c3bd Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 12:23:00 +0200 Subject: [PATCH 2/6] linting --- bin/extract_galaxy_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index c0fb2774..d50f0a6c 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -22,7 +22,7 @@ from github.ContentFile import ContentFile from github.Repository import Repository -from owlready2 import get_ontology, Thing +from owlready2 import get_ontology # Config variables BIOTOOLS_API_URL = "https://bio.tools" From 594a88f6a54b07eeee85e79eb08ac6c492a90a18 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 12:38:44 +0200 Subject: [PATCH 3/6] linting --- bin/extract_galaxy_tools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index d50f0a6c..a2dfe7b1 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -21,7 +21,6 @@ from github import Github from github.ContentFile import ContentFile from github.Repository import Repository - from owlready2 import get_ontology # Config variables From 9deb14bc46a742cf25d99b8f103edc7db4e96f9b Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 13:45:38 +0200 Subject: [PATCH 4/6] add requirement, fix test --- bin/extract_all_tools_test.sh | 2 +- requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/extract_all_tools_test.sh b/bin/extract_all_tools_test.sh index f24b3c5c..2ba40be5 100755 --- a/bin/extract_all_tools_test.sh +++ b/bin/extract_all_tools_test.sh @@ -8,7 +8,7 @@ json_output="results/${1}_tools.json" python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all-tools $output \ + --all-tools $tsv_output \ --all-tools-json $json_output \ --planemo-repository-list $1 \ --test diff --git a/requirements.txt b/requirements.txt index 34ad35e7..f3246526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pyyaml numpy Pillow matplotlib -wordcloud \ No newline at end of file +wordcloud +owlready2 \ No newline at end of file From 68d3d8b4c1c4bc5eacccee89845b9a1de0a3f652 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 13:51:41 +0200 Subject: [PATCH 5/6] format EDAM no subclasses correctly --- bin/extract_galaxy_tools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index a2dfe7b1..7003390c 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -577,6 +577,10 @@ def export_tools_to_tsv( df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) df["EDAM operation"] = format_list_column(df["EDAM operation"]) df["EDAM topic"] = format_list_column(df["EDAM topic"]) + + df["EDAM operation (no subclasses)"] = format_list_column(df["EDAM operation (no subclasses)"]) + df["EDAM topic (no subclasses)"] = format_list_column(df["EDAM topic (no subclasses)"]) + df["bio.tool ids"] = format_list_column(df["bio.tool ids"]) # the Galaxy tools need to be formatted for the add_instances_to_table to work From 4b87b0369439864bedcd0b30cdb520c87115b1ef Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 4 Jun 2024 14:09:36 +0200 Subject: [PATCH 6/6] change subclass with superclass --- bin/extract_galaxy_tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index 7003390c..e71e5525 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -578,8 +578,8 @@ def export_tools_to_tsv( df["EDAM operation"] = format_list_column(df["EDAM operation"]) df["EDAM topic"] = format_list_column(df["EDAM topic"]) - df["EDAM operation (no subclasses)"] = format_list_column(df["EDAM operation (no subclasses)"]) - df["EDAM topic (no subclasses)"] = format_list_column(df["EDAM topic (no subclasses)"]) + df["EDAM operation (no superclasses)"] = format_list_column(df["EDAM operation (no superclasses)"]) + df["EDAM topic (no superclasses)"] = format_list_column(df["EDAM topic (no superclasses)"]) df["bio.tool ids"] = format_list_column(df["bio.tool ids"]) @@ -627,7 +627,8 @@ def filter_tools( def reduce_ontology_terms(terms: List, ontology: Any) -> List: """ - Reduces a list of Ontology terms, to include only terms that are not subclasses of one of the other terms. + Reduces a list of Ontology terms, to include only terms that are not super-classes of one of the other terms. + In other terms all classes that have a subclass in the terms are removed. :terms: list of terms from that ontology :ontology: Ontology @@ -760,10 +761,10 @@ def reduce_ontology_terms(terms: List, ontology: Any) -> List: edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load() for tool in tools: - tool["EDAM operation (no subclasses)"] = reduce_ontology_terms( + tool["EDAM operation (no superclasses)"] = reduce_ontology_terms( tool["EDAM operation"], ontology=edam_ontology ) - tool["EDAM topic (no subclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology) + tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology) export_tools_to_json(tools, args.all_tools_json) export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)