diff --git a/bin/extract_all_tools_test.sh b/bin/extract_all_tools_test.sh index f24b3c5c..2ba40be5 100755 --- a/bin/extract_all_tools_test.sh +++ b/bin/extract_all_tools_test.sh @@ -8,7 +8,7 @@ json_output="results/${1}_tools.json" python bin/extract_galaxy_tools.py \ extractools \ --api $GITHUB_API_KEY \ - --all-tools $output \ + --all-tools $tsv_output \ --all-tools-json $json_output \ --planemo-repository-list $1 \ --test diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index c558d6fa..e71e5525 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -21,6 +21,7 @@ from github import Github from github.ContentFile import ContentFile from github.Repository import Repository +from owlready2 import get_ontology # Config variables BIOTOOLS_API_URL = "https://bio.tools" @@ -576,6 +577,10 @@ def export_tools_to_tsv( df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) df["EDAM operation"] = format_list_column(df["EDAM operation"]) df["EDAM topic"] = format_list_column(df["EDAM topic"]) + + df["EDAM operation (no superclasses)"] = format_list_column(df["EDAM operation (no superclasses)"]) + df["EDAM topic (no superclasses)"] = format_list_column(df["EDAM topic (no superclasses)"]) + df["bio.tool ids"] = format_list_column(df["bio.tool ids"]) # the Galaxy tools need to be formatted for the add_instances_to_table to work @@ -620,6 +625,48 @@ def filter_tools( return ts_filtered_tools, filtered_tools +def reduce_ontology_terms(terms: List, ontology: Any) -> List: + """ + Reduces a list of Ontology terms, to include only terms that are not super-classes of one of the other terms. + In other terms all classes that have a subclass in the terms are removed. + + :terms: list of terms from that ontology + :ontology: Ontology + """ + + # if list is empty do nothing + if not terms: + return terms + + classes = [ontology.search_one(label=term) for term in terms] + check_classes = [cla for cla in classes if cla is not None] # Remove None values + + new_classes = [] + for cla in check_classes: + try: + # get all subclasses + subclasses = list(cla.subclasses()) + + # check if any of the other classes is a subclass + include_class = True + for subcla in subclasses: + for cla2 in check_classes: + if subcla == cla2: + include_class = False + + # only keep the class if it is not a parent class + if include_class: + new_classes.append(cla) + + except Exception as e: + print(f"Error processing class {cla}: {e}") + + # convert back to terms, skipping None values + new_terms = [cla.label[0] for cla in new_classes if cla is not None] + # print(f"Terms: {len(terms)}, New terms: {len(new_terms)}") + return new_terms + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata" @@ -695,7 +742,7 @@ def filter_tools( run_test=args.test, add_extra_repositories=not args.avoid_extra_repositories, ) - # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file + # parse tools in GitHub repositories to extract metadata, filter by TS categories and export to output file tools: List[Dict] = [] for r in repo_list: print("Parsing tools from:", (r)) @@ -709,6 +756,16 @@ def filter_tools( f"Error while extracting tools from repo {r}: {e}", file=sys.stderr, ) + + # add additional information to the List[Dict] object + edam_ontology = get_ontology("https://edamontology.org/EDAM_1.25.owl").load() + + for tool in tools: + tool["EDAM operation (no superclasses)"] = reduce_ontology_terms( + tool["EDAM operation"], ontology=edam_ontology + ) + tool["EDAM topic (no superclasses)"] = reduce_ontology_terms(tool["EDAM topic"], ontology=edam_ontology) + export_tools_to_json(tools, args.all_tools_json) export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True) diff --git a/requirements.txt b/requirements.txt index 34ad35e7..f3246526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pyyaml numpy Pillow matplotlib -wordcloud \ No newline at end of file +wordcloud +owlready2 \ No newline at end of file diff --git a/results/microgalaxy/tools_wordcloud.png b/results/microgalaxy/tools_wordcloud.png deleted file mode 100644 index 319f636f..00000000 Binary files a/results/microgalaxy/tools_wordcloud.png and /dev/null differ