-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: integrate BioPortal annotator for term recommendation
Implement functionality to access the BioPortal annotator API. This enables: - Text input analysis - Relevant class retrieval - Recommendations for data authors and curators
- Loading branch information
Showing
4 changed files
with
150 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
"""The annotator module""" | ||
|
||
from typing import Union | ||
from requests import get, exceptions | ||
|
||
|
||
# pylint: disable=too-many-locals | ||
def get_bioportal_annotation( | ||
text: str, | ||
api_key: str, | ||
ontologies: str, | ||
semantic_types: str = "", | ||
expand_semantic_types_hierarchy: bool = False, | ||
expand_class_hierarchy: bool = False, | ||
class_hierarchy_max_level: int = 0, | ||
expand_mappings: bool = False, | ||
stop_words: str = "", | ||
minimum_match_length: int = 3, | ||
exclude_numbers: bool = False, | ||
whole_word_only: bool = True, | ||
exclude_synonyms: bool = False, | ||
longest_only: bool = False, | ||
) -> Union[list, None]: | ||
"""Get an annotation from the BioPortal API | ||
:param text: The text to be annotated. | ||
:param api_key: The BioPortal API key. | ||
:param ontologies: The ontologies to use for annotation. | ||
:param semantic_types: The semantic types to use for annotation. | ||
:param expand_semantic_types_hierarchy: true means to use the semantic | ||
types passed in the "semantic_types" parameter as well as all their | ||
immediate children. false means to use ONLY the semantic types passed | ||
in the "semantic_types" parameter. | ||
:param expand_class_hierarchy: used only in conjunction with | ||
"class_hierarchy_max_level" parameter; determines whether or not to | ||
include ancestors of the given class when performing an annotation. | ||
:param class_hierarchy_max_level: the depth of the hierarchy to use when | ||
performing an annotation. | ||
:param expand_mappings: true means that the following manual mappings will | ||
be used in annotation: UMLS, REST, CUI, OBOXREF. | ||
:param stop_words: a comma-separated list of words to ignore in the text. | ||
:param minimum_match_length: the minimum number of characters in a term | ||
that must be matched in the text. | ||
:param exclude_numbers: true means to exclude numbers from annotation. | ||
:param whole_word_only: true means to match whole words only. | ||
:param exclude_synonyms: true means to exclude synonyms from annotation. | ||
:param longest_only: true means that only the longest match for a given | ||
phrase will be returned. | ||
:returns: A list of dictionaries, each with the annotation keys `label` | ||
and `uri`, corresponding to the preferred label and URI of the | ||
annotated concept. None if the request fails. | ||
:notes: This function is a wrapper for the BioPortal API. The BioPortal API | ||
is a repository of biomedical ontologies with a RESTful API that allows | ||
users to annotate text with ontology concepts. The API is documented at | ||
https://data.bioontology.org/documentation#nav_annotator. | ||
This function requires an API key from BioPortal. To obtain an API key, | ||
users must register at https://bioportal.bioontology.org/account. The | ||
key can be loaded as an environment variable from the configuration | ||
file (see `utilities.load_configuration`). | ||
""" | ||
# Construct the query | ||
url = "https://data.bioontology.org/annotator" | ||
payload = { | ||
"text": text, | ||
"apikey": api_key, | ||
"ontologies": ontologies, | ||
"semantic_types": semantic_types, | ||
"expand_semantic_types_hierarchy": expand_semantic_types_hierarchy, | ||
"expand_class_hierarchy": expand_class_hierarchy, | ||
"class_hierarchy_max_level": class_hierarchy_max_level, | ||
"expand_mappings": expand_mappings, | ||
"stop_words": stop_words, | ||
"minimum_match_length": minimum_match_length, | ||
"exclude_numbers": exclude_numbers, | ||
"whole_word_only": whole_word_only, | ||
"exclude_synonyms": exclude_synonyms, | ||
"longest_only": longest_only, | ||
"page_size": 100, # to circumvent pagination | ||
"format": "json", # being explicit here, even though it's the default | ||
} | ||
# Get annotations | ||
try: | ||
r = get(url, params=payload, timeout=10) | ||
r.raise_for_status() | ||
except exceptions.RequestException as e: | ||
print(f"Error calling https://data.bioontology.org/annotator: {e}") | ||
return None | ||
|
||
# Parse the results | ||
annotations = [] | ||
for item in r.json(): | ||
self_link = item.get("annotatedClass", {}).get("links").get("self", None) | ||
try: | ||
r = get(self_link, params={"apikey": api_key}, timeout=10) | ||
r.raise_for_status() | ||
except exceptions.RequestException as e: | ||
print(f"Error calling {self_link}: {e}") | ||
return None | ||
uri = r.json().get("@id", None) | ||
label = r.json().get("prefLabel", None) | ||
annotations.append({"label": label, "uri": uri}) | ||
return annotations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
"""Test annotator code""" | ||
|
||
import os | ||
import pytest | ||
from spinneret.annotator import get_bioportal_annotation | ||
from spinneret.utilities import load_configuration | ||
|
||
|
||
def test_get_bioportal_annotation(): | ||
"""Test get_bioportal_annotation""" | ||
text = """ | ||
This dataset contains cover of kelp forest sessile | ||
invertebrates, understory macroalgae, and substrate types by | ||
integrating data from four contributing projects working in the kelp | ||
forests of the Santa Barbara Channel, USA. Divers collect data on | ||
using either uniform point contact (UPC) or random point contact (RPC) | ||
methods. | ||
""" | ||
if not os.path.exists("config.json"): | ||
pytest.skip("Skipping test due to missing config.json file in package root.") | ||
load_configuration("config.json") | ||
res = get_bioportal_annotation( | ||
text, | ||
api_key=os.environ["BIOPORTAL_API_KEY"], | ||
ontologies="ENVO", | ||
exclude_synonyms=True, | ||
) | ||
assert res is not None | ||
assert len(res) > 0 | ||
assert isinstance(res, list) | ||
for item in res: | ||
assert isinstance(item, dict) | ||
assert "label" in item | ||
assert "uri" in item | ||
assert isinstance(item["label"], str) | ||
assert isinstance(item["uri"], str) | ||
assert item["label"] != "" | ||
assert item["uri"] != "" |