Skip to content

Commit

Permalink
feat: integrate BioPortal annotator for term recommendation
Browse files Browse the repository at this point in the history
Implement functionality to access the BioPortal annotator API. This
enables:
- Text input analysis
- Relevant class retrieval
- Recommendations for data authors and curators
  • Loading branch information
clnsmth authored Aug 27, 2024
1 parent 1231973 commit 9822406
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 1 deletion.
6 changes: 6 additions & 0 deletions docs/source/user/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
Developer Interface
===================

Annotator Module
----------------

.. automodule:: spinneret.annotator
:members:

Datasets Module
---------------

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ patch_without_tag = false # patch release by default
template_dir = "docs/source/_templates/" # changelog template directory

[tool.pylint.'MESSAGES.CONTROL']
disable = "too-many-public-methods,c-extension-no-member"
disable = "too-many-arguments,c-extension-no-member"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
105 changes: 105 additions & 0 deletions src/spinneret/annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""The annotator module"""

from typing import Union
from requests import get, exceptions


# pylint: disable=too-many-locals
def get_bioportal_annotation(
text: str,
api_key: str,
ontologies: str,
semantic_types: str = "",
expand_semantic_types_hierarchy: bool = False,
expand_class_hierarchy: bool = False,
class_hierarchy_max_level: int = 0,
expand_mappings: bool = False,
stop_words: str = "",
minimum_match_length: int = 3,
exclude_numbers: bool = False,
whole_word_only: bool = True,
exclude_synonyms: bool = False,
longest_only: bool = False,
) -> Union[list, None]:
"""Get an annotation from the BioPortal API
:param text: The text to be annotated.
:param api_key: The BioPortal API key.
:param ontologies: The ontologies to use for annotation.
:param semantic_types: The semantic types to use for annotation.
:param expand_semantic_types_hierarchy: true means to use the semantic
types passed in the "semantic_types" parameter as well as all their
immediate children. false means to use ONLY the semantic types passed
in the "semantic_types" parameter.
:param expand_class_hierarchy: used only in conjunction with
"class_hierarchy_max_level" parameter; determines whether or not to
include ancestors of the given class when performing an annotation.
:param class_hierarchy_max_level: the depth of the hierarchy to use when
performing an annotation.
:param expand_mappings: true means that the following manual mappings will
be used in annotation: UMLS, REST, CUI, OBOXREF.
:param stop_words: a comma-separated list of words to ignore in the text.
:param minimum_match_length: the minimum number of characters in a term
that must be matched in the text.
:param exclude_numbers: true means to exclude numbers from annotation.
:param whole_word_only: true means to match whole words only.
:param exclude_synonyms: true means to exclude synonyms from annotation.
:param longest_only: true means that only the longest match for a given
phrase will be returned.
:returns: A list of dictionaries, each with the annotation keys `label`
and `uri`, corresponding to the preferred label and URI of the
annotated concept. None if the request fails.
:notes: This function is a wrapper for the BioPortal API. The BioPortal API
is a repository of biomedical ontologies with a RESTful API that allows
users to annotate text with ontology concepts. The API is documented at
https://data.bioontology.org/documentation#nav_annotator.
This function requires an API key from BioPortal. To obtain an API key,
users must register at https://bioportal.bioontology.org/account. The
key can be loaded as an environment variable from the configuration
file (see `utilities.load_configuration`).
"""
# Construct the query
url = "https://data.bioontology.org/annotator"
payload = {
"text": text,
"apikey": api_key,
"ontologies": ontologies,
"semantic_types": semantic_types,
"expand_semantic_types_hierarchy": expand_semantic_types_hierarchy,
"expand_class_hierarchy": expand_class_hierarchy,
"class_hierarchy_max_level": class_hierarchy_max_level,
"expand_mappings": expand_mappings,
"stop_words": stop_words,
"minimum_match_length": minimum_match_length,
"exclude_numbers": exclude_numbers,
"whole_word_only": whole_word_only,
"exclude_synonyms": exclude_synonyms,
"longest_only": longest_only,
"page_size": 100, # to circumvent pagination
"format": "json", # being explicit here, even though it's the default
}
# Get annotations
try:
r = get(url, params=payload, timeout=10)
r.raise_for_status()
except exceptions.RequestException as e:
print(f"Error calling https://data.bioontology.org/annotator: {e}")
return None

# Parse the results
annotations = []
for item in r.json():
self_link = item.get("annotatedClass", {}).get("links").get("self", None)
try:
r = get(self_link, params={"apikey": api_key}, timeout=10)
r.raise_for_status()
except exceptions.RequestException as e:
print(f"Error calling {self_link}: {e}")
return None
uri = r.json().get("@id", None)
label = r.json().get("prefLabel", None)
annotations.append({"label": label, "uri": uri})
return annotations
38 changes: 38 additions & 0 deletions tests/test_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Test annotator code"""

import os
import pytest
from spinneret.annotator import get_bioportal_annotation
from spinneret.utilities import load_configuration


def test_get_bioportal_annotation():
"""Test get_bioportal_annotation"""
text = """
This dataset contains cover of kelp forest sessile
invertebrates, understory macroalgae, and substrate types by
integrating data from four contributing projects working in the kelp
forests of the Santa Barbara Channel, USA. Divers collect data on
using either uniform point contact (UPC) or random point contact (RPC)
methods.
"""
if not os.path.exists("config.json"):
pytest.skip("Skipping test due to missing config.json file in package root.")
load_configuration("config.json")
res = get_bioportal_annotation(
text,
api_key=os.environ["BIOPORTAL_API_KEY"],
ontologies="ENVO",
exclude_synonyms=True,
)
assert res is not None
assert len(res) > 0
assert isinstance(res, list)
for item in res:
assert isinstance(item, dict)
assert "label" in item
assert "uri" in item
assert isinstance(item["label"], str)
assert isinstance(item["uri"], str)
assert item["label"] != ""
assert item["uri"] != ""

0 comments on commit 9822406

Please sign in to comment.