Skip to content

Commit

Permalink
feat: annotate research topic with OntoGPT
Browse files Browse the repository at this point in the history
Add a function to annotate the `research topic` using the OntoGPT
package to be more precise and accurate than currently possible using
the BioPortal annotator.
  • Loading branch information
clnsmth authored Nov 5, 2024
1 parent 48b3246 commit 42dbf77
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 0 deletions.
87 changes: 87 additions & 0 deletions src/spinneret/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
expand_curie,
)

# pylint: disable=too-many-lines


# pylint: disable=too-many-locals
def get_bioportal_annotation(
Expand Down Expand Up @@ -979,3 +981,88 @@ def add_env_medium_annotations_to_workbook(
if output_path:
write_workbook(wb, output_path)
return wb


def add_research_topic_annotations_to_workbook(
workbook: Union[str, pd.core.frame.DataFrame],
eml: Union[str, etree._ElementTree],
output_path: str = None,
overwrite: bool = False,
local_model: str = None,
return_ungrounded: bool = False,
) -> pd.core.frame.DataFrame:
"""
:param workbook: Either the path to the workbook to be annotated, or the
workbook itself as a pandas DataFrame.
:param eml: Either the path to the EML file corresponding to the workbook,
or the EML file itself as an lxml etree.
:param output_path: The path to write the annotated workbook.
:param overwrite: If True, overwrite existing research topic annotations in the
workbook. This enables updating the annotations in the workbook with
the latest research topic annotations.
:param local_model: See `get_ontogpt_annotation` documentation for details.
:param return_ungrounded: See `get_ontogpt_annotation` documentation for
details.
:returns: Workbook with research topic annotations.
:notes: This function retrieves research topic annotations using OntoGPT, which
requires setup and configuration described in the
`get_ontogpt_annotation` function.
"""

# Load the workbook and EML for processing
wb = load_workbook(workbook)
eml = load_eml(eml)

# Set the author identifier for consistent reference below
author = "spinneret.annotator.get_onto_gpt_annotation"

# Remove existing research topic annotations if overwrite is True, using a set of
# criteria that accurately define the annotations to remove.
if overwrite:
wb = delete_annotations(
workbook=wb,
criteria={
"element": "dataset",
"element_xpath": "/eml:eml/dataset",
"predicate": "research topic",
"author": author,
},
)

# Get the research topic annotations
dataset_element = eml.xpath("//dataset")[0]
element_description = get_description(dataset_element)
annotations = get_ontogpt_annotation(
text=element_description,
template="research_topic",
local_model=local_model,
return_ungrounded=return_ungrounded,
)

# Add research topic annotations to the workbook
if annotations is not None:
for annotation in annotations:
row = initialize_workbook_row()
row["package_id"] = get_package_id(eml)
row["url"] = get_package_url(eml)
row["element"] = dataset_element.tag
if "id" in dataset_element.attrib:
row["element_id"] = dataset_element.attrib["id"]
else:
row["element_id"] = pd.NA
row["element_xpath"] = eml.getpath(dataset_element)
row["context"] = get_subject_and_context(dataset_element)["context"]
row["description"] = element_description
row["subject"] = get_subject_and_context(dataset_element)["subject"]
row["predicate"] = "research topic"
row["predicate_id"] = "http://vocabs.lter-europe.net/EnvThes/21604"
row["object"] = annotation["label"]
row["object_id"] = annotation["uri"]
row["author"] = author
row["date"] = pd.Timestamp.now()
row = pd.DataFrame([row], dtype=str)
wb = pd.concat([wb, row], ignore_index=True)

if output_path:
write_workbook(wb, output_path)
return wb
51 changes: 51 additions & 0 deletions tests/test_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
add_env_broad_scale_annotations_to_workbook,
add_env_local_scale_annotations_to_workbook,
add_env_medium_annotations_to_workbook,
add_research_topic_annotations_to_workbook,
)
from spinneret.utilities import load_configuration, load_eml, load_workbook
from spinneret.datasets import get_example_eml_dir
Expand Down Expand Up @@ -766,3 +767,53 @@ def test_add_env_medium_annotations_to_workbook(tmp_path, use_mock, mocker):
# Original annotations are gone
assert not wb["object"].str.contains("a label").any()
assert not wb["object_id"].str.contains("a uri").any()


@pytest.mark.parametrize("use_mock", [True]) # False tests with real local LLM queries
def test_add_research_topic_annotations_to_workbook(tmp_path, use_mock, mocker):
"""Test add_research_topic_annotations_to_workbook"""

# Parameterize the test
workbook_path = "tests/edi.3.9_annotation_workbook.tsv"
output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv"

# The workbook shouldn't have any annotations yet
wb = load_workbook(workbook_path)
assert not has_annotations(wb)

# The workbook has annotations after calling the function
if use_mock:
mocker.patch(
"spinneret.annotator.get_ontogpt_annotation",
return_value=[{"label": "a label", "uri": "a uri"}],
)
wb = add_research_topic_annotations_to_workbook(
workbook=workbook_path,
eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
output_path=output_path,
local_model="llama3.2",
return_ungrounded=True, # ensures we get at least one annotation back
)
assert has_annotations(wb)

# Overwriting changes the annotations. Note, we can't test this with real
# requests because we'll expect the same results as the first call.
if use_mock:
mocker.patch( # an arbitrary response to check for
"spinneret.annotator.get_ontogpt_annotation",
return_value=[{"label": "a different label", "uri": "a different uri"}],
)
wb = add_research_topic_annotations_to_workbook(
workbook=output_path, # the output from the first call
eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
output_path=output_path,
local_model="llama3.2",
return_ungrounded=True, # ensures we get at least one annotation back
overwrite=True,
)
assert wb["object"].str.contains("a different label").any()
assert wb["object_id"].str.contains("a different uri").any()

# Original annotations are gone
assert not wb["object"].str.contains("a label").any()
assert not wb["object_id"].str.contains("a uri").any()

0 comments on commit 42dbf77

Please sign in to comment.