Skip to content

Commit

Permalink
feat: annotate local environmental context with OntoGPT
Browse files Browse the repository at this point in the history
Add a function to annotate the `local scale environmental context` using
the OntoGPT package to be more precise and accurate than currently
possible using the BioPortal annotator.
  • Loading branch information
clnsmth authored Nov 5, 2024
1 parent 1c825bb commit 2335413
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 0 deletions.
89 changes: 89 additions & 0 deletions src/spinneret/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,3 +793,92 @@ def add_env_broad_scale_annotations_to_workbook(
if output_path:
write_workbook(wb, output_path)
return wb


def add_env_local_scale_annotations_to_workbook(
workbook: Union[str, pd.core.frame.DataFrame],
eml: Union[str, etree._ElementTree],
output_path: str = None,
overwrite: bool = False,
local_model: str = None,
return_ungrounded: bool = False,
) -> pd.core.frame.DataFrame:
"""
:param workbook: Either the path to the workbook to be annotated, or the
workbook itself as a pandas DataFrame.
:param eml: Either the path to the EML file corresponding to the workbook,
or the EML file itself as an lxml etree.
:param output_path: The path to write the annotated workbook.
:param overwrite: If True, overwrite existing local scale environmental
context annotations in the workbook. This enables updating the
annotations in the workbook with the latest local scale environmental
context annotations.
:param local_model: See `get_ontogpt_annotation` documentation for details.
:param return_ungrounded: See `get_ontogpt_annotation` documentation for
details.
:returns: Workbook with local scale environmental context annotations.
:notes: This function retrieves local scale environmental context
annotations using OntoGPT, which requires setup and configuration
described in the `get_ontogpt_annotation` function.
"""

# Load the workbook and EML for processing
wb = load_workbook(workbook)
eml = load_eml(eml)

# Set the author identifier for consistent reference below
author = "spinneret.annotator.get_onto_gpt_annotation"

# Remove existing local scale environmental context annotations if
# overwrite is True, using a set of criteria that accurately define the
# annotations to remove.
if overwrite:
wb = delete_annotations(
workbook=wb,
criteria={
"element": "dataset",
"element_xpath": "/eml:eml/dataset",
"predicate": "env_local_scale",
"author": author,
},
)

# Get the local scale environmental context annotations
dataset_element = eml.xpath("//dataset")[0]
element_description = get_description(dataset_element)
annotations = get_ontogpt_annotation(
text=element_description,
template="env_local_scale",
local_model=local_model,
return_ungrounded=return_ungrounded,
)

# Add local scale environmental context annotations to the workbook
if annotations is not None:
for annotation in annotations:
row = initialize_workbook_row()
row["package_id"] = get_package_id(eml)
row["url"] = get_package_url(eml)
row["element"] = dataset_element.tag
if "id" in dataset_element.attrib:
row["element_id"] = dataset_element.attrib["id"]
else:
row["element_id"] = pd.NA
row["element_xpath"] = eml.getpath(dataset_element)
row["context"] = get_subject_and_context(dataset_element)["context"]
row["description"] = element_description
row["subject"] = get_subject_and_context(dataset_element)["subject"]
row["predicate"] = "env_local_scale"
row["predicate_id"] = (
"https://genomicsstandardsconsortium.github.io/mixs/0000013/"
)
row["object"] = annotation["label"]
row["object_id"] = annotation["uri"]
row["author"] = author
row["date"] = pd.Timestamp.now()
row = pd.DataFrame([row], dtype=str)
wb = pd.concat([wb, row], ignore_index=True)

if output_path:
write_workbook(wb, output_path)
return wb
51 changes: 51 additions & 0 deletions tests/test_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
add_measurement_type_annotations_to_workbook,
add_process_annotations_to_workbook,
add_env_broad_scale_annotations_to_workbook,
add_env_local_scale_annotations_to_workbook,
)
from spinneret.utilities import load_configuration, load_eml, load_workbook
from spinneret.datasets import get_example_eml_dir
Expand Down Expand Up @@ -664,3 +665,53 @@ def test_add_env_broad_scale_annotations_to_workbook(tmp_path, use_mock, mocker)
# Original annotations are gone
assert not wb["object"].str.contains("a label").any()
assert not wb["object_id"].str.contains("a uri").any()


@pytest.mark.parametrize("use_mock", [True]) # False tests with real local LLM queries
def test_add_env_local_scale_annotations_to_workbook(tmp_path, use_mock, mocker):
"""Test add_env_local_scale_annotations_to_workbook"""

# Parameterize the test
workbook_path = "tests/edi.3.9_annotation_workbook.tsv"
output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv"

# The workbook shouldn't have any annotations yet
wb = load_workbook(workbook_path)
assert not has_annotations(wb)

# The workbook has annotations after calling the function
if use_mock:
mocker.patch( # a response returned in real requests
"spinneret.annotator.get_ontogpt_annotation",
return_value=[{"label": "a label", "uri": "a uri"}],
)
wb = add_env_local_scale_annotations_to_workbook(
workbook=workbook_path,
eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
output_path=output_path,
local_model="llama3.2",
return_ungrounded=True, # ensures we get at least one annotation back
)
assert has_annotations(wb)

# Overwriting changes the annotations. Note, we can't test this with real
# requests because we'll expect the same results as the first call.
if use_mock:
mocker.patch( # an arbitrary response to check for
"spinneret.annotator.get_ontogpt_annotation",
return_value=[{"label": "a different label", "uri": "a different uri"}],
)
wb = add_env_local_scale_annotations_to_workbook(
workbook=output_path, # the output from the first call
eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
output_path=output_path,
local_model="llama3.2",
return_ungrounded=True, # ensures we get at least one annotation back
overwrite=True,
)
assert wb["object"].str.contains("a different label").any()
assert wb["object_id"].str.contains("a different uri").any()

# Original annotations are gone
assert not wb["object"].str.contains("a label").any()
assert not wb["object_id"].str.contains("a uri").any()

0 comments on commit 2335413

Please sign in to comment.