From 233541304edb3ec5b98609367bc64ade468022ac Mon Sep 17 00:00:00 2001
From: Colin Smith <colin.smith@wisc.edu>
Date: Tue, 5 Nov 2024 11:37:01 -0800
Subject: [PATCH] feat: annotate `local environmental context` with OntoGPT

Add a function to annotate the `local scale environmental context` using
the OntoGPT package to be more precise and accurate than currently
possible using the BioPortal annotator.
---
 src/spinneret/annotator.py | 89 ++++++++++++++++++++++++++++++++++++++
 tests/test_annotator.py    | 51 ++++++++++++++++++++++
 2 files changed, 140 insertions(+)

diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py
index f3cf2f6..e557262 100644
--- a/src/spinneret/annotator.py
+++ b/src/spinneret/annotator.py
@@ -793,3 +793,92 @@ def add_env_broad_scale_annotations_to_workbook(
     if output_path:
         write_workbook(wb, output_path)
     return wb
+
+
+def add_env_local_scale_annotations_to_workbook(
+    workbook: Union[str, pd.core.frame.DataFrame],
+    eml: Union[str, etree._ElementTree],
+    output_path: str = None,
+    overwrite: bool = False,
+    local_model: str = None,
+    return_ungrounded: bool = False,
+) -> pd.core.frame.DataFrame:
+    """
+    :param workbook: Either the path to the workbook to be annotated, or the
+        workbook itself as a pandas DataFrame.
+    :param eml: Either the path to the EML file corresponding to the workbook,
+        or the EML file itself as an lxml etree.
+    :param output_path: The path to write the annotated workbook.
+    :param overwrite: If True, overwrite existing local scale environmental
+        context annotations in the workbook. This enables updating the
+        annotations in the workbook with the latest local scale environmental
+        context annotations.
+    :param local_model: See `get_ontogpt_annotation` documentation for details.
+    :param return_ungrounded: See `get_ontogpt_annotation` documentation for
+        details.
+    :returns: Workbook with local scale environmental context annotations.
+    :notes: This function retrieves local scale environmental context
+        annotations using OntoGPT, which requires setup and configuration
+        described in the `get_ontogpt_annotation` function.
+    """
+
+    # Load the workbook and EML for processing
+    wb = load_workbook(workbook)
+    eml = load_eml(eml)
+
+    # Set the author identifier for consistent reference below
+    author = "spinneret.annotator.get_onto_gpt_annotation"
+
+    # Remove existing local scale environmental context annotations if
+    # overwrite is True, using a set of criteria that accurately define the
+    # annotations to remove.
+    if overwrite:
+        wb = delete_annotations(
+            workbook=wb,
+            criteria={
+                "element": "dataset",
+                "element_xpath": "/eml:eml/dataset",
+                "predicate": "env_local_scale",
+                "author": author,
+            },
+        )
+
+    # Get the local scale environmental context annotations
+    dataset_element = eml.xpath("//dataset")[0]
+    element_description = get_description(dataset_element)
+    annotations = get_ontogpt_annotation(
+        text=element_description,
+        template="env_local_scale",
+        local_model=local_model,
+        return_ungrounded=return_ungrounded,
+    )
+
+    # Add local scale environmental context annotations to the workbook
+    if annotations is not None:
+        for annotation in annotations:
+            row = initialize_workbook_row()
+            row["package_id"] = get_package_id(eml)
+            row["url"] = get_package_url(eml)
+            row["element"] = dataset_element.tag
+            if "id" in dataset_element.attrib:
+                row["element_id"] = dataset_element.attrib["id"]
+            else:
+                row["element_id"] = pd.NA
+            row["element_xpath"] = eml.getpath(dataset_element)
+            row["context"] = get_subject_and_context(dataset_element)["context"]
+            row["description"] = element_description
+            row["subject"] = get_subject_and_context(dataset_element)["subject"]
+            row["predicate"] = "env_local_scale"
+            row["predicate_id"] = (
+                "https://genomicsstandardsconsortium.github.io/mixs/0000013/"
+            )
+            row["object"] = annotation["label"]
+            row["object_id"] = annotation["uri"]
+            row["author"] = author
+            row["date"] = pd.Timestamp.now()
+            row = pd.DataFrame([row], dtype=str)
+            wb = pd.concat([wb, row], ignore_index=True)
+
+    if output_path:
+        write_workbook(wb, output_path)
+    return wb
diff --git a/tests/test_annotator.py b/tests/test_annotator.py
index bdc26c0..cbbe999 100644
--- a/tests/test_annotator.py
+++ b/tests/test_annotator.py
@@ -16,6 +16,7 @@
     add_measurement_type_annotations_to_workbook,
     add_process_annotations_to_workbook,
     add_env_broad_scale_annotations_to_workbook,
+    add_env_local_scale_annotations_to_workbook,
 )
 from spinneret.utilities import load_configuration, load_eml, load_workbook
 from spinneret.datasets import get_example_eml_dir
@@ -664,3 +665,53 @@ def test_add_env_broad_scale_annotations_to_workbook(tmp_path, use_mock, mocker)
     # Original annotations are gone
     assert not wb["object"].str.contains("a label").any()
     assert not wb["object_id"].str.contains("a uri").any()
+
+
+@pytest.mark.parametrize("use_mock", [True])  # False tests with real local LLM queries
+def test_add_env_local_scale_annotations_to_workbook(tmp_path, use_mock, mocker):
+    """Test add_env_local_scale_annotations_to_workbook"""
+
+    # Parameterize the test
+    workbook_path = "tests/edi.3.9_annotation_workbook.tsv"
+    output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv"
+
+    # The workbook shouldn't have any annotations yet
+    wb = load_workbook(workbook_path)
+    assert not has_annotations(wb)
+
+    # The workbook has annotations after calling the function
+    if use_mock:
+        mocker.patch(  # a response returned in real requests
+            "spinneret.annotator.get_ontogpt_annotation",
+            return_value=[{"label": "a label", "uri": "a uri"}],
+        )
+    wb = add_env_local_scale_annotations_to_workbook(
+        workbook=workbook_path,
+        eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
+        output_path=output_path,
+        local_model="llama3.2",
+        return_ungrounded=True,  # ensures we get at least one annotation back
+    )
+    assert has_annotations(wb)
+
+    # Overwriting changes the annotations. Note, we can't test this with real
+    # requests because we'll expect the same results as the first call.
+    if use_mock:
+        mocker.patch(  # an arbitrary response to check for
+            "spinneret.annotator.get_ontogpt_annotation",
+            return_value=[{"label": "a different label", "uri": "a different uri"}],
+        )
+    wb = add_env_local_scale_annotations_to_workbook(
+        workbook=output_path,  # the output from the first call
+        eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
+        output_path=output_path,
+        local_model="llama3.2",
+        return_ungrounded=True,  # ensures we get at least one annotation back
+        overwrite=True,
+    )
+    assert wb["object"].str.contains("a different label").any()
+    assert wb["object_id"].str.contains("a different uri").any()
+
+    # Original annotations are gone
+    assert not wb["object"].str.contains("a label").any()
+    assert not wb["object_id"].str.contains("a uri").any()