From 42dbf7727a242aa0c554b947e067bec71da8d180 Mon Sep 17 00:00:00 2001
From: Colin Smith <colin.smith@wisc.edu>
Date: Tue, 5 Nov 2024 14:55:58 -0800
Subject: [PATCH] feat: annotate `research topic` with OntoGPT

Add a function to annotate the `research topic` using the OntoGPT
package to be more precise and accurate than currently possible using
the BioPortal annotator.
---
 src/spinneret/annotator.py | 87 ++++++++++++++++++++++++++++++++++++++
 tests/test_annotator.py    | 51 ++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py
index f6adbbd..e3bfac9 100644
--- a/src/spinneret/annotator.py
+++ b/src/spinneret/annotator.py
@@ -24,6 +24,8 @@
     expand_curie,
 )
 
+# pylint: disable=too-many-lines
+
 
 # pylint: disable=too-many-locals
 def get_bioportal_annotation(
@@ -979,3 +981,88 @@ def add_env_medium_annotations_to_workbook(
     if output_path:
         write_workbook(wb, output_path)
     return wb
+
+
+def add_research_topic_annotations_to_workbook(
+    workbook: Union[str, pd.core.frame.DataFrame],
+    eml: Union[str, etree._ElementTree],
+    output_path: str = None,
+    overwrite: bool = False,
+    local_model: str = None,
+    return_ungrounded: bool = False,
+) -> pd.core.frame.DataFrame:
+    """
+    :param workbook: Either the path to the workbook to be annotated, or the
+        workbook itself as a pandas DataFrame.
+    :param eml: Either the path to the EML file corresponding to the workbook,
+        or the EML file itself as an lxml etree.
+    :param output_path: The path to write the annotated workbook.
+    :param overwrite: If True, overwrite existing research topic annotations in the
+        workbook. This enables updating the annotations in the workbook with
+        the latest research topic annotations.
+    :param local_model: See `get_ontogpt_annotation` documentation for details.
+    :param return_ungrounded: See `get_ontogpt_annotation` documentation for
+        details.
+    :returns: Workbook with research topic annotations.
+    :notes: This function retrieves research topic annotations using OntoGPT, which
+        requires setup and configuration described in the
+        `get_ontogpt_annotation` function.
+    """
+
+    # Load the workbook and EML for processing
+    wb = load_workbook(workbook)
+    eml = load_eml(eml)
+
+    # Set the author identifier for consistent reference below
+    author = "spinneret.annotator.get_onto_gpt_annotation"
+
+    # Remove existing research topic annotations if overwrite is True, using a set of
+    # criteria that accurately define the annotations to remove.
+    if overwrite:
+        wb = delete_annotations(
+            workbook=wb,
+            criteria={
+                "element": "dataset",
+                "element_xpath": "/eml:eml/dataset",
+                "predicate": "research topic",
+                "author": author,
+            },
+        )
+
+    # Get the research topic annotations
+    dataset_element = eml.xpath("//dataset")[0]
+    element_description = get_description(dataset_element)
+    annotations = get_ontogpt_annotation(
+        text=element_description,
+        template="research_topic",
+        local_model=local_model,
+        return_ungrounded=return_ungrounded,
+    )
+
+    # Add research topic annotations to the workbook
+    if annotations is not None:
+        for annotation in annotations:
+            row = initialize_workbook_row()
+            row["package_id"] = get_package_id(eml)
+            row["url"] = get_package_url(eml)
+            row["element"] = dataset_element.tag
+            if "id" in dataset_element.attrib:
+                row["element_id"] = dataset_element.attrib["id"]
+            else:
+                row["element_id"] = pd.NA
+            row["element_xpath"] = eml.getpath(dataset_element)
+            row["context"] = get_subject_and_context(dataset_element)["context"]
+            row["description"] = element_description
+            row["subject"] = get_subject_and_context(dataset_element)["subject"]
+            row["predicate"] = "research topic"
+            row["predicate_id"] = "http://vocabs.lter-europe.net/EnvThes/21604"
+            row["object"] = annotation["label"]
+            row["object_id"] = annotation["uri"]
+            row["author"] = author
+            row["date"] = pd.Timestamp.now()
+            row = pd.DataFrame([row], dtype=str)
+            wb = pd.concat([wb, row], ignore_index=True)
+
+    if output_path:
+        write_workbook(wb, output_path)
+    return wb
diff --git a/tests/test_annotator.py b/tests/test_annotator.py
index c632f9e..5f23313 100644
--- a/tests/test_annotator.py
+++ b/tests/test_annotator.py
@@ -18,6 +18,7 @@
     add_env_broad_scale_annotations_to_workbook,
     add_env_local_scale_annotations_to_workbook,
     add_env_medium_annotations_to_workbook,
+    add_research_topic_annotations_to_workbook,
 )
 from spinneret.utilities import load_configuration, load_eml, load_workbook
 from spinneret.datasets import get_example_eml_dir
@@ -766,3 +767,53 @@ def test_add_env_medium_annotations_to_workbook(tmp_path, use_mock, mocker):
     # Original annotations are gone
     assert not wb["object"].str.contains("a label").any()
     assert not wb["object_id"].str.contains("a uri").any()
+
+
+@pytest.mark.parametrize("use_mock", [True])  # False tests with real local LLM queries
+def test_add_research_topic_annotations_to_workbook(tmp_path, use_mock, mocker):
+    """Test add_research_topic_annotations_to_workbook"""
+
+    # Parameterize the test
+    workbook_path = "tests/edi.3.9_annotation_workbook.tsv"
+    output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv"
+
+    # The workbook shouldn't have any annotations yet
+    wb = load_workbook(workbook_path)
+    assert not has_annotations(wb)
+
+    # The workbook has annotations after calling the function
+    if use_mock:
+        mocker.patch(
+            "spinneret.annotator.get_ontogpt_annotation",
+            return_value=[{"label": "a label", "uri": "a uri"}],
+        )
+    wb = add_research_topic_annotations_to_workbook(
+        workbook=workbook_path,
+        eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
+        output_path=output_path,
+        local_model="llama3.2",
+        return_ungrounded=True,  # ensures we get at least one annotation back
+    )
+    assert has_annotations(wb)
+
+    # Overwriting changes the annotations. Note, we can't test this with real
+    # requests because we'll expect the same results as the first call.
+    if use_mock:
+        mocker.patch(  # an arbitrary response to check for
+            "spinneret.annotator.get_ontogpt_annotation",
+            return_value=[{"label": "a different label", "uri": "a different uri"}],
+        )
+    wb = add_research_topic_annotations_to_workbook(
+        workbook=output_path,  # the output from the first call
+        eml=get_example_eml_dir() + "/" + "edi.3.9.xml",
+        output_path=output_path,
+        local_model="llama3.2",
+        return_ungrounded=True,  # ensures we get at least one annotation back
+        overwrite=True,
+    )
+    assert wb["object"].str.contains("a different label").any()
+    assert wb["object_id"].str.contains("a different uri").any()
+
+    # Original annotations are gone
+    assert not wb["object"].str.contains("a label").any()
+    assert not wb["object_id"].str.contains("a uri").any()