From 42dbf7727a242aa0c554b947e067bec71da8d180 Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Tue, 5 Nov 2024 14:55:58 -0800 Subject: [PATCH] feat: annotate `research topic` with OntoGPT Add a function to annotate the `research topic` using the OntoGPT package to be more precise and accurate than currently possible using the BioPortal annotator. --- src/spinneret/annotator.py | 87 ++++++++++++++++++++++++++++++++++++++ tests/test_annotator.py | 51 ++++++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py index f6adbbd..e3bfac9 100644 --- a/src/spinneret/annotator.py +++ b/src/spinneret/annotator.py @@ -24,6 +24,8 @@ expand_curie, ) +# pylint: disable=too-many-lines + # pylint: disable=too-many-locals def get_bioportal_annotation( @@ -979,3 +981,88 @@ def add_env_medium_annotations_to_workbook( if output_path: write_workbook(wb, output_path) return wb + + +def add_research_topic_annotations_to_workbook( + workbook: Union[str, pd.core.frame.DataFrame], + eml: Union[str, etree._ElementTree], + output_path: str = None, + overwrite: bool = False, + local_model: str = None, + return_ungrounded: bool = False, +) -> pd.core.frame.DataFrame: + """ + :param workbook: Either the path to the workbook to be annotated, or the + workbook itself as a pandas DataFrame. + :param eml: Either the path to the EML file corresponding to the workbook, + or the EML file itself as an lxml etree. + :param output_path: The path to write the annotated workbook. + :param overwrite: If True, overwrite existing research topic annotations in the + workbook. This enables updating the annotations in the workbook with + the latest research topic annotations. + :param local_model: See `get_ontogpt_annotation` documentation for details. + :param return_ungrounded: See `get_ontogpt_annotation` documentation for + details. + :returns: Workbook with research topic annotations. + :notes: This function retrieves research topic annotations using OntoGPT, which + requires setup and configuration described in the + `get_ontogpt_annotation` function. + """ + + # Load the workbook and EML for processing + wb = load_workbook(workbook) + eml = load_eml(eml) + + # Set the author identifier for consistent reference below + author = "spinneret.annotator.get_onto_gpt_annotation" + + # Remove existing research topic annotations if overwrite is True, using a set of + # criteria that accurately define the annotations to remove. + if overwrite: + wb = delete_annotations( + workbook=wb, + criteria={ + "element": "dataset", + "element_xpath": "/eml:eml/dataset", + "predicate": "research topic", + "author": author, + }, + ) + + # Get the research topic annotations + dataset_element = eml.xpath("//dataset")[0] + element_description = get_description(dataset_element) + annotations = get_ontogpt_annotation( + text=element_description, + template="research_topic", + local_model=local_model, + return_ungrounded=return_ungrounded, + ) + + # Add research topic annotations to the workbook + if annotations is not None: + for annotation in annotations: + row = initialize_workbook_row() + row["package_id"] = get_package_id(eml) + row["url"] = get_package_url(eml) + row["element"] = dataset_element.tag + if "id" in dataset_element.attrib: + row["element_id"] = dataset_element.attrib["id"] + else: + row["element_id"] = pd.NA + row["element_xpath"] = eml.getpath(dataset_element) + row["context"] = get_subject_and_context(dataset_element)["context"] + row["description"] = element_description + row["subject"] = get_subject_and_context(dataset_element)["subject"] + row["predicate"] = "research topic" + row["predicate_id"] = "http://vocabs.lter-europe.net/EnvThes/21604" + row["object"] = annotation["label"] + row["object_id"] = annotation["uri"] + row["author"] = author + row["date"] = pd.Timestamp.now() + row = pd.DataFrame([row], dtype=str) + wb = pd.concat([wb, row], ignore_index=True) + + if output_path: + write_workbook(wb, output_path) + return wb diff --git a/tests/test_annotator.py b/tests/test_annotator.py index c632f9e..5f23313 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -18,6 +18,7 @@ add_env_broad_scale_annotations_to_workbook, add_env_local_scale_annotations_to_workbook, add_env_medium_annotations_to_workbook, + add_research_topic_annotations_to_workbook, ) from spinneret.utilities import load_configuration, load_eml, load_workbook from spinneret.datasets import get_example_eml_dir @@ -766,3 +767,53 @@ def test_add_env_medium_annotations_to_workbook(tmp_path, use_mock, mocker): # Original annotations are gone assert not wb["object"].str.contains("a label").any() assert not wb["object_id"].str.contains("a uri").any() + + +@pytest.mark.parametrize("use_mock", [True]) # False tests with real local LLM queries +def test_add_research_topic_annotations_to_workbook(tmp_path, use_mock, mocker): + """Test add_research_topic_annotations_to_workbook""" + + # Parameterize the test + workbook_path = "tests/edi.3.9_annotation_workbook.tsv" + output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv" + + # The workbook shouldn't have any annotations yet + wb = load_workbook(workbook_path) + assert not has_annotations(wb) + + # The workbook has annotations after calling the function + if use_mock: + mocker.patch( + "spinneret.annotator.get_ontogpt_annotation", + return_value=[{"label": "a label", "uri": "a uri"}], + ) + wb = add_research_topic_annotations_to_workbook( + workbook=workbook_path, + eml=get_example_eml_dir() + "/" + "edi.3.9.xml", + output_path=output_path, + local_model="llama3.2", + return_ungrounded=True, # ensures we get at least one annotation back + ) + assert has_annotations(wb) + + # Overwriting changes the annotations. Note, we can't test this with real + # requests because we'll expect the same results as the first call. + if use_mock: + mocker.patch( # an arbitrary response to check for + "spinneret.annotator.get_ontogpt_annotation", + return_value=[{"label": "a different label", "uri": "a different uri"}], + ) + wb = add_research_topic_annotations_to_workbook( + workbook=output_path, # the output from the first call + eml=get_example_eml_dir() + "/" + "edi.3.9.xml", + output_path=output_path, + local_model="llama3.2", + return_ungrounded=True, # ensures we get at least one annotation back + overwrite=True, + ) + assert wb["object"].str.contains("a different label").any() + assert wb["object_id"].str.contains("a different uri").any() + + # Original annotations are gone + assert not wb["object"].str.contains("a label").any() + assert not wb["object_id"].str.contains("a uri").any()