From 233541304edb3ec5b98609367bc64ade468022ac Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Tue, 5 Nov 2024 11:37:01 -0800 Subject: [PATCH] feat: annotate `local environmental context` with OntoGPT Add a function to annotate the `local scale environmental context` using the OntoGPT package to be more precise and accurate than currently possible using the BioPortal annotator. --- src/spinneret/annotator.py | 89 ++++++++++++++++++++++++++++++++++++++ tests/test_annotator.py | 51 ++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py index f3cf2f6..e557262 100644 --- a/src/spinneret/annotator.py +++ b/src/spinneret/annotator.py @@ -793,3 +793,92 @@ def add_env_broad_scale_annotations_to_workbook( if output_path: write_workbook(wb, output_path) return wb + + +def add_env_local_scale_annotations_to_workbook( + workbook: Union[str, pd.core.frame.DataFrame], + eml: Union[str, etree._ElementTree], + output_path: str = None, + overwrite: bool = False, + local_model: str = None, + return_ungrounded: bool = False, +) -> pd.core.frame.DataFrame: + """ + :param workbook: Either the path to the workbook to be annotated, or the + workbook itself as a pandas DataFrame. + :param eml: Either the path to the EML file corresponding to the workbook, + or the EML file itself as an lxml etree. + :param output_path: The path to write the annotated workbook. + :param overwrite: If True, overwrite existing local scale environmental + context annotations in the workbook. This enables updating the + annotations in the workbook with the latest local scale environmental + context annotations. + :param local_model: See `get_ontogpt_annotation` documentation for details. + :param return_ungrounded: See `get_ontogpt_annotation` documentation for + details. + :returns: Workbook with local scale environmental context annotations. + :notes: This function retrieves local scale environmental context + annotations using OntoGPT, which requires setup and configuration + described in the `get_ontogpt_annotation` function. + """ + + # Load the workbook and EML for processing + wb = load_workbook(workbook) + eml = load_eml(eml) + + # Set the author identifier for consistent reference below + author = "spinneret.annotator.get_onto_gpt_annotation" + + # Remove existing local scale environmental context annotations if + # overwrite is True, using a set of criteria that accurately define the + # annotations to remove. + if overwrite: + wb = delete_annotations( + workbook=wb, + criteria={ + "element": "dataset", + "element_xpath": "/eml:eml/dataset", + "predicate": "env_local_scale", + "author": author, + }, + ) + + # Get the local scale environmental context annotations + dataset_element = eml.xpath("//dataset")[0] + element_description = get_description(dataset_element) + annotations = get_ontogpt_annotation( + text=element_description, + template="env_local_scale", + local_model=local_model, + return_ungrounded=return_ungrounded, + ) + + # Add local scale environmental context annotations to the workbook + if annotations is not None: + for annotation in annotations: + row = initialize_workbook_row() + row["package_id"] = get_package_id(eml) + row["url"] = get_package_url(eml) + row["element"] = dataset_element.tag + if "id" in dataset_element.attrib: + row["element_id"] = dataset_element.attrib["id"] + else: + row["element_id"] = pd.NA + row["element_xpath"] = eml.getpath(dataset_element) + row["context"] = get_subject_and_context(dataset_element)["context"] + row["description"] = element_description + row["subject"] = get_subject_and_context(dataset_element)["subject"] + row["predicate"] = "env_local_scale" + row["predicate_id"] = ( + "https://genomicsstandardsconsortium.github.io/mixs/0000013/" + ) + row["object"] = annotation["label"] + row["object_id"] = annotation["uri"] + row["author"] = author + row["date"] = pd.Timestamp.now() + row = pd.DataFrame([row], dtype=str) + wb = pd.concat([wb, row], ignore_index=True) + + if output_path: + write_workbook(wb, output_path) + return wb diff --git a/tests/test_annotator.py b/tests/test_annotator.py index bdc26c0..cbbe999 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -16,6 +16,7 @@ add_measurement_type_annotations_to_workbook, add_process_annotations_to_workbook, add_env_broad_scale_annotations_to_workbook, + add_env_local_scale_annotations_to_workbook, ) from spinneret.utilities import load_configuration, load_eml, load_workbook from spinneret.datasets import get_example_eml_dir @@ -664,3 +665,53 @@ def test_add_env_broad_scale_annotations_to_workbook(tmp_path, use_mock, mocker) # Original annotations are gone assert not wb["object"].str.contains("a label").any() assert not wb["object_id"].str.contains("a uri").any() + + +@pytest.mark.parametrize("use_mock", [True]) # False tests with real local LLM queries +def test_add_env_local_scale_annotations_to_workbook(tmp_path, use_mock, mocker): + """Test add_env_local_scale_annotations_to_workbook""" + + # Parameterize the test + workbook_path = "tests/edi.3.9_annotation_workbook.tsv" + output_path = str(tmp_path) + "edi.3.9_annotation_workbook.tsv" + + # The workbook shouldn't have any annotations yet + wb = load_workbook(workbook_path) + assert not has_annotations(wb) + + # The workbook has annotations after calling the function + if use_mock: + mocker.patch( # a response returned in real requests + "spinneret.annotator.get_ontogpt_annotation", + return_value=[{"label": "a label", "uri": "a uri"}], + ) + wb = add_env_local_scale_annotations_to_workbook( + workbook=workbook_path, + eml=get_example_eml_dir() + "/" + "edi.3.9.xml", + output_path=output_path, + local_model="llama3.2", + return_ungrounded=True, # ensures we get at least one annotation back + ) + assert has_annotations(wb) + + # Overwriting changes the annotations. Note, we can't test this with real + # requests because we'll expect the same results as the first call. + if use_mock: + mocker.patch( # an arbitrary response to check for + "spinneret.annotator.get_ontogpt_annotation", + return_value=[{"label": "a different label", "uri": "a different uri"}], + ) + wb = add_env_local_scale_annotations_to_workbook( + workbook=output_path, # the output from the first call + eml=get_example_eml_dir() + "/" + "edi.3.9.xml", + output_path=output_path, + local_model="llama3.2", + return_ungrounded=True, # ensures we get at least one annotation back + overwrite=True, + ) + assert wb["object"].str.contains("a different label").any() + assert wb["object_id"].str.contains("a different uri").any() + + # Original annotations are gone + assert not wb["object"].str.contains("a label").any() + assert not wb["object_id"].str.contains("a uri").any()