Add a template specifically for extracting human phenotypes (#214)

This includes some extra prompt text to encourage the model to return a list compatible with parsing, as one input text can have numerous entities to extract and GPT-3+ appears to be more likely to separate lists with commas than semicolons as lists grow longer. This suggests that most text should be parsed this way, or at least some additional validation step can re-parse outputs if they look like "entity1, entity2, entity3" and don't ground
monarch-initiative · Oct 19, 2023 · 63ba04a · 63ba04a
2 parents 3f1202a + c8d04cf
commit 63ba04a
Show file tree

Hide file tree

Showing 2 changed files with 166 additions and 0 deletions.
diff --git a/src/ontogpt/templates/human_phenotype.py b/src/ontogpt/templates/human_phenotype.py
@@ -0,0 +1,126 @@
+from __future__ import annotations
+from datetime import datetime, date
+from enum import Enum
+from typing import List, Dict, Optional, Any, Union
+from pydantic import BaseModel as BaseModel, Field
+import sys
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+
+metamodel_version = "None"
+version = "None"
+
+class ConfiguredBaseModel(BaseModel,
+                validate_assignment = True,
+                validate_default = True,
+                extra = 'forbid',
+                arbitrary_types_allowed = True,
+                use_enum_values = True):
+    pass
+
+
+class NullDataOptions(str, Enum):
+
+
+    UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
+
+    NOT_APPLICABLE = "NOT_APPLICABLE"
+
+    NOT_MENTIONED = "NOT_MENTIONED"
+
+
+
+class ExtractionResult(ConfiguredBaseModel):
+    """
+    A result of extracting knowledge on text
+    """
+    input_id: Optional[str] = Field(None)
+    input_title: Optional[str] = Field(None)
+    input_text: Optional[str] = Field(None)
+    raw_completion_output: Optional[str] = Field(None)
+    prompt: Optional[str] = Field(None)
+    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
+    named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")
+
+
+class NamedEntity(ConfiguredBaseModel):
+
+    id: Optional[str] = Field(None, description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class HumanPhenotypeSet(NamedEntity):
+
+    phenotypes: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of human phenotypes, including symptoms of disease. It must be semicolon-separated. Labels containing the word 'with' should be split into multiple phenotypes.""")
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class HumanPhenotype(NamedEntity):
+
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class CompoundExpression(ConfiguredBaseModel):
+
+    None
+
+
+class Triple(CompoundExpression):
+    """
+    Abstract parent for Relation Extraction tasks
+    """
+    subject: Optional[str] = Field(None)
+    predicate: Optional[str] = Field(None)
+    object: Optional[str] = Field(None)
+    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
+    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
+    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")
+
+
+class TextWithTriples(ConfiguredBaseModel):
+
+    publication: Optional[Publication] = Field(None)
+    triples: Optional[List[Triple]] = Field(default_factory=list)
+
+
+class RelationshipType(NamedEntity):
+
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class Publication(ConfiguredBaseModel):
+
+    id: Optional[str] = Field(None, description="""The publication identifier""")
+    title: Optional[str] = Field(None, description="""The title of the publication""")
+    abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
+    combined_text: Optional[str] = Field(None)
+    full_text: Optional[str] = Field(None, description="""The full text of the publication""")
+
+
+class AnnotatorResult(ConfiguredBaseModel):
+
+    subject_text: Optional[str] = Field(None)
+    object_id: Optional[str] = Field(None)
+    object_text: Optional[str] = Field(None)
+
+
+
+# Model rebuild
+# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
+ExtractionResult.model_rebuild()
+NamedEntity.model_rebuild()
+HumanPhenotypeSet.model_rebuild()
+HumanPhenotype.model_rebuild()
+CompoundExpression.model_rebuild()
+Triple.model_rebuild()
+TextWithTriples.model_rebuild()
+RelationshipType.model_rebuild()
+Publication.model_rebuild()
+AnnotatorResult.model_rebuild()
+
diff --git a/src/ontogpt/templates/human_phenotype.yaml b/src/ontogpt/templates/human_phenotype.yaml
@@ -0,0 +1,40 @@
+id: http://w3id.org/ontogpt/human_phenotype
+name: human_phenotype-template
+title: Human Phenotype Extraction Template
+description: >-
+  A template for extracting human phenotypes to HPO terms
+license: https://creativecommons.org/publicdomain/zero/1.0/
+prefixes:
+  linkml: https://w3id.org/linkml/
+  human_phenotype: http://w3id.org/ontogpt/human_phenotype
+  HP: http://purl.obolibrary.org/obo/HP_
+
+default_prefix: human_phenotype
+default_range: string
+
+imports:
+  - linkml:types
+  - core
+
+classes:
+  HumanPhenotypeSet:
+    tree_root: true
+    is_a: NamedEntity
+    attributes:
+      phenotypes:
+        range: HumanPhenotype
+        multivalued: true
+        description: >- 
+          A semicolon-separated list of human phenotypes, including symptoms of disease.
+          It must be semicolon-separated.
+          Labels containing the word 'with' should be split into multiple phenotypes.
+
+  HumanPhenotype:
+    is_a: NamedEntity
+    id_prefixes:
+      - HP
+    annotations:
+      annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit
+      prompt: >- 
+        the name of a human phenotype or symptom.
+         Examples are ascites, fever, pain, seizure, increased intracranial pressure, lactic acidosis.