Skip to content

Commit

Permalink
Add a template specifically for extracting human phenotypes (#214)
Browse files Browse the repository at this point in the history
This includes some extra prompt text to encourage the model to return a
list compatible with parsing, as one input text can have numerous
entities to extract and GPT-3+ appears to be more likely to separate
lists with commas than semicolons as lists grow longer.
This suggests that most text should be parsed this way, or at least some
additional validation step can re-parse outputs if they look like
"entity1, entity2, entity3" and don't ground
  • Loading branch information
caufieldjh authored Oct 19, 2023
2 parents 3f1202a + c8d04cf commit 63ba04a
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 0 deletions.
126 changes: 126 additions & 0 deletions src/ontogpt/templates/human_phenotype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from __future__ import annotations
from datetime import datetime, date
from enum import Enum
from typing import List, Dict, Optional, Any, Union
from pydantic import BaseModel as BaseModel, Field
import sys
if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal


metamodel_version = "None"
version = "None"

class ConfiguredBaseModel(BaseModel,
validate_assignment = True,
validate_default = True,
extra = 'forbid',
arbitrary_types_allowed = True,
use_enum_values = True):
pass


class NullDataOptions(str, Enum):


UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"

NOT_APPLICABLE = "NOT_APPLICABLE"

NOT_MENTIONED = "NOT_MENTIONED"



class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
input_id: Optional[str] = Field(None)
input_title: Optional[str] = Field(None)
input_text: Optional[str] = Field(None)
raw_completion_output: Optional[str] = Field(None)
prompt: Optional[str] = Field(None)
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")


class NamedEntity(ConfiguredBaseModel):

id: Optional[str] = Field(None, description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class HumanPhenotypeSet(NamedEntity):

phenotypes: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of human phenotypes, including symptoms of disease. It must be semicolon-separated. Labels containing the word 'with' should be split into multiple phenotypes.""")
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class HumanPhenotype(NamedEntity):

id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class CompoundExpression(ConfiguredBaseModel):

None


class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
subject: Optional[str] = Field(None)
predicate: Optional[str] = Field(None)
object: Optional[str] = Field(None)
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")


class TextWithTriples(ConfiguredBaseModel):

publication: Optional[Publication] = Field(None)
triples: Optional[List[Triple]] = Field(default_factory=list)


class RelationshipType(NamedEntity):

id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Publication(ConfiguredBaseModel):

id: Optional[str] = Field(None, description="""The publication identifier""")
title: Optional[str] = Field(None, description="""The title of the publication""")
abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
combined_text: Optional[str] = Field(None)
full_text: Optional[str] = Field(None, description="""The full text of the publication""")


class AnnotatorResult(ConfiguredBaseModel):

subject_text: Optional[str] = Field(None)
object_id: Optional[str] = Field(None)
object_text: Optional[str] = Field(None)



# Model rebuild
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
ExtractionResult.model_rebuild()
NamedEntity.model_rebuild()
HumanPhenotypeSet.model_rebuild()
HumanPhenotype.model_rebuild()
CompoundExpression.model_rebuild()
Triple.model_rebuild()
TextWithTriples.model_rebuild()
RelationshipType.model_rebuild()
Publication.model_rebuild()
AnnotatorResult.model_rebuild()

40 changes: 40 additions & 0 deletions src/ontogpt/templates/human_phenotype.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
id: http://w3id.org/ontogpt/human_phenotype
name: human_phenotype-template
title: Human Phenotype Extraction Template
description: >-
A template for extracting human phenotypes to HPO terms
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
linkml: https://w3id.org/linkml/
human_phenotype: http://w3id.org/ontogpt/human_phenotype
HP: http://purl.obolibrary.org/obo/HP_

default_prefix: human_phenotype
default_range: string

imports:
- linkml:types
- core

classes:
HumanPhenotypeSet:
tree_root: true
is_a: NamedEntity
attributes:
phenotypes:
range: HumanPhenotype
multivalued: true
description: >-
A semicolon-separated list of human phenotypes, including symptoms of disease.
It must be semicolon-separated.
Labels containing the word 'with' should be split into multiple phenotypes.
HumanPhenotype:
is_a: NamedEntity
id_prefixes:
- HP
annotations:
annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit
prompt: >-
the name of a human phenotype or symptom.
Examples are ascites, fever, pain, seizure, increased intracranial pressure, lactic acidosis.

0 comments on commit 63ba04a

Please sign in to comment.