-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a template specifically for extracting human phenotypes (#214)
This includes some extra prompt text to encourage the model to return a list compatible with parsing, as one input text can have numerous entities to extract and GPT-3+ appears to be more likely to separate lists with commas than semicolons as lists grow longer. This suggests that most text should be parsed this way, or at least some additional validation step can re-parse outputs if they look like "entity1, entity2, entity3" and don't ground
- Loading branch information
Showing
2 changed files
with
166 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from __future__ import annotations | ||
from datetime import datetime, date | ||
from enum import Enum | ||
from typing import List, Dict, Optional, Any, Union | ||
from pydantic import BaseModel as BaseModel, Field | ||
import sys | ||
if sys.version_info >= (3, 8): | ||
from typing import Literal | ||
else: | ||
from typing_extensions import Literal | ||
|
||
|
||
metamodel_version = "None" | ||
version = "None" | ||
|
||
class ConfiguredBaseModel(BaseModel, | ||
validate_assignment = True, | ||
validate_default = True, | ||
extra = 'forbid', | ||
arbitrary_types_allowed = True, | ||
use_enum_values = True): | ||
pass | ||
|
||
|
||
class NullDataOptions(str, Enum): | ||
|
||
|
||
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" | ||
|
||
NOT_APPLICABLE = "NOT_APPLICABLE" | ||
|
||
NOT_MENTIONED = "NOT_MENTIONED" | ||
|
||
|
||
|
||
class ExtractionResult(ConfiguredBaseModel): | ||
""" | ||
A result of extracting knowledge on text | ||
""" | ||
input_id: Optional[str] = Field(None) | ||
input_title: Optional[str] = Field(None) | ||
input_text: Optional[str] = Field(None) | ||
raw_completion_output: Optional[str] = Field(None) | ||
prompt: Optional[str] = Field(None) | ||
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") | ||
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") | ||
|
||
|
||
class NamedEntity(ConfiguredBaseModel): | ||
|
||
id: Optional[str] = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class HumanPhenotypeSet(NamedEntity): | ||
|
||
phenotypes: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of human phenotypes, including symptoms of disease. It must be semicolon-separated. Labels containing the word 'with' should be split into multiple phenotypes.""") | ||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class HumanPhenotype(NamedEntity): | ||
|
||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class CompoundExpression(ConfiguredBaseModel): | ||
|
||
None | ||
|
||
|
||
class Triple(CompoundExpression): | ||
""" | ||
Abstract parent for Relation Extraction tasks | ||
""" | ||
subject: Optional[str] = Field(None) | ||
predicate: Optional[str] = Field(None) | ||
object: Optional[str] = Field(None) | ||
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") | ||
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") | ||
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") | ||
|
||
|
||
class TextWithTriples(ConfiguredBaseModel): | ||
|
||
publication: Optional[Publication] = Field(None) | ||
triples: Optional[List[Triple]] = Field(default_factory=list) | ||
|
||
|
||
class RelationshipType(NamedEntity): | ||
|
||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class Publication(ConfiguredBaseModel): | ||
|
||
id: Optional[str] = Field(None, description="""The publication identifier""") | ||
title: Optional[str] = Field(None, description="""The title of the publication""") | ||
abstract: Optional[str] = Field(None, description="""The abstract of the publication""") | ||
combined_text: Optional[str] = Field(None) | ||
full_text: Optional[str] = Field(None, description="""The full text of the publication""") | ||
|
||
|
||
class AnnotatorResult(ConfiguredBaseModel): | ||
|
||
subject_text: Optional[str] = Field(None) | ||
object_id: Optional[str] = Field(None) | ||
object_text: Optional[str] = Field(None) | ||
|
||
|
||
|
||
# Model rebuild | ||
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model | ||
ExtractionResult.model_rebuild() | ||
NamedEntity.model_rebuild() | ||
HumanPhenotypeSet.model_rebuild() | ||
HumanPhenotype.model_rebuild() | ||
CompoundExpression.model_rebuild() | ||
Triple.model_rebuild() | ||
TextWithTriples.model_rebuild() | ||
RelationshipType.model_rebuild() | ||
Publication.model_rebuild() | ||
AnnotatorResult.model_rebuild() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
id: http://w3id.org/ontogpt/human_phenotype | ||
name: human_phenotype-template | ||
title: Human Phenotype Extraction Template | ||
description: >- | ||
A template for extracting human phenotypes to HPO terms | ||
license: https://creativecommons.org/publicdomain/zero/1.0/ | ||
prefixes: | ||
linkml: https://w3id.org/linkml/ | ||
human_phenotype: http://w3id.org/ontogpt/human_phenotype | ||
HP: http://purl.obolibrary.org/obo/HP_ | ||
|
||
default_prefix: human_phenotype | ||
default_range: string | ||
|
||
imports: | ||
- linkml:types | ||
- core | ||
|
||
classes: | ||
HumanPhenotypeSet: | ||
tree_root: true | ||
is_a: NamedEntity | ||
attributes: | ||
phenotypes: | ||
range: HumanPhenotype | ||
multivalued: true | ||
description: >- | ||
A semicolon-separated list of human phenotypes, including symptoms of disease. | ||
It must be semicolon-separated. | ||
Labels containing the word 'with' should be split into multiple phenotypes. | ||
HumanPhenotype: | ||
is_a: NamedEntity | ||
id_prefixes: | ||
- HP | ||
annotations: | ||
annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit | ||
prompt: >- | ||
the name of a human phenotype or symptom. | ||
Examples are ascites, fever, pain, seizure, increased intracranial pressure, lactic acidosis. |