From 0006a7b76cd8b23c372c6a438fc70c92496c32e9 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Mon, 4 Nov 2024 14:55:35 -0500 Subject: [PATCH] Add initial version of micronutrient extraction template --- src/ontogpt/templates/ecosim_methods.py | 316 +++++++++++++++ src/ontogpt/templates/ecosim_methods.yaml | 99 +++++ src/ontogpt/templates/ecosim_simple.py | 243 +++++++++++ src/ontogpt/templates/ecosim_simple.yaml | 39 ++ src/ontogpt/templates/micronutrient.py | 396 ++++++++++++++++++ src/ontogpt/templates/micronutrient.yaml | 77 ++++ src/ontogpt/templates/vbo_char.py | 469 ++++++++++++++++++++++ src/ontogpt/templates/vbo_char.yaml | 124 ++++++ src/ontogpt/templates/vbo_names.py | 351 ++++++++++++++++ src/ontogpt/templates/vbo_names.yaml | 57 +++ 10 files changed, 2171 insertions(+) create mode 100644 src/ontogpt/templates/ecosim_methods.py create mode 100644 src/ontogpt/templates/ecosim_methods.yaml create mode 100644 src/ontogpt/templates/ecosim_simple.py create mode 100644 src/ontogpt/templates/ecosim_simple.yaml create mode 100644 src/ontogpt/templates/micronutrient.py create mode 100644 src/ontogpt/templates/micronutrient.yaml create mode 100644 src/ontogpt/templates/vbo_char.py create mode 100644 src/ontogpt/templates/vbo_char.yaml create mode 100644 src/ontogpt/templates/vbo_names.py create mode 100644 src/ontogpt/templates/vbo_names.yaml diff --git a/src/ontogpt/templates/ecosim_methods.py b/src/ontogpt/templates/ecosim_methods.py new file mode 100644 index 000000000..b3280f677 --- /dev/null +++ b/src/ontogpt/templates/ecosim_methods.py @@ -0,0 +1,316 @@ +from __future__ import annotations +from datetime import ( + datetime, + date, + time +) +from decimal import Decimal +from enum import Enum +import re +import sys +from typing import ( + Any, + ClassVar, + List, + Literal, + Dict, + Optional, + Union +) +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_validator +) +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + + + +class LinkMLMeta(RootModel): + root: Dict[str, Any] = {} + model_config = ConfigDict(frozen=True) + + def __getattr__(self, key:str): + return getattr(self.root, key) + + def __getitem__(self, key:str): + return self.root[key] + + def __setitem__(self, key:str, value): + self.root[key] = value + + def __contains__(self, key:str) -> bool: + return key in self.root + + +linkml_meta = LinkMLMeta({'default_prefix': 'ecosim_methods', + 'default_range': 'string', + 'description': 'EcoSIM Methods Extraction Template', + 'id': 'http://w3id.org/ontogpt/ecosim_methods', + 'imports': ['linkml:types', 'core'], + 'license': 'https://creativecommons.org/publicdomain/zero/1.0/', + 'name': 'ecosim_methods', + 'prefixes': {'ecosim': {'prefix_prefix': 'ecosim', + 'prefix_reference': 'http://purl.obolibrary.org/obo/ecosim'}, + 'ecosim_simple': {'prefix_prefix': 'ecosim_simple', + 'prefix_reference': 'http://w3id.org/ontogpt/ecosim_simple'}, + 'linkml': {'prefix_prefix': 'linkml', + 'prefix_reference': 'https://w3id.org/linkml/'}, + 'rdf': {'prefix_prefix': 'rdf', + 'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}}, + 'source_file': '/home/harry/ontogpt/src/ontogpt/templates/ecosim_methods.yaml', + 'title': 'EcoSIM Methods Extraction Template'} ) + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} }) + input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} }) + input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} }) + raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} }) + prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} }) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} }) + named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} }) + + +class NamedEntity(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class CompoundExpression(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} }) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} }) + + +class RelationshipType(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core', + 'id_prefixes': ['RO', 'biolink']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Publication(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} }) + title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} }) + abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} }) + combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} }) + full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} }) + + +class AnnotatorResult(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} }) + object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} }) + object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} }) + + +class TermSet(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/ecosim_methods', 'tree_root': True}) + + locations: Optional[List[str]] = Field(None, description="""A semicolon-separated list of research locations. Examples include: Vermont, New York City, Ethiopia""", json_schema_extra = { "linkml_meta": {'alias': 'locations', 'domain_of': ['TermSet']} }) + methods: Optional[List[str]] = Field(None, description="""A semicolon-separated list of methods used in environmental and earth science research. Examples include: sampling, spectroscopy""", json_schema_extra = { "linkml_meta": {'alias': 'methods', 'domain_of': ['TermSet']} }) + variables: Optional[str] = Field(None, description="""A semicolon-separated list of variables measured in environmental and earth science research. Examples include: root shape, biomass, water turbidity""", json_schema_extra = { "linkml_meta": {'alias': 'variables', 'domain_of': ['TermSet']} }) + equipments: Optional[str] = Field(None, description="""A semicolon-separated list of equipment used in environmental and earth science research.""", json_schema_extra = { "linkml_meta": {'alias': 'equipments', 'domain_of': ['TermSet']} }) + equipment_to_variable_relationships: Optional[List[EquipmentMeasuresVariable]] = Field(None, description="""A semicolon separated list of relationships between specific equipment and variables they are used to measure as described in the input. Example: NMR spectrometer was used to measure chemical content""", json_schema_extra = { "linkml_meta": {'alias': 'equipment_to_variable_relationships', 'domain_of': ['TermSet']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Location(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'prompt': {'tag': 'prompt', + 'value': 'The name of a location used in ' + 'research.'}}, + 'from_schema': 'http://w3id.org/ontogpt/ecosim_methods'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Method(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'bioportal:ECOSIM'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a method used in environment ' + 'and earth science research.'}}, + 'from_schema': 'http://w3id.org/ontogpt/ecosim_methods'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Variable(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'bioportal:ECOSIM'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a variable measured in ' + 'environment and earth science ' + 'research.'}}, + 'from_schema': 'http://w3id.org/ontogpt/ecosim_methods'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Equipment(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'prompt': {'tag': 'prompt', + 'value': 'The name of a piece of equipment used in ' + 'environment and earth science ' + 'research.'}}, + 'from_schema': 'http://w3id.org/ontogpt/ecosim_methods'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class EquipmentMeasuresVariable(CompoundExpression): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/ecosim_methods'}) + + equipment: Optional[str] = Field(None, description="""Name of the equipment used to measure a variable.""", json_schema_extra = { "linkml_meta": {'alias': 'equipment', 'domain_of': ['EquipmentMeasuresVariable']} }) + variable: Optional[str] = Field(None, description="""Name of the variable being measured.""", json_schema_extra = { "linkml_meta": {'alias': 'variable', 'domain_of': ['EquipmentMeasuresVariable']} }) + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +TermSet.model_rebuild() +Location.model_rebuild() +Method.model_rebuild() +Variable.model_rebuild() +Equipment.model_rebuild() +EquipmentMeasuresVariable.model_rebuild() diff --git a/src/ontogpt/templates/ecosim_methods.yaml b/src/ontogpt/templates/ecosim_methods.yaml new file mode 100644 index 000000000..5900af1e7 --- /dev/null +++ b/src/ontogpt/templates/ecosim_methods.yaml @@ -0,0 +1,99 @@ +id: http://w3id.org/ontogpt/ecosim_methods +name: ecosim_methods +title: EcoSIM Methods Extraction Template +description: >- + EcoSIM Methods Extraction Template +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + linkml: https://w3id.org/linkml/ + ecosim_simple: http://w3id.org/ontogpt/ecosim_simple + ecosim: http://purl.obolibrary.org/obo/ecosim + +default_prefix: ecosim_methods +default_range: string + +imports: + - linkml:types + - core + +classes: + TermSet: + tree_root: true + is_a: NamedEntity + attributes: + locations: + range: Location + multivalued: true + description: >- + A semicolon-separated list of research locations. + Examples include: Vermont, New York City, + Ethiopia + methods: + range: Method + multivalued: true + description: >- + A semicolon-separated list of methods used in + environmental and earth science research. Examples + include: sampling, spectroscopy + variables: + range: Variable + description: >- + A semicolon-separated list of variables measured in + environmental and earth science research. Examples + include: root shape, biomass, water turbidity + equipments: + range: Equipment + description: >- + A semicolon-separated list of equipment used in + environmental and earth science research. + equipment_to_variable_relationships: + range: EquipmentMeasuresVariable + description: >- + A semicolon separated list of relationships + between specific equipment and variables + they are used to measure as described in the input. + Example: NMR spectrometer was used to measure + chemical content + multivalued: true + inlined: true + + Location: + is_a: NamedEntity + annotations: + prompt: >- + The name of a location used in research. + + Method: + is_a: NamedEntity + annotations: + annotators: bioportal:ECOSIM + prompt: >- + The name of a method used in environment and + earth science research. + + Variable: + is_a: NamedEntity + annotations: + annotators: bioportal:ECOSIM + prompt: >- + The name of a variable measured in environment and + earth science research. + + Equipment: + is_a: NamedEntity + annotations: + prompt: >- + The name of a piece of equipment used in + environment and earth science research. + + EquipmentMeasuresVariable: + is_a: CompoundExpression + attributes: + equipment: + range: Equipment + description: Name of the equipment used to measure a variable. + variable: + range: Variable + description: Name of the variable being measured. + diff --git a/src/ontogpt/templates/ecosim_simple.py b/src/ontogpt/templates/ecosim_simple.py new file mode 100644 index 000000000..1ee531917 --- /dev/null +++ b/src/ontogpt/templates/ecosim_simple.py @@ -0,0 +1,243 @@ +from __future__ import annotations +from datetime import ( + datetime, + date, + time +) +from decimal import Decimal +from enum import Enum +import re +import sys +from typing import ( + Any, + ClassVar, + List, + Literal, + Dict, + Optional, + Union +) +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_validator +) +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + + + +class LinkMLMeta(RootModel): + root: Dict[str, Any] = {} + model_config = ConfigDict(frozen=True) + + def __getattr__(self, key:str): + return getattr(self.root, key) + + def __getitem__(self, key:str): + return self.root[key] + + def __setitem__(self, key:str, value): + self.root[key] = value + + def __contains__(self, key:str) -> bool: + return key in self.root + + +linkml_meta = LinkMLMeta({'default_prefix': 'ecosim_simple', + 'default_range': 'string', + 'description': 'Simple EcoSIM Extraction Template', + 'id': 'http://w3id.org/ontogpt/ecosim_simple', + 'imports': ['linkml:types', 'core'], + 'license': 'https://creativecommons.org/publicdomain/zero/1.0/', + 'name': 'ecosim_simple', + 'prefixes': {'ecosim': {'prefix_prefix': 'ecosim', + 'prefix_reference': 'http://purl.obolibrary.org/obo/ecosim'}, + 'ecosim_simple': {'prefix_prefix': 'ecosim_simple', + 'prefix_reference': 'http://w3id.org/ontogpt/ecosim_simple'}, + 'linkml': {'prefix_prefix': 'linkml', + 'prefix_reference': 'https://w3id.org/linkml/'}, + 'rdf': {'prefix_prefix': 'rdf', + 'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}}, + 'source_file': '/home/harry/ontogpt/src/ontogpt/templates/ecosim_simple.yaml', + 'title': 'Simple EcoSIM Extraction Template'} ) + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} }) + input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} }) + input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} }) + raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} }) + prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} }) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} }) + named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} }) + + +class NamedEntity(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class CompoundExpression(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} }) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} }) + + +class RelationshipType(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core', + 'id_prefixes': ['RO', 'biolink']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Publication(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} }) + title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} }) + abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} }) + combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} }) + full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} }) + + +class AnnotatorResult(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} }) + object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} }) + object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} }) + + +class TermSet(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/ecosim_simple', 'tree_root': True}) + + terms: Optional[List[str]] = Field(None, description="""A semicolon-separated list of variables for earth system simulation. Do not include abbreviations in parentheses, e.g., \"Carbon (C)\" should be represented as \"carbon\". Examples include: carboxylation, sodium, underground irrigation.""", json_schema_extra = { "linkml_meta": {'alias': 'terms', 'domain_of': ['TermSet']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +class Term(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'bioportal:ECOSIM'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a variable for earth system ' + 'simulation.'}}, + 'from_schema': 'http://w3id.org/ontogpt/ecosim_simple'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +TermSet.model_rebuild() +Term.model_rebuild() diff --git a/src/ontogpt/templates/ecosim_simple.yaml b/src/ontogpt/templates/ecosim_simple.yaml new file mode 100644 index 000000000..eff28b9d5 --- /dev/null +++ b/src/ontogpt/templates/ecosim_simple.yaml @@ -0,0 +1,39 @@ +id: http://w3id.org/ontogpt/ecosim_simple +name: ecosim_simple +title: Simple EcoSIM Extraction Template +description: >- + Simple EcoSIM Extraction Template +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + linkml: https://w3id.org/linkml/ + ecosim_simple: http://w3id.org/ontogpt/ecosim_simple + ecosim: http://purl.obolibrary.org/obo/ecosim + +default_prefix: ecosim_simple +default_range: string + +imports: + - linkml:types + - core + +classes: + TermSet: + tree_root: true + is_a: NamedEntity + attributes: + terms: + range: Term + multivalued: true + description: >- + A semicolon-separated list of variables + for earth system simulation. Do not include + abbreviations in parentheses, e.g., "Carbon (C)" + should be represented as "carbon". Examples include: carboxylation, sodium, underground irrigation. + + Term: + is_a: NamedEntity + annotations: + annotators: bioportal:ECOSIM + prompt: >- + The name of a variable for earth system simulation. diff --git a/src/ontogpt/templates/micronutrient.py b/src/ontogpt/templates/micronutrient.py new file mode 100644 index 000000000..6703d9cec --- /dev/null +++ b/src/ontogpt/templates/micronutrient.py @@ -0,0 +1,396 @@ +from __future__ import annotations + +import re +import sys +from datetime import ( + date, + datetime, + time +) +from decimal import Decimal +from enum import Enum +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Optional, + Union +) + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_validator +) + + +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + + + +class LinkMLMeta(RootModel): + root: Dict[str, Any] = {} + model_config = ConfigDict(frozen=True) + + def __getattr__(self, key:str): + return getattr(self.root, key) + + def __getitem__(self, key:str): + return self.root[key] + + def __setitem__(self, key:str, value): + self.root[key] = value + + def __contains__(self, key:str) -> bool: + return key in self.root + + +linkml_meta = LinkMLMeta({'default_prefix': 'micronutrient', + 'default_range': 'string', + 'description': 'A template for micronutrient information from text, including ' + 'its participation in biochemical pathways and relationships ' + 'to genes and diseases.', + 'id': 'http://w3id.org/ontogpt/micronutrient', + 'imports': ['linkml:types', 'core'], + 'license': 'https://creativecommons.org/publicdomain/zero/1.0/', + 'name': 'micronutrient', + 'prefixes': {'GO': {'prefix_prefix': 'GO', + 'prefix_reference': 'http://purl.obolibrary.org/obo/GO_'}, + 'chebi': {'prefix_prefix': 'chebi', + 'prefix_reference': 'http://purl.obolibrary.org/obo/CHEBI_'}, + 'foodon': {'prefix_prefix': 'foodon', + 'prefix_reference': 'http://purl.obolibrary.org/obo/foodon_'}, + 'linkml': {'prefix_prefix': 'linkml', + 'prefix_reference': 'https://w3id.org/linkml/'}, + 'micronutrient': {'prefix_prefix': 'micronutrient', + 'prefix_reference': 'http://w3id.org/ontogpt/micronutrient'}, + 'rdf': {'prefix_prefix': 'rdf', + 'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}}, + 'source_file': 'src/ontogpt/templates/micronutrient.yaml', + 'title': 'Food Extraction Template'} ) + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} }) + input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} }) + input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} }) + raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} }) + prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} }) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} }) + named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} }) + + +class NamedEntity(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class CompoundExpression(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} }) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} }) + + +class RelationshipType(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core', + 'id_prefixes': ['RO', 'biolink']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Publication(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} }) + title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} }) + abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} }) + combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} }) + full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} }) + + +class AnnotatorResult(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} }) + object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} }) + object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} }) + + +class Document(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/micronutrient', 'tree_root': True}) + + nutrientTerms: Optional[List[str]] = Field(None, description="""A semicolon-separated list of any names of nutrients or micronutrients, e.g., riboflavin, chromium, fiber""", json_schema_extra = { "linkml_meta": {'alias': 'nutrientTerms', 'domain_of': ['Document']} }) + nutrientToPathwayRelationships: Optional[List[str]] = Field(None, description="""A semicolon-separated list of relationships between nutrients and biochemical pathways, e.g., riboflavin IS INVOLVED IN citric acid cycle""", json_schema_extra = { "linkml_meta": {'alias': 'nutrientToPathwayRelationships', 'domain_of': ['Document']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class NutrientTerm(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'sqlite:obo:foodon, sqlite:obo:chebi'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a nutrient.'}}, + 'from_schema': 'http://w3id.org/ontogpt/micronutrient', + 'id_prefixes': ['FOODON', 'CHEBI']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Pathway(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', 'value': 'sqlite:obo:go'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a biochemical pathway.'}}, + 'from_schema': 'http://w3id.org/ontogpt/micronutrient', + 'id_prefixes': ['GO']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class NutrientToPathwayRelationship(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/micronutrient'}) + + nutrient: Optional[str] = Field(None, description="""The name of a nutrient.""", json_schema_extra = { "linkml_meta": {'alias': 'nutrient', 'domain_of': ['NutrientToPathwayRelationship']} }) + pathway: Optional[str] = Field(None, description="""The name of a biochemical pathway.""", json_schema_extra = { "linkml_meta": {'alias': 'pathway', 'domain_of': ['NutrientToPathwayRelationship']} }) + relationship: Optional[str] = Field(None, description="""The relationship between the nutrient and the pathway, for example \"IS INVOLVED IN\"""", json_schema_extra = { "linkml_meta": {'alias': 'relationship', 'domain_of': ['NutrientToPathwayRelationship']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +Document.model_rebuild() +NutrientTerm.model_rebuild() +Pathway.model_rebuild() +NutrientToPathwayRelationship.model_rebuild() + diff --git a/src/ontogpt/templates/micronutrient.yaml b/src/ontogpt/templates/micronutrient.yaml new file mode 100644 index 000000000..4c2f82875 --- /dev/null +++ b/src/ontogpt/templates/micronutrient.yaml @@ -0,0 +1,77 @@ +id: http://w3id.org/ontogpt/micronutrient +name: micronutrient +title: Food Extraction Template +description: >- + A template for micronutrient information from text, + including its participation in biochemical pathways + and relationships to genes and diseases. +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + foodon: http://purl.obolibrary.org/obo/foodon_ + chebi: http://purl.obolibrary.org/obo/CHEBI_ + GO: http://purl.obolibrary.org/obo/GO_ + micronutrient: http://w3id.org/ontogpt/micronutrient + linkml: https://w3id.org/linkml/ + +default_prefix: micronutrient +default_range: string + +imports: + - linkml:types + - core + +classes: + Document: + tree_root: true + is_a: NamedEntity + attributes: + nutrientTerms: + range: NutrientTerm + multivalued: true + description: >- + A semicolon-separated list of any names of nutrients + or micronutrients, e.g., riboflavin, chromium, fiber + nutrientToPathwayRelationships: + range: NutrientToPathwayRelationship + multivalued: true + description: >- + A semicolon-separated list of relationships between + nutrients and biochemical pathways, e.g., riboflavin + IS INVOLVED IN citric acid cycle + + NutrientTerm: + is_a: NamedEntity + id_prefixes: + - FOODON + - CHEBI + annotations: + annotators: sqlite:obo:foodon, sqlite:obo:chebi + prompt: >- + The name of a nutrient. + + Pathway: + is_a: NamedEntity + id_prefixes: + - GO + annotations: + annotators: sqlite:obo:go + prompt: >- + The name of a biochemical pathway. + + NutrientToPathwayRelationship: + is_a: NamedEntity + attributes: + nutrient: + range: NutrientTerm + description: >- + The name of a nutrient. + pathway: + range: Pathway + description: >- + The name of a biochemical pathway. + relationship: + range: string + description: >- + The relationship between the nutrient and the pathway, + for example "IS INVOLVED IN" diff --git a/src/ontogpt/templates/vbo_char.py b/src/ontogpt/templates/vbo_char.py new file mode 100644 index 000000000..6e8950fe5 --- /dev/null +++ b/src/ontogpt/templates/vbo_char.py @@ -0,0 +1,469 @@ +from __future__ import annotations + +import re +import sys +from datetime import ( + date, + datetime, + time +) +from decimal import Decimal +from enum import Enum +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Optional, + Union +) + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_validator +) + + +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + + + +class LinkMLMeta(RootModel): + root: Dict[str, Any] = {} + model_config = ConfigDict(frozen=True) + + def __getattr__(self, key:str): + return getattr(self.root, key) + + def __getitem__(self, key:str): + return self.root[key] + + def __setitem__(self, key:str, value): + self.root[key] = value + + def __contains__(self, key:str) -> bool: + return key in self.root + + +linkml_meta = LinkMLMeta({'default_prefix': 'vbo_char', + 'default_range': 'string', + 'description': 'An extraction template for animal names present in VBO, along ' + 'with the characteristics of each breed', + 'id': 'http://w3id.org/ontogpt/vbo_char', + 'imports': ['linkml:types', 'core'], + 'license': 'https://creativecommons.org/publicdomain/zero/1.0/', + 'name': 'vbo_char', + 'prefixes': {'linkml': {'prefix_prefix': 'linkml', + 'prefix_reference': 'https://w3id.org/linkml/'}, + 'rdf': {'prefix_prefix': 'rdf', + 'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}, + 'vbo': {'prefix_prefix': 'vbo', + 'prefix_reference': 'http://purl.obolibrary.org/obo/vbo'}, + 'vbo_char': {'prefix_prefix': 'vbo_char', + 'prefix_reference': 'http://w3id.org/ontogpt/vbo_char'}}, + 'source_file': '/home/harry/ontogpt/src/ontogpt/templates/vbo_char.yaml', + 'title': 'Extraction Template for Animal Breeds and their Characteristics'} ) + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} }) + input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} }) + input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} }) + raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} }) + prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} }) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} }) + named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} }) + + +class NamedEntity(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class CompoundExpression(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} }) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} }) + + +class RelationshipType(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core', + 'id_prefixes': ['RO', 'biolink']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Publication(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} }) + title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} }) + abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} }) + combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} }) + full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} }) + + +class AnnotatorResult(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} }) + object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} }) + object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} }) + + +class NameSet(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/vbo_char', 'tree_root': True}) + + animal_names: Optional[List[str]] = Field(None, description="""A semicolon-separated list of names of animals used in the input text. These are general names, e.g., if any breed of cat is mentioned, this list should include \"Cat breed\", or for any pig, include \"Pig breed\".""", json_schema_extra = { "linkml_meta": {'alias': 'animal_names', 'domain_of': ['NameSet']} }) + names: Optional[List[str]] = Field(None, description="""A semicolon-separated list of names of animal breeds used in the input text. These should be as specific as possible about the breed of the animal. Examples include: Gimbsheimer Enten, Debao pony, Baixi""", json_schema_extra = { "linkml_meta": {'alias': 'names', 'domain_of': ['NameSet']} }) + characteristics: Optional[List[BreedToCharacteristic]] = Field(None, description="""A semicolon-separated list of names of animal breeds used in the input text, along with a single characteristic mentioned for that breed. These should be as specific as possible about the breed of the animal. The characteristic may be color, dimensions, physical properties, abilities, or other features. Each statement should contain just one pair of breed name and characteristic. It should be formatted as \"Breed IS Characteristic\", or \"Breed HAS Characteristic\", e.g., Gimbsheimer Enten IS blue, Debao pony IS short, Baixi IS circular Each additional characteristic for a breed should get its own statement, e.g., \"breed name is tall and wide\" should become \"Breed IS tall\" and \"Breed is wide\".""", json_schema_extra = { "linkml_meta": {'alias': 'characteristics', 'domain_of': ['NameSet']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class AnimalName(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', 'value': 'sqlite:obo:vbo'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a grouping category of ' + 'vertebrate animal breeds.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_char'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class BreedName(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', 'value': 'sqlite:obo:vbo'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a vertebrate animal breed.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_char'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Characteristic(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'sqlite:obo:uberon, sqlite:obo:pato'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a characteristic of an ' + 'animal.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_char'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Descriptor(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', + 'value': 'sqlite:obo:pato'}, + 'prompt': {'tag': 'prompt', + 'value': 'A descriptor for a characteristic.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_char'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class BreedToCharacteristic(Triple): + """ + A triple in which the subject is an animal breed, the object is a characteristic, and the predicate is usually \"IS\" or \"HAS\". + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/vbo_char', + 'slot_usage': {'object': {'description': 'The specific characteristic. This ' + 'is generally a noun of the ' + 'characteristic, e.g., with "blue ' + 'fin" the object is "fin".', + 'name': 'object', + 'range': 'Characteristic'}, + 'object_qualifier': {'description': 'An optional qualifier or ' + 'modifier for the ' + 'characteristic. This is ' + 'generally a descriptor of ' + 'the characteristic, e.g., ' + 'with "blue fin" the ' + 'qualifier is "blue".', + 'name': 'object_qualifier', + 'range': 'Descriptor'}, + 'predicate': {'description': 'The relationship type, generally ' + 'IS or HAS to indicate a breed is ' + 'defined by having a specific ' + 'characteristic.', + 'name': 'predicate', + 'range': 'NamedEntity'}, + 'subject': {'name': 'subject', 'range': 'BreedName'}, + 'subject_qualifier': {'description': 'An optional qualifier or ' + 'modifier for the breed.', + 'name': 'subject_qualifier', + 'range': 'NamedEntity'}}}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, description="""The relationship type, generally IS or HAS to indicate a breed is defined by having a specific characteristic.""", json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, description="""The specific characteristic. This is generally a noun of the characteristic, e.g., with \"blue fin\" the object is \"fin\".""", json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the breed.""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the characteristic. This is generally a descriptor of the characteristic, e.g., with \"blue fin\" the qualifier is \"blue\".""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +NameSet.model_rebuild() +AnimalName.model_rebuild() +BreedName.model_rebuild() +Characteristic.model_rebuild() +Descriptor.model_rebuild() +BreedToCharacteristic.model_rebuild() diff --git a/src/ontogpt/templates/vbo_char.yaml b/src/ontogpt/templates/vbo_char.yaml new file mode 100644 index 000000000..4bc6441cc --- /dev/null +++ b/src/ontogpt/templates/vbo_char.yaml @@ -0,0 +1,124 @@ +id: http://w3id.org/ontogpt/vbo_char +name: vbo_char +title: Extraction Template for Animal Breeds and their Characteristics +description: >- + An extraction template for animal names present in VBO, + along with the characteristics of each breed +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + linkml: https://w3id.org/linkml/ + vbo_char: http://w3id.org/ontogpt/vbo_char + vbo: http://purl.obolibrary.org/obo/vbo + +default_prefix: vbo_char +default_range: string + +imports: + - linkml:types + - core + +classes: + NameSet: + tree_root: true + is_a: NamedEntity + attributes: + animal_names: + range: AnimalName + multivalued: true + description: >- + A semicolon-separated list of names of animals + used in the input text. These are general names, + e.g., if any breed of cat is mentioned, this + list should include "Cat breed", or for any pig, + include "Pig breed". + names: + range: BreedName + multivalued: true + description: >- + A semicolon-separated list of names of animal breeds + used in the input text. These should be as specific + as possible about the breed of the animal. + Examples include: + Gimbsheimer Enten, Debao pony, Baixi + characteristics: + range: BreedToCharacteristic + multivalued: true + inlined: true + inlined_as_list: true + description: >- + A semicolon-separated list of names of animal breeds + used in the input text, along with a single characteristic + mentioned for that breed. These should be as specific + as possible about the breed of the animal. The characteristic + may be color, dimensions, physical properties, abilities, + or other features. Each statement should contain just one + pair of breed name and characteristic. + It should be formatted as + "Breed IS Characteristic", or "Breed HAS Characteristic", + e.g., Gimbsheimer Enten IS blue, + Debao pony IS short, + Baixi IS circular + Each additional characteristic for a breed should get + its own statement, e.g., "breed name is tall and wide" + should become "Breed IS tall" and "Breed is wide". + + AnimalName: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:vbo + prompt: >- + The name of a grouping category of vertebrate + animal breeds. + + BreedName: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:vbo + prompt: >- + The name of a vertebrate animal breed. + + Characteristic: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:uberon, sqlite:obo:pato + prompt: >- + The name of a characteristic of an animal. + + Descriptor: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:pato + prompt: >- + A descriptor for a characteristic. + + BreedToCharacteristic: + is_a: Triple + description: >- + A triple in which the subject is an animal breed, + the object is a characteristic, and the predicate + is usually "IS" or "HAS". + slot_usage: + subject: + range: BreedName + object: + description: >- + The specific characteristic. + This is generally a noun of the characteristic, + e.g., with "blue fin" the object is "fin". + range: Characteristic + predicate: + range: NamedEntity + description: >- + The relationship type, generally IS or HAS to indicate a breed + is defined by having a specific characteristic. + subject_qualifier: + range: NamedEntity + description: >- + An optional qualifier or modifier for the breed. + object_qualifier: + range: Descriptor + description: >- + An optional qualifier or modifier for the characteristic. + This is generally a descriptor of the characteristic, + e.g., with "blue fin" the qualifier is "blue". diff --git a/src/ontogpt/templates/vbo_names.py b/src/ontogpt/templates/vbo_names.py new file mode 100644 index 000000000..117cacbcb --- /dev/null +++ b/src/ontogpt/templates/vbo_names.py @@ -0,0 +1,351 @@ +from __future__ import annotations + +import re +import sys +from datetime import ( + date, + datetime, + time +) +from decimal import Decimal +from enum import Enum +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Optional, + Union +) + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_validator +) + + +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + + + +class LinkMLMeta(RootModel): + root: Dict[str, Any] = {} + model_config = ConfigDict(frozen=True) + + def __getattr__(self, key:str): + return getattr(self.root, key) + + def __getitem__(self, key:str): + return self.root[key] + + def __setitem__(self, key:str, value): + self.root[key] = value + + def __contains__(self, key:str) -> bool: + return key in self.root + + +linkml_meta = LinkMLMeta({'default_prefix': 'vbo_names', + 'default_range': 'string', + 'description': 'An extraction template for animal names present in VBO', + 'id': 'http://w3id.org/ontogpt/vbo_names', + 'imports': ['linkml:types', 'core'], + 'license': 'https://creativecommons.org/publicdomain/zero/1.0/', + 'name': 'vbo_names', + 'prefixes': {'linkml': {'prefix_prefix': 'linkml', + 'prefix_reference': 'https://w3id.org/linkml/'}, + 'rdf': {'prefix_prefix': 'rdf', + 'prefix_reference': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'}, + 'vbo': {'prefix_prefix': 'vbo', + 'prefix_reference': 'http://purl.obolibrary.org/obo/vbo'}, + 'vbo_names': {'prefix_prefix': 'vbo_names', + 'prefix_reference': 'http://w3id.org/ontogpt/ecosim_simple'}}, + 'source_file': '/home/harry/ontogpt/src/ontogpt/templates/vbo_names.yaml', + 'title': 'Extraction Template for Animal Names'} ) + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} }) + input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} }) + input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} }) + raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} }) + prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} }) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} }) + named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} }) + + +class NamedEntity(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class CompoundExpression(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} }) + predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} }) + object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} }) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} }) + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} }) + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} }) + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} }) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'domain_of': ['TextWithTriples', 'TextWithEntity']} }) + entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} }) + + +class RelationshipType(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core', + 'id_prefixes': ['RO', 'biolink']}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class Publication(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} }) + title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} }) + abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} }) + combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} }) + full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} }) + + +class AnnotatorResult(ConfiguredBaseModel): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'}) + + subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} }) + object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} }) + object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} }) + + +class NameSet(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/vbo_names', 'tree_root': True}) + + animal_names: Optional[List[str]] = Field(None, description="""A semicolon-separated list of names of animals used in the input text. These are general names, e.g., if any breed of cat is mentioned, this list should include \"Cat breed\", or for any pig, include \"Pig breed\".""", json_schema_extra = { "linkml_meta": {'alias': 'animal_names', 'domain_of': ['NameSet']} }) + names: Optional[List[str]] = Field(None, description="""A semicolon-separated list of names of animal breeds used in the input text. These should be as specific as possible about the breed of the animal. Examples include: Gimbsheimer Enten, Debao pony, Baixi""", json_schema_extra = { "linkml_meta": {'alias': 'names', 'domain_of': ['NameSet']} }) + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class AnimalName(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', 'value': 'sqlite:obo:vbo'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a grouping category of ' + 'vertebrate animal breeds.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_names'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +class BreedName(NamedEntity): + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'annotations': {'annotators': {'tag': 'annotators', 'value': 'sqlite:obo:vbo'}, + 'prompt': {'tag': 'prompt', + 'value': 'The name of a vertebrate animal breed.'}}, + 'from_schema': 'http://w3id.org/ontogpt/vbo_names'}) + + id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['this is populated during the grounding and normalization step'], + 'domain_of': ['NamedEntity', 'Publication']} }) + label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label', + 'aliases': ['name'], + 'annotations': {'owl': {'tag': 'owl', + 'value': 'AnnotationProperty, AnnotationAssertion'}}, + 'domain_of': ['NamedEntity'], + 'slot_uri': 'rdfs:label'} }) + original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans', + 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, + 'comments': ['This is determined during grounding and normalization', + 'But is based on the full input text'], + 'domain_of': ['NamedEntity']} }) + + @field_validator('original_spans') + def pattern_original_spans(cls, v): + pattern=re.compile(r"^\d+:\d+$") + if isinstance(v,list): + for element in v: + if isinstance(v, str) and not pattern.match(element): + raise ValueError(f"Invalid original_spans format: {element}") + elif isinstance(v,str): + if not pattern.match(v): + raise ValueError(f"Invalid original_spans format: {v}") + return v + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +NameSet.model_rebuild() +AnimalName.model_rebuild() +BreedName.model_rebuild() diff --git a/src/ontogpt/templates/vbo_names.yaml b/src/ontogpt/templates/vbo_names.yaml new file mode 100644 index 000000000..433d75620 --- /dev/null +++ b/src/ontogpt/templates/vbo_names.yaml @@ -0,0 +1,57 @@ +id: http://w3id.org/ontogpt/vbo_names +name: vbo_names +title: Extraction Template for Animal Names +description: >- + An extraction template for animal names present in VBO +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + linkml: https://w3id.org/linkml/ + vbo_names: http://w3id.org/ontogpt/ecosim_simple + vbo: http://purl.obolibrary.org/obo/vbo + +default_prefix: vbo_names +default_range: string + +imports: + - linkml:types + - core + +classes: + NameSet: + tree_root: true + is_a: NamedEntity + attributes: + animal_names: + range: AnimalName + multivalued: true + description: >- + A semicolon-separated list of names of animals + used in the input text. These are general names, + e.g., if any breed of cat is mentioned, this + list should include "Cat breed", or for any pig, + include "Pig breed". + names: + range: BreedName + multivalued: true + description: >- + A semicolon-separated list of names of animal breeds + used in the input text. These should be as specific + as possible about the breed of the animal. + Examples include: + Gimbsheimer Enten, Debao pony, Baixi + + AnimalName: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:vbo + prompt: >- + The name of a grouping category of vertebrate + animal breeds. + + BreedName: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:vbo + prompt: >- + The name of a vertebrate animal breed.