Skip to content

Commit

Permalink
Fix #204 and the remainder of the HPOA evaluation (#207)
Browse files Browse the repository at this point in the history
Repair the test for HPOA evaluation, its fixtures, and the associated evaluation functions.
  • Loading branch information
caufieldjh authored Sep 18, 2023
2 parents ba59fc2 + e046924 commit bb45238
Show file tree
Hide file tree
Showing 7 changed files with 1,717 additions and 28 deletions.
26 changes: 12 additions & 14 deletions src/ontogpt/evaluation/hpoa/eval_hpoa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from random import shuffle
from typing import Dict, Iterable, Iterator, List, Tuple

from oaklib import BasicOntologyInterface, get_implementation_from_shorthand
from oaklib import BasicOntologyInterface, get_adapter
from oaklib.datamodels.search import SearchConfiguration
from oaklib.datamodels.search_datamodel import SearchProperty
from oaklib.interfaces import SearchInterface
Expand All @@ -18,12 +18,10 @@
from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine
from ontogpt.templates.mendelian_disease import MendelianDisease

THIS_DIR = Path(__file__).parent
DATABASE_DIR = Path(__file__).parent / "database"
TEST_CASES_DIR = THIS_DIR / "test_cases"
EXEMPLARS_DIR = THIS_DIR / "exemplars"
EXEMPLAR_CASES = EXEMPLARS_DIR / "drugmechdb-exemplars.yaml"

TEST_CASES_DIR = Path("tests").joinpath("input")
TEST_HPOA_FILE = "test_sample.hpoa.tsv"
NUM_TESTS = 3 # Note: each test requires input text; see provided test cases

DISEASE_ID = str
TERM = str
Expand Down Expand Up @@ -72,18 +70,18 @@ class EvalHPOA(SPIRESEvaluationEngine):

def __post_init__(self):
self.extractor = SPIRESEngine("mendelian_disease.MendelianDisease")
self.mondo = get_implementation_from_shorthand("sqlite:obo:mondo")
self.mondo = get_adapter("sqlite:obo:mondo")

def load_test_cases(self) -> List[MendelianDisease]:
return []

def disease_text(self, id: str):
id = id.replace("OMIM:", "omim-")
with open(TEST_CASES_DIR / f"{id}.txt") as f:
id = id.lower().replace(":", "-")
with open(TEST_CASES_DIR / "cases" / f"{id}.txt") as f:
return f.read()

def parse_hpoa(self) -> Iterator[HPOAnnotation]:
with open(TEST_CASES_DIR / "test.hpoa.tsv") as file:
with open(TEST_CASES_DIR / TEST_HPOA_FILE) as file:
reader = csv.reader(file, delimiter="\t")
for row in reader:
yield HPOAnnotation(
Expand Down Expand Up @@ -155,7 +153,7 @@ def enhance(self, obj: MendelianDisease):
obj.name = mondo.label(entity)
obj.label = obj.name
obj.description = mondo.definition(entity)
obj.subclass_of = list(mondo.hierararchical_parents(entity))
obj.subclass_of = list(mondo.hierarchical_parents(entity))
obj.synonyms = list(mondo.entity_aliases(entity))
for _s, _p, gene in mondo.relationships([entity], ["RO:0004003"]):
gene = (
Expand All @@ -176,15 +174,15 @@ def eval(self, task: str = None, **kwargs) -> EvaluationObjectSetHPOA:
else:
raise ValueError(f"Unknown task {task}")

def eval_against_pubs(self, num_tests=3) -> EvaluationObjectSetHPOA:
def eval_against_pubs(self, num_tests=NUM_TESTS) -> EvaluationObjectSetHPOA:
ke = self.extractor
pmc = PubmedClient()
eos = EvaluationObjectSetHPOA()
eos.test = list(self.diseases_by_publication().values())
eos.training = []
eos.predictions = []
shuffle(eos.test)
for test_case in eos.test[0:num_tests]:
for test_case in eos.test[0:num_tests-1]:
# text = self.disease_text(test_case.id)
if len(test_case.publications) != 1:
raise ValueError(f"Expected 1 publication, got {len(test_case.publications)}")
Expand All @@ -204,7 +202,7 @@ def eval_against_omim_plus_pubs(self, **kwargs) -> EvaluationObjectSetHPOA:
return self.eval_against_omim_or_pubs(use_publications=True)

def eval_against_omim_or_pubs(
self, num_tests=3, use_publications=False
self, num_tests=NUM_TESTS, use_publications=False
) -> EvaluationObjectSetHPOA:
ke = self.extractor
eos = EvaluationObjectSetHPOA()
Expand Down
1 change: 1 addition & 0 deletions tests/input/cases/omim-608716.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Autosomal recessive primary microcephaly-5 (MCPH5) is characterized by decreased occipitofrontal circumference (OFC), usually less than 3 standard deviations (SD) of the mean, present at birth and associated with mental retardation and speech delay. Other features may include short stature or mild seizures. MCPH5 is associated with a simplification of the cerebral cortical gyral pattern in some cases, which is considered within the phenotypic spectrum of primary microcephaly (review by Woods et al., 2005; Saadi et al., 2009; Passemard et al., 2009).
1 change: 1 addition & 0 deletions tests/input/cases/omim-619428.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Focal segmental glomerulosclerosis and neurodevelopmental syndrome (FSGSNEDS) is characterized by global developmental delay and renal dysfunction manifest as proteinuria and nephrotic syndrome apparent from infancy or early childhood. Some patients present with renal disease, whereas others present with developmental delay and develop renal disease later in childhood. Renal biopsy shows focal segmental glomerulosclerosis (FSGS), but the course of the disease is variable: some patients have transient proteinuria and others require renal transplant. Neurodevelopmental features are also variable, with some patients having only mildly impaired intellectual development, and others having a severe developmental disorder associated with early-onset refractory seizures or epileptic encephalopathy. Additional features, including feeding difficulties, poor overall growth, and nonspecific dysmorphic facial features, are commonly observed (summary by Assoum et al., 2018 and Weng et al., 2021).
1 change: 1 addition & 0 deletions tests/input/cases/omim-620038.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Neurodevelopmental disorder with microcephaly, hypotonia, and absent language (NEDMHAL) is a severe autosomal recessive disorder characterized by the constellation of these features. Behavioral problems and hearing loss are also present (Ansar et al., 2020).
Loading

0 comments on commit bb45238

Please sign in to comment.