Skip to content

Commit

Permalink
feat: split all_ontology into individual files. (#93)
Browse files Browse the repository at this point in the history
## Reason for Change

- #81 
- #82 

## Changes

- add sematic_versioning as a requirement for all_ontology build and
API. It is used to find the latest version of cellxgene_schema.
- modify *tools/ontology-builder/src/all_ontology_generator.py* to
generate a separate file for each ontology in ontology_info.json. By
default all_ontology_generator.py generates json.gz file for the latest
cellxgene_schema version.
- update *artifact-schemas/ontology_info_schema.json* to support split
ontology.json.gz formats
- move `ontologyCategory` to the top level in
*artifact-schemas/all_ontology_schema.json*. This is because each
ontology is save into isn't own file, so the terms do not need to be
nested under their ontology name.
- update *artifact-schemas/all_ontology_schema.json* to support multiple
cellxgene schema versions
- nest ontology_info.json properties under a Cellxgene Schema version.
This allows us to support multiple schema versions from
ontology_info.json file.
- It is used by *tools/ontology-builder/src/all_ontology_generator.py*
to generate the ontology artifacts for the current schema version.
- It is used by
*api/python/src/cellxgene_ontology_guide/supported_versions.py* to get
the schema version and determine what ontology versions to use.
- update *api/python/src/cellxgene_ontology_guide/ontology_parser.py* to
use `CXGSchema` to access ontology terms.
- add *api/python/src/cellxgene_ontology_guide/supported_versions.py* to
the ontologies associated with a schema version.
- add caching of loaded ontology files to avoid excessive unzipping of
ontology files.

## Testing steps

- Add unit tests for all new functionality
- Updated unit tests for existing functionality.
- Verified multiple ontology assets are created
- Verified the API can used the multiple assets.

## Notes for Reviewer
- can artifact_download.py be removed?

---------

Co-authored-by: github-actions <[email protected]>
  • Loading branch information
Bento007 and github-actions authored Mar 6, 2024
1 parent 2512163 commit ead59e5
Show file tree
Hide file tree
Showing 27 changed files with 498 additions and 357 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ name: Updates to Ontology Files
on:
push:
paths:
- "**/api/python/src/cellxgene_ontology_guide/artifacts/ontology_info.json"
- "**/artifact-schemas/ontology_info_schema.json"
- "**/artifact-schemas/all_ontology_schema.json"
- "**/ontology-assets/ontology_info.json"
- "**/tools/ontology-builder/src/all_ontology_generator.py"
branches-ignore:
- main

Expand Down Expand Up @@ -37,7 +39,7 @@ jobs:
- name: ontology-processing
run: |
python3 ./tools/ontology-builder/src/all_ontology_generator.py
git add ./api/python/src/cellxgene_ontology_guide/artifacts/all_ontology.json.gz
git add ./ontology-assets/*.json.gz
- name: Commit
run: |
git commit -m "AUTO: update ontologies"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/validate_json_schemas.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ jobs:
- name: install requirements
run: |
pip install -r tools/ontology-builder/requirements.txt
- name: validate curated lists
- name: validate json schemas
run: |
python3 ./tools/ontology-builder/src/validate_curated_lists.py
python3 ./tools/ontology-builder/src/validate_json_schemas.py
2 changes: 1 addition & 1 deletion api/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors = [
license = { file = "LICENSE" }
readme = "README.md"
requires-python = "~= 3.11"
dependencies = []
dependencies = ["semantic_version==2.8.5"]

[project.optional-dependencies]
test = ["pytest"]
Expand Down
44 changes: 0 additions & 44 deletions api/python/src/cellxgene_ontology_guide/artifact_download.py

This file was deleted.

4 changes: 1 addition & 3 deletions api/python/src/cellxgene_ontology_guide/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
DATA_ROOT = os.path.join(PACKAGE_ROOT, "data")
ALL_ONTOLOGY_FILENAME = "all_ontology.json.gz"
ONTOLOGY_FILENAME_SUFFIX = ".json.gz"
ONTOLOGY_INFO_FILENAME = "ontology_info.json"
ONTOLOGY_ASSET_RELEASE_URL = "https://github.com/chanzuckerberg/cellxgene-ontology-guide/releases/download"
SCHEMA_VERSION_TO_ONTOLOGY_ASSET_TAG = {"5.0.0": "ontology-assets-v0.0.1"}
25 changes: 12 additions & 13 deletions api/python/src/cellxgene_ontology_guide/ontology_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
from typing import Any, Dict, List, Union

from artifact_download import load_artifact_by_schema
from constants import ALL_ONTOLOGY_FILENAME, ONTOLOGY_INFO_FILENAME
from entities import Ontology, OntologyFileType, OntologyVariant

from cellxgene_ontology_guide.supported_versions import CXGSchema


class OntologyParser:
"""
Expand All @@ -19,8 +19,7 @@ def __init__(self, schema_version: str):
:param schema_version: str version of the schema to load ontology metadata for
"""
self.ontology_dict = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
self.supported_ontologies = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME)
self.cxg_schema = CXGSchema(version=schema_version)

def _parse_ontology_name(self, term_id: str) -> str:
"""
Expand All @@ -35,7 +34,7 @@ def _parse_ontology_name(self, term_id: str) -> str:
raise ValueError(f"{term_id} does not conform to expected regex pattern {pattern} and cannot be queried.")

ontology_name = term_id.split(":")[0]
if ontology_name not in self.supported_ontologies:
if ontology_name not in self.cxg_schema.supported_ontologies:
raise ValueError(f"{term_id} is not part of a supported ontology, its metadata cannot be fetched.")

return ontology_name
Expand All @@ -52,7 +51,7 @@ def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[s
:return: flattened List[str] of ancestor terms
"""
ontology_name = self._parse_ontology_name(term_id)
ancestors: List[str] = self.ontology_dict[ontology_name][term_id]["ancestors"]
ancestors: List[str] = self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"]
return ancestors + [term_id] if include_self else ancestors

def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]:
Expand Down Expand Up @@ -95,7 +94,7 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False)
ontology_names.add(ontology_name)

for ontology in ontology_names:
for candidate_descendant, candidate_metadata in self.ontology_dict[ontology].items():
for candidate_descendant, candidate_metadata in self.cxg_schema.ontology(ontology).items():
for ancestor_id in descendants_dict:
if ancestor_id in candidate_metadata["ancestors"]:
descendants_dict[ancestor_id].append(candidate_descendant)
Expand All @@ -112,7 +111,7 @@ def is_term_deprecated(self, term_id: str) -> bool:
:return: boolean flag indicating whether the term is deprecated
"""
ontology_name = self._parse_ontology_name(term_id)
is_deprecated: bool = self.ontology_dict[ontology_name][term_id].get("deprecated")
is_deprecated: bool = self.cxg_schema.ontology(ontology_name)[term_id].get("deprecated")
return is_deprecated

def get_term_replacement(self, term_id: str) -> Union[str, None]:
Expand All @@ -125,7 +124,7 @@ def get_term_replacement(self, term_id: str) -> Union[str, None]:
:return: replacement str term ID if it exists, None otherwise
"""
ontology_name = self._parse_ontology_name(term_id)
replaced_by: str = self.ontology_dict[ontology_name][term_id].get("replaced_by")
replaced_by: str = self.cxg_schema.ontology(ontology_name)[term_id].get("replaced_by")
return replaced_by if replaced_by else None

def get_term_metadata(self, term_id: str) -> Dict[str, Any]:
Expand All @@ -145,7 +144,7 @@ def get_term_metadata(self, term_id: str) -> Dict[str, Any]:
"""
ontology_name = self._parse_ontology_name(term_id)
return {
key: self.ontology_dict[ontology_name][term_id].get(key, None)
key: self.cxg_schema.ontology(ontology_name)[term_id].get(key, None)
for key in {"comments", "term_tracker", "consider"}
}

Expand All @@ -159,7 +158,7 @@ def get_term_label(self, term_id: str) -> str:
:return: str human-readable label for the term
"""
ontology_name = self._parse_ontology_name(term_id)
label: str = self.ontology_dict[ontology_name][term_id]["label"]
label: str = self.cxg_schema.ontology(ontology_name)[term_id]["label"]
return label

def get_ontology_download_url(
Expand All @@ -178,8 +177,8 @@ def get_ontology_download_url(
:param ontology_variant: OntologyVariant enum of the ontology variant to fetch
:return: str download URL for the requested ontology file
"""
source_url = self.supported_ontologies[ontology.name]["source"]
version = self.supported_ontologies[ontology.name]["version"]
source_url = self.cxg_schema.supported_ontologies[ontology.name]["source"]
version = self.cxg_schema.supported_ontologies[ontology.name]["version"]
return (
f"{source_url}/{version}/{ontology.value}-{ontology_variant.value}.{ontology_filetype.value}"
if ontology_variant
Expand Down
77 changes: 77 additions & 0 deletions api/python/src/cellxgene_ontology_guide/supported_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import functools
import gzip
import json
import os
from typing import Any, Dict, List, Optional

from constants import DATA_ROOT, ONTOLOGY_FILENAME_SUFFIX, ONTOLOGY_INFO_FILENAME
from semantic_version import Version

from cellxgene_ontology_guide.entities import Ontology


@functools.cache
def load_ontology_file(file_name: str) -> Any:
"""Load the ontology file from the data directory and return it as a dict."""
with gzip.open(os.path.join(DATA_ROOT, file_name), "rt") as f:
return json.load(f)


def clear_ontology_file_cache() -> None:
"""Clear the cache for the load_ontology_file function."""
load_ontology_file.cache_clear()


def get_latest_schema_version(versions: List[str]) -> str:
"""Given a list of schema versions, return the latest version.
:param versions: List[str] list of schema versions. Versions can be in the format "v5.0.0" or "5.0.0"
:return: str latest version with a "v" prefix
"""

def _coerce(v: str) -> Version:
return Version.coerce(v[1:]) if v[0] == "v" else Version.coerce(v)

return "v" + str(sorted([_coerce(version) for version in versions])[-1])


def load_supported_versions() -> Any:
"""Load the ontology_info.json file and return it as a dict."""
with open(os.path.join(DATA_ROOT, ONTOLOGY_INFO_FILENAME)) as f:
return json.load(f)


class CXGSchema:
"""A class to represent the ontology information used by a cellxgene schema version."""

def __init__(self, version: Optional[str] = None):
"""
:param version: The schema version to use. If not provided, the latest schema version will be used.
"""
ontology_info = load_supported_versions()
if version is None:
version = get_latest_schema_version(ontology_info.keys())
elif version not in ontology_info:
raise ValueError(f"Schema version {version} is not supported in this package version.")

self.version = version
self.supported_ontologies = ontology_info[version]
self.ontology_file_names: Dict[str, str] = {}

def ontology(self, name: str) -> Any:
"""Return the ontology terms for the given ontology name. Load from the file cache if available.
:param name: str name of the ontology to get the terms for
:return: dict representation of the ontology terms
"""
if name not in self.ontology_file_names:
if getattr(Ontology, name, None) is None:
raise ValueError(f"Ontology {name} is not supported in this package version.")

try:
onto_version = self.supported_ontologies[name]["version"]
except KeyError as e:
raise ValueError(f"Ontology {name} is not supported for schema version {self.version}") from e
file_name = f"{name}-ontology-{onto_version}{ONTOLOGY_FILENAME_SUFFIX}"
self.ontology_file_names[name] = file_name # save to file name to access from cache
return load_ontology_file(self.ontology_file_names[name])
15 changes: 15 additions & 0 deletions api/python/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from unittest.mock import patch

import pytest


@pytest.fixture
def mock_load_supported_versions(tmpdir):
with patch("cellxgene_ontology_guide.supported_versions.load_supported_versions") as mock:
yield mock


@pytest.fixture
def mock_load_ontology_file():
with patch("cellxgene_ontology_guide.supported_versions.load_ontology_file") as mock:
yield mock
94 changes: 0 additions & 94 deletions api/python/tests/test_artifact_download.py

This file was deleted.

Loading

0 comments on commit ead59e5

Please sign in to comment.