Skip to content

Commit

Permalink
Create, cache and use derived schema for 10x faster validation
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed May 14, 2024
1 parent aada08f commit 90b0017
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 28 deletions.
20 changes: 12 additions & 8 deletions src/primaschema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,47 +32,51 @@ def hash_ref(ref_path: Path):
print(hex_digest)


def validate(scheme_dir: Path):
def validate(scheme_dir: Path, full: bool = False):
"""
Validate a primer scheme bundle containing info.yml, primer.bed and reference.fasta
:arg scheme_dir: Path of scheme.bed file
:arg out_dir: Path of directory in which to save primer.bed
:arg force: Overwrite existing output files
:arg full: Perform meticulous validation using full model
"""
return lib.validate(scheme_dir)
return lib.validate(scheme_dir, full=full)


def validate_recursive(root_dir: Path, force: bool = False):
def validate_recursive(root_dir: Path, full: bool = False, force: bool = False):
"""
Recursively validate primer scheme bundles in the specified directory
:arg root_dir: Path in which to search for schemes
:arg full: Perform meticulous validation using full model
:arg force: Overwrite existing schemes and ignore hash check failures
"""
lib.validate_recursive(root_dir=root_dir, force=force)
lib.validate_recursive(root_dir=root_dir, full=full, force=force)


def build(scheme_dir: Path, out_dir: Path = Path(), force: bool = False):
def build(scheme_dir: Path, out_dir: Path = Path(), full: bool = False, force: bool = False):
"""
Build a primer scheme bundle containing info.yml, primer.bed and reference.fasta
:arg scheme_dir: Path of input scheme directory
:arg out_dir: Path of directory in which to save scheme
:arg full: Perform meticulous validation using full model
:arg force: Overwrite existing output files
"""
lib.build(scheme_dir=scheme_dir, out_dir=out_dir, force=force)
lib.build(scheme_dir=scheme_dir, out_dir=out_dir, full=full, force=force)


def build_recursive(root_dir: Path, force: bool = False, nested: bool = False):
def build_recursive(root_dir: Path, full: bool = False, force: bool = False, nested: bool = False):
"""
Recursively build primer scheme bundles in the specified directory
:arg root_dir: Path in which to search for schemes
:arg full: Perform meticulous validation using full model
:arg force: Overwrite existing schemes and ignore hash check failures
:arg nested: Build definitions inside a nested dir structure of family/version
"""
lib.build_recursive(root_dir=root_dir, force=force, nested=nested)
lib.build_recursive(root_dir=root_dir, full=full, force=force, nested=nested)


def build_manifest(root_dir: Path, schema_dir: Path = Path(), out_dir: Path = Path()):
Expand Down
52 changes: 36 additions & 16 deletions src/primaschema/lib.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import importlib.util
import json
import logging
import os
Expand All @@ -14,10 +15,12 @@
import pandas as pd
import yaml
from Bio import SeqIO
# from linkml.generators.pydanticgen import PydanticGenerator
from linkml.generators.pythongen import PythonGenerator
from linkml.validators import JsonSchemaDataValidator
from linkml_runtime.utils.schemaview import SchemaView

from primaschema.util import run


SCHEME_BED_FIELDS = ["chrom", "chromStart", "chromEnd", "name", "poolName", "strand"]
PRIMER_BED_FIELDS = SCHEME_BED_FIELDS + ["sequence"]
Expand All @@ -33,12 +36,21 @@ def scan(path):
yield entry


def import_class_from_path(file_path, class_name="PrimerScheme"):
spec = importlib.util.spec_from_file_location(class_name, file_path)
if spec is None:
raise ImportError(f"Failed to load schema from {file_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return getattr(module, class_name)


def get_primer_schemes_path():
"""Locate primer-schemes repo root using environment variable"""
env_var = "PRIMER_SCHEMES_PATH"
if (
not env_var in os.environ
or not (Path(os.environ[env_var]).resolve() / "schema").exists()
or not (Path(os.environ[env_var]).resolve() / Path("schema") / Path("primer_scheme.yml")).exists()
):
raise RuntimeError(
f'Invalid or unset environment variable {env_var} ({os.environ.get(env_var)}).\n\nSet {env_var} to the path of a local copy of the primer-schemes repo to proceed. For example, do `git clone https://github.com/pha4ge/primer-schemes` followed by `export {env_var}="/path/to/primer-schemes"`'
Expand Down Expand Up @@ -190,15 +202,22 @@ def validate_yaml_with_json_schema(yaml_path: Path, schema_path: Path):
return jsonschema.validate(yaml_data, schema=schema)


def validate_with_linkml_schema(yaml_path: Path, schema_path: Path):
schema_view = SchemaView(schema_path)
schema_gen = PythonGenerator(schema_view.schema)
schema_compiled = schema_gen.compile_module()
def validate_with_linkml_schema(yaml_path: Path, full: bool = False):
data = parse_yaml(yaml_path)
data_instance = schema_compiled.PrimerScheme(**data)
# print(yaml_dumper.dumps(data_instance))
validator = JsonSchemaDataValidator(schema_view.schema)
validator.validate_object(data_instance)
schema_path = get_primer_schemes_path() / "schema/primer_scheme.yml"
pythonised_schema_path = get_primer_schemes_path() / "schema/primer_scheme.py"
if full:
schema_view = SchemaView(schema_path)
schema_gen = PythonGenerator(schema_view.schema)
schema_compiled = schema_gen.compile_module()
data_instance = schema_compiled.PrimerScheme(**data)
else:
if not pythonised_schema_path.exists():
run(f"gen-python {schema_path} > {pythonised_schema_path}")
logging.info(f"Wrote Pythonised schema to {pythonised_schema_path}")
print(run("ls").stdout)
PrimerScheme = import_class_from_path(pythonised_schema_path)
data_instance = PrimerScheme(**data)


def validate_bed(bed_path: Path, bed_type=Literal["primer", "scheme"]):
Expand Down Expand Up @@ -235,7 +254,7 @@ def infer_bed_type(bed_path: Path) -> str:
return bed_type


def validate(scheme_dir: Path, force: bool = False):
def validate(scheme_dir: Path, full: bool = False, force: bool = False):
# schema_path = get_primer_schemes_path() / "schema/scheme_schema.latest.json"
logging.info(f"Validating {scheme_dir}")
validate_bed(scheme_dir / "primer.bed", bed_type="primer")
Expand All @@ -244,7 +263,8 @@ def validate(scheme_dir: Path, force: bool = False):
# )
schema_path = get_primer_schemes_path() / "schema/primer_scheme.yml"
validate_with_linkml_schema(
yaml_path=scheme_dir / "info.yml", schema_path=schema_path
yaml_path=scheme_dir / "info.yml",
full=full
)
scheme = parse_yaml(scheme_dir / "info.yml")
existing_primer_checksum = scheme.get("primer_checksum")
Expand Down Expand Up @@ -292,15 +312,15 @@ def validate_recursive(root_dir: Path, force: bool = False):


def build(
scheme_dir: Path, out_dir: Path = Path(), force: bool = False, nested: bool = True
scheme_dir: Path, out_dir: Path = Path(), full: bool = False, force: bool = False, nested: bool = True
):
"""
Build a PHA4GE primer scheme bundle.
Given a directory path containing info.yml, reference.fasta, and either
primer.bed or reference.bed, generate a directory containing info.yml including
primer and reference checksums and a canonical primer.bed representation.
"""
validate(scheme_dir=scheme_dir, force=force)
validate(scheme_dir=scheme_dir, full=full, force=force)
scheme = parse_yaml(scheme_dir / "info.yml")
if nested:
family = Path(scheme["name"].partition("-")[0])
Expand All @@ -327,7 +347,7 @@ def build(
os.remove("scheme.bed")


def build_recursive(root_dir: Path, force: bool = False, nested: bool = False):
def build_recursive(root_dir: Path, full: bool = False, force: bool = False, nested: bool = False):
"""Build all schemes in a directory tree"""
schemes_paths = {}
for entry in scan(root_dir):
Expand All @@ -336,7 +356,7 @@ def build_recursive(root_dir: Path, force: bool = False, nested: bool = False):
scheme_dir = Path(entry.path).parent
schemes_paths[scheme.get("name")] = scheme_dir
for scheme, path in schemes_paths.items():
build(scheme_dir=path, force=force)
build(scheme_dir=path, full=full, force=force)


def build_manifest(root_dir: Path, schema_dir: Path, out_dir: Path = Path()):
Expand Down
6 changes: 6 additions & 0 deletions src/primaschema/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import subprocess

def run(cmd, cwd="./"): # Helper for CLI testing
return subprocess.run(
cmd, cwd=cwd, shell=True, check=True, text=True, capture_output=True
)
16 changes: 12 additions & 4 deletions test/test_all.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import subprocess

from pathlib import Path

import pytest

import primaschema.lib as lib


data_dir = Path("test/data")
schema_dir = Path(os.environ["PRIMER_SCHEMES_PATH"]).resolve() / "schema"

Expand Down Expand Up @@ -53,17 +55,22 @@ def test_artic_v41_scheme_hash_matches_primer_hash():
assert scheme_bed_hash == primer_bed_hash


def test_eden_v1_schema_full():
lib.validate_with_linkml_schema(
data_dir / "primer-schemes/eden/v1/info.yml",
full = True
)


def test_eden_v1_schema():
lib.validate_with_linkml_schema(
data_dir / "primer-schemes/eden/v1/info.yml",
schema_dir / "primer_scheme.yml",
)


def test_artic_v41_schema():
lib.validate_with_linkml_schema(
data_dir / "primer-schemes/artic/v4.1/info.yml",
schema_dir / "primer_scheme.yml",
)


Expand Down Expand Up @@ -108,11 +115,12 @@ def test_build_from_scheme_bed():

def test_build_recursive():
lib.build_recursive(data_dir / "primer-schemes", force=True)
run("rm -rf built", cwd="./")
run("rm -rf built")


def test_build_manifest():
lib.build_manifest(root_dir=data_dir / "primer-schemes", schema_dir=schema_dir)
run("rm -rf built index.yml", cwd="./")


def test_primer_bed_to_scheme_bed():
Expand Down Expand Up @@ -156,4 +164,4 @@ def test_print_intervals():

assert (
"""MN908947.3\t29452\t29854\tSARS-CoV-2_99\n""" in run_cmd.stdout
)
)

0 comments on commit 90b0017

Please sign in to comment.