diff --git a/.guix-run b/.guix-run new file mode 100644 index 00000000..93907e24 --- /dev/null +++ b/.guix-run @@ -0,0 +1,20 @@ +#! /bin/sh +# +# This script sets up a Guix container. Make sure guix is in the path +# - after installing Guix (on Debian). +# +# Note that pyshex etc are part of the guix-bioinformatics channel at +# +# https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics + +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C guix --ad-hoc git python python-pyyaml python-pycurl python-magic nss-certs python-pyshex --network openssl python-schema-salad python-pytest which less vim python-toml python-iniconfig python-tox python-mypy python-pylint + +# Once in the shell you can do +# --- run tests (takes 1 minute, skips lint and mypy) +# tox +# --- install and run +# python3 setup.py install --user +# ~/.local/bin/schema-salad-tool --help +# --- Example +# ~/.local/bin/schema-salad-tool --debug --print-rdf schema_salad/tests/data/pubseq/pubseq-schema.yml schema_salad/tests/data/pubseq/MW084447.1.json schema_salad/tests/data/pubseq/MW343767.1.json + diff --git a/README.rst b/README.rst index d4287705..e7fe4111 100644 --- a/README.rst +++ b/README.rst @@ -10,15 +10,21 @@ Schema Salad ------------ -Salad is a schema language for describing JSON or YAML structured -linked data documents. Salad schema describes rules for -preprocessing, structural validation, and hyperlink checking for -documents described by a Salad schema. Salad supports rich data -modeling with inheritance, template specialization, object -identifiers, object references, documentation generation, code -generation, and transformation to RDF_. Salad provides a bridge -between document and record oriented data modeling and the Semantic -Web. +Schema Salad is a schema language for YAML (or JSON) that also lets +you map your YAML data structures into RDF structured linked data +documents via JSON-LD. In other words, a schema validates and +transforms YAML or JSON documents into structured linked data +documents - the missing link between a NoSQL JSON document and a +linked data document that can be reasoned about, e.g. for human and +machine processing. + +Salad schema describes rules for preprocessing, structural validation, +and hyperlink checking for documents described by a Salad +schema. Salad supports rich data modeling with inheritance, template +specialization, object identifiers, object references, documentation +generation, code generation, and transformation to RDF_. Salad +provides a bridge between document and record oriented data modeling +and the Semantic Web. The Schema Salad library is Python 3.6+ only. @@ -63,6 +69,14 @@ Validate a document using a schema:: $ schema-salad-tool myschema.yml mydocument.yml +Validate a JSON document using a schema:: + + $ schema-salad-tool myschema.yml mydocument.json + +Multiple documents and (lazy) expansion can be used:: + + $ schema-salad-tool myschema.yml 'my*.yml' + Generate HTML documentation:: $ schema-salad-tool myschema.yml > myschema.html diff --git a/schema_salad/main.py b/schema_salad/main.py index 399ed1a8..7f282630 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -1,6 +1,7 @@ """Command line interface to schema-salad.""" import argparse +import glob import logging import os import sys @@ -200,7 +201,7 @@ def main(argsl: Optional[List[str]] = None) -> int: ) parser.add_argument("schema", type=str, nargs="?", default=None) - parser.add_argument("document", type=str, nargs="?", default=None) + parser.add_argument("documents", nargs="*", default=None) parser.add_argument( "--version", "-v", action="store_true", help="Print version", default=None ) @@ -263,12 +264,15 @@ def main(argsl: Optional[List[str]] = None) -> int: makedoc(args) return 0 + # Use globbing to expand the list of documents - and flatten again + args.documents = [item for sublist in map(lambda fn: glob.glob(fn), args.documents) for item in sublist] + # Optionally print the schema after ref resolution - if not args.document and args.print_pre: + if not args.documents and args.print_pre: json_dump(schema_doc, fp=sys.stdout, indent=4) return 0 - if not args.document and args.print_index: + if not args.documents and args.print_index: json_dump(list(metaschema_loader.idx.keys()), fp=sys.stdout, indent=4) return 0 @@ -344,7 +348,7 @@ def main(argsl: Optional[List[str]] = None) -> int: rdfs.serialize(destination=stdout(), format=args.rdf_serializer) return 0 - if args.print_metadata and not args.document: + if args.print_metadata and not args.documents: json_dump(schema_metadata, fp=sys.stdout, indent=4) return 0 @@ -357,25 +361,36 @@ def main(argsl: Optional[List[str]] = None) -> int: return 0 # If no document specified, all done. - if not args.document: + if not args.documents: print(f"Schema `{args.schema}` is valid") return 0 - # Load target document and resolve refs - try: - uri = args.document - document, doc_metadata = document_loader.resolve_ref( - uri, strict_foreign_properties=args.strict_foreign_properties - ) - except ValidationException as e: - msg = to_one_line_messages(e) if args.print_oneline else str(e) - _logger.error( - "Document `%s` failed validation:\n%s", - args.document, - msg, - exc_info=args.debug, - ) - return 1 + # Load target document and resolve refs. Note that this can now + # take multiple document files. doc_metadata only returns the + # metadata for the last document as they should be the same + document = [] + ids = {} # check for duplicate use of document id as it creates + # unpredictable output + for uri in args.documents: + try: + document1, doc_metadata = document_loader.resolve_ref( + uri, strict_foreign_properties=args.strict_foreign_properties + ) + if "id" in document1: + doc_id = document1["id"] + if doc_id in ids: + raise Exception(f"Document id {doc_id} is duplicated in {uri}!") + ids[doc_id] = True + document.append(document1) + except ValidationException as e: + msg = to_one_line_messages(e) if args.print_oneline else str(e) + _logger.error( + "Document `%s` failed validation:\n%s", + document, + msg, + exc_info=args.debug, + ) + return 1 # Optionally print the document after ref resolution if args.print_pre: @@ -397,13 +412,13 @@ def main(argsl: Optional[List[str]] = None) -> int: ) except ValidationException as e: msg2 = to_one_line_messages(e) if args.print_oneline else str(e) - _logger.error(f"While validating document `{args.document}`:\n{msg2}") + _logger.error(f"While validating document `{args.documents}`:\n{msg2}") return 1 # Optionally convert the document to RDF if args.print_rdf: if isinstance(document, (Mapping, MutableSequence)): - printrdf(args.document, document, schema_ctx, args.rdf_serializer) + printrdf(args.documents, document, schema_ctx, args.rdf_serializer) return 0 else: print("Document must be a dictionary or list.") @@ -413,7 +428,7 @@ def main(argsl: Optional[List[str]] = None) -> int: json_dump(doc_metadata, fp=sys.stdout, indent=4) return 0 - print(f"Document `{args.document}` is valid") + print(f"Document `{args.documents}` is valid") return 0 diff --git a/schema_salad/tests/data/pubseq/MW084447.1.json b/schema_salad/tests/data/pubseq/MW084447.1.json new file mode 100644 index 00000000..b2346010 --- /dev/null +++ b/schema_salad/tests/data/pubseq/MW084447.1.json @@ -0,0 +1,46 @@ +{ + "id": "placeholder", + "update_date": "2020-11-12", + "host": { + "host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + "sample": { + "sample_id": "MW084447.1", + "database": "https://www.ncbi.nlm.nih.gov/genbank/", + "source_database_accession": [ + "http://identifiers.org/insdc/MW084447.1#sequence" + ], + "collection_date": "2020-04-14", + "original_collection_location": "USA", + "collection_location": "http://www.wikidata.org/entity/Q23337", + "country": "USA", + "place": "Salt Lake City" + }, + "virus": { + "virus_strain": "SARS-CoV-2/human/USA/UT-02140/2020", + "virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049" + }, + "technology": { + "alignment_protocol": "bwa v. 0.7.17-r1188", + "sample_sequencing_technology": [ + "http://purl.obolibrary.org/obo/OBI_0000759" + ], + "assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028" + }, + "submitter": { + "authors": [ + "Young,E.L.", + "Oakeson,K.", + "Sangster,A.", + "Hirschi,B.", + "Butz,H." + ], + "submitter_name": [ + " Utah Public Health Laboratory" + ], + "submitter_address": "Utah Public Health Laboratory Infectious Disease submission group, 4431 S 2700 W, Salt Lake City, UT 84129, USA" + }, + "warnings": [ + "Missing specimen_source" + ] +} \ No newline at end of file diff --git a/schema_salad/tests/data/pubseq/MW343767.1.json b/schema_salad/tests/data/pubseq/MW343767.1.json new file mode 100644 index 00000000..4b766d01 --- /dev/null +++ b/schema_salad/tests/data/pubseq/MW343767.1.json @@ -0,0 +1,53 @@ +{ + "id": "placeholder1", + "update_date": "2020-12-08", + "host": { + "host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + "sample": { + "sample_id": "MW343767.1", + "database": "https://www.ncbi.nlm.nih.gov/genbank/", + "source_database_accession": [ + "http://identifiers.org/insdc/MW343767.1#sequence" + ], + "collection_date": "2020-10-27", + "specimen_source": [ + "http://purl.obolibrary.org/obo/NCIT_C155831" + ], + "original_collection_location": "USA", + "collection_location": "http://www.wikidata.org/entity/Q23556", + "country": "USA", + "place": "Atlanta" + }, + "virus": { + "virus_strain": "SARS-CoV-2/human/USA/GA-CDC-7701/2020", + "virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049" + }, + "technology": { + "alignment_protocol": "freebayes v. 1.3 ", + "sample_sequencing_technology": [ + "http://purl.obolibrary.org/obo/OBI_0000759" + ], + "assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028" + }, + "submitter": { + "authors": [ + "Li,Y.", + "Tao,Y.", + "Zhang,J.", + "Queen,K.", + "Uehara,A.", + "Cook,P.", + "Paden,C.R.", + "Wang,H.", + "Tong,S." + ], + "submitter_name": [ + " Respiratory Viruses Branch" + ], + "submitter_address": "Centers for Disease Control and Prevention, 1600 Clifton Rd, Atlanta, GA 30329, USA" + }, + "warnings": [ + + ] +} diff --git a/schema_salad/tests/data/pubseq/pubseq-schema.yml b/schema_salad/tests/data/pubseq/pubseq-schema.yml new file mode 100644 index 00000000..e1947c1a --- /dev/null +++ b/schema_salad/tests/data/pubseq/pubseq-schema.yml @@ -0,0 +1,326 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + cc: https://creativecommons.org/ns# + dc: http://purl.org/metadata/dublin_core_elements# + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ + sio: http://semanticscience.org/resource/ + edam: http://edamontology.org/ + # Vocabulary for clinical care, translational and basic research, + # and public information and administrative activities + evs: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# + # Software Ontology + swo: http://www.ebi.ac.uk/swo/ + +$graph: + +- name: licenseSchema + type: record + fields: + license_type: + doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf + type: string + jsonldPredicate: + _id: https://creativecommons.org/ns#License + _type: "@id" + noLinkCheck: true + title: + doc: Attribution title related to data license + type: string? + jsonldPredicate: + _id: http://purl.org/metadata/dublin_core_elements#Title + attribution_name: + doc: Attribution NAME related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionName + attribution_url: + doc: Attribution URL related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionURL + _type: "@id" + noLinkCheck: true + attribution_source: + doc: Attribution source URL related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionSource + _type: "@id" + noLinkCheck: true + +- name: hostSchema + type: record + fields: + host_species: + doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + _type: "@id" + noLinkCheck: true + host_id: + doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + host_sex: + doc: Sex of the host as defined in PATO, expect Male (http://purl.obolibrary.org/obo/PATO_0000384) or Female (http://purl.obolibrary.org/obo/PATO_0000383) or in Intersex (http://purl.obolibrary.org/obo/PATO_0001340) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000047 + _type: "@id" + noLinkCheck: true + host_age: + doc: Age of the host as number (e.g. 50) + type: int? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000011 + host_age_unit: + doc: Unit of host age e.g. http://purl.obolibrary.org/obo/UO_0000036 + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42574 + _type: "@id" + noLinkCheck: true + host_health_status: + doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C25688 + _type: "@id" + noLinkCheck: true + host_treatment: + doc: Process in which the act is intended to modify or alter host status + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 + host_vaccination: + doc: List of vaccines given to the host + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/VO_0000002 + ethnicity: + doc: Ethinicity of the host e.g. http://purl.obolibrary.org/obo/HANCESTRO_0010 + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001014 + _type: "@id" + noLinkCheck: true + additional_host_information: + doc: Field for additional host information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + +- name: sampleSchema + type: record + fields: + sample_id: + doc: Unique sample identifier as defined by the submitter + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + collection_date: + doc: Date when the sample was taken + type: string + jsonldPredicate: + _id: https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C164024 + collection_location: + doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + _type: "@id" + noLinkCheck: true + original_collection_location: + doc: Original geographical location where the sample was collected as a string + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + country: + doc: Original country location as text + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001627 + place: + doc: Original place as text + type: string? + jsonldPredicate: + # Geographical region + _id: http://purl.obolibrary.org/obo/GEO_000000372 + collector_name: + doc: Name of the person that took the sample + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001895 + collecting_institution: + doc: Institute that was responsible for sampling + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C41206 + specimen_source: + doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab) + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001479 + _type: "@id" + noLinkCheck: true + sample_storage_conditions: + doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh .... + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001472 + additional_collection_information: + doc: Add additional comment about the circumstances that a sample was taken + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + database: + doc: URI to database, e.g. https://www.ncbi.nlm.nih.gov/genbank/ + type: string? + jsonldPredicate: + _id: https://schema.org/maintainer + source_database_accession: + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence) + type: string[]? + jsonldPredicate: + _id: http://edamontology.org/data_2091 + _type: "@id" + noLinkCheck: true + +- name: virusSchema + type: record + fields: + virus_species: + doc: The name of virus species from the NCBI taxonomy database, e.g. http://purl.obolibrary.org/obo/NCBITaxon_2697049 for Severe acute respiratory syndrome coronavirus 2 + type: string + jsonldPredicate: + _id: http://edamontology.org/data_1875 + _type: "@id" + noLinkCheck: true + virus_strain: + doc: Name of the virus strain + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_010055 + +- name: technologySchema + type: record + fields: + sample_sequencing_technology: + doc: Technology device that was used to sequence the sample (e.g Sanger, Nanopore MiniION) + type: string[] + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C153598 + _type: "@id" + noLinkCheck: true + assembly_method: + doc: Assembly method refers to how the reads were assembled into contigs for which either a de novo (http://purl.obolibrary.org/obo/GENEPIO_0001628) or mapping/reference based (http://purl.obolibrary.org/obo/GENEPIO_0002028) strategy is used. + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GENEPIO_0000090 + _type: "@id" + alignment_protocol: + doc: Protocol which provides detailed instructions to obtain the assembly + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0004917 + sequencing_coverage: + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20] + type: double[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + additional_technology_information: + doc: Field for additional technology information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + +- name: submitterSchema + type: record + fields: + authors: + doc: Name(s) of the author(s) of the sequence data in the scientific publication + type: string[] + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42781 + submitter_name: + doc: Name of the submitter(s) of the sequence data + type: string[]? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000116 + submitter_address: + doc: Address of the submitter of the sequence data + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000172 + originating_lab: + doc: Laboratory name or identifier where the sample to sequence was produced + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37984 + lab_address: + doc: Address of the laboratory where the sample was produced + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C25407 + provider: + doc: Name or identifier of the provider of the sample + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37900 + submitter_sample_id: + doc: Identifier given to the sample by the submitter + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C164332 + publication: + doc: Reference to the scientifc publication of the sequence (e.g. DOI, pubmed ID, ...) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C19026 + submitter_orcid: + # An identifier curated by ORCID, Inc. to denote some academic author + doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097 + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/APOLLO_SV_00000496 + _type: "@id" + noLinkCheck: true + additional_submitter_information: + doc: Field for additional submitter information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + + +- name: MainSchema + type: record + documentRoot: true + fields: + host: hostSchema + sample: sampleSchema + virus: virusSchema + technology: technologySchema + submitter: submitterSchema + license: ["null", licenseSchema] + id: + doc: The subject (eg the fasta/fastq file) that the metadata describes + type: string + jsonldPredicate: + _id: "@id" + _type: "@id" + noLinkCheck: true + update_date: + doc: Date when record was last updated + type: string + jsonldPredicate: + _id: https://schema.org/dateModified + warnings: + doc: List processing warnings An error correction is a data transformation objective where the aim is to remove (correct for) erroneous contributions arising from the input data, or the transformation itself. + type: string[]? + jsonldPredicate: + _id: http://www.ebi.ac.uk/swo/objective/SWO_7000012 + _type: string? diff --git a/schema_salad/tests/test_examples.py b/schema_salad/tests/test_examples.py index 1bc389e8..a8c5a6f3 100644 --- a/schema_salad/tests/test_examples.py +++ b/schema_salad/tests/test_examples.py @@ -41,7 +41,7 @@ def test_self_validate() -> None: path = get_data("metaschema/metaschema.yml") assert path assert 0 == schema_salad.main.main(argsl=[path]) - assert 0 == schema_salad.main.main(argsl=[path, path]) + assert 1 == schema_salad.main.main(argsl=[path, path]) # passing in 2 schemas should throw an error def test_print_rdf(capfdbinary: CaptureFixture[bytes]) -> None: @@ -512,3 +512,13 @@ def test_nullable_links() -> None: ra, _ = ldr.resolve_all(cmap({"link": None}), "http://example.com", checklinks=True) assert {"link": None} == ra + +def test_pubseq_multidoc_example() -> None: + schema = get_data("tests/data/pubseq/pubseq-schema.yml") + doc1 = get_data("tests/data/pubseq/MW084447.1.json") + doc2 = get_data("tests/data/pubseq/MW343767.1.json") + assert schema + assert doc1 + assert doc2 + assert 0 == schema_salad.main.main(argsl=[schema,doc1]) + assert 0 == schema_salad.main.main(argsl=[schema,doc1,doc2])