diff --git a/scripts/after_download.sh b/scripts/after_download.sh index f1da15a8..9e50166f 100755 --- a/scripts/after_download.sh +++ b/scripts/after_download.sh @@ -16,3 +16,6 @@ tar xfO data/monarch/kg-phenio.tar.gz merged-kg_nodes.tsv | grep ^NCBITaxon | cu # Repair Orphanet prefixes in MONDO sssom rows as necessary sed -i 's/\torphanet.ordo\:/\tOrphanet\:/g' data/monarch/mondo.sssom.tsv + +# Repair mesh: prefixes in MONDO sssom rows as necessary +sed -i 's@mesh:@MESH:@g' data/monarch/mondo.sssom.tsv diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py index e36a7e8c..656f9089 100644 --- a/src/monarch_ingest/cli_utils.py +++ b/src/monarch_ingest/cli_utils.py @@ -9,6 +9,7 @@ from biolink_model.datamodel import model # import the pythongen biolink model to get the version from linkml_runtime import SchemaView from linkml.utils.helpers import convert_to_snake_case +import requests # from loguru import logger import pandas @@ -46,6 +47,19 @@ def transform_one( # if log: logger.removeHandler(fh) raise ValueError(f"{ingest} is not a valid ingest - see ingests.yaml for a list of options") + # if a url is provided instead of a config, just download the file and copy it to the output dir + if "url" in ingests[ingest]: + for url in ingests[ingest]["url"]: + filename = url.split("/")[-1] + + if Path(f"{output_dir}/transform_output/{filename}").is_file() and not force: + continue + + response = requests.get(url, allow_redirects=True) + with open(f"{output_dir}/transform_output/{filename}", "wb") as f: + f.write(response.content) + return + source_file = Path(Path(__file__).parent, ingests[ingest]["config"]) if not Path(source_file).is_file(): diff --git a/src/monarch_ingest/ingests.yaml b/src/monarch_ingest/ingests.yaml index 2745d6dc..44d5b642 100644 --- a/src/monarch_ingest/ingests.yaml +++ b/src/monarch_ingest/ingests.yaml @@ -1,9 +1,31 @@ +## Pass-through modular ingests +alliance_genotype: + url: + - 'https://github.com/monarch-initiative/alliance-genotype-ingest/releases/latest/download/alliance_genotype_nodes.tsv' +alliance_phenotype: + url: + - 'https://github.com/monarch-initiative/alliance-phenotype-association-ingest/releases/latest/download/alliance_phenotype_edges.tsv' +alliance_disease_association: + url: + - 'https://github.com/monarch-initiative/alliance-disease-association-ingest/releases/latest/download/alliance_disease_edges.tsv' +zfin_genotype_to_phenotype: + url: + - 'https://github.com/monarch-initiative/zfin-genotype-to-phenotype-ingest/releases/latest/download/zfin_genotype_to_phenotype_edges.tsv' +clingen_variant: + url: + - 'https://github.com/monarch-initiative/clingen-ingest/releases/latest/download/clingen_variant_nodes.tsv' + - 'https://github.com/monarch-initiative/clingen-ingest/releases/latest/download/clingen_variant_edges.tsv' +clinvar_variant: + url: + - 'https://github.com/monarch-initiative/clinvar-ingest/releases/latest/download/clinvar_variant_nodes.tsv' + - 'https://github.com/monarch-initiative/clinvar-ingest/releases/latest/download/clinvar_variant_edges.tsv' + +## Ingests within this repository + alliance_gene: config: 'ingests/alliance/gene.yaml' alliance_gene_to_expression: config: 'ingests/alliance/gene_to_expression.yaml' -alliance_gene_to_phenotype: - config: 'ingests/alliance/gene_to_phenotype.yaml' # alliance_publication: # config: 'ingests/alliance/publication.yaml' bgee_gene_to_expression: @@ -64,3 +86,4 @@ zfin_gene_to_phenotype: config: 'ingests/zfin/gene_to_phenotype.yaml' # zfin_publication_to_gene: # config: 'ingests/zfin/publication_to_gene.yaml' + diff --git a/src/monarch_ingest/qc_expect.yaml b/src/monarch_ingest/qc_expect.yaml index c28daf97..3d840a54 100644 --- a/src/monarch_ingest/qc_expect.yaml +++ b/src/monarch_ingest/qc_expect.yaml @@ -2,6 +2,8 @@ nodes: provided_by: alliance_gene_nodes: min: 290000 + alliance_genotype_nodes: + min: 130000 dictybase_gene_nodes: min: 14000 hgnc_gene_nodes: @@ -14,12 +16,14 @@ nodes: min: 5000 reactome_pathway_nodes: min: 21000 + clinvar_variant_nodes: + min: 1280000 + clingen_variant_nodes: + min: 7000 edges: provided_by: alliance_gene_to_expression_edges: min: 1870000 - alliance_gene_to_phenotype_edges: - min: 300000 bgee_gene_to_expression_edges: min: 430000 biogrid_edges: @@ -49,8 +53,16 @@ edges: reactome_gene_to_pathway_edges: min: 200000 string_protein_links_edges: - min: 1490000 + min: 1470000 xenbase_gene_to_phenotype_edges: min: 2000 - zfin_gene_to_phenotype_edges: - min: 148000 + alliance_phenotype_edges: + min: 650000 + alliance_disease_edges: + min: 10000 + zfin_genotype_to_phenotype_edges: + min: 125000 + clinvar_variant_edges: + min: 1400000 + clingen_variant_edges: + min: 5000