From a2929f0d8fb792f309c5b70508f9cd7a4a40c168 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 14 Oct 2024 14:17:22 -0700 Subject: [PATCH 1/3] Bring ncbi gene nodes in from modular ingest --- src/monarch_ingest/ingests.yaml | 10 ++++++++-- src/monarch_ingest/qc_expect.yaml | 15 ++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/monarch_ingest/ingests.yaml b/src/monarch_ingest/ingests.yaml index 44d5b642..dc8a8300 100644 --- a/src/monarch_ingest/ingests.yaml +++ b/src/monarch_ingest/ingests.yaml @@ -19,6 +19,14 @@ clinvar_variant: url: - 'https://github.com/monarch-initiative/clinvar-ingest/releases/latest/download/clinvar_variant_nodes.tsv' - 'https://github.com/monarch-initiative/clinvar-ingest/releases/latest/download/clinvar_variant_edges.tsv' +ncbi_gene: + url: + - 'https://github.com/monarch-initiative/ncbi-gene/releases/latest/download/ncbi_gene_9615_nodes.tsv' + - 'https://github.com/monarch-initiative/ncbi-gene/releases/latest/download/ncbi_gene_9913_nodes.tsv' + - 'https://github.com/monarch-initiative/ncbi-gene/releases/latest/download/ncbi_gene_9823_nodes.tsv' + - 'https://github.com/monarch-initiative/ncbi-gene/releases/latest/download/ncbi_gene_9031_nodes.tsv' + - 'https://github.com/monarch-initiative/ncbi-gene/releases/latest/download/ncbi_gene_227321_nodes.tsv' + ## Ingests within this repository @@ -54,8 +62,6 @@ hpoa_gene_to_phenotype: config: 'ingests/hpoa/gene_to_phenotype.yaml' # mgi_publication_to_gene: # config: 'ingests/mgi/publication_to_gene.yaml' -ncbi_gene: - config: 'ingests/ncbi/gene.yaml' panther_genome_orthologs: config: 'ingests/panther/genome_orthologs.yaml' pombase_gene: diff --git a/src/monarch_ingest/qc_expect.yaml b/src/monarch_ingest/qc_expect.yaml index 6e0d7cee..38abfd44 100644 --- a/src/monarch_ingest/qc_expect.yaml +++ b/src/monarch_ingest/qc_expect.yaml @@ -8,8 +8,17 @@ nodes: min: 14000 hgnc_gene_nodes: min: 43000 - ncbi_gene_nodes: - min: 196000 + ncbi_gene_9823_nodes: + min: 45000 + ncbi_gene_9031_nodes: + min: 32000 + ncbi_gene_227321_nodes: + min: 10000 + ncbi_gene_9615_nodes: + min: 50000 + ncbi_gene_9913_nodes: + min: 57000 + phenio_nodes: min: 288000 pombase_gene_nodes: @@ -19,7 +28,7 @@ nodes: clinvar_variant_nodes: min: 7000 clingen_variant_nodes: - min: 7000 + min: 5600 edges: provided_by: alliance_gene_to_expression_edges: From 4d62df7344c150b9f4295f6bba9092b78724398c Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 14 Oct 2024 14:21:18 -0700 Subject: [PATCH 2/3] remove ncbi non-modular gene ingest code --- src/monarch_ingest/ingests/ncbi/gene.py | 33 ------- src/monarch_ingest/ingests/ncbi/gene.yaml | 86 ------------------- src/monarch_ingest/ingests/ncbi/metadata.yaml | 11 --- 3 files changed, 130 deletions(-) delete mode 100644 src/monarch_ingest/ingests/ncbi/gene.py delete mode 100644 src/monarch_ingest/ingests/ncbi/gene.yaml delete mode 100644 src/monarch_ingest/ingests/ncbi/metadata.yaml diff --git a/src/monarch_ingest/ingests/ncbi/gene.py b/src/monarch_ingest/ingests/ncbi/gene.py deleted file mode 100644 index 0b7f42c4..00000000 --- a/src/monarch_ingest/ingests/ncbi/gene.py +++ /dev/null @@ -1,33 +0,0 @@ -from koza.cli_utils import get_koza_app - -from biolink_model.datamodel.pydanticmodel_v2 import Gene - -koza_app = get_koza_app("ncbi_gene") -taxon_labels = koza_app.get_map("taxon-labels") - -# If a taxon label we need isn't in phenio's NCBITaxon subset, we can add it here -extra_taxon_labels = {'NCBITaxon:227321': 'Dictyostelium discoideum'} - -while (row := koza_app.get_row()) is not None: - - in_taxon = 'NCBITaxon:' + row["tax_id"] - - if in_taxon in taxon_labels: - in_taxon_label = taxon_labels[in_taxon]['label'] - elif in_taxon in extra_taxon_labels: - in_taxon_label = extra_taxon_labels[in_taxon] - else: - raise ValueError(f"Taxon {in_taxon} not found in taxon-labels") - - gene = Gene( - id='NCBIGene:' + row["GeneID"], - symbol=row["Symbol"], - name=row["Symbol"], - full_name=row["Full_name_from_nomenclature_authority"], - description=row["description"], - in_taxon=[in_taxon], - in_taxon_label=in_taxon_label, - provided_by=["infores:ncbi-gene"], - ) - - koza_app.write(gene) diff --git a/src/monarch_ingest/ingests/ncbi/gene.yaml b/src/monarch_ingest/ingests/ncbi/gene.yaml deleted file mode 100644 index 44931e0c..00000000 --- a/src/monarch_ingest/ingests/ncbi/gene.yaml +++ /dev/null @@ -1,86 +0,0 @@ -name: 'ncbi_gene' - -files: - - './data/ncbi/gene_info.gz' - -metadata: './src/monarch_ingest/ingests/ncbi/metadata.yaml' - -global_table: './src/monarch_ingest/translation_table.yaml' - -format: 'csv' - -delimiter: '\t' - -header: 0 - -columns: - - 'tax_id' - - 'GeneID' - - 'Symbol' - - 'LocusTag' - - 'Synonyms' - - 'dbXrefs' - - 'chromosome' - - 'map_location' - - 'description' - - 'type_of_gene' - - 'Symbol_from_nomenclature_authority' - - 'Full_name_from_nomenclature_authority' - - 'Nomenclature_status' - - 'Other_designations' - - 'Modification_date' - - 'Feature_type' - -filters: - - inclusion: 'include' - column: 'tax_id' - filter_code: 'in' - value: - - '9615' - - '9913' - - '9823' - - '9031' - # multiple Aspergillus genus taxon:5052 - # Aspergillus nidulans FGSC A4 - primary Aspergillus nidulans genomic reference - - '227321' - # - # Monarch Technical, 26 Sept 2022: decision taken to *exclude* - # non-nidulans Aspergillus (sub-)species from the ingest for now - # - # - '330879' # Aspergillus fumigatus Af293 - # - '425011' # Aspergillus niger CBS 513.88 - # - '510516' # Aspergillus oryzae RIB40 - # - '341663' # Aspergillus terreus NIH2624 - # - '767769' # Aspergillus brasiliensis - not in GOA - # - '41063' # Aspergillus zonatus - not in GOA - # - '1137211' # Aspergillus acidus CBS 106.47 - not in GOA - # - '1073089' # Aspergillus wentii DTO 134E9 - # - '46472' # Aspergillus versicolor - # - '602072' # Aspergillus carbonarius ITEM 5010 - # - '690307' # Aspergillus aculeatus ATCC16872 - # - '1160497' # Aspergillus glaucus CBS 516.65 - # - '331117' # Neosartorya fischeri NRRL 181 - # - '344612' # Aspergillus clavatus NRRL 1 - # - '332952' # Aspergillus flavus NRRL 3357 - # - '767770' # Aspergillus tubingensis CBS 134.48 - not in GOA - # - '75750' # Aspergillus sydowii - not in GOA - # - '451804' # Aspergillus fumigatus A1163 - not in GOA - # - '1033177' # Aspergillus kawachii - not in GOA - -depends_on: - - './src/monarch_ingest/maps/taxon-labels.yaml' - -node_properties: - - 'id' - - 'category' - - 'name' - - 'symbol' - - 'full_name' - - 'description' - - 'in_taxon' - - 'in_taxon_label' - - 'provided_by' - - -transform_mode: 'flat' - diff --git a/src/monarch_ingest/ingests/ncbi/metadata.yaml b/src/monarch_ingest/ingests/ncbi/metadata.yaml deleted file mode 100644 index bfe4203e..00000000 --- a/src/monarch_ingest/ingests/ncbi/metadata.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: 'NCBI' - -dataset_description: - ingest_title: 'NCBI' - ingest_url: 'https://www.ncbi.nlm.nih.gov/gene/' - description: 'Gene integrates information from a wide range of species. A record may include nomenclature, Reference Sequences (RefSeqs), maps, pathways, variations, phenotypes, and links to genome-, phenotype-, and locus-specific resources worldwide.' - rights: 'http://www.ncbi.nlm.nih.gov/home/about/policies.shtml' - -source_files: - - 'gene.yaml' - From a2c272a1d8f9884f3e4018de2c3ea1d4d8d194d6 Mon Sep 17 00:00:00 2001 From: Kevin Schaper Date: Mon, 14 Oct 2024 16:07:18 -0700 Subject: [PATCH 3/3] remove ncbi gene tests --- tests/unit/ncbi/test_ncbi_gene.py | 56 ------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 tests/unit/ncbi/test_ncbi_gene.py diff --git a/tests/unit/ncbi/test_ncbi_gene.py b/tests/unit/ncbi/test_ncbi_gene.py deleted file mode 100644 index 2f31ff75..00000000 --- a/tests/unit/ncbi/test_ncbi_gene.py +++ /dev/null @@ -1,56 +0,0 @@ -import pytest -from koza.utils.testing_utils import mock_koza # noqa: F401 - - -@pytest.fixture -def source_name(): - return "ncbi_gene" - - -@pytest.fixture -def script(): - return "./src/monarch_ingest/ingests/ncbi/gene.py" - - -@pytest.fixture -def gene_row(): - return { - "GeneID": "373854", - "Symbol": "TENM2", - "Full_name_from_nomenclature_authority": "teneurin transmembrane protein 2", - "description": "teneurin transmembrane protein 2", - "tax_id": "9031", - } - - -@pytest.fixture -def gene_entities(mock_koza, source_name, gene_row, script, taxon_label_map_cache, global_table): - row = gene_row - return mock_koza( - source_name, - row, - script, - map_cache=taxon_label_map_cache, - global_table=global_table, - ) - - -def test_gene_information_gene(gene_entities): - assert len(gene_entities) == 1 - gene = gene_entities[0] - assert gene - - -def test_gene_information_id(gene_entities): - gene = gene_entities[0] - assert gene.id == "NCBIGene:373854" - - -def test_gene_information_symbol(gene_entities): - gene = gene_entities[0] - assert gene.symbol == "TENM2" - - -def test_gene_information_description(gene_entities): - gene = gene_entities[0] - assert gene.description == "teneurin transmembrane protein 2"