Skip to content

Commit

Permalink
feat: derive taxon list and custom tags from spreadsheet (#159)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx committed Nov 16, 2024
1 parent 6e36129 commit cd14339
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 34 deletions.
25 changes: 9 additions & 16 deletions files/build-files-from-ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,9 @@
import pandas as pd
import requests

TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report"
TAXA_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv"

TAXA = [
"Plasmodium falciparum",
"Plasmodium vivax",
"Plasmodium yoelii",
"Plasmodium vinckei",
"Culex pipiens",
"Anopheles gambiae",
"Toxoplasma gondii",
"Mycobacterium tuberculosis",
"Coccidioides posadasii",
"Coccidioides immitis"
]
TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report"

ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"

Expand Down Expand Up @@ -66,7 +55,11 @@ def get_genomes_df(tax_ids):
def build_files():
print("Building files")

organisms_df = get_organisms_df(TAXA)
taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False)

organisms_source_df = get_organisms_df(list(taxa_df["Name"]))

organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"])

organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t")

Expand All @@ -78,9 +71,9 @@ def build_files():
gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="pairedAccession", right_on="genBank")
ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")

result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df)
genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df)

result_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t")
genomes_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t")

print(f"Wrote to {GENOMES_OUTPUT_PATH}")

Expand Down
14 changes: 7 additions & 7 deletions files/source/genomes-from-ncbi.tsv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession ucscBrowser genBank refSeq
Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2
Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 https://genome.ucsc.edu/h/GCF_000002765.5 GCA_000002765.3 GCF_000002765.5
Leishmania major strain Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2
Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2
Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 https://genome.ucsc.edu/h/GCA_018416015.2 GCA_018416015.2 GCF_018416015.1
Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Complete Genome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1
Leishmania donovani 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1
Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2
Trypanosoma brucei brucei TREU927 185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2
Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2
Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2
Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2
Leishmania braziliensis MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2
Trypanosoma cruzi 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1
Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2
Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5 GCA_000277735.2
Severe acute respiratory syndrome coronavirus 2 2697049 GCF_009858895.2 False Complete Genome 1.0 29903 1 29903 1 38.0 GCA_009858895.3
Monkeypox virus 10244 GCF_000857045.1 False Complete Genome 1.0 196858 1 196858 1 33.0 GCA_000857045.1
Mycobacterium tuberculosis 1773 GCF_030566675.1 False Complete Genome 1.0 4516435 1 4516435 1 20.0x 65.5 GCA_030566675.1
Mycobacterium tuberculosis 1773 GCF_963525475.1 False Complete Genome 1.0 4469156 1 4469156 1 100.0x 65.5 GCA_963525475.1
Mycobacterium tuberculosis 1773 GCF_017901095.1 False Complete Genome 1.0 4459449 1 4459449 1 111.0x 65.5 GCA_017901095.1
Mycobacterium tuberculosis 1773 GCF_030566105.1 False Complete Genome 1.0 4459087 1 4459087 1 20.0x 65.0 GCA_030566105.1
Mycobacterium tuberculosis 1773 GCF_033124885.1 False Complete Genome 1.0 4456234 1 4456234 1 257.0x 65.5 GCA_033124885.1
Mycobacterium tuberculosis 1773 GCF_014899985.1 False Complete Genome 1.0 4450340 1 4450340 1 92.0x 65.5 GCA_014899985.1
Mycobacterium tuberculosis 1773 GCF_040208995.1 False Complete Genome 1.0 4448271 1 4448271 1 208.6x 65.5 GCA_040208995.1
Mycobacterium tuberculosis 1773 GCF_014899965.1 False Complete Genome 1.0 4447644 1 4447644 1 51.0x 65.5 GCA_014899965.1
Mycobacterium tuberculosis 1773 GCF_040208985.1 False Complete Genome 1.0 4445673 1 4445673 1 232.2x 65.5 GCA_040208985.1
29 changes: 18 additions & 11 deletions files/source/organisms-from-ncbi.tsv
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
taxon taxonomyId assemblyCount
Anopheles gambiae 7165 7
Coccidioides immitis 5501 5
Coccidioides posadasii 199306 13
Culex pipiens 7175 5
Mycobacterium tuberculosis 1773 7823
Plasmodium falciparum 5833 67
Plasmodium vinckei 5860 10
Plasmodium vivax 5855 19
Plasmodium yoelii 5861 15
Toxoplasma gondii 5811 29
taxon taxonomyId assemblyCount CustomTags
Anopheles gambiae 7165 7 VEuPathDb
Coccidioides immitis 5501 5
Coccidioides posadasii 199306 13
Culex pipiens 7175 5 VEuPathDb
Leishmania braziliensis 5660 11 VEuPathDb
Leishmania donovani 5661 12 VEuPathDb
Leishmania major 5664 7 VEuPathDb
Monkeypox virus 10244 6911 Virus
Mycobacterium tuberculosis 1773 7823 Bact
Plasmodium falciparum 5833 67 VEuPathDb
Plasmodium vinckei 5860 10 VEuPathDb
Plasmodium vivax 5855 19 VEuPathDb
Plasmodium yoelii 5861 15 VEuPathDb
Severe acute respiratory syndrome coronavirus 2 2697049 92 Virus
Toxoplasma gondii 5811 29 VEuPathDb
Trypanosoma brucei 5691 5 VEuPathDb
Trypanosoma cruzi 5693 45 VEuPathDb

0 comments on commit cd14339

Please sign in to comment.