Skip to content

Commit

Permalink
feat: add script to get genomes from ncbi api (#159)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx committed Nov 7, 2024
1 parent 9a6875c commit 02573d3
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 0 deletions.
63 changes: 63 additions & 0 deletions files/build-genomes-files-from-ncbi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from urllib.parse import quote as url_quote
import pandas as pd
import requests

TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report"

TAXA = [
"Plasmodium falciparum",
"Plasmodium vivax",
"Plasmodium yoelii",
"Plasmodium vinckei",
"Culex pipiens",
"Anopheles gambiae",
"Toxoplasma gondii",
"Mycobacterium tuberculosis",
"Coccidioides posadasii",
"Coccidioides immitis"
]

OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv"

def build_taxonomy_request_body(taxa):
return {"taxons": taxa, "children": False, "ranks": ["genus"]}

def get_tax_ids(taxa):
taxonomy_info = requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()
return [organism_info["taxonomy"]["tax_id"] for organism_info in taxonomy_info["reports"]]

def build_genomes_url(tax_ids):
return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{url_quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome"

def get_genome_row(genome_info):
refseq_category = genome_info["assembly_info"].get("refseq_category")
return {
"taxon": genome_info["organism"]["organism_name"],
"taxonomyId": genome_info["organism"]["tax_id"],
"accession": genome_info["accession"],
"isRef": (not (refseq_category is None)) and ("reference" in refseq_category),
"level": genome_info["assembly_info"]["assembly_level"],
"chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"),
"length": genome_info["assembly_stats"]["total_sequence_length"],
"scaffoldCount": genome_info["assembly_stats"]["number_of_scaffolds"],
"scaffoldN50": genome_info["assembly_stats"]["scaffold_n50"],
"scaffoldL50": genome_info["assembly_stats"]["scaffold_l50"],
"coverage": genome_info["assembly_stats"].get("genome_coverage"),
"gcPercent": genome_info["assembly_stats"]["gc_percent"],
"annotationStatus": genome_info["annotation_info"].get("status"),
}

def get_genomes_df(tax_ids):
return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]])

def build_genomes_files():
print("Building files")

df = get_genomes_df(get_tax_ids(TAXA))

df.to_csv(OUTPUT_PATH, index=False, sep="\t")

print(f"Wrote to {OUTPUT_PATH}")

if __name__ == "__main__":
build_genomes_files()
21 changes: 21 additions & 0 deletions files/source/genomes-from-ncbi.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus
Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5
Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation
Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation
Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation
Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Complete Genome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation
Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5
Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation
Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation
Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation
Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation
Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5
Mycobacterium tuberculosis 1773 GCF_030566675.1 False Complete Genome 1.0 4516435 1 4516435 1 20.0x 65.5
Mycobacterium tuberculosis 1773 GCF_963525475.1 False Complete Genome 1.0 4469156 1 4469156 1 100.0x 65.5
Mycobacterium tuberculosis 1773 GCF_017901095.1 False Complete Genome 1.0 4459449 1 4459449 1 111.0x 65.5
Mycobacterium tuberculosis 1773 GCF_030566105.1 False Complete Genome 1.0 4459087 1 4459087 1 20.0x 65.0
Mycobacterium tuberculosis 1773 GCF_033124885.1 False Complete Genome 1.0 4456234 1 4456234 1 257.0x 65.5
Mycobacterium tuberculosis 1773 GCF_014899985.1 False Complete Genome 1.0 4450340 1 4450340 1 92.0x 65.5
Mycobacterium tuberculosis 1773 GCF_040208995.1 False Complete Genome 1.0 4448271 1 4448271 1 208.6x 65.5
Mycobacterium tuberculosis 1773 GCF_014899965.1 False Complete Genome 1.0 4447644 1 4447644 1 51.0x 65.5
Mycobacterium tuberculosis 1773 GCF_040208985.1 False Complete Genome 1.0 4445673 1 4445673 1 232.2x 65.5

0 comments on commit 02573d3

Please sign in to comment.