-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add script to get genomes from ncbi api (#159)
- Loading branch information
Showing
2 changed files
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from urllib.parse import quote as url_quote | ||
import pandas as pd | ||
import requests | ||
|
||
TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report" | ||
|
||
TAXA = [ | ||
"Plasmodium falciparum", | ||
"Plasmodium vivax", | ||
"Plasmodium yoelii", | ||
"Plasmodium vinckei", | ||
"Culex pipiens", | ||
"Anopheles gambiae", | ||
"Toxoplasma gondii", | ||
"Mycobacterium tuberculosis", | ||
"Coccidioides posadasii", | ||
"Coccidioides immitis" | ||
] | ||
|
||
OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv" | ||
|
||
def build_taxonomy_request_body(taxa): | ||
return {"taxons": taxa, "children": False, "ranks": ["genus"]} | ||
|
||
def get_tax_ids(taxa): | ||
taxonomy_info = requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json() | ||
return [organism_info["taxonomy"]["tax_id"] for organism_info in taxonomy_info["reports"]] | ||
|
||
def build_genomes_url(tax_ids): | ||
return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{url_quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome" | ||
|
||
def get_genome_row(genome_info): | ||
refseq_category = genome_info["assembly_info"].get("refseq_category") | ||
return { | ||
"taxon": genome_info["organism"]["organism_name"], | ||
"taxonomyId": genome_info["organism"]["tax_id"], | ||
"accession": genome_info["accession"], | ||
"isRef": (not (refseq_category is None)) and ("reference" in refseq_category), | ||
"level": genome_info["assembly_info"]["assembly_level"], | ||
"chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"), | ||
"length": genome_info["assembly_stats"]["total_sequence_length"], | ||
"scaffoldCount": genome_info["assembly_stats"]["number_of_scaffolds"], | ||
"scaffoldN50": genome_info["assembly_stats"]["scaffold_n50"], | ||
"scaffoldL50": genome_info["assembly_stats"]["scaffold_l50"], | ||
"coverage": genome_info["assembly_stats"].get("genome_coverage"), | ||
"gcPercent": genome_info["assembly_stats"]["gc_percent"], | ||
"annotationStatus": genome_info["annotation_info"].get("status"), | ||
} | ||
|
||
def get_genomes_df(tax_ids): | ||
return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]]) | ||
|
||
def build_genomes_files(): | ||
print("Building files") | ||
|
||
df = get_genomes_df(get_tax_ids(TAXA)) | ||
|
||
df.to_csv(OUTPUT_PATH, index=False, sep="\t") | ||
|
||
print(f"Wrote to {OUTPUT_PATH}") | ||
|
||
if __name__ == "__main__": | ||
build_genomes_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus | ||
Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 | ||
Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation | ||
Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation | ||
Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation | ||
Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Complete Genome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation | ||
Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 | ||
Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation | ||
Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation | ||
Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation | ||
Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation | ||
Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_030566675.1 False Complete Genome 1.0 4516435 1 4516435 1 20.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_963525475.1 False Complete Genome 1.0 4469156 1 4469156 1 100.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_017901095.1 False Complete Genome 1.0 4459449 1 4459449 1 111.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_030566105.1 False Complete Genome 1.0 4459087 1 4459087 1 20.0x 65.0 | ||
Mycobacterium tuberculosis 1773 GCF_033124885.1 False Complete Genome 1.0 4456234 1 4456234 1 257.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_014899985.1 False Complete Genome 1.0 4450340 1 4450340 1 92.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_040208995.1 False Complete Genome 1.0 4448271 1 4448271 1 208.6x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_014899965.1 False Complete Genome 1.0 4447644 1 4447644 1 51.0x 65.5 | ||
Mycobacterium tuberculosis 1773 GCF_040208985.1 False Complete Genome 1.0 4445673 1 4445673 1 232.2x 65.5 |