-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
4 changed files
with
128 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from urllib.parse import quote as url_quote | ||
import pandas as pd | ||
import requests | ||
|
||
TAXA_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv" | ||
|
||
TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report" | ||
|
||
ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json" | ||
|
||
ORGANISMS_OUTPUT_PATH = "files/source/organisms-from-ncbi.tsv" | ||
GENOMES_OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv" | ||
|
||
def build_taxonomy_request_body(taxa): | ||
return {"taxons": taxa, "children": False, "ranks": ["genus"]} | ||
|
||
def get_organism_row(organism_taxonomy): | ||
return { | ||
"taxon": organism_taxonomy["current_scientific_name"]["name"], | ||
"taxonomyId": organism_taxonomy["tax_id"], | ||
"assemblyCount": next(count["count"] for count in organism_taxonomy["counts"] if count["type"] == "COUNT_TYPE_ASSEMBLY"), | ||
} | ||
|
||
def get_organisms_df(taxa): | ||
return pd.DataFrame([get_organism_row(organism_info["taxonomy"]) for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()["reports"]]) | ||
|
||
def get_tax_ids(organisms_df): | ||
return list(organisms_df["taxonomyId"]) | ||
|
||
def build_genomes_url(tax_ids): | ||
return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{url_quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome" | ||
|
||
def get_genome_row(genome_info): | ||
refseq_category = genome_info["assembly_info"].get("refseq_category") | ||
return { | ||
"taxon": genome_info["organism"]["organism_name"], | ||
"taxonomyId": genome_info["organism"]["tax_id"], | ||
"accession": genome_info["accession"], | ||
"isRef": refseq_category == "reference genome", | ||
"level": genome_info["assembly_info"]["assembly_level"], | ||
"chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"), | ||
"length": genome_info["assembly_stats"]["total_sequence_length"], | ||
"scaffoldCount": genome_info["assembly_stats"]["number_of_scaffolds"], | ||
"scaffoldN50": genome_info["assembly_stats"]["scaffold_n50"], | ||
"scaffoldL50": genome_info["assembly_stats"]["scaffold_l50"], | ||
"coverage": genome_info["assembly_stats"].get("genome_coverage"), | ||
"gcPercent": genome_info["assembly_stats"]["gc_percent"], | ||
"annotationStatus": genome_info["annotation_info"].get("status"), | ||
"pairedAccession": genome_info["paired_accession"], | ||
} | ||
|
||
def get_genomes_df(tax_ids): | ||
return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]]) | ||
|
||
def build_files(): | ||
print("Building files") | ||
|
||
taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False) | ||
|
||
organisms_source_df = get_organisms_df(list(taxa_df["Name"])) | ||
|
||
organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"]) | ||
|
||
organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t") | ||
|
||
print(f"Wrote to {ORGANISMS_OUTPUT_PATH}") | ||
|
||
genomes_source_df = get_genomes_df(get_tax_ids(organisms_df)) | ||
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] | ||
|
||
gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="pairedAccession", right_on="genBank") | ||
ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq") | ||
|
||
genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df) | ||
|
||
genomes_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t") | ||
|
||
print(f"Wrote to {GENOMES_OUTPUT_PATH}") | ||
|
||
if __name__ == "__main__": | ||
build_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession ucscBrowser genBank refSeq | ||
Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2 | ||
Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 https://genome.ucsc.edu/h/GCF_000002765.5 GCA_000002765.3 GCF_000002765.5 | ||
Leishmania major strain Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2 | ||
Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2 | ||
Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 https://genome.ucsc.edu/h/GCA_018416015.2 GCA_018416015.2 GCF_018416015.1 | ||
Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Complete Genome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1 | ||
Leishmania donovani 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1 | ||
Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2 | ||
Trypanosoma brucei brucei TREU927 185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2 | ||
Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2 | ||
Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2 | ||
Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2 | ||
Leishmania braziliensis MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2 | ||
Trypanosoma cruzi 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1 | ||
Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2 | ||
Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5 GCA_000277735.2 | ||
Severe acute respiratory syndrome coronavirus 2 2697049 GCF_009858895.2 False Complete Genome 1.0 29903 1 29903 1 38.0 GCA_009858895.3 | ||
Monkeypox virus 10244 GCF_000857045.1 False Complete Genome 1.0 196858 1 196858 1 33.0 GCA_000857045.1 | ||
Mycobacterium tuberculosis 1773 GCF_030566675.1 False Complete Genome 1.0 4516435 1 4516435 1 20.0x 65.5 GCA_030566675.1 | ||
Mycobacterium tuberculosis 1773 GCF_963525475.1 False Complete Genome 1.0 4469156 1 4469156 1 100.0x 65.5 GCA_963525475.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
taxon taxonomyId assemblyCount CustomTags | ||
Anopheles gambiae 7165 7 VEuPathDb | ||
Coccidioides immitis 5501 5 | ||
Coccidioides posadasii 199306 13 | ||
Culex pipiens 7175 5 VEuPathDb | ||
Leishmania braziliensis 5660 11 VEuPathDb | ||
Leishmania donovani 5661 12 VEuPathDb | ||
Leishmania major 5664 7 VEuPathDb | ||
Monkeypox virus 10244 6911 Virus | ||
Mycobacterium tuberculosis 1773 7823 Bact | ||
Plasmodium falciparum 5833 67 VEuPathDb | ||
Plasmodium vinckei 5860 10 VEuPathDb | ||
Plasmodium vivax 5855 19 VEuPathDb | ||
Plasmodium yoelii 5861 15 VEuPathDb | ||
Severe acute respiratory syndrome coronavirus 2 2697049 92 Virus | ||
Toxoplasma gondii 5811 29 VEuPathDb | ||
Trypanosoma brucei 5691 5 VEuPathDb | ||
Trypanosoma cruzi 5693 45 VEuPathDb |