diff --git a/README.md b/README.md index d5d46c0..be5cc05 100644 --- a/README.md +++ b/README.md @@ -32,3 +32,11 @@ python3 ./files/build-genomes-files.py The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate` again. + +## Building data files from the NCBI API + +Using the Python environment described above, run the script: + +```shell +python3 ./files/build-files-from-ncbi.py +``` diff --git a/files/build-files-from-ncbi.py b/files/build-files-from-ncbi.py new file mode 100644 index 0000000..8a0717b --- /dev/null +++ b/files/build-files-from-ncbi.py @@ -0,0 +1,81 @@ +from urllib.parse import quote as url_quote +import pandas as pd +import requests + +TAXA_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv" + +TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report" + +ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json" + +ORGANISMS_OUTPUT_PATH = "files/source/organisms-from-ncbi.tsv" +GENOMES_OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv" + +def build_taxonomy_request_body(taxa): + return {"taxons": taxa, "children": False, "ranks": ["genus"]} + +def get_organism_row(organism_taxonomy): + return { + "taxon": organism_taxonomy["current_scientific_name"]["name"], + "taxonomyId": organism_taxonomy["tax_id"], + "assemblyCount": next(count["count"] for count in organism_taxonomy["counts"] if count["type"] == "COUNT_TYPE_ASSEMBLY"), + } + +def get_organisms_df(taxa): + return pd.DataFrame([get_organism_row(organism_info["taxonomy"]) for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()["reports"]]) + +def get_tax_ids(organisms_df): + return list(organisms_df["taxonomyId"]) + +def build_genomes_url(tax_ids): + return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{url_quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome" + +def get_genome_row(genome_info): + refseq_category = genome_info["assembly_info"].get("refseq_category") + return { + "taxon": genome_info["organism"]["organism_name"], + "taxonomyId": genome_info["organism"]["tax_id"], + "accession": genome_info["accession"], + "isRef": refseq_category == "reference genome", + "level": genome_info["assembly_info"]["assembly_level"], + "chromosomeCount": genome_info["assembly_stats"].get("total_number_of_chromosomes"), + "length": genome_info["assembly_stats"]["total_sequence_length"], + "scaffoldCount": genome_info["assembly_stats"]["number_of_scaffolds"], + "scaffoldN50": genome_info["assembly_stats"]["scaffold_n50"], + "scaffoldL50": genome_info["assembly_stats"]["scaffold_l50"], + "coverage": genome_info["assembly_stats"].get("genome_coverage"), + "gcPercent": genome_info["assembly_stats"]["gc_percent"], + "annotationStatus": genome_info["annotation_info"].get("status"), + "pairedAccession": genome_info["paired_accession"], + } + +def get_genomes_df(tax_ids): + return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]]) + +def build_files(): + print("Building files") + + taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False) + + organisms_source_df = get_organisms_df(list(taxa_df["Name"])) + + organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"]) + + organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t") + + print(f"Wrote to {ORGANISMS_OUTPUT_PATH}") + + genomes_source_df = get_genomes_df(get_tax_ids(organisms_df)) + assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] + + gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="pairedAccession", right_on="genBank") + ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq") + + genomes_df = gen_bank_merge_df.combine_first(ref_seq_merge_df) + + genomes_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t") + + print(f"Wrote to {GENOMES_OUTPUT_PATH}") + +if __name__ == "__main__": + build_files() diff --git a/files/source/genomes-from-ncbi.tsv b/files/source/genomes-from-ncbi.tsv new file mode 100644 index 0000000..8876bf3 --- /dev/null +++ b/files/source/genomes-from-ncbi.tsv @@ -0,0 +1,21 @@ +taxon taxonomyId accession isRef level chromosomeCount length scaffoldCount scaffoldN50 scaffoldL50 coverage gcPercent annotationStatus pairedAccession ucscBrowser genBank refSeq +Mycobacterium tuberculosis H37Rv 83332 GCF_000195955.2 True Complete Genome 1.0 4411532 1 4411532 1 65.5 GCA_000195955.2 +Plasmodium falciparum 3D7 36329 GCF_000002765.6 True Complete Genome 14.0 23292622 14 1687656 5 100.0x 19.5 Full annotation GCA_000002765.3 https://genome.ucsc.edu/h/GCF_000002765.5 GCA_000002765.3 GCF_000002765.5 +Leishmania major strain Friedlin 347515 GCF_000002725.2 True Complete Genome 36.0 32855089 36 1091540 11 59.5 Full annotation GCA_000002725.2 https://genome.ucsc.edu/h/GCF_000002725.2 GCA_000002725.2 GCF_000002725.2 +Plasmodium yoelii 5861 GCF_900002385.2 True Complete Genome 14.0 23043114 14 2046250 5 100.0x 21.5 Full annotation GCA_900002385.2 https://genome.ucsc.edu/h/GCF_900002385.2 GCA_900002385.2 GCF_900002385.2 +Coccidioides posadasii str. Silveira 443226 GCF_018416015.2 True Complete Genome 9.0 28193268 9 8079863 2 475.0x 46.5 Full annotation GCA_018416015.2 https://genome.ucsc.edu/h/GCA_018416015.2 GCA_018416015.2 GCF_018416015.1 +Plasmodium vinckei vinckei 54757 GCF_900681995.1 True Complete Genome 14.0 18338688 14 1692345 5 155.0x 23.0 Full annotation GCA_900681995.1 https://genome.ucsc.edu/h/GCF_900681995.1 GCA_900681995.1 GCF_900681995.1 +Leishmania donovani 5661 GCF_000227135.1 True Chromosome 36.0 32444968 36 1024085 11 59.5 Full annotation GCA_000227135.2 https://genome.ucsc.edu/h/GCF_000227135.1 GCA_000227135.2 GCF_000227135.1 +Toxoplasma gondii ME49 508771 GCF_000006565.2 True Chromosome 14.0 65633124 2276 4973582 6 26.5x 52.5 GCA_000006565.2 https://genome.ucsc.edu/h/GCF_000006565.2 GCA_000006565.2 GCF_000006565.2 +Trypanosoma brucei brucei TREU927 185431 GCF_000002445.2 True Chromosome 11.0 26075494 12 2481190 4 46.5 Full annotation GCA_000002445.1 https://genome.ucsc.edu/h/GCF_000002445.2 GCA_000002445.1 GCF_000002445.2 +Anopheles gambiae 7165 GCF_943734735.2 True Chromosome 3.0 264451381 190 99149756 2 54.0x 44.5 Full annotation GCA_943734735.2 https://genome.ucsc.edu/h/GCF_943734735.2 GCA_943734735.2 GCF_943734735.2 +Plasmodium vivax 5855 GCF_000002415.2 True Chromosome 14.0 27007701 2747 1678596 6 42.5 Full annotation GCA_000002415.2 https://genome.ucsc.edu/h/GCF_000002415.2 GCA_000002415.2 GCF_000002415.2 +Culex pipiens pallens 42434 GCF_016801865.2 True Chromosome 3.0 566339288 289 186194774 2 250.0x 37.0 Full annotation GCA_016801865.2 +Leishmania braziliensis MHOM/BR/75/M2904 420245 GCF_000002845.2 True Chromosome 35.0 32068771 138 992961 11 58.0 GCA_000002845.2 https://genome.ucsc.edu/h/GCF_000002845.2 GCA_000002845.2 GCF_000002845.2 +Trypanosoma cruzi 5693 GCF_000209065.1 True Scaffold 89937456 29495 88624 212 51.5 Full annotation GCA_000209065.1 https://genome.ucsc.edu/h/GCF_000209065.1 GCA_000209065.1 GCF_000209065.1 +Coccidioides immitis RS 246410 GCF_000149335.2 True Scaffold 28947925 6 4323945 3 46.0 Full annotation GCA_000149335.2 https://genome.ucsc.edu/h/GCF_000149335.2 GCA_000149335.2 GCF_000149335.2 +Mycobacterium tuberculosis H37Rv 83332 GCF_000277735.2 False Complete Genome 1.0 4411709 1 4411709 1 65.5 GCA_000277735.2 +Severe acute respiratory syndrome coronavirus 2 2697049 GCF_009858895.2 False Complete Genome 1.0 29903 1 29903 1 38.0 GCA_009858895.3 +Monkeypox virus 10244 GCF_000857045.1 False Complete Genome 1.0 196858 1 196858 1 33.0 GCA_000857045.1 +Mycobacterium tuberculosis 1773 GCF_030566675.1 False Complete Genome 1.0 4516435 1 4516435 1 20.0x 65.5 GCA_030566675.1 +Mycobacterium tuberculosis 1773 GCF_963525475.1 False Complete Genome 1.0 4469156 1 4469156 1 100.0x 65.5 GCA_963525475.1 diff --git a/files/source/organisms-from-ncbi.tsv b/files/source/organisms-from-ncbi.tsv new file mode 100644 index 0000000..83237cd --- /dev/null +++ b/files/source/organisms-from-ncbi.tsv @@ -0,0 +1,18 @@ +taxon taxonomyId assemblyCount CustomTags +Anopheles gambiae 7165 7 VEuPathDb +Coccidioides immitis 5501 5 +Coccidioides posadasii 199306 13 +Culex pipiens 7175 5 VEuPathDb +Leishmania braziliensis 5660 11 VEuPathDb +Leishmania donovani 5661 12 VEuPathDb +Leishmania major 5664 7 VEuPathDb +Monkeypox virus 10244 6911 Virus +Mycobacterium tuberculosis 1773 7823 Bact +Plasmodium falciparum 5833 67 VEuPathDb +Plasmodium vinckei 5860 10 VEuPathDb +Plasmodium vivax 5855 19 VEuPathDb +Plasmodium yoelii 5861 15 VEuPathDb +Severe acute respiratory syndrome coronavirus 2 2697049 92 Virus +Toxoplasma gondii 5811 29 VEuPathDb +Trypanosoma brucei 5691 5 VEuPathDb +Trypanosoma cruzi 5693 45 VEuPathDb