From c7318714e24e177e3d691b8eae602aa5d11a349e Mon Sep 17 00:00:00 2001 From: hunterckx <118154470+hunterckx@users.noreply.github.com> Date: Wed, 6 Nov 2024 22:33:50 -0800 Subject: [PATCH] feat: export organism list from ncbi api (#159) --- ...-from-ncbi.py => build-files-from-ncbi.py} | 34 ++++++++++++++----- files/source/organisms-from-ncbi.tsv | 11 ++++++ 2 files changed, 36 insertions(+), 9 deletions(-) rename files/{build-genomes-files-from-ncbi.py => build-files-from-ncbi.py} (71%) create mode 100644 files/source/organisms-from-ncbi.tsv diff --git a/files/build-genomes-files-from-ncbi.py b/files/build-files-from-ncbi.py similarity index 71% rename from files/build-genomes-files-from-ncbi.py rename to files/build-files-from-ncbi.py index 90a7f8f..a34e732 100644 --- a/files/build-genomes-files-from-ncbi.py +++ b/files/build-files-from-ncbi.py @@ -19,14 +19,24 @@ ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json" -OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv" +ORGANISMS_OUTPUT_PATH = "files/source/organisms-from-ncbi.tsv" +GENOMES_OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv" def build_taxonomy_request_body(taxa): return {"taxons": taxa, "children": False, "ranks": ["genus"]} -def get_tax_ids(taxa): - taxonomy_info = requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json() - return [organism_info["taxonomy"]["tax_id"] for organism_info in taxonomy_info["reports"]] +def get_organism_row(organism_taxonomy): + return { + "taxon": organism_taxonomy["current_scientific_name"]["name"], + "taxonomyId": organism_taxonomy["tax_id"], + "assemblyCount": next(count["count"] for count in organism_taxonomy["counts"] if count["type"] == "COUNT_TYPE_ASSEMBLY"), + } + +def get_organisms_df(taxa): + return pd.DataFrame([get_organism_row(organism_info["taxonomy"]) for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body(taxa)).json()["reports"]]) + +def get_tax_ids(organisms_df): + return list(organisms_df["taxonomyId"]) def build_genomes_url(tax_ids): return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{url_quote(",".join([str(id) for id in tax_ids]))}/dataset_report?filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome" @@ -53,10 +63,16 @@ def get_genome_row(genome_info): def get_genomes_df(tax_ids): return pd.DataFrame(data=[get_genome_row(genome_info) for genome_info in requests.get(build_genomes_url(tax_ids)).json()["reports"]]) -def build_genomes_files(): +def build_files(): print("Building files") - genomes_source_df = get_genomes_df(get_tax_ids(TAXA)) + organisms_df = get_organisms_df(TAXA) + + organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t") + + print(f"Wrote to {ORGANISMS_OUTPUT_PATH}") + + genomes_source_df = get_genomes_df(get_tax_ids(organisms_df)) assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]] gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="pairedAccession", right_on="genBank") @@ -64,9 +80,9 @@ def build_genomes_files(): result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df) - result_df.to_csv(OUTPUT_PATH, index=False, sep="\t") + result_df.to_csv(GENOMES_OUTPUT_PATH, index=False, sep="\t") - print(f"Wrote to {OUTPUT_PATH}") + print(f"Wrote to {GENOMES_OUTPUT_PATH}") if __name__ == "__main__": - build_genomes_files() + build_files() diff --git a/files/source/organisms-from-ncbi.tsv b/files/source/organisms-from-ncbi.tsv new file mode 100644 index 0000000..50c59ad --- /dev/null +++ b/files/source/organisms-from-ncbi.tsv @@ -0,0 +1,11 @@ +taxon taxonomyId assemblyCount +Anopheles gambiae 7165 7 +Coccidioides immitis 5501 5 +Coccidioides posadasii 199306 13 +Culex pipiens 7175 5 +Mycobacterium tuberculosis 1773 7822 +Plasmodium falciparum 5833 67 +Plasmodium vinckei 5860 10 +Plasmodium vivax 5855 19 +Plasmodium yoelii 5861 15 +Toxoplasma gondii 5811 29