From f241e075680b6c36cc637155b57a7219e4d3fa30 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 13 Nov 2023 15:16:14 -0800 Subject: [PATCH] Use genbank_accession column as ID column --- Snakefile | 40 +++++++- config/config_zika.yaml | 2 + config/dropped_strains.txt | 167 ++++++++++++++++--------------- example_data/metadata.tsv | 70 ++++++------- example_data/sequences.fasta | 68 ++++++------- scripts/set_final_strain_name.py | 38 +++++++ 6 files changed, 230 insertions(+), 155 deletions(-) create mode 100644 config/config_zika.yaml create mode 100644 scripts/set_final_strain_name.py diff --git a/Snakefile b/Snakefile index 3562078..0e04f3d 100644 --- a/Snakefile +++ b/Snakefile @@ -1,3 +1,6 @@ +if not config: + configfile: "config/config_zika.yaml" + rule all: input: auspice_json = "auspice/zika.json", @@ -59,12 +62,14 @@ rule filter: group_by = "country year month", sequences_per_group = 40, min_date = 2012, - min_length = 5385 + min_length = 5385, + strain_id = config.get("strain_id_field", "strain"), shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ --output {output.sequences} \ --group-by {params.group_by} \ @@ -124,13 +129,15 @@ rule refine: params: coalescent = "opt", date_inference = "marginal", - clock_filter_iqd = 4 + clock_filter_iqd = 4, + strain_id = config.get("strain_id_field", "strain"), shell: """ augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --output-tree {output.tree} \ --output-node-data {output.node_data} \ --timetree \ @@ -212,12 +219,16 @@ rule export: auspice_config = files.auspice_config, description = files.description output: - auspice_json = rules.all.input.auspice_json + auspice_json = "results/raw_zika.json", + root_sequence = "results/raw_zika_root-sequence.json", + params: + strain_id = config.get("strain_id_field", "strain"), shell: """ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ --colors {input.colors} \ --auspice-config {input.auspice_config} \ @@ -226,6 +237,29 @@ rule export: --output {output.auspice_json} """ +rule final_strain_name: + input: + auspice_json="results/raw_zika.json", + metadata="data/metadata.tsv", + root_sequence="results/raw_zika_root-sequence.json", + output: + auspice_json="auspice/zika.json", + root_sequence="auspice/zika_root-sequence.json", + params: + strain_id=config["strain_id_field"], + display_strain_field=config.get("display_strain_field", "strain"), + shell: + """ + python3 scripts/set_final_strain_name.py \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --input-auspice-json {input.auspice_json} \ + --display-strain-name {params.display_strain_field} \ + --output {output.auspice_json} + + cp {input.root_sequence} {output.root_sequence} + """ + rule clean: """Removing directories: {params}""" params: diff --git a/config/config_zika.yaml b/config/config_zika.yaml new file mode 100644 index 0000000..5345584 --- /dev/null +++ b/config/config_zika.yaml @@ -0,0 +1,2 @@ +strain_id_field: "genbank_accession" +display_strain_field: "strain" \ No newline at end of file diff --git a/config/dropped_strains.txt b/config/dropped_strains.txt index 22b5878..746e1ba 100644 --- a/config/dropped_strains.txt +++ b/config/dropped_strains.txt @@ -1,86 +1,87 @@ -PF13/251013_18 # reference included in config/zika_reference.gb -AFMC_U # too basal -AFMC_S # too basal -Boracay/16423 # too basal -JMB_185 # too basal -PHL/2012/CPC_0740 # too basal +MG827392 +KX369547 # PF13/251013_18 # reference included in config/zika_reference.gb +KY553111 # AFMC_U # too basal +KY962729 # AFMC_S # too basal +KY120353 # Boracay/16423 # too basal +KU179098 # JMB_185 # too basal +KU681082 # PHL/2012/CPC_0740 # too basal VIE/Bra/2016 # too basal -Dominican_Republic/2016/PD2 # duplicate of other strain in dataset -GD01 # duplicate of other strain in dataset -GDZ16001 # duplicate of other strain in dataset -VEN/UF_2/2016 # duplicate of other strain in dataset -ZZ_1 # duplicate of other strain in dataset -VR10599/Pavia/2016 # export with unknown origin -34997/Pavia/2016 # export with unknown origin -COL/FLR_00001/2015 # duplicate of COL/FLR/2015 -COL/FLR_00002/2015 # duplicate of COL/FLR/2015 -COL/FLR_00003/2015 # duplicate of COL/FLR/2015 -COL/FLR_00004/2015 # duplicate of COL/FLR/2015 -COL/FLR_00005/2015 # duplicate of COL/FLR/2015 -COL/FLR_00006/2015 # duplicate of COL/FLR/2015 -COL/FLR_00007/2015 # duplicate of COL/FLR/2015 -COL/FLR_00008/2015 # duplicate of COL/FLR/2015 -COL/FLR_00009/2015 # duplicate of COL/FLR/2015 -COL/FLR_00010/2015 # duplicate of COL/FLR/2015 -COL/FLR_00011/2015 # duplicate of COL/FLR/2015 -COL/FLR_00012/2015 # duplicate of COL/FLR/2015 -COL/FLR_00013/2015 # duplicate of COL/FLR/2015 -COL/FLR_00014/2015 # duplicate of COL/FLR/2015 -COL/FLR_00015/2015 # duplicate of COL/FLR/2015 -COL/FLR_00016/2015 # duplicate of COL/FLR/2015 -COL/FLR_00017/2015 # duplicate of COL/FLR/2015 -COL/FLR_00018/2015 # duplicate of COL/FLR/2015 -COL/FLR_00019/2015 # duplicate of COL/FLR/2015 -COL/FLR_00020/2015 # duplicate of COL/FLR/2015 -COL/FLR_00021/2015 # duplicate of COL/FLR/2015 -COL/FLR_00022/2015 # duplicate of COL/FLR/2015 -COL/FLR_00023/2015 # duplicate of COL/FLR/2015 -COL/FLR_00024/2015 # duplicate of COL/FLR/2015 -COL/FLR_00025/2015 # duplicate of COL/FLR/2015 -COL/FLR_00026/2015 # duplicate of COL/FLR/2015 -COL/FLR_00034/2015 # duplicate of COL/FLR/2015 -COL/FLR_00035/2015 # duplicate of COL/FLR/2015 -COL/FLR_00036/2015 # duplicate of COL/FLR/2015 -COL/FLR_00038/2015 # duplicate of COL/FLR/2015 -COL/FLR_00040/2015 # duplicate of COL/FLR/2015 -COL/FLR_00041/2015 # duplicate of COL/FLR/2015 -COL/FLR_00042/2015 # duplicate of COL/FLR/2015 -COL/PRV_00027/2015 # misdated -COL/PRV_00028/2015 # misdated -COL/PAN_00029/2015 # misdated -COL/PAN_00030/2015 # misdated -BRA/2016/FC_DQ12D1 # large indel -Brazil/2016/ZBRX8 # large indel -Brazil/2016/ZBRX11 # large indel -CX17 # large indel -MEX/2016/mex27 # large indel -MEX/2016/mex50 # large indel -SLV/2016/ElSalvador_1055 # large indel -USVI/20/2016 # large indel +KU853013 # Dominican_Republic/2016/PD2 # duplicate of other strain in dataset +KU740184 # GD01 # duplicate of other strain in dataset +KU761564 # GDZ16001 # duplicate of other strain in dataset +KX893855 # VEN/UF_2/2016 # duplicate of other strain in dataset +KY927808 # ZZ_1 # duplicate of other strain in dataset +KY003154 # VR10599/Pavia/2016 # export with unknown origin +KY003153 # 34997/Pavia/2016 # export with unknown origin +MF574552 # COL/FLR_00001/2015 # duplicate of COL/FLR/2015 +MF574559 # COL/FLR_00002/2015 # duplicate of COL/FLR/2015 +MF574560 # COL/FLR_00003/2015 # duplicate of COL/FLR/2015 +MF574561 # COL/FLR_00004/2015 # duplicate of COL/FLR/2015 +MF574571 # COL/FLR_00005/2015 # duplicate of COL/FLR/2015 +MF574555 # COL/FLR_00006/2015 # duplicate of COL/FLR/2015 +MF574557 # COL/FLR_00007/2015 # duplicate of COL/FLR/2015 +MF574562 # COL/FLR_00008/2015 # duplicate of COL/FLR/2015 +MF574572 # COL/FLR_00009/2015 # duplicate of COL/FLR/2015 +MF574570 # COL/FLR_00010/2015 # duplicate of COL/FLR/2015 +MF574565 # COL/FLR_00011/2015 # duplicate of COL/FLR/2015 +MF574568 # COL/FLR_00012/2015 # duplicate of COL/FLR/2015 +MF574558 # COL/FLR_00013/2015 # duplicate of COL/FLR/2015 +MF574576 # COL/FLR_00014/2015 # duplicate of COL/FLR/2015 +MF574567 # COL/FLR_00015/2015 # duplicate of COL/FLR/2015 +MF574575 # COL/FLR_00016/2015 # duplicate of COL/FLR/2015 +MF574553 # COL/FLR_00017/2015 # duplicate of COL/FLR/2015 +MF574573 # COL/FLR_00018/2015 # duplicate of COL/FLR/2015 +MF574574 # COL/FLR_00019/2015 # duplicate of COL/FLR/2015 +MF574577 # COL/FLR_00020/2015 # duplicate of COL/FLR/2015 +MF574556 # COL/FLR_00021/2015 # duplicate of COL/FLR/2015 +MF574554 # COL/FLR_00022/2015 # duplicate of COL/FLR/2015 +MF574566 # COL/FLR_00023/2015 # duplicate of COL/FLR/2015 +MF574569 # COL/FLR_00024/2015 # duplicate of COL/FLR/2015 +MF574563 # COL/FLR_00025/2015 # duplicate of COL/FLR/2015 +MF574564 # COL/FLR_00026/2015 # duplicate of COL/FLR/2015 +MF574581 # COL/FLR_00034/2015 # duplicate of COL/FLR/2015 +MF574588 # COL/FLR_00035/2015 # duplicate of COL/FLR/2015 +MF574582 # COL/FLR_00036/2015 # duplicate of COL/FLR/2015 +MF574586 # COL/FLR_00038/2015 # duplicate of COL/FLR/2015 +MF574584 # COL/FLR_00040/2015 # duplicate of COL/FLR/2015 +MF574583 # COL/FLR_00041/2015 # duplicate of COL/FLR/2015 +MF574580 # COL/FLR_00042/2015 # duplicate of COL/FLR/2015 +MF574579 # COL/PRV_00027/2015 # misdated +MF574578 # COL/PRV_00028/2015 # misdated +MF574585 # COL/PAN_00029/2015 # misdated +MF574587 # COL/PAN_00030/2015 # misdated +KY785436 # BRA/2016/FC_DQ12D1 # large indel +KY559010 # Brazil/2016/ZBRX8 # large indel +KY559011 # Brazil/2016/ZBRX11 # large indel +KX986761 # CX17 # large indel +MF801405 # MEX/2016/mex27 # large indel +MF801424 # MEX/2016/mex50 # large indel +MF801377 # SLV/2016/ElSalvador_1055 # large indel +VI20_12plex # USVI/20/2016 # large indel USVI/21/2016 # large indel -USVI/23/2016 # large indel -USVI/27/2016 # large indel -USVI/30/2016 # large indel -USVI/32/2016 # large indel -Thailand/1605aTw # excess divergence -VE_Ganxian # excess divergence -ZK_YN001 # excess divergence -Haiti/0029/2014 # contamination present -Haiti/0033/2014 # contamination present -Haiti/0036/2014 # contamination present -Haiti/0054/2014 # contamination present -Haiti/0074/2014 # contamination present -Haiti/0097/2014 # contamination present -mosquito/Haiti/1682/2016 # contamination present +VI23_12plex # USVI/23/2016 # large indel +VI27_1d # USVI/27/2016 # large indel +VI30_1d # USVI/30/2016 # large indel +VI32_12plex # USVI/32/2016 # large indel +KY126351 # Thailand/1605aTw # excess divergence +KU744693 # VE_Ganxian # excess divergence +KY328290 # ZK_YN001 # excess divergence +KY415986 # Haiti/0029/2014 # contamination present +KY415987 # Haiti/0033/2014 # contamination present +KY415990 # Haiti/0036/2014 # contamination present +KY415988 # Haiti/0054/2014 # contamination present +KY415989 # Haiti/0074/2014 # contamination present +KY415991 # Haiti/0097/2014 # contamination present +MF384325 # mosquito/Haiti/1682/2016 # contamination present ZF36_36S # contamination present -MR766 # lab strain -Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016 -Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59 -V15555 # highly diverged -DK # lab strain -DK23 # lab strain -rGZ02a/2018 # highly diverged -rGZ02p/2018 # highly diverged -V211784 # highly diverged -LMM/AG5643 -Faranah/18 +MK105975 # MR766 # lab strain +KX856011 # Aedes_sp/MEX_I_44/2016 # duplicate of Aedes_aegypti/MEX/MEX_I_44/2016 +MK028857 # Puerto_Rico/2015/PRVABC59 # duplicate of PRVABC59 +MN025403 # V15555 # highly diverged +MT505349 # DK # lab strain +MT505350 # DK23 # lab strain +MW680969 # rGZ02a/2018 # highly diverged +MW680970 # rGZ02p/2018 # highly diverged +OK054351 # V211784 # highly diverged +MT478034 # LMM/AG5643 +OL414716 # Faranah/18 diff --git a/example_data/metadata.tsv b/example_data/metadata.tsv index 9c30f2e..6e5345c 100644 --- a/example_data/metadata.tsv +++ b/example_data/metadata.tsv @@ -1,35 +1,35 @@ -strain virus accession date region country division city db segment authors url title journal paper_url -PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ -COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ -PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323 -COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ -Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739 -ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 Direct Submission Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China https://www.ncbi.nlm.nih.gov/pubmed/ -VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant Genome Announc 5 (17), e00231-17 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28450510 -DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448 -HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 -USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935 Genomic epidemiology reveals multiple introductions of Zika virus into the United States Nature (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/28538723 -SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697 Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis Lancet Infect Dis (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/ -SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744 Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis Lancet Infect Dis (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/ -SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726 Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis Lancet Infect Dis (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/ -USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473 Genomic epidemiology reveals multiple introductions of Zika virus into the United States Nature (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/28538723 -Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937 Genomic epidemiology reveals multiple introductions of Zika virus into the United States Nature (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/28538723 -SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688 Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis Lancet Infect Dis (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/ -USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478 Genomic epidemiology reveals multiple introductions of Zika virus into the United States Nature (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/28538723 -COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578 Direct Submission Submitted (30-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ -Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778 Imported Zika virus strains, Taiwan, 2016 Unpublished https://www.ncbi.nlm.nih.gov/pubmed/ -1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509 How Did Zika Virus Emerge in the Pacific Islands and Latin America? MBio 7 (5), e01239-16 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27729507 -1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519 How Did Zika Virus Emerge in the Pacific Islands and Latin America? MBio 7 (5), e01239-16 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27729507 -1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512 How Did Zika Virus Emerge in the Pacific Islands and Latin America? MBio 7 (5), e01239-16 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27729507 -Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995 Epidemic establishment and cryptic transmission of Zika virus in Brazil and the Americas Unpublished https://www.ncbi.nlm.nih.gov/pubmed/ -Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989 Establishment and cryptic transmission of Zika virus in Brazil and the Americas Nature 546 (7658), 406-410 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538727 -Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991 Epidemic establishment and cryptic transmission of Zika virus in Brazil and the Americas Unpublished https://www.ncbi.nlm.nih.gov/pubmed/ -V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323 -Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195 Zika Virus Targets Different Primary Human Placental Cells, Suggesting Two Routes for Vertical Transmission Cell Host Microbe 20 (2), 155-166 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27443522 -Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997 Epidemic establishment and cryptic transmission of Zika virus in Brazil and the Americas Unpublished https://www.ncbi.nlm.nih.gov/pubmed/ -SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255 Genetic and Biological Characterization for Zika Viruses Imported through Shenzhen Port Chin. Sci. Bull. 61 (22), 2463-2474 (2016) https://www.ncbi.nlm.nih.gov/pubmed/ +strain virus genbank_accession date region country division city db segment authors url +PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 +COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 +PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 +COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 +Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 +ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 +VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 +DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 +BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 +DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 +EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 +HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 +DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484 +DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441 +USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935 +SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697 +SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744 +SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726 +USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473 +Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937 +SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688 +USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478 +COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578 +Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778 +1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509 +1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519 +1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512 +Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995 +Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989 +Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991 +V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217 +Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195 +Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997 +SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255 diff --git a/example_data/sequences.fasta b/example_data/sequences.fasta index 64facba..9203c90 100644 --- a/example_data/sequences.fasta +++ b/example_data/sequences.fasta @@ -1,4 +1,4 @@ ->PAN/CDC_259359_V1_V3/2015 +>KX156774 gaatttgaagcgaatgctaacaacagtatcaacaggttttattttggatttggaaacgag agtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgc taaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggac @@ -179,7 +179,7 @@ gaccttccccacccttcaatctggggcctgaactggagatcagctgtggatctccagaag agggactagtggttagaggagaccccccggaaaacgcaaaacagcatattgacgctggga aagaccagagactccatgagtttccaccacgctggccgccaggcacagatcgccgaatag cggcggccggtgtggggaaatccatgggtct ->COL/FLR_00024/2015 +>MF574569 tcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttatt ttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggatt ccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaa @@ -358,7 +358,7 @@ agctgggaaaccaagcctatagtcaggccgagaacgccatggcacggaagaagccatgct gcctgtgagcccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgcagga tgggaaaagaaggtggcgaccttccccacccttcaatctggggcctgaactggagatcag ctgtggatctccagaagagggactagtggttagaggaga ->PRVABC59 +>KU501215 gttgttgatctgtgtgaatcagactgcgacagttcgagtttgaagcgaaagctagcaaca gtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaaa aaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtgag @@ -537,7 +537,7 @@ tgtgacccccccaggagaagctgggaaaccaagcctatagtcaggccgagaacgccatgg cacggaagaagccatgctgcctgtgagcccctcagaggacactgagtcaaaaaaccccac gcgcttggaggcgcaggatgggaaaagaaggtggcgaccttccccacccttcaatctggg gcctgaactggagatcagctgtggatctccagaagagggactagtggttagagga ->COL/FLR_00008/2015 +>MF574562 tcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttatt ttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggatt ccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaa @@ -716,7 +716,7 @@ agctgggaaaccaagcctatagtcaggccgagaacgccatggcacggaagaagccatgct gcctgtgagcccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgcagga tgggaaaagaaggtggcgaccttccccacccttcaatctggggcctgaactggagatcag ctgtggatctccagaagagggactagtggttagaggaga ->Colombia/2016/ZC204Se +>KY317939 gacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttg gaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgt caatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgcc @@ -894,7 +894,7 @@ agtcagccacagcttggggaaagctgtgcagcctgtgacccccccaggagaagctgggaa accaagcctatagtcaggccgagaacgccatggcacggaagaagccatgctgcctgtgag cccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaag aaggtggcgaccttccccacccttcaatctggggcctgaactggagat ->ZKC2/2016 +>KX253996 agttgttgatctgtgtgaatcagactgcgacagttcgagtttgaagcgaaagctagcaac agtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaa aaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtga @@ -1076,7 +1076,7 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca tgggtct ->VEN/UF_1/2016 +>KX702400 agttgttactgttgctgactcagactgcgacagttcgagtttgaagcgaaagctagcaac agtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaa aaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtga @@ -1258,7 +1258,7 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca tgggtctt ->DOM/2016/BB_0059 +>KY785425 tggctgccatgctgagaataatcaatgctaggaaggagaagaagagacgaggcgcagata ctagtgtcggaattgttggcctcctgctgaccacagctatggcagcggaggtcactagac gtgggagtgcatactacatgtacttggacagaaacgatgctggggaggccatatctttcc @@ -1427,7 +1427,7 @@ ggtgtggatctctcatagggcacagaccgcgcaccacctgggctgagaacattaaaaaca cagtcaacatggtgcgcaggatcataggtgaggaagaaaagtacatggactacctatcca cccaagttcgctacttgggtgaagaagggtctacacctggagtgctgtaagcaccaatct taatgttgtcaggcc ->BRA/2016/FC_6706 +>KY785433 agtttgaagcgaaagctagcaacagtatcaacaggttttatttyggatttggaaacgaga gtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgct aaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggact @@ -1601,7 +1601,7 @@ cattccctatttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagacc gcgcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcatagg tgatgaagaaaagtacatggactacctatccacccaagttcgctacttgggtgaagaagg gtctacacctggagtgctgtaagcaccaatcttaatgttgtcaggc ->DOM/2016/BB_0183 +>KY785420 gtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagag tttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgcta aaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggactt @@ -1780,7 +1780,7 @@ tagtcaggccgagaacgccatggcacggaagaagccatgctgcctgtgagcccctcagag gacactgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcga ccttccccacccttcaatctggggcctgaactggagatcagctgtggatccccagaagag g ->EcEs062_16 +>KX879603 agtagttgatctgtgtgaatcagactgcgacagttcgagtttgaagcgaaagctagcaac agtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaa aaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtga @@ -1962,7 +1962,7 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca tgggagatcgga ->HND/2016/HU_ME59 +>KY785418 gtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagag tttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgcta aaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggactt @@ -2136,7 +2136,7 @@ attccctatttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagaccg cgcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcataggt gatgaagaaaagtacatggactacctatccacccaagttcgctacttgggtgaagaaggg tctacacctggagtgctgtaagcaccaatcttaatgttgtcaggc ->DOM/2016/MA_WGS16_011 +>KY785484 aagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagagtttct ggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaaacg cggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgct @@ -2314,7 +2314,7 @@ ggggaaagctgtgcagcctgtgacccccccaggagaagctgggaaaccaagcctatagtc aggccgagaacgccatggcacggaagaagccatgctgcctgtgagcccctcagaggacac tgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttc cccacccttcaatctggggcctgaactggggatcag ->DOM/2016/BB_0433 +>KY785441 tttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagagt ttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaa aacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttc @@ -2488,7 +2488,7 @@ ttccctatttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagaccgc gcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcataggtg aggaagaaaagtacatggactacctatccacccaagttcgctacttgggtgaagaagggt ctacacctggagtgctgtaagcaccaatcctaatgttgtcaggcc ->USA/2016/FL022 +>KY075935 gcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttggatt tggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggatt gtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctg @@ -2662,7 +2662,7 @@ aaatggacagacattccctatttgggaaaaagggaagacttgtggtgtggatctctcata gggcacagaccgcgcaccacctgggctgagaacattaaaaacacagtcaacatggtgcgc aggatcataggtgaggaagaaaagtacatggactacctatccacccaagtccgctacttg ggtgaagaagggtctacacctggagtgctgtaagcaccaatctta ->SG_027 +>KY241697 ctgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttgga tttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccgga ttgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggc @@ -2842,7 +2842,7 @@ tgagcccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgcaggatggga aaagaaggtggcgaccttccccacccttcaatctggggcctgaactggagatcagctgtg gatctccagaagagggactagtggttagaggagaccccccggaaaacgcaaaacagcata ttgacgctgggaaagaccagagactccatgagtttccaccacgctggccgccag ->SG_074 +>KY241744 gaatcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggtttt attttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggagg attccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggctt @@ -3023,7 +3023,7 @@ ggatgggaaaagaaggtggcgaccttccccacccttcaatctggggcctgaactggagat cagctgtggatctccagaagagggactagtggttagaggagaccccccggaaaacgcaaa acagcatattgacgctgggaaagaccagagactccatgagtttccaccacgctggccgcc aggcacagatcgccgaatagcg ->SG_056 +>KY241726 gaatcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggtttt attttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggagg attccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggctt @@ -3203,7 +3203,7 @@ gctgcctgtgagcccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgca ggatgggaaaagaaggtggcgaccttccccacccttcaatctggggcctgaactggagat cagctgtggatctccagaagagggactagtggttagaggagaccccccggaaaacgcaaa acagcatattgacgctgggaaagaccagagactccatgagtttccaccacgctggcc ->USA/2016/FLUR022 +>KY325473 gtgtgaatcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacagg ttttattttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccg gaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggg @@ -3384,7 +3384,7 @@ cgcaggatgggaaaagaaggtggcgaccttccccacccttcaatctggggcctgaactgg agatcagctgtggatctccagaagagggactagtggttagaggagaccccccggaaaacg caaaacagcatattgacgctgggaaagaccagagactccatgagtttccaccacgctggc cgccaggcacagatcgccgaatagcggcggccggtgtggggaaatc ->Aedes_aegypti/USA/2016/FL05 +>KY075937 gacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttg gaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgt caatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgcc @@ -3562,7 +3562,7 @@ agtcagccacagcttggggaaagctgtgcagcctgtgacccccccaggagaagctgggaa accaagcctatagtcaggccgagaacgccatggcacggaagaagccatgctgcctgtgag cccctcagaggacactgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaag aaggtggcgaccttccccacccttcaatctggggcctgaactggagat ->SG_018 +>KY241688 atgnnnnnnnnnnnnnnnnnnnccggaggattccggattgtcaatatgctaaaacgcgga gtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggt catgggcccatcaggatggtcttggcgattctagcctttttgaggttcacggcaatcaag @@ -3741,7 +3741,7 @@ tcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttcccca cccttcaatctggggcctgaactggagatcagctgtggatctccagaagagggactagtg gttagaggagaccccccggaaaacgcaaaacagcatattgacgctgggaaagaccagaga ctccatgagtttccaccacgctggccgccaggcacagat ->USA/2016/FLWB042 +>KY325478 ctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatcaggat ggtcttggcgattctagcctttttgagattcacggcaatcaagccatcactgggtctcat caatagatggggttcagtggggaaaaaagaggctatggaaataataaagaagttcaagaa @@ -3916,7 +3916,7 @@ aatctcaatgttgtcaggcctgctagtcagccacagcttggggaaagctgtgcagcctgt gacccccccaggagaagctgggaaaccaagcctatagtcaggccgagaacgccatggcac ggaagaagccatgctgcctgtgagcccctcagaggacactgagtcaaaaaaccccacgcg cttggaggcgcaggnnnnnnaaagaag ->COL/PRV_00028/2015 +>MF574578 ttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacgagagtt tctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaa acgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttct @@ -4095,7 +4095,7 @@ gtcaggccgagaacgccatggcacggaagaagccatgctgcctgtgagcccctcagagga cactgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcgacc ttccccacccttcaatctggggcctgaactggagatcagctgtggatctccagaagaggg actagtggttagaggaga ->Thailand/1610acTw +>MF692778 gcaacagtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaa cccaaaaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccg tgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggccatgggcc @@ -4271,7 +4271,7 @@ ggactacctatccacccaagttcgctacttgggtgaagaagggtctacacctggagtgct gtaagcaccaatcttagtgttgtcaggcctgctagtcagccacagcttggggaaagctgt gcagcctgtgacccccccaggagaagctgggaaaccaagcccatagtcaggccgagaacg ccatggcacggaag ->1_0087_PF +>KX447509 agtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaa aaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtga gcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca @@ -4449,7 +4449,7 @@ ctgtgacccccccaggagaagctgggaaaccaagcctatagtcaggccgagaacgccatg gcacggaagaagccatgctgcctgtgagcccctcagaggacactgagtcaaaaaacccca cgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttccccacccttcaatctgg ggcctgaactggagatcagctgtggat ->1_0199_PF +>KX447519 actgcgacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttgg atttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccgg attgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagagg @@ -4603,7 +4603,7 @@ tgggctctagtggacaaggaaagagagcaccacctgagaggagagtgccagagttgtgtg tacaacatgatgggaaaaagagaaaagaaacaaggggaatttggaaaggccaagggcagc cgcgccatctggtatatgtggctaggggctagatttctagagttcgaagcccttggattc ttgaacgaggatcactggatgg ->1_0181_PF +>KX447512 agtatcaacaggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaa aaaagaaatccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtga gcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca @@ -4781,7 +4781,7 @@ ctgtgacccccccaggagaagctgggaaaccaagcctatagtcaggccgagaacgccatg gcacggaagaagccatgctgcctgtgagcccctcagaggacactgagtcaaaaaacccca cgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttccccacccttcaatctgg ggcctgaactggagatcagctgtgga ->Brazil/2015/ZBRC301 +>KY558995 gatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccg gattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagag gctgccagccggacttctgctgggtcatgggcccatcaggatggtcttggcgattctagc @@ -4950,7 +4950,7 @@ gactgcttgcctagcaaaatcatatgcgcagatgtggcagctcctttatttccacagaan ggacctccgactgatggccaatgccatttgttcatctgtgccagttgactgggttccaac tgggagaactacctggtcaatccatggaaanggagaatggatgaccactgaagacatgct tg ->Brazil/2015/ZBRA105 +>KY558989 gatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccg gattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagag gctgccagccggacttctgctgggtcatgggcccatcaggatggtcttggcgattctagc @@ -5119,7 +5119,7 @@ gactgcttgcctagcaaaatcatatgcgcaaatgtggcagctcctttatttccacagaag ggacctccgactgatggccaatgccatttgttcatctgtgccagttgactgggttccaac tgggagaactacctggtcaatccatggaaagggagaatggatgaccactgaagacatgct tg ->Brazil/2016/ZBRC16 +>KY558991 tgagaataatcaatgctaggaaggagaagaagagacgaggcgcagatactagtgtcggaa ttgttggcctcctgctgaccacagctatggcagcggaggtcactagacgtgggagtgcat actatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattgg @@ -5272,7 +5272,7 @@ atgcagatgacactgctggctgggacacccgcatcagcaggtttgatctggagaatgaag ctctaatcaccaaccaaatggagagagggcacagggccttggcattggccataatcaagt acacataccaaaacaaagtggtaaaggtccttagaccagctgaaaaagggaaaacagtta tggacatcatttcgagacaagaccaaaggggg ->V8375 +>KU501217 atgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaaacgcgga gtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggt catgggcccatcaggatggtcttggcgattctagcctttttgagattcacggcaatcaag @@ -5445,7 +5445,7 @@ ttgggaaaaagggaagacttgtggtgtggatctctcatagggcacagaccgcgcaccacc tgggctgagaacattaaaaacacagtcaacatggtgcgcaggatcataggtgatgaagaa aagtacatggactacctatccacccaagttcgctacttgggtgaagaagggtctacacct ggagtgctgtaa ->Nica1_16 +>KX421195 tcgagtttgaagcgaaagctagcaacagtatcaacaggttttattttggatttggaaacg agagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatat gctaaaacgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccgg @@ -5624,7 +5624,7 @@ cctatagtcaggccgagaacgccatggcacggaagaagccatgctgcctgtgagcccctc agaggacactgagtcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtg gcgaccttccccacccttcaatctggggcctgaactggagatcagctgtggatctccaga agagggactagtggttagaggag ->Brazil/2015/ZBRC303 +>KY558997 tgagaataatcaatgctaggaaggagaagaagagacgaggcacagatactagtgtcggaa ttgttggcctcctgctgaccacagctatggcagcggaggtcactagacgtgggagtgcat actatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattgg @@ -5782,7 +5782,7 @@ agatgcaagacttgtggctgctgcggaggtcagagaaagtgaccaactggttgcagagca acggatgggataggctcaaacgaatggcagtcagtggagatgattgcgttgtgaagccaa ttgatgataggtttgcacatgccctcaggttcttgaatgatatgggaaaagttaggaagg acacacaagagtgg ->SMGC_1 +>KX266255 tctgtgtgaatcagactgcgacagttcgagtttgaagcgaaagctagcaacagtatcaac aggttttattttggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaat ccggaggattccggattgtcaatatgctaaaacgcggagtagcccgtgtgagcccctttg diff --git a/scripts/set_final_strain_name.py b/scripts/set_final_strain_name.py new file mode 100644 index 0000000..08ca935 --- /dev/null +++ b/scripts/set_final_strain_name.py @@ -0,0 +1,38 @@ +import pandas as pd +import json, argparse +from augur.io import read_metadata + +def replace_name_recursive(node, lookup): + if node["name"] in lookup: + node["name"] = lookup[node["name"]] + + if "children" in node: + for child in node["children"]: + replace_name_recursive(child, lookup) + +if __name__=="__main__": + parser = argparse.ArgumentParser( + description="Swaps out the strain names in the Auspice JSON with the final strain name", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") + parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") + parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") + parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") + args = parser.parse_args() + + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) + name_lookup = {} + for ri, row in metadata.iterrows(): + strain_id = row.name + name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] + + with open(args.input_auspice_json, 'r') as fh: + data = json.load(fh) + + replace_name_recursive(data['tree'], name_lookup) + + with open(args.output, 'w') as fh: + json.dump(data, fh)