diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 15ff6a9..ddd301d 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -16,6 +16,8 @@ rule files: files = rules.files.params +include: "workflow/snakemake_rules/usvi.smk" + rule download: """Downloading sequences and metadata from data.nextstrain.org""" output: @@ -53,8 +55,8 @@ rule filter: - minimum genome length of {params.min_length} (50% of Zika virus genome) """ input: - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv", exclude = files.dropped_strains output: sequences = "results/filtered.fasta" @@ -122,7 +124,7 @@ rule refine: input: tree = "results/tree_raw.nwk", alignment = "results/aligned.fasta", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: tree = "results/tree.nwk", node_data = "results/branch_lengths.json" @@ -189,7 +191,7 @@ rule traits: """ input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: node_data = "results/traits.json", params: @@ -212,7 +214,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv", + metadata = "data/metadata_all.tsv", branch_lengths = "results/branch_lengths.json", traits = "results/traits.json", nt_muts = "results/nt_muts.json", @@ -242,7 +244,7 @@ rule export: rule final_strain_name: input: auspice_json="results/raw_zika.json", - metadata="data/metadata.tsv", + metadata="data/metadata_all.tsv", root_sequence="results/raw_zika_root-sequence.json", output: auspice_json="auspice/zika.json", diff --git a/phylogenetic/config/config_zika.yaml b/phylogenetic/config/config_zika.yaml index 5345584..fa4e134 100644 --- a/phylogenetic/config/config_zika.yaml +++ b/phylogenetic/config/config_zika.yaml @@ -1,2 +1,2 @@ -strain_id_field: "genbank_accession" +strain_id_field: "accession" display_strain_field: "strain" \ No newline at end of file diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv index 6e5345c..3d39cf9 100644 --- a/phylogenetic/example_data/metadata.tsv +++ b/phylogenetic/example_data/metadata.tsv @@ -1,35 +1,35 @@ -strain virus genbank_accession date region country division city db segment authors url -PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 -COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 -PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 -COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 -Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 -ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 -VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 -DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 -BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 -DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 -EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 -HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 -DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484 -DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441 -USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935 -SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697 -SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744 -SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726 -USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473 -Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937 -SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688 -USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478 -COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578 -Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778 -1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509 -1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519 -1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512 -Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995 -Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989 -Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991 -V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217 -Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195 -Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997 -SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255 +strain virus genbank_accession date region country division city db segment authors +PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al +COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al +COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al +ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al +VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al +DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al +DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al +HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al +DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al +SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al +Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al +SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al +COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al +1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al +Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al +Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al +V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al +Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al +Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al +SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al diff --git a/phylogenetic/example_data/metadata_usvi.tsv b/phylogenetic/example_data/metadata_usvi.tsv new file mode 100644 index 0000000..96d3d52 --- /dev/null +++ b/phylogenetic/example_data/metadata_usvi.tsv @@ -0,0 +1,2 @@ +genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url +USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/ diff --git a/phylogenetic/example_data/sequences_usvi.fasta b/phylogenetic/example_data/sequences_usvi.fasta new file mode 100644 index 0000000..d677bfc --- /dev/null +++ b/phylogenetic/example_data/sequences_usvi.fasta @@ -0,0 +1,137 @@ +>VI37 +nnnnnnnnnnnnnnnnnnnnnnnnnnnngacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattt +tggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaa +acgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca +ggatggtcttggcgattctagcctttttgagattcacggcaatcaagccatcactgggcctcatcaatagatggggttca +gtggggaaaaaagaggctatggaaacaataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctag +gaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagctatggcagcggagg +tcactagacgtgggagtgcatactatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattg +gggatgaataagtgttatatacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgct +ggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgggttgtgtacggaacctgccatc +acaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccctcccattccaccaggaagctgcaaacgcggtcg +caaacctggttggaatcaagagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgc +gttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtcatgatactgctga +ttgccccggcatacagcatcaggtgcataggagtcagcaatagggactttgtggaaggtatgtcaggtgggacttgggtt +gatgttgtcttggaacatggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaac +aacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagacatggcttctgacagccgctgcc +caacacaaggtgaagcctaccttgacaagcaatcagacactcaatatgtctgcaaaagaacgttagtggacagaggctgg +ggaaatggatgtggactttttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaa +gagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccagcacagtgggatgatcgttaatg +acacaggacatgaaactgatgagaatagagcgaaagttgagataacgcccaattcaccgagagccgaagccaccctgggg +ggttttggaagcctaggacttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataa +caagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagacaccggaactccac +actggaacaacaaagaagcactggtagagttcaaggacgcacatgccaaaaggcaaactgtcgtggttctagggagtcaa +gaaggagcagttcacacggcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggcca +cttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttgtgtactgcagcgttcacattca +ccaagatcccggctgaaacactgcacgggacagtcacagtggaggtacagtacgcagggacagatggaccttgcaaggtt +ccagctcagatggcggtggacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaag +cactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcataggagtcggggagaaga +agatcacccaccactggcacaggagtggcagcaccattggaaaagcatttgaagccactgtgagaggtgccaagagaatg +gcagtcttgggagacacagcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaat +ttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacgttgctgatgtggt +tgggtctgaacacaaagaatggatctatttcccttatgtgcttggccttagggggagtgttgatcttcttatccacagcc +gtctctgctgatgtggggtgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataacga +cgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagttaagcaagcctggg +aagatggtatctgcgggatctcctctgtttcaagaatggaaaacatcatgtggagatcagtagaaggggagctcaacgca +atcctggaagagaatggagttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagatt +gcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagcagcaaagacaaata +acagctttgtcgtggatggtgacacactgaaggaatgcccactcaaacatagagcatggaacagctttcttgtggaggat +catgggttcggggtatttcacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgttat +tggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgagagtgagaagaatgacacatgga +ggctggagagggcccatctgatcgagatgaaaacatgtgaatggccaaagtcccacacattgtggacagatggaatagaa +gagagtgatctgatcatacccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaaat +gaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccacgtggaggaaacat +gtggaacaagaggaccatctctgagatcaaccactgcaagcggaagggtgatcgaggaatggtgctgcagggagtgcaca +atgcccccactgtcgttccgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagcaa +cttagtaaggtcaatggtgactgcaggatcaactgatcacatggaccacttctcccttggagtgcttgtgatcctgctca +tggtgcaggaagggctgaagaagagaatgaccacaaagatcatcataagcacatcaatggcagtgctggtagctatgatc +ctgggaggattttcaatgagtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggagg +agatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatcttcagagctaatt +ggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttgcaaactgcgatctccgccttggaaggcgacctg +atggttctcatcaatggttttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacctt +ggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggccttgctacttgcgggg +ggtttatgctcctctctctgaagggaaaaggcagtgtgaagaagaacttaccatttgtcatggccctgggactaaccgct +gtgaggctggtcgaccccatcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcga +agtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatggctgggcccatgg +ccgcggtcggtctgctaattgtcagttacgtggtctcaggaaagagtgtggacatgtacattgaaagagcaggtgacatc +acatgggaaaaagatgcggaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctccct +ggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgaccatctgtggcatgaacccaatag +ccataccctttgcagctggagcgtggtacgtatacgtgaagactggaaaaaggagtggtgctctatgggatgtgcctgct +cccaaggaagtaaaaaagggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagt +tggagtgggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcgctgagaagcggtgaag +ggagacttgatccatactggggagatgtcaagcaggatctggtgtcatactgtggtccatggaagctagatgccgcctgg +gatgggcacagcgaggtgcagctcttggccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatt +taagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttcaggatctccaatcctagacaagt +gtgggagagtgataggactttatggcaatggggtcgtgatcaaaaacgggagttatgttagtgccatcacccaagggagg +agggaggaagagactcctgttgagtgcttcgagccctcgatgctgaagaagaagcagctaactgtcttagacttgcatcc +tggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgtactgtgatcttag +ctccaaccagggttgtcgctgctgaaatggaggaggcccttagagggcttccagtgcgttatatgacaacagcagtcaat +gtcacccactctggaacagaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagt +ccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaagaggatacatttcaa +caagggttgagatgggcgaggcggctgccatcttcatgaccgccacgccaccaggaacccgtgacgcatttccggactcc +aactcaccaattatggacaccgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcattc +tggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacaaaggctggaaaacggg +tcatacagctcagcagaaagacttttgagacagagttccagaaaacaaaacatcaagagtgggactttgtcgtgacaact +gacatttcagagatgggcgccaactttaaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttga +tggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggagggggcgcataggcagga +atcccaacaaacctggagatgagtatctgtatggaggtgggtgcgcagagactgacgaagaccatgcacactggcttgaa +gcaagaatgctccttgacaatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagc +agccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaagaggagatcttcctg +tttggctggcctatcaggttgcatctgccggaataacctacacagatagaagatggtgctttgatggcacgaccaacaac +accataatggaagacagtgtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatgga +cgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagaggagcggcttttggag +tgatggaagccctgggaacactgccaggacacatgacnnagagattccaggaagcnattgacaacctcgctgtgctcatg +cgngcagagactggaagcaggccttacaaagccgcggcggcccaattgccggagaccctagagaccataatgcntttggg +gttgctgggaacagtctcgctgggaatcttcttcgtcttgatgaggaacaagggcatagggaagatgggctttggaatgg +tgactcttggggccagcgcatggctcatgtggctctcggaaattgagccagccagaattgcatgtgtcctcattgttgtg +ttcctattgctggtggtgctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcat +ggtagcagtaggtcttttgggcttgattaccgccaatgaactcggatggttggagagaacaaagagtgacctaagccatc +taatgggaaggagagaggagggggcaaccataggattctcaatggacattgacctgcggccagcctcagcttgggccatc +tatgctgccttgacaactttcattaccccagccgtccaacatgcagtgaccacctcatacaacaactactccttaatggc +gatggccacgcaagctggagtgttgtttggcatgggcaaagggatgccattctacgcatgggactttggagtcccgctgc +taatgataggttgctactcacaattaacacccctgaccctaatagtggccatcattttgctcgtggcgcactacatgtac +ttgatcccagggctgcaggcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgt +ggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatgggacaggtgctactca +tagcagtggccgtctccagcgccatactgtcgcggaccgcctgggggtggggggaggctggggctctgatcacagccgca +acttccactttgtgggaaggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagggg +aagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtcaagagacgtgggggtggaacag +gagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcggccctggagttctactcctacaaaaagtcaggc +atcaccgaggtgtgcagagaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccgagg +aagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatcttggatgtggcagag +ggggctggagttactacgccgccaccatccgcaaagttcaagaagtgaaaggatacacaaaaggaggccctggtcatgaa +gaacccgtgttggtgcaaagctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctga +gccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacggacgctcagagtcc +tctccatggtgggggattggcttgaaaaaagaccaggagccttttgtataaaagtgttgtgcccatacaccagcactatg +atggaaaccctggagcgactgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacatga +gatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctcttggggcgcatgg +acgggcctaggaggccagtgaaatatgaggaggatgtgaatctcggctctggcacgcgggctgtggtaagctgcgctgaa +gctcccaacatgaagatcattggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaa +ccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctctctaataaacgggg +ttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcacaggaatagccatgaccgacaccacaccgtatggt +cagcaaagagttttcaaggaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggt +ctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgcaccaaagaagagttcatcaacaaggttc +gtagcaatgcagcattaggggcaatatttgaggaggaaaaagagtggaagactgcagtggaagctgtgaacgatccaagg +ttctgggctctagtggacaaggaaagagagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaa +aagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggctaggggctagatttc +tagagttcgaagcccttggattcttgaacgaggatcactggatggggagagagaactcaggaggtggtgttgaagggctg +ggattacaaagactcggatatgtcctagaagagatgagtcgtataccaggaggaaggatgtatgcagatgacactgctgg +ctgggacacccgcattagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaagggcacagggcct +tggcattggccataatcaagtacacataccaaaacaaagtggtaaaggtccttagaccagctgaaaaagggaaaacagtt +atggacattatttcgagacaagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaacctagt +ggtgcaactcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcggaggtcagagaaag +tgaccaactggttgcagagcaacggatgggataggctcaaacgaatggcagtcagtggagatgattgcgttgtgaagcca +attgatgataggtttgcacatgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaacc +ctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaagctccatctcaaggacgggaggt +ccattgtggttccctgccgccaccaagatgaactgattggtcgggcccgcgtctctccaggggcgggatggagcatccgg +gagactgcttgcctagcaaaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatggc +caatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccatggaaagggagaat +ggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnn diff --git a/phylogenetic/scripts/set_final_strain_name.py b/phylogenetic/scripts/set_final_strain_name.py index c670f44..d104ca1 100644 --- a/phylogenetic/scripts/set_final_strain_name.py +++ b/phylogenetic/scripts/set_final_strain_name.py @@ -6,7 +6,6 @@ def replace_name_recursive(node, lookup, saveoldcolumn): if node["name"] in lookup: if saveoldcolumn == "accession": node["node_attrs"][saveoldcolumn] = node["name"] - node["node_attrs"]["url"] = "https://www.ncbi.nlm.nih.gov/nuccore/" + node["name"] elif saveoldcolumn == "genbank_accession": node["node_attrs"][saveoldcolumn] = {} node["node_attrs"][saveoldcolumn]["value"] = node["name"] diff --git a/phylogenetic/workflow/snakemake_rules/usvi.smk b/phylogenetic/workflow/snakemake_rules/usvi.smk new file mode 100644 index 0000000..5ae8b3d --- /dev/null +++ b/phylogenetic/workflow/snakemake_rules/usvi.smk @@ -0,0 +1,52 @@ +rule download_usvi: + """Downloading sequences and metadata from data.nextstrain.org""" + output: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + params: + sequences_url = "https://data.nextstrain.org/files/zika/sequences_usvi.fasta.zst", + metadata_url = "https://data.nextstrain.org/files/zika/metadata_usvi.tsv.zst" + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + +rule decompress_usvi: + """Decompressing sequences and metadata""" + input: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + output: + sequences = "data/sequences_usvi.fasta", + metadata = "data/metadata_usvi.tsv" + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + +rule append_usvi: + """Appending USVI sequences""" + input: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv", + usvi_sequences = "data/sequences_usvi.fasta", + usvi_metadata = "data/metadata_usvi.tsv" + output: + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv" + shell: + """ + cat {input.sequences} {input.usvi_sequences} > {output.sequences} + + csvtk mutate2 -tl \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \ + {input.metadata} \ + | csvtk mutate2 -tl \ + -n accession \ + -e '$genbank_accession' \ + | csvtk concat -tl - {input.usvi_metadata} \ + > {output.metadata} + """ \ No newline at end of file