Skip to content

Commit

Permalink
[do not merge] example of augur subsample
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed Feb 25, 2024
1 parent c710ca1 commit a14e3e4
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 16 deletions.
181 changes: 181 additions & 0 deletions phylogenetic/config/reference.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
>KX369547
GAATCAGACTGCGACAGTTCGAGTTTGAAGCGAAAGCTAGCAACAGTATCAACAGGTTTT
ATTTGGATTTGGAAACGAGAGTTTCTGGTCATGAAAAACCCAAAAAAGAAATCCGGAGGA
TTCCGGATTGTCAATATGCTAAAACGCGGAGTAGCCCGTGTGAGCCCCTTTGGGGGCTTG
AAGAGGCTGCCAGCCGGACTTCTGCTGGGTCATGGGCCCATCAGGATGGTCTTGGCGATT
CTAGCCTTTTTGAGATTCACGGCAATCAAGCCATCACTGGGTCTCATCAATAGATGGGGT
TCAGTGGGGAAAAAAGAGGCTATGGAAATAATAAAGAAGTTCAAGAAAGATCTGGCTGCC
ATGCTGAGAATAATCAATGCTAGGAAGGAGAAGAAGAGACGAGGCGCAGATACTAGTGTC
GGAATTGTTGGCCTCCTGCTGACCACAGCTATGGCAGCGGAGGTCACTAGACGTGGGAGT
GCATACTATATGTACTTGGACAGAAACGATGCTGGGGAGGCCATATCTTTTCCAACCACA
TTGGGGATGAATAAGTGTTATATACAGATCATGGATCTTGGACACATGTGTGATGCCACC
ATGAGCTATGAATGCCCTATGCTGGATGAGGGGGTGGAACCAGATGACGTCGATTGTTGG
TGCAACACGACGTCAACTTGGGTTGTGTACGGAACCTGCCATCACAAAAAAGGTGAAGCA
CGGAGATCTAGAAGAGCTGTGACGCTCCCCTCCCATTCCACTAGAAAGCTGCAAACGCGG
TCGCAAACCTGGTTGGAATCAAGAGAATACACAAAGCACTTGATTAGAGTCGAAAATTGG
ATATTCAGGAACCCTGGCCTCGCGTTAGCAGCAGCTGCCATCGCTTGGCTTTTGGGAAGC
TCAACGAGCCAAAAAGTCATATACTTGGTCATGATACTGCTGATTGCCCCGGCATACAGC
ATCAGGTGCATAGGAGTCAGCAATAGGGACTTTGTGGAAGGTATGTCAGGTGGGACTTGG
GTTGATGTTGTCTTGGAACATGGAGGTTGTGTCACCGTAATGGCACAGGACAAACCGACT
GTCGACATAGAGCTGGTTACAACAACAGTCAGCAACATGGCGGAGGTAAGATCCTACTGC
TATGAGGCATCAATATCAGACATGGCTTCGGACAGCCGCTGCCCAACACAAGGTGAAGCC
TACCTTGACAAGCAATCAGACACTCAATATGTCTGCAAAAGAACGTTAGTGGACAGAGGC
TGGGGAAATGGATGTGGACTTTTTGGCAAAGGGAGCCTGGTGACATGCGCTAAGTTTGCA
TGCTCCAAGAAAATGACCGGGAAGAGCATCCAGCCAGAGAATCTGGAGTACCGGATAATG
CTGTCAGTTCATGGCTCCCAGCACAGTGGGATGATCGTTAATGACACAGGACATGAAACT
GATGAGAATAGAGCGAAGGTTGAGATAACGCCCAATTCACCAAGAGCCGAAGCCACCCTG
GGGGGTTTTGGAAGCCTAGGACTTGATTGTGAACCGAGGACAGGCCTTGACTTTTCAGAT
TTGTATTACTTGACTATGAATAACAAGCACTGGTTGGTTCACAAGGAGTGGTTCCACGAC
ATTCCATTACCTTGGCACGCTGGGGCAGACACCGGAACTCCACACTGGAACAACAAAGAA
GCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTTCTAGGGAGT
CAAGAAGGAGCAGTTCACACGGCCCTTGCTGGAGCTCTGGAGGCTGAGATGGATGGTGCA
AAGGGAAGGCTGTCCTCTGGCCACTTGAAATGTCGCCTGAAAATGGATAAACTTAGATTG
AAGGGCGTGTCATACTCCTTGTGTACCGCAGCGTTCACATTCACCAAGATCCCGGCTGAA
ACACTGCACGGGACAGTCACAGTGGAGGTACAGTACGCAGGGACAGATGGACCTTGCAAG
GTTCCAGCTCAGATGGCGGTGGACATGCAAACTCTGACCCCAGTTGGGAGGTTGATAACC
GCTAACCCCGTAATCACTGAAAGCACTGAGAACTCTAAGATGATGCTGGAACTTGATCCA
CCATTTGGGGACTCTTACATTGTCATAGGAGTCGGGGAGAAGAAGATCACCCACCACTGG
CACAGGAGTGGCAGCACCATTGGAAAAGCATTTGAAGCCACTGTGAGAGGTGCCAAGAGA
ATGGCAGTCTTGGGAGACACAGCCTGGGACTTTGGATCAGTTGGAGGCGCTCTCAACTCA
TTGGGCAAGGGCATCCATCAAATTTTTGGAGCAGCTTTCAAATCATTGTTTGGAGGAATG
TCCTGGTTCTCACAAATTCTCATTGGAACGTTGCTGATGTGGTTGGGTCTGAACACAAAG
AATGGATCTATTTCCCTTATGTGCTTGGCCTTAGGGGGAGTGTTGATCTTCTTATCCACA
GCCGTCTCTGCTGATGTGGGGTGCTCGGTGGACTTCTCAAAGAAGGAGACGAGATGCGGT
ACAGGGGTGTTCGTCTATAACGACGTTGAAGCCTGGAGGGACAGGTACAAGTACCATCCT
GACTCCCCCCGTAGATTGGCAGCAGCAGTCAAGCAAGCCTGGGAAGATGGTATCTGTGGG
ATCTCCTCTGTTTCAAGAATGGAAAACATCATGTGGAGATCAGTAGAAGGGGAGCTCAAC
GCAATCCTGGAAGAGAATGGAGTTCAACTGACGGTCGTTGTGGGATCTGTAAAAAACCCC
ATGTGGAGAGGTCCACAGAGATTGCCCGTGCCTGTGAACGAGCTGCCCCACGGCTGGAAG
GCTTGGGGGAAATCGTACTTCGTCAGAGCAGCAAAGACAAATAACAGCTTTGTCGTGGAT
GGTGACACACTGAAGGAATGCCCACTCGAACATAGAGCATGGAACAGCTTTCTTGTGGAG
GATCATGGGTTCGGGGTATTTCACACTAGTGTCTGGCTCAAGGTTAGAGAAGATTATTCA
TTAGAGTGTGATCCAGCCGTTATTGGAACAGCTGTTAAGGGAAAGGAGGCTGTACACAGT
GATCTAGGCTACTGGATTGAGAGTGAGAAGAATGACACATGGAGGCTGAAGAGGGCCCAT
CTGATCGAGATGAAAACATGTGAATGGCCAAAGTCCCACACATTGTGGACAGATGGAATA
GAAGAGAGTGATCTGATCATACCCAAGTCTTTAGCTGGGCCACTCAGCCATCACAATACC
AGAGAGGGCTACAGGACCCAAATGAAAGGGCCATGGCACAGTGAAGAGCTTGAAATTCGG
TTTGAGGAATGCCCAGGCACTAAGGTCCACGTGGAGGAAACATGTGGAACAAGAGGACCA
TCTCTGAGATCAACCACTGCAAGCGGAAGGGTGATCGAGGAATGGTGCTGCAGGGAGTGC
ACAATGCCCCCACTGTCGTTCCGGGCTAAAGATGGCTGTTGGTATGGAATGGAGATAAGG
CCCAGGAAAGAACCAGAAAGTAACTTAGTAAGGTCAATGGTGACTGCAGGATCAACTGAT
CACATGGATCACTTCTCCCTTGGAGTGCTTGTGATTCTGCTCATGGTGCAGGAAGGGCTG
AAGAAGAGAATGACCACAAAGATCATCATAAGCACATCAATGGCAGTGCTGGTAGCTATG
ATCCTGGGAGGATTTTCAATGAGTGACCTGGCTAAGCTTGCAATTTTGATGGGTGCCACC
TTCGCGGAAATGAACACTGGAGGAGATGTAGCTCATCTGGCGCTGATAGCGGCATTCAAA
GTCAGACCAGCGTTGCTGGTATCTTTCATCTTCAGAGCTAATTGGACACCCCGTGAAAGC
ATGCTGCTGGCCTTGGCCTCGTGTCTTTTGCAAACTGCGATCTCCGCCTTGGAAGGCGAC
CTGATGGTTCTCATCAATGGTTTTGCTTTGGCCTGGTTGGCAATACGAGCGATGGTTGTT
CCACGCACTGATAACATCACCTTGGCAATCCTGGCTGCTCTGACACCACTGGCCCGGGGC
ACACTGCTTGTGGCGTGGAGAGCAGGCCTTGCTACTTGCGGGGGGTTTATGCTCCTCTCT
CTGAAGGGAAAAGGCAGTGTGAAGAAGAACTTACCATTTGTCATGGCCCTGGGACTAACC
GCTGTGAGGCTGGTCGACCCCATCAACGTGGTGGGACTGCTGTTGCTCACAAGGAGTGGG
AAGCGGAGCTGGCCCCCTAGCGAAGTACTCACAGCTGTTGGCCTGATATGCGCATTGGCT
GGAGGGTTCGCCAAGGCAGATATAGAGATGGCTGGGCCCATGGCCGCGGTCGGTCTGCTA
ATTGTCAGTTACGTGGTCTCAGGAAAGAGTGTGGACATGTACATTGAAAGAGCAGGTGAC
ATCACATGGGAAAAAGATGCGGAAGTCACTGGAAACAGTCCCCGGCTCGATGTGGCGCTA
GATGAGAGTGGTGATTTCTCCCTGGTGGAGGATGACGGTCCCCCCATGAGAGAGATCATA
CTCAAGGTGGTCCTGATGACCATCTGTGGCATGAACCCAATAGCCATACCCTTTGCAGCT
GGAGCGTGGTACGTATACGTGAAGACTGGAAAAAGGAGTGGTGCTCTATGGGATGTGCCT
GCTCCCAAGGAAGTAAAAAAGGGGGAGACCACAGATGGAGTGTACAGAGTAATGACTCGT
AGACTGCTAGGTTCAACACAAGTTGGAGTGGGAGTTATGCAAGAGGGGGTCTTTCACACT
ATGTGGCACGTCACAAAAGGATCCGCGCTGAGAAGCGGTGAAGGGAGACTTGATCCATAC
TGGGGAGATGTCAAGCAGGATCTGGTGTCATACTGTGGTCCATGGAAGCTAGATGCCGCC
TGGGACGGGCACAGCGAGGTGCAGCTCTTGGCCGTGCCCCCCGGAGAGAGAGCGAGGAAC
ATCCAGACTCTGCCCGGAATATTTAAGACAAAGGATGGGGACATTGGAGCGGTTGCGCTG
GATTACCCAGCAGGAACTTCAGGATCTCCAATCCTAGACAAGTGTGGGAGAGTGATAGGA
CTTTATGGCAATGGGGTCGTGATCAAAAATGGGAGTTATGTTAGTGCCATCACCCAAGGG
AGGAGGGAGGAAGAGACTCCTGTTGAGTGCTTCGAGCCTTCGATGCTGAAGAAGAAGCAG
CTAACTGTCTTAGACTTGCATCCTGGAGCTGGGAAAACCAGGAGAGTTCTTCCTGAAATA
GTCCGTGAAGCCATAAAAACAAGACTCCGTACTGTGATCTTAGCTCCAACCAGGGTTGTC
GCTGCTGAAATGGAGGAAGCCCTTAGAGGGCTTCCAGTGCGTTATATGACAACAGCAGTC
AATGTCACCCACTCTGGAACAGAAATCGTCGACTTAATGTGCCATGCCACCTTCACTTCA
CGTCTACTACAGCCAATCAGAGTCCCCAACTATAATCTGTATATTATGGATGAGGCCCAC
TTCACAGATCCCTCAAGTATAGCAGCAAGAGGATACATTTCAACAAGGGTTGAGATGGGC
GAGGCGGCTGCCATCTTCATGACCGCCACGCCACCAGGAACCCGTGACGCATTTCCGGAC
TCCAACTCACCAATTATGGACACCGAAGTGGAAGTCCCAGAGAGAGCCTGGAGCTCAGGC
TTTGATTGGGTGACGGATCATTCTGGAAAAACAGTTTGGTTTGTTCCAAGCGTGAGGAAC
GGCAATGAGATCGCAGCTTGTCTGACAAAGGCTGGAAAACGGGTCATACAGCTCAGCAGA
AAGACTTTTGAGACAGAGTTCCAGAAAACAAAACATCAAGAGTGGGACTTTGTCGTGACA
ACTGACATTTCAGAGATGGGCGCCAACTTTAAAGCTGACCGTGTCATAGATTCCAGGAGA
TGCCTAAAGCCGGTCATACTTGATGGCGAGAGAGTCATTCTGGCTGGACCCATGCCTGTC
ACACATGCCAGCGCTGCCCAGAGGAGGGGGCGCATAGGCAGGAATCCCAACAAACCTGGA
GATGAGTATCTGTATGGAGGTGGGTGCGCAGAGACTGACGAAGACCATGCACACTGGCTT
GAAGCAAGAATGCTCCTTGACAATATTTACCTCCAAGATGGCCTCATAGCCTCGCTCTAT
CGACCTGAGGCCGACAAAGTAGCAGCCATTGAGGGAGAGTTCAAGCTTAGGACGGAGCAA
AGGAAGACCTTTGTGGAACTCATGAAAAGAGGAGATCTTCCTGTTTGGCTGGCCTATCAG
GTTGCATCTGCCGGAATAACCTACACAGATAGAAGATGGTGCTTTGATGGCACGACCAAC
AACACCATAATGGAAGACAGTGTGCCGGCAGAGGTGTGGACCAGACACGGAGAGAAAAGA
GTGCTCAAACCGAGGTGGATGGACGCCAGAGTTTGTTCAGATCATGCGGCCCTGAAGTCA
TTCAAGGAGTTTGCCGCTGGGAAAAGAGGAGCGGCTTTTGGAGTGATGGAAGCCCTGGGA
ACACTGCCAGGACACATGACAGAGAGATTCCAGGAAGCCATTGACAACCTCGCTGTGCTC
ATGCGGGCAGAGACTGGAAGCAGGCCTTACAAAGCCGCGGCGGCCCAATTGCCGGAGACC
CTAGAGACCATTATGCTTTTGGGGTTGCTGGGAACAGTCTCGCTGGGAATCTTTTTCGTC
TTGATGAGGAACAAGGGCATAGGGAAGATGGGCTTTGGAATGGTGACTCTTGGGGCCAGC
GCATGGCTCATGTGGCTCTCGGAAATTGAGCCAGCCAGAATTGCATGTGTCCTCATTGTT
GTGTTCCTATTGCTGGTGGTGCTCATACCTGAGCCAGAAAAGCAAAGATCTCCCCAGGAC
AACCAAATGGCAATCATCATCATGGTAGCAGTAGGTCTTCTGGGCTTGATTACCGCCAAT
GAACTCGGATGGTTGGAGAGAACAAAGAGTGACCTAAGCCATCTAATGGGAAGGAGAGAG
GAGGGGGCAACCATAGGATTCTCAATGGACATTGACCTGCGGCCAGCCTCAGCTTGGGCC
ATCTATGCTGCCTTGACAACTTTCATTACCCCAGCCGTCCAACATGCAGTGACCACTTCA
TACAACAACTACTCCTTAATGGCGATGGCCACGCAAGCTGGAGTGTTGTTTGGTATGGGC
AAAGGGATGCCATTCTACGCATGGGACTTTGGAGTCCCGCTGCTAATGATAGGTTGCTAC
TCACAATTAACACCCCTGACCCTAATAGTGGCCATCATTTTGCTCGTGGCGCACTACATG
TACTTGATCCCAGGGCTGCAGGCAGCAGCTGCGCGTGCTGCCCAGAAGAGAACGGCAGCT
GGCATCATGAAGAACCCTGTTGTGGATGGAATAGTGGTGACTGACATTGACACAATGACA
ATTGACCCCCAAGTGGAGAAAAAGATGGGACAGGTGCTACTCATAGCAGTAGCCGTCTCC
AGCGCCATACTGTCGCGGACCGCCTGGGGGTGGGGGGAGGCTGGGGCCCTGATCACAGCT
GCAACTTCCACTTTGTGGGAAGGCTCTCCGAACAAGTACTGGAACTCCTCTACAGCCACT
TCACTGTGTAACATTTTTAGGGGAAGTTACTTGGCTGGAGCTTCTCTAATCTACACAGTA
ACAAGAAACGCTGGCTTGGTCAAGAGACGTGGGGGTGGAACAGGAGAGACCCTGGGAGAG
AAATGGAAGGCCCGCTTGAACCAGATGTCGGCCCTGGAGTTCTACTCCTACAAAAAGTCA
GGCATCACCGAGGTGTGCAGAGAAGAGGCCCGCCGCGCCCTCAAGGACGGTGTGGCAACG
GGAGGCCATGCTGTGTCCCGAGGAAGTGCAAAGCTGAGATGGTTGGTGGAGCGGGGATAC
CTGCAGCCCTATGGAAAGGTCATTGATCTTGGATGTGGCAGAGGGGGCTGGAGTTACTAC
GCCGCCACCATCCGCAAAGTTCAAGAAGTGAAAGGATACACAAAAGGAGGCCCTGGTCAT
GAAGAACCCATGTTGGTGCAAAGCTATGGGTGGAACATAGTCCGTCTTAAGAGTGGGGTG
GACGTCTTTCATATGGCGGCTGAGCCGTGTGACACGTTGCTGTGTGACATAGGTGAGTCA
TCATCTAGTCCTGAAGTGGAAGAAGCACGGACGCTCAGAGTCCTCTCCATGGTGGGGGAT
TGGCTTGAAAAAAGACCAGGAGCCTTTTGTATAAAAGTGTTGTGCCCATACACCAGCACT
ATGATGGAAACCCTGGAGCGACTGCAGCGTAGGTATGGGGGAGGACTGGTCAGAGTGCCA
CTCTCCCGCAACTCTACACATGAGATGTACTGGGTCTCTGGAGCGAAAAGCAACACCATA
AAAAGTGTGTCCACCACGAGCCAGCTCCTCTTGGGGCGCATGGACGGGCCCAGGAGGCCA
GTGAAATATGAGGAGGATGTGAATCTCGGCTCTGGCACGCGGGCTGTGGTAAGCTGCGCT
GAAGCTCCCAACATGAAGATCATTGGTAACCGCATTGAAAGGATCCGCAGTGAGCACGCG
GAAACGTGGTTCTTTGACGAGAACCACCCATATAGGACATGGGCTTACCATGGAAGCTAT
GAGGCCCCCACACAAGGGTCAGCGTCCTCTCTAATAAACGGGGTTGTCAGGCTCCTGTCA
AAACCCTGGGATGTGGTGACTGGAGTCACAGGAATAGCCATGACCGACACCACACCGTAT
GGTCAGCAAAGAGTTTTCAAGGAAAAAGTGGACACTAGGGTGCCAGACCCCCAAGAAGGC
ACTCGTCAGGTTATGAGCATGGTCTCTTCCTGGTTGTGGAAAGAGCTAGGCAAACACAAA
CGGCCACGAGTCTGTACCAAAGAAGAGTTCATCAACAAGGTTCGTAGCAATGCAGCATTA
GGGGCAATATTTGAAGAGGAAAAAGAGTGGAAGACTGCAGTGGAAGCTGTGAACGATCCA
AGGTTCTGGGCTCTAGTGGACAAGGAAAGAGAGCACCACCTGAGAGGAGAGTGCCAGAGT
TGTGTGTACAACATGATGGGAAAAAGAGAAAAGAAACAAGGGGAATTTGGAAAGGCCAAG
GGCAGCCGCGCCATCTGGTATATGTGGCTAGGGGCTAGATTTCTAGAGTTCGAAGCCCTT
GGATTCTTGAACGAGGATCACTGGATGGGGAGAGAGAACTCAGGAGGTGGTGTTGAAGGG
CTGGGATTACAAAGACTCGGATATGTCCTAGAAGAGATGAGTCGCATACCAGGAGGAAGG
ATGTATGCAGATGACACTGCTGGCTGGGACACCCGCATCAGCAGGTTTGATCTGGAGAAT
GAAGCTCTAATCACCAACCAAATGGAGAAAGGGCACAGGGCCTTGGCATTGGCCATAATC
AAGTACACATACCAAAACAAAGTGGTAAAGGTCCTTAGACCAGCTGAAAAAGGGAAGACA
GTTATGGACATTATTTCGAGACAAGACCAAAGGGGGAGCGGACAAGTTGTCACTTACGCT
CTTAACACATTTACCAACCTAGTGGTGCAACTCATTCGGAATATGGAGGCTGAGGAAGTT
CTAGAGATGCAAGACTTGTGGCTGCTGCGGAGGTCAGAGAAAGTGACCAACTGGTTGCAG
AGCAACGGATGGGATAGGCTCAAACGAATGGCAGTCAGTGGAGATGATTGCGTTGTGAAG
CCAATTGATGATAGGTTTGCACATGCCCTCAGGTTCTTGAATGATATGGGAAAAGTTAGG
AAGGACACACAAGAGTGGAAACCCTCAACTGGATGGGACAACTGGGAAGAAGTTCCGTTT
TGCTCCCACCACTTCAACAAGCTCCATCTCAAGGACGGGAGGTCCATTGTGGTTCCCTGC
CGCCACCAAGATGAACTGATTGGCCGGGCCCGCGTCTCTCCAGGGGCGGGATGGAGCATC
CGGGAGACTGCTTGCCTAGCAAAATCATATGCGCAAATGTGGCAGCTCCTTTATTTCCAC
AGAAGGGACCTCCGACTGATGGCCAATGCCATTTGTTCATCTGTGCCAGTTGACTGGGTT
CCAACTGGGAGAACTACCTGGTCAATCCATGGAAAGGGAGAATGGATGACCACTGAAGAC
ATGCTTGTGGTGTGGAACAGAGTGTGGATTGAGGAGAACGACCACATGGAAGACAAGACC
CCAGTTACGAAATGGACAGACATTCCCTATTTGGGAAAAAGGGAAGACTTGTGGTGTGGA
TCTCTCATAGGGCACAGACCGCGCACCACCTGGGCTGAGAACATTAAAAACACAGTCAAC
ATGGTGCGCAGGATCATAGGTGATGAAGAAAAGTACATGGACTACCTATCCACCCAAGTT
CGCTACTTGGGTGAAGAAGGGTCTACACCTGGAGTGCTGTAAGCACCAATCTTAGTGTTG
TCAGGCCTGCTAGTCAGCCACAGCTTGGGGAAAGCTGTGCAGCCTGTGACCCCCCCAGGA
GAAGCTGGGAAACCAAGCCTATAGTCAGGCCGAGAACGCCATGGCACGGAAGAAGCCATG
CTGCCTGTGAGCCCCTCAGAGGACACTGAGTCAAAAAACCCCACGCGCTTGGAGGCGCAG
GATGGGAAAAGAAGGTGGCGACCTTCCCCACCCTTCAATCTGGGGCCTGAACTGGAGATC
AGCTGTGGATCTCCAGAAGAGGGACTAGTGGTTAGAGGAGACCCCCCGGAAAACGCAAAA
CAGCATATTGACGCTGGGAAAGACCAGAGACTCCATGAGTTTCCACCACGCTGGCCGCCA
GGCACAGATCGCCGAATAGCGGCGGCCGG
18 changes: 18 additions & 0 deletions phylogenetic/config/subsampling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
focal:
filter: >-
--query "country=='USA'"
--subsample-max-sequences 50
contextual:
priorities:
type: proximity
focus: focal
num_per_focal: 5
background:
filter: "--group-by country month --subsample-max-sequences 100"
exclude:
- contextual
- focal
output:
- background
- contextual
- focal
4 changes: 2 additions & 2 deletions phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ rule ancestral:
"""Reconstructing ancestral sequences and mutations"""
input:
tree = "results/tree.nwk",
alignment = "results/aligned.fasta"
alignment = "results/subsampled.fasta"
output:
node_data = "results/nt_muts.json"
params:
Expand Down Expand Up @@ -73,7 +73,7 @@ rule traits:
"""
input:
tree = "results/tree.nwk",
metadata = "data/metadata_all.tsv"
metadata = "results/subsampled.tsv"
output:
node_data = "results/traits.json",
params:
Expand Down
6 changes: 3 additions & 3 deletions phylogenetic/rules/construct_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ See Augur's usage docs for these commands for more details.
rule tree:
"""Building tree"""
input:
alignment = "results/aligned.fasta"
alignment = "results/subsampled.fasta"
output:
tree = "results/tree_raw.nwk"
shell:
Expand All @@ -42,8 +42,8 @@ rule refine:
"""
input:
tree = "results/tree_raw.nwk",
alignment = "results/aligned.fasta",
metadata = "data/metadata_all.tsv"
alignment = "results/subsampled.fasta",
metadata = "results/subsampled.tsv"
output:
tree = "results/tree.nwk",
node_data = "results/branch_lengths.json"
Expand Down
6 changes: 3 additions & 3 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ rule export:
"""Exporting data files for for auspice"""
input:
tree = "results/tree.nwk",
metadata = "data/metadata_all.tsv",
metadata = "results/subsampled.tsv",
branch_lengths = "results/branch_lengths.json",
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
Expand Down Expand Up @@ -59,7 +59,7 @@ rule export:
rule final_strain_name:
input:
auspice_json="results/raw_zika.json",
metadata="data/metadata_all.tsv",
metadata="results/subsampled.tsv",
root_sequence="results/raw_zika_root-sequence.json",
output:
auspice_json="auspice/zika.json",
Expand All @@ -69,7 +69,7 @@ rule final_strain_name:
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py \
python scripts/set_final_strain_name.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
Expand Down
35 changes: 27 additions & 8 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ rule decompress:
rule filter:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- from {params.min_date} onwards
- excluding strains in {input.exclude}
- minimum genome length of {params.min_length} (50% of Zika virus genome)
Expand All @@ -62,10 +61,9 @@ rule filter:
metadata = "data/metadata_all.tsv",
exclude = "config/dropped_strains.txt",
output:
sequences = "results/filtered.fasta"
sequences = "results/filtered.fasta",
metadata = "results/filtered.tsv"
params:
group_by = "country year month",
sequences_per_group = 40,
min_date = 2012,
min_length = 5385,
strain_id = config.get("strain_id_field", "strain"),
Expand All @@ -76,9 +74,8 @@ rule filter:
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--min-date {params.min_date} \
--min-length {params.min_length}
"""
Expand All @@ -101,4 +98,26 @@ rule align:
--output {output.alignment} \
--fill-gaps \
--remove-reference
"""
"""

rule subsample:
input:
metadata = "results/filtered.tsv",
sequences = "results/aligned.fasta",
reference = "config/reference.fasta",
config = "config/subsampling.yaml",
output:
metadata = "results/subsampled.tsv",
sequences = "results/subsampled.fasta",
params:
tmpdir = "results/subsampling",
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur subsample \
--config {input.config} \
--metadata {input.metadata} --sequences {input.sequences} --reference {input.reference} \
--output-metadata {output.metadata} --output-sequences {output.sequences} \
--tmpdir {params.tmpdir} \
--metadata-id-columns {params.strain_id}
"""

0 comments on commit a14e3e4

Please sign in to comment.