Merge pull request #20 from nextstrain/add-N450-tree

Make tree for 450bp of the N gene ("N450")
nextstrain · Apr 1, 2024 · 9c1fea2 · 9c1fea2
2 parents 0855c99 + bf83a42
commit 9c1fea2
Show file tree

Hide file tree

Showing 13 changed files with 222 additions and 49 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,5 @@
 # CHANGELOG
+* 1 April 2024: Create a tree using the 450 nucleotides encoding the carboxyl-terminal 150 amino acids of the nucleoprotein (N450), which is highly represented on NCBI for measles. [PR #20](https://github.com/nextstrain/measles/pull/20)
 * 15 March 2024: Connect ingest and phylogenetic workflows to follow the pathogen-repo-guide by uploading ingest output to S3, downloading ingest output from S3 to phylogenetic directory, using "accession" column as the ID column, and using a color scheme that matches the new region name format. [PR #19](https://github.com/nextstrain/measles/pull/19)
 * 1 March 2024: Add phylogenetic directory to follow the pathogen-repo-guide, and update the CI workflow to match the new file structure. [PR #18](https://github.com/nextstrain/measles/pull/18)
 * 14 February 2024: Add ingest directory from pathogen-repo-guide and make measles-specific modifications. [PR #10](https://github.com/nextstrain/measles/pull/10)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -1,10 +1,13 @@
+genes = ['N450', 'genome']
+
 configfile: "defaults/config.yaml" 
 
 rule all:
     input:
-        auspice_json = "auspice/measles.json",
+        auspice_json = expand("auspice/measles_{gene}.json", gene=genes)
 
 include: "rules/prepare_sequences.smk"
+include: "rules/prepare_sequences_N450.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
 include: "rules/export.smk"

diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json
@@ -16,11 +16,6 @@
       "title": "Date",
       "type": "continuous"
     },
-    {
-      "key": "author",
-      "title": "Author",
-      "type": "categorical"
-    },
     {
       "key": "country",
       "title": "Country",
@@ -43,5 +38,8 @@
     "country",
     "region",
     "author"
+  ],
+  "metadata_columns": [
+    "author"
   ]
 }
diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json
@@ -0,0 +1,45 @@
+{
+  "title": "Real-time tracking of measles virus evolution",
+  "maintainers": [
+    {"name": "Kim Andrews", "url": "https://bedford.io/team/kim-andrews/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "build_url": "https://github.com/nextstrain/measles",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true
+  },
+  "filters": [
+    "country",
+    "region",
+    "author"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv
@@ -4,23 +4,3 @@ region	Africa	#8ABB6A
 region	Europe	#BEBB48
 region	South America	#E29E39
 region	North America	#E2562B
-
-country	india	#511EA8
-country	china	#4333BE
-country	vietnam	#3F4ECB
-country	south korea	#4169CF
-country	japan	#4682C9
-country	australia	#4F96BB
-country	new zealand	#5AA5A8
-country	russia	#68AF92
-country	gambia	#78B77D
-country	sudan	#8BBB6A
-country	morocco	#9EBE59
-country	italy	#B3BD4D
-country	germany	#C5B945
-country	france	#D5B03F
-country	netherlands	#E0A23A
-country	united kingdom	#E68D36
-country	brazil	#E67231
-country	usa	#E1502A
-country	canada	#DC2F24
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -2,13 +2,21 @@ strain_id_field: "accession"
 files:
     exclude: "defaults/dropped_strains.txt"
     reference: "defaults/measles_reference.gb"
+    reference_N450: "defaults/measles_reference_N450.gb"
+    reference_N450_fasta: "defaults/measles_reference_N450.fasta"
     colors: "defaults/colors.tsv"
     auspice_config: "defaults/auspice_config.json"
+    auspice_config_N450: "defaults/auspice_config_N450.json"
 filter: 
     group_by: "country year month"
     sequences_per_group: 20
     min_date: 1950
     min_length: 5000
+filter_N450:
+    group_by: "country year"
+    subsample_max_sequences: 3000
+    min_date: 1950
+    min_length: 400
 refine:
     coalescent: "opt"
     date_inference: "marginal"

diff --git a/phylogenetic/defaults/measles_reference_N450.fasta b/phylogenetic/defaults/measles_reference_N450.fasta
@@ -0,0 +1,8 @@
+>lcl|NC_001498.1_cds_NP_056918.1_1 [gene=N] [locus_tag=MeVgp1] [db_xref=GeneID:1489804] [protein=nucleocapsid protein] [protein_id=NP_056918.1] [location=1233..1682] [gbkey=CDS]
+GTCAGTTCCACATTGGCATCCGAACTCGGTATCACTGCCGAGGATGCAAGGCTTGTTTCAGAGAT
+TGCAATGCATACTACTGAGGACAGGATCAGTAGAGCGGTCGGACCCAGACAAGCCCAAGTGTCATTTCTA
+CACGGTGATCAAAGTGAGAATGAGCTACCAGGATTGGGGGGCAAGGAAGATAGGAGGGTCAAACAGGGTC
+GGGGAGAAGCCAGGGAGAGCTACAGAGAAACCGGGTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCC
+TCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGTCGAAGG
+TCAGCTGACGCCCTGCTCAGGCTGCAAGCCATGGCAGGAATCTTGGAAGAACAAGGCTCAGACACGGACA
+CCCCTAGGGTATACAATGACAGAGATCTTCTAGAC
diff --git a/phylogenetic/defaults/measles_reference_N450.gb b/phylogenetic/defaults/measles_reference_N450.gb
@@ -0,0 +1,69 @@
+LOCUS       NC_001498                450 bp    cRNA    linear   VRL 13-AUG-2018
+DEFINITION  Measles virus, complete genome.
+ACCESSION   NC_001498 REGION: 1233..1682
+VERSION     NC_001498.1
+DBLINK      Project: 15025
+            BioProject: PRJNA485481
+KEYWORDS    RefSeq.
+SOURCE      Measles morbillivirus
+  ORGANISM  Measles morbillivirus
+            Viruses; Riboviria; Orthornavirae; Negarnaviricota;
+            Haploviricotina; Monjiviricetes; Mononegavirales; Paramyxoviridae;
+            Orthoparamyxovirinae; Morbillivirus; Morbillivirus hominis.
+REFERENCE   1  (sites)
+  AUTHORS   Rima,B.K. and Duprex,W.P.
+  TITLE     The measles virus replication cycle
+  JOURNAL   Curr. Top. Microbiol. Immunol. 329, 77-102 (2009)
+   PUBMED   19198563
+REFERENCE   2
+  AUTHORS   Takeuchi,K., Miyajima,N., Kobune,F. and Tashiro,M.
+  TITLE     Comparative nucleotide sequence analyses of the entire genomes of
+            B95a cell-isolated and vero cell-isolated measles viruses from the
+            same patient
+  JOURNAL   Virus Genes 20 (3), 253-257 (2000)
+   PUBMED   10949953
+REFERENCE   3  (bases 1 to 450)
+  CONSRTM   NCBI Genome Project
+  TITLE     Direct Submission
+  JOURNAL   Submitted (01-AUG-2000) National Center for Biotechnology
+            Information, NIH, Bethesda, MD 20894, USA
+REFERENCE   4  (bases 1 to 450)
+  AUTHORS   Takeuchi,K., Tanabayashi,K. and Tashiro,M.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (10-JUL-1998) Kaoru Takeuchi, National Institute of
+            Infectious Diseases, Viral Disease and Vaccine Contorol; 4-7-1
+            Gakuen, Musashi-murayama, Tokyo 208-0011, Japan
+            (E-mail:[email protected], Tel:81-42-561-0771(ex.530),
+            Fax:81-42-567-5631)
+COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
+            reference sequence was derived from AB016162.
+            Sequence updated (21-Jul-1998)
+            Sequence updated (11-Dec-1998).
+            COMPLETENESS: full length.
+FEATURES             Location/Qualifiers
+     source          1..450
+                     /organism="Measles morbillivirus"
+                     /mol_type="viral cRNA"
+                     /strain="Ichinose-B95a"
+                     /db_xref="taxon:11234"
+     CDS             <1..>450
+                     /gene="N"
+                     /codon_start=1
+                     /product="nucleocapsid protein"
+                     /protein_id="NP_056918.1"
+                     /db_xref="GeneID:1489804"
+                     /translation="VSSTLASELGITAEDAR
+                     LVSEIAMHTTEDRISRAVGPRQAQVSFLHGDQSENELPGLGGKEDRRVKQGRGEARES
+                     YRETGSSRASDARAAHPPTSMPLDIDTASESGQDPQDSRRSADALLRLQAMAGILEEQ
+                     GSDTDTPRVYNDRDLLD"
+ORIGIN      
+        1 gtcagttcca cattggcatc cgaactcggt atcactgccg aggatgcaag gcttgtttca
+       61 gagattgcaa tgcatactac tgaggacagg atcagtagag cggtcggacc cagacaagcc
+      121 caagtgtcat ttctacacgg tgatcaaagt gagaatgagc taccaggatt ggggggcaag
+      181 gaagatagga gggtcaaaca gggtcgggga gaagccaggg agagctacag agaaaccggg
+      241 tccagcagag caagtgatgc gagagctgcc catcctccaa ccagcatgcc cctagacatt
+      301 gacactgcat cggagtcagg ccaagatccg caggacagtc gaaggtcagc tgacgccctg
+      361 ctcaggctgc aagccatggc aggaatcttg gaagaacaag gctcagacac ggacacccct
+      421 agggtataca atgacagaga tcttctagac
+//
+
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
@@ -8,10 +8,10 @@ See Augur's usage docs for these commands for more details.
 rule ancestral:
     """Reconstructing ancestral sequences and mutations"""
     input:
-        tree = "results/tree.nwk",
-        alignment = "results/aligned.fasta"
+        tree = "results/{gene}/tree.nwk",
+        alignment = "results/{gene}/aligned.fasta"
     output:
-        node_data = "results/nt_muts.json"
+        node_data = "results/{gene}/nt_muts.json"
     params:
         inference = config["ancestral"]["inference"]
     shell:
@@ -26,11 +26,11 @@ rule ancestral:
 rule translate:
     """Translating amino acid sequences"""
     input:
-        tree = "results/tree.nwk",
-        node_data = "results/nt_muts.json",
-        reference = config["files"]["reference"]
+        tree = "results/{gene}/tree.nwk",
+        node_data = "results/{gene}/nt_muts.json",
+        reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb"
     output:
-        node_data = "results/aa_muts.json"
+        node_data = "results/{gene}/aa_muts.json"
     shell:
         """
         augur translate \

diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
@@ -7,9 +7,9 @@ See Augur's usage docs for these commands for more details.
 rule tree:
     """Building tree"""
     input:
-        alignment = "results/aligned.fasta"
+        alignment = "results/{gene}/aligned.fasta"
     output:
-        tree = "results/tree_raw.nwk"
+        tree = "results/{gene}/tree_raw.nwk"
     shell:
         """
         augur tree \
@@ -26,12 +26,12 @@ rule refine:
       - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
     """
     input:
-        tree = "results/tree_raw.nwk",
-        alignment = "results/aligned.fasta",
+        tree = "results/{gene}/tree_raw.nwk",
+        alignment = "results/{gene}/aligned.fasta",
         metadata = "data/metadata.tsv"
     output:
-        tree = "results/tree.nwk",
-        node_data = "results/branch_lengths.json"
+        tree = "results/{gene}/tree.nwk",
+        node_data = "results/{gene}/branch_lengths.json"
     params:
         coalescent = config["refine"]["coalescent"],
         date_inference = config["refine"]["date_inference"],
@@ -50,6 +50,7 @@ rule refine:
             --coalescent {params.coalescent} \
             --date-confidence \
             --date-inference {params.date_inference} \
-            --clock-filter-iqd {params.clock_filter_iqd} 
+            --clock-filter-iqd {params.clock_filter_iqd} \
+            --stochastic-resolve
         """
 
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
@@ -8,15 +8,17 @@ See Augur's usage docs for these commands for more details.
 rule export:
     """Exporting data files for for auspice"""
     input:
-        tree = "results/tree.nwk",
+        tree = "results/{gene}/tree.nwk",
         metadata = "data/metadata.tsv",
-        branch_lengths = "results/branch_lengths.json",
-        nt_muts = "results/nt_muts.json",
-        aa_muts = "results/aa_muts.json",
+        branch_lengths = "results/{gene}/branch_lengths.json",
+        nt_muts = "results/{gene}/nt_muts.json",
+        aa_muts = "results/{gene}/aa_muts.json",
         colors = config["files"]["colors"],
-        auspice_config = config["files"]["auspice_config"]
+        auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json"
+
     output:
-        auspice_json = rules.all.input.auspice_json
+        auspice_json = "auspice/measles_{gene}.json",
+        root_sequence = "auspice/measles_{gene}_root-sequence.json"
     params:
         strain_id = config["strain_id_field"]
     shell:

diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
@@ -44,7 +44,7 @@ rule filter:
         metadata = "data/metadata.tsv",
         exclude = config["files"]["exclude"]
     output:
-        sequences = "results/filtered.fasta"
+        sequences = "results/genome/filtered.fasta"
     params:
         group_by = config["filter"]["group_by"],
         sequences_per_group = config["filter"]["sequences_per_group"],
@@ -71,10 +71,10 @@ rule align:
       - filling gaps with N
     """
     input:
-        sequences = "results/filtered.fasta",
+        sequences = "results/genome/filtered.fasta",
         reference = config["files"]["reference"]
     output:
-        alignment = "results/aligned.fasta"
+        alignment = "results/genome/aligned.fasta"
     shell:
         """
         augur align \

diff --git a/phylogenetic/rules/prepare_sequences_N450.smk b/phylogenetic/rules/prepare_sequences_N450.smk
@@ -0,0 +1,58 @@
+"""
+This part of the workflow prepares sequences for constructing the phylogenetic tree for 450bp of the N gene.
+
+See Augur's usage docs for these commands for more details.
+"""
+
+rule align_and_extract_N450:
+    input:
+        sequences = "data/sequences.fasta",
+        reference = config["files"]["reference_N450_fasta"]
+    output:
+        sequences = "results/N450/sequences.fasta"
+    params:
+        min_length = config['filter_N450']['min_length']
+    shell:
+        """
+        nextclade run \
+           -j 1 \
+           --input-ref {input.reference} \
+           --output-fasta {output.sequences} \
+           --min-seed-cover 0.01 \
+           --min-length {params.min_length} \
+           --silent \
+           {input.sequences}
+        """
+rule filter_N450:
+    """
+    Filtering to
+      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+      - excluding strains in {input.exclude}
+      - minimum genome length of {params.min_length}
+      - excluding strains with missing region, country or date metadata
+    """
+    input:
+        sequences = "results/N450/sequences.fasta",
+        metadata = "data/metadata.tsv",
+        exclude = config["files"]["exclude"]
+    output:
+        sequences = "results/N450/aligned.fasta"
+    params:
+        group_by = config['filter_N450']['group_by'],
+        subsample_max_sequences = config["filter_N450"]["subsample_max_sequences"],
+        min_date = config["filter_N450"]["min_date"],
+        min_length = config['filter_N450']['min_length'],
+        strain_id = config["strain_id_field"]
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --exclude {input.exclude} \
+            --output {output.sequences} \
+            --group-by {params.group_by} \
+            --subsample-max-sequences {params.subsample_max_sequences} \
+            --min-date {params.min_date} \
+            --min-length {params.min_length}
+        """