diff --git a/CHANGES.md b/CHANGES.md index 103b249..f563361 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,5 @@ # CHANGELOG +* 1 April 2024: Create a tree using the 450 nucleotides encoding the carboxyl-terminal 150 amino acids of the nucleoprotein (N450), which is highly represented on NCBI for measles. [PR #20](https://github.com/nextstrain/measles/pull/20) * 15 March 2024: Connect ingest and phylogenetic workflows to follow the pathogen-repo-guide by uploading ingest output to S3, downloading ingest output from S3 to phylogenetic directory, using "accession" column as the ID column, and using a color scheme that matches the new region name format. [PR #19](https://github.com/nextstrain/measles/pull/19) * 1 March 2024: Add phylogenetic directory to follow the pathogen-repo-guide, and update the CI workflow to match the new file structure. [PR #18](https://github.com/nextstrain/measles/pull/18) * 14 February 2024: Add ingest directory from pathogen-repo-guide and make measles-specific modifications. [PR #10](https://github.com/nextstrain/measles/pull/10) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index c1bbbd6..b4521ff 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,10 +1,13 @@ +genes = ['N450', 'genome'] + configfile: "defaults/config.yaml" rule all: input: - auspice_json = "auspice/measles.json", + auspice_json = expand("auspice/measles_{gene}.json", gene=genes) include: "rules/prepare_sequences.smk" +include: "rules/prepare_sequences_N450.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index c010bb9..27f2484 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -16,11 +16,6 @@ "title": "Date", "type": "continuous" }, - { - "key": "author", - "title": "Author", - "type": "categorical" - }, { "key": "country", "title": "Country", @@ -43,5 +38,8 @@ "country", "region", "author" + ], + "metadata_columns": [ + "author" ] } diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json new file mode 100644 index 0000000..5193bc0 --- /dev/null +++ b/phylogenetic/defaults/auspice_config_N450.json @@ -0,0 +1,45 @@ +{ + "title": "Real-time tracking of measles virus evolution", + "maintainers": [ + {"name": "Kim Andrews", "url": "https://bedford.io/team/kim-andrews/"}, + {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} + ], + "build_url": "https://github.com/nextstrain/measles", + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "num_date", + "title": "Date", + "type": "continuous" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country", + "region" + ], + "display_defaults": { + "map_triplicate": true + }, + "filters": [ + "country", + "region", + "author" + ], + "metadata_columns": [ + "author" + ] +} diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv index ce960e3..55a9f8b 100644 --- a/phylogenetic/defaults/colors.tsv +++ b/phylogenetic/defaults/colors.tsv @@ -4,23 +4,3 @@ region Africa #8ABB6A region Europe #BEBB48 region South America #E29E39 region North America #E2562B - -country india #511EA8 -country china #4333BE -country vietnam #3F4ECB -country south korea #4169CF -country japan #4682C9 -country australia #4F96BB -country new zealand #5AA5A8 -country russia #68AF92 -country gambia #78B77D -country sudan #8BBB6A -country morocco #9EBE59 -country italy #B3BD4D -country germany #C5B945 -country france #D5B03F -country netherlands #E0A23A -country united kingdom #E68D36 -country brazil #E67231 -country usa #E1502A -country canada #DC2F24 diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index ac82fcc..035e5b3 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -2,13 +2,21 @@ strain_id_field: "accession" files: exclude: "defaults/dropped_strains.txt" reference: "defaults/measles_reference.gb" + reference_N450: "defaults/measles_reference_N450.gb" + reference_N450_fasta: "defaults/measles_reference_N450.fasta" colors: "defaults/colors.tsv" auspice_config: "defaults/auspice_config.json" + auspice_config_N450: "defaults/auspice_config_N450.json" filter: group_by: "country year month" sequences_per_group: 20 min_date: 1950 min_length: 5000 +filter_N450: + group_by: "country year" + subsample_max_sequences: 3000 + min_date: 1950 + min_length: 400 refine: coalescent: "opt" date_inference: "marginal" diff --git a/phylogenetic/defaults/measles_reference_N450.fasta b/phylogenetic/defaults/measles_reference_N450.fasta new file mode 100644 index 0000000..eab1954 --- /dev/null +++ b/phylogenetic/defaults/measles_reference_N450.fasta @@ -0,0 +1,8 @@ +>lcl|NC_001498.1_cds_NP_056918.1_1 [gene=N] [locus_tag=MeVgp1] [db_xref=GeneID:1489804] [protein=nucleocapsid protein] [protein_id=NP_056918.1] [location=1233..1682] [gbkey=CDS] +GTCAGTTCCACATTGGCATCCGAACTCGGTATCACTGCCGAGGATGCAAGGCTTGTTTCAGAGAT +TGCAATGCATACTACTGAGGACAGGATCAGTAGAGCGGTCGGACCCAGACAAGCCCAAGTGTCATTTCTA +CACGGTGATCAAAGTGAGAATGAGCTACCAGGATTGGGGGGCAAGGAAGATAGGAGGGTCAAACAGGGTC +GGGGAGAAGCCAGGGAGAGCTACAGAGAAACCGGGTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCC +TCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGTCGAAGG +TCAGCTGACGCCCTGCTCAGGCTGCAAGCCATGGCAGGAATCTTGGAAGAACAAGGCTCAGACACGGACA +CCCCTAGGGTATACAATGACAGAGATCTTCTAGAC diff --git a/phylogenetic/defaults/measles_reference_N450.gb b/phylogenetic/defaults/measles_reference_N450.gb new file mode 100644 index 0000000..277c96a --- /dev/null +++ b/phylogenetic/defaults/measles_reference_N450.gb @@ -0,0 +1,69 @@ +LOCUS NC_001498 450 bp cRNA linear VRL 13-AUG-2018 +DEFINITION Measles virus, complete genome. +ACCESSION NC_001498 REGION: 1233..1682 +VERSION NC_001498.1 +DBLINK Project: 15025 + BioProject: PRJNA485481 +KEYWORDS RefSeq. +SOURCE Measles morbillivirus + ORGANISM Measles morbillivirus + Viruses; Riboviria; Orthornavirae; Negarnaviricota; + Haploviricotina; Monjiviricetes; Mononegavirales; Paramyxoviridae; + Orthoparamyxovirinae; Morbillivirus; Morbillivirus hominis. +REFERENCE 1 (sites) + AUTHORS Rima,B.K. and Duprex,W.P. + TITLE The measles virus replication cycle + JOURNAL Curr. Top. Microbiol. Immunol. 329, 77-102 (2009) + PUBMED 19198563 +REFERENCE 2 + AUTHORS Takeuchi,K., Miyajima,N., Kobune,F. and Tashiro,M. + TITLE Comparative nucleotide sequence analyses of the entire genomes of + B95a cell-isolated and vero cell-isolated measles viruses from the + same patient + JOURNAL Virus Genes 20 (3), 253-257 (2000) + PUBMED 10949953 +REFERENCE 3 (bases 1 to 450) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (01-AUG-2000) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 4 (bases 1 to 450) + AUTHORS Takeuchi,K., Tanabayashi,K. and Tashiro,M. + TITLE Direct Submission + JOURNAL Submitted (10-JUL-1998) Kaoru Takeuchi, National Institute of + Infectious Diseases, Viral Disease and Vaccine Contorol; 4-7-1 + Gakuen, Musashi-murayama, Tokyo 208-0011, Japan + (E-mail:ktake@nih.go.jp, Tel:81-42-561-0771(ex.530), + Fax:81-42-567-5631) +COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The + reference sequence was derived from AB016162. + Sequence updated (21-Jul-1998) + Sequence updated (11-Dec-1998). + COMPLETENESS: full length. +FEATURES Location/Qualifiers + source 1..450 + /organism="Measles morbillivirus" + /mol_type="viral cRNA" + /strain="Ichinose-B95a" + /db_xref="taxon:11234" + CDS <1..>450 + /gene="N" + /codon_start=1 + /product="nucleocapsid protein" + /protein_id="NP_056918.1" + /db_xref="GeneID:1489804" + /translation="VSSTLASELGITAEDAR + LVSEIAMHTTEDRISRAVGPRQAQVSFLHGDQSENELPGLGGKEDRRVKQGRGEARES + YRETGSSRASDARAAHPPTSMPLDIDTASESGQDPQDSRRSADALLRLQAMAGILEEQ + GSDTDTPRVYNDRDLLD" +ORIGIN + 1 gtcagttcca cattggcatc cgaactcggt atcactgccg aggatgcaag gcttgtttca + 61 gagattgcaa tgcatactac tgaggacagg atcagtagag cggtcggacc cagacaagcc + 121 caagtgtcat ttctacacgg tgatcaaagt gagaatgagc taccaggatt ggggggcaag + 181 gaagatagga gggtcaaaca gggtcgggga gaagccaggg agagctacag agaaaccggg + 241 tccagcagag caagtgatgc gagagctgcc catcctccaa ccagcatgcc cctagacatt + 301 gacactgcat cggagtcagg ccaagatccg caggacagtc gaaggtcagc tgacgccctg + 361 ctcaggctgc aagccatggc aggaatcttg gaagaacaag gctcagacac ggacacccct + 421 agggtataca atgacagaga tcttctagac +// + diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 2f8eec4..61e94b8 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -8,10 +8,10 @@ See Augur's usage docs for these commands for more details. rule ancestral: """Reconstructing ancestral sequences and mutations""" input: - tree = "results/tree.nwk", - alignment = "results/aligned.fasta" + tree = "results/{gene}/tree.nwk", + alignment = "results/{gene}/aligned.fasta" output: - node_data = "results/nt_muts.json" + node_data = "results/{gene}/nt_muts.json" params: inference = config["ancestral"]["inference"] shell: @@ -26,11 +26,11 @@ rule ancestral: rule translate: """Translating amino acid sequences""" input: - tree = "results/tree.nwk", - node_data = "results/nt_muts.json", - reference = config["files"]["reference"] + tree = "results/{gene}/tree.nwk", + node_data = "results/{gene}/nt_muts.json", + reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb" output: - node_data = "results/aa_muts.json" + node_data = "results/{gene}/aa_muts.json" shell: """ augur translate \ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 3a5f6e8..e222800 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -7,9 +7,9 @@ See Augur's usage docs for these commands for more details. rule tree: """Building tree""" input: - alignment = "results/aligned.fasta" + alignment = "results/{gene}/aligned.fasta" output: - tree = "results/tree_raw.nwk" + tree = "results/{gene}/tree_raw.nwk" shell: """ augur tree \ @@ -26,12 +26,12 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree = "results/tree_raw.nwk", - alignment = "results/aligned.fasta", + tree = "results/{gene}/tree_raw.nwk", + alignment = "results/{gene}/aligned.fasta", metadata = "data/metadata.tsv" output: - tree = "results/tree.nwk", - node_data = "results/branch_lengths.json" + tree = "results/{gene}/tree.nwk", + node_data = "results/{gene}/branch_lengths.json" params: coalescent = config["refine"]["coalescent"], date_inference = config["refine"]["date_inference"], @@ -50,6 +50,7 @@ rule refine: --coalescent {params.coalescent} \ --date-confidence \ --date-inference {params.date_inference} \ - --clock-filter-iqd {params.clock_filter_iqd} + --clock-filter-iqd {params.clock_filter_iqd} \ + --stochastic-resolve """ \ No newline at end of file diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index c9ec2da..dcd1893 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -8,15 +8,17 @@ See Augur's usage docs for these commands for more details. rule export: """Exporting data files for for auspice""" input: - tree = "results/tree.nwk", + tree = "results/{gene}/tree.nwk", metadata = "data/metadata.tsv", - branch_lengths = "results/branch_lengths.json", - nt_muts = "results/nt_muts.json", - aa_muts = "results/aa_muts.json", + branch_lengths = "results/{gene}/branch_lengths.json", + nt_muts = "results/{gene}/nt_muts.json", + aa_muts = "results/{gene}/aa_muts.json", colors = config["files"]["colors"], - auspice_config = config["files"]["auspice_config"] + auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json" + output: - auspice_json = rules.all.input.auspice_json + auspice_json = "auspice/measles_{gene}.json", + root_sequence = "auspice/measles_{gene}_root-sequence.json" params: strain_id = config["strain_id_field"] shell: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index af6c9c6..c333bb2 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -44,7 +44,7 @@ rule filter: metadata = "data/metadata.tsv", exclude = config["files"]["exclude"] output: - sequences = "results/filtered.fasta" + sequences = "results/genome/filtered.fasta" params: group_by = config["filter"]["group_by"], sequences_per_group = config["filter"]["sequences_per_group"], @@ -71,10 +71,10 @@ rule align: - filling gaps with N """ input: - sequences = "results/filtered.fasta", + sequences = "results/genome/filtered.fasta", reference = config["files"]["reference"] output: - alignment = "results/aligned.fasta" + alignment = "results/genome/aligned.fasta" shell: """ augur align \ diff --git a/phylogenetic/rules/prepare_sequences_N450.smk b/phylogenetic/rules/prepare_sequences_N450.smk new file mode 100644 index 0000000..68a8db0 --- /dev/null +++ b/phylogenetic/rules/prepare_sequences_N450.smk @@ -0,0 +1,58 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree for 450bp of the N gene. + +See Augur's usage docs for these commands for more details. +""" + +rule align_and_extract_N450: + input: + sequences = "data/sequences.fasta", + reference = config["files"]["reference_N450_fasta"] + output: + sequences = "results/N450/sequences.fasta" + params: + min_length = config['filter_N450']['min_length'] + shell: + """ + nextclade run \ + -j 1 \ + --input-ref {input.reference} \ + --output-fasta {output.sequences} \ + --min-seed-cover 0.01 \ + --min-length {params.min_length} \ + --silent \ + {input.sequences} + """ +rule filter_N450: + """ + Filtering to + - {params.sequences_per_group} sequence(s) per {params.group_by!s} + - excluding strains in {input.exclude} + - minimum genome length of {params.min_length} + - excluding strains with missing region, country or date metadata + """ + input: + sequences = "results/N450/sequences.fasta", + metadata = "data/metadata.tsv", + exclude = config["files"]["exclude"] + output: + sequences = "results/N450/aligned.fasta" + params: + group_by = config['filter_N450']['group_by'], + subsample_max_sequences = config["filter_N450"]["subsample_max_sequences"], + min_date = config["filter_N450"]["min_date"], + min_length = config['filter_N450']['min_length'], + strain_id = config["strain_id_field"] + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --exclude {input.exclude} \ + --output {output.sequences} \ + --group-by {params.group_by} \ + --subsample-max-sequences {params.subsample_max_sequences} \ + --min-date {params.min_date} \ + --min-length {params.min_length} + """ \ No newline at end of file