diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 0e16b8bd9..472831243 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -71,7 +71,7 @@ files: mutational_fitness_distance_map: "defaults/mutational_fitness_distance_map.json" sites_to_mask: "defaults/sites_ignored_for_tree_topology.txt" -# Define genes to translate during alignment by nextalign. +# Define genes to translate during alignment by nextclade. genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] # Filter settings diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index ddb6334d8..8d5f61194 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,10 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +## v14 (23 October 2024) + +- 23 October 2024: Update workflow to use Nextclade v3. This includes the removal of unused mutation summary script and rules that expected Nextclade v2 outputs. Dropping the mutation summary rules removed the need for the full alignment rule `align` to produce the insertions and translations outputs, so they have been removed. The `build_align` rule no longer produces a separate `insertions.tsv` since insertions are now included in the `nextclade_qc.tsv`. [PR 1160](https://github.com/nextstrain/ncov/pull/1160) + - 2 October 2024: Include a new parameter for `clade_recency` under `colors`. This parameter is used to define which clades should receive a color from the standard rainbow palette. A value of `6M` will cause clades with strains in the tree sampled within the last 6 months to be colored and earlier strains to not receive a color (and be colored in a palette of grays by Auspice). This `clade_recency` parameter is used in `builds.yaml` in `nextstrain_profiles` to color clades according for the `1m`, `2m`, `6m` and `all-time` timepoints. If `clade_recency` is not supplied then all clades will be colored. [PR 1132](https://github.com/nextstrain/ncov/pull/1132) - 30 September 2024: Use population-based weighted sampling for `nextstrain_profiles`. This requires a minimum Augur version of 25.3.0. PRs [1106](https://github.com/nextstrain/ncov/pull/1106), [1150](https://github.com/nextstrain/ncov/pull/1150), [1151](https://github.com/nextstrain/ncov/pull/1151) diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst index e317ab36b..60a54d823 100644 --- a/docs/src/reference/workflow-config-file.rst +++ b/docs/src/reference/workflow-config-file.rst @@ -431,7 +431,7 @@ genes ----- - type: array -- description: A list of genes for which ``nextalign`` should generate amino acid sequences during the alignment process. Gene names must match the names provided in the gene map from the ``annotation`` parameter. +- description: A list of genes for which ``nextclade`` should generate amino acid sequences during the alignment process. Gene names must match the names provided in the gene map from the ``annotation`` parameter. - default: ``["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"]`` - used in rules: ``align``, ``build_align``, ``translate``, ``mutational_fitness`` @@ -513,17 +513,17 @@ alignment_reference ~~~~~~~~~~~~~~~~~~~ - type: string -- description: Path to a FASTA-formatted sequence to use for alignment with ``nextalign`` +- description: Path to a FASTA-formatted sequence to use for alignment with ``nextclade`` - default: ``defaults/reference_seq.fasta`` -- used in rules: ``align``, ``proximity_score`` (subsampling), ``build_align``, ``build_mutation_summary`` +- used in rules: ``align``, ``proximity_score`` (subsampling) annotation ~~~~~~~~~~ - type: string -- description: Path to a GFF-formatted annotation of gene coordinates (e.g., a “gene map”) for use by ``nextalign`` and mutation summaries. +- description: Path to a GFF-formatted annotation of gene coordinates (e.g., a “gene map”) for use by ``nextclade`` for codon-aware alignment. - default: ``defaults/annotation.gff`` -- used in rules: ``align``, ``build_align``, ``build_mutation_summary`` +- used in rules: ``align`` outgroup ~~~~~~~~ diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml index 1431850a7..d9d9c3968 100644 --- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml @@ -18,7 +18,6 @@ slack_token: ~ slack_channel: "#ncov-gisaid-updates" genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] -use_nextalign: true include_hcov19_prefix: True files: diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index d80b7f521..03dce7edf 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -17,7 +17,6 @@ slack_token: ~ slack_channel: "#ncov-gisaid-updates" genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] -use_nextalign: true include_hcov19_prefix: True files: diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 7c19b72eb..9d4e60aec 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -17,7 +17,6 @@ slack_token: ~ slack_channel: "#ncov-genbank-updates" genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] -use_nextalign: true include_hcov19_prefix: False files: diff --git a/scripts/mutation_summary.py b/scripts/mutation_summary.py deleted file mode 100644 index a1db4def4..000000000 --- a/scripts/mutation_summary.py +++ /dev/null @@ -1,106 +0,0 @@ -import argparse, os, glob -from augur.io import open_file -from Bio import SeqIO, SeqFeature, Seq -from Bio.SeqIO.FastaIO import SimpleFastaParser -import numpy as np -import pandas as pd - -def read_reference(fname, genemap): - try: - ref = str(SeqIO.read(fname, 'fasta').seq) - except: - with open(fname, 'r') as fh: - ref = "".join([x.strip() for x in fh]) - - translations = {} - with open(genemap, 'r') as fh: - for line in fh: - if line[0]=='#': - continue - entries = [x.strip() for x in line.strip().split('\t')] - start = int(entries[3]) - end = int(entries[4]) - strand = entries[6] - attributes = {x.split('=')[0]:'='.join(x.split('=')[1:]) for x in entries[8].split(';')} - if 'gene_name' in attributes: - name = attributes['gene_name'].strip('"') - else: - name = None - translation = Seq.translate(SeqFeature.SeqFeature(SeqFeature.FeatureLocation(start-1, end, strand=-1 if strand=='-' else 1)).extract(ref)) - translations[name] = str(translation) - - return {"nuc":ref, "translations":translations} - -def summarise_differences(ref, query, isAA): - """ - Summarise the differences between a provided reference and a query - (both of which are numpy arrays with dtype int8) - Returns a string of comma-seperated mutations - """ - # in int8: A = 65 T = 84 C = 67 G = 71 N = 78 - = 45 X = 88 - ambiguous = 88 if isAA else 78 # 'X' or 'N' - # replace all leading and trailing gaps with the ambiguous character - idx_not_gaps = np.where(query!=45)[0] # 45 is '-' (gap) - if idx_not_gaps.size: - query[0:idx_not_gaps[0]] = ambiguous - query[idx_not_gaps[-1]+1:len(query)] = ambiguous - else: - # the query is nothing but gaps! We don't report any mutations here - return "" - # sometimes the query length is longer than the reference. In this case we preserve previous behavior - # by discarding extra characters in the query - if query.size>ref.size: - query = query[0:ref.size] - # find indicies where the query differs from the reference, and is not ambiguous - changes = np.logical_and(ref!=query, query!=ambiguous).nonzero()[0] - # save these as a comma-seperated list of , where the base (position) is 1-based - return ",".join([f"{chr(ref[idx])}{idx+1}{chr(query[idx])}" for idx in changes]) - -def to_numpy_array(input_string): - return np.frombuffer(input_string.upper().encode('utf-8'), dtype=np.int8).copy() - -def to_mutations(aln_file, ref, aa=False): - res = {} - ref_array = to_numpy_array(ref) - with open_file(aln_file, 'r') as fh: - for name, seq in SimpleFastaParser(fh): - res[name] = summarise_differences(ref_array, to_numpy_array(seq), aa) - return res - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="transform nextalign output to sparse format", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--directory', type=str, required=True, help="directory with nextalign output") - parser.add_argument('--alignment', type=str, required=False, help="nucleotide alignment (if not part of default pattern)") - parser.add_argument('--insertions', type=str, required=False, help="insertions (if not part of default pattern)") - parser.add_argument('--basename', type=str, required=True, help="output pattern") - parser.add_argument('--reference', type=str, required=True, help="reference sequence") - parser.add_argument('--genes', nargs="+", required=True, help="list of gene names to summarize mutations for") - parser.add_argument('--genemap', type=str, required=True, help="annotation") - parser.add_argument('--output', type=str, required=True, help="output tsv file") - args = parser.parse_args() - - res = read_reference(args.reference, args.genemap) - ref = res['nuc'] - translations = res['translations'] - - nucleotide_alignment = args.alignment or os.path.join(args.directory, args.basename+'.aligned.fasta*') - insertions = os.path.join(args.directory, args.basename+'.insertions.csv') - - genes = set(args.genes) - gene_files = glob.glob(os.path.join(args.directory, args.basename+'.gene.*.fasta*')) - - compressed = {} - res = to_mutations(nucleotide_alignment, ref) - compressed = {'nucleotide_mutations': pd.DataFrame(res.values(), index=res.keys(), columns=['nucleotide'])} - for fname in gene_files: - # Find the gene name in the current gene file, since the filename may have multiple suffixes. - gene = (set(fname.split('.')) & genes).pop() - res = to_mutations(fname, translations[gene], aa=True) - compressed[gene] = pd.DataFrame(res.values(), index=res.keys(), columns=[gene]) - - res = pd.concat(compressed.values(), axis=1) - res.to_csv(args.output, sep='\t') diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index 080c872f1..bf59f7072 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -7,8 +7,7 @@ dependencies: - augur=22.4.0 - epiweeks=2.1.2 - iqtree=2.2.0.3 - - nextalign=2.14.0 - - nextclade=2.14.0 + - nextclade=3.9.0 - pangolin=3.1.20 - pangolearn=2022.01.20 - python>=3.8* diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index 723d00804..b9a8f19bd 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -93,38 +93,6 @@ rule export_all_regions: """ -rule mutation_summary: - message: "Summarizing {input.alignment}" - input: - alignment = rules.align.output.alignment, - insertions = rules.align.output.insertions, - translations = rules.align.output.translations, - reference = config["files"]["alignment_reference"], - genemap = config["files"]["annotation"] - output: - mutation_summary = "results/mutation_summary_{origin}.tsv.xz" - log: - "logs/mutation_summary_{origin}.txt" - benchmark: - "benchmarks/mutation_summary_{origin}.txt" - params: - outdir = "results/translations", - basename = "seqs_{origin}", - genes=config["genes"], - conda: config["conda_environment"] - shell: - r""" - python3 scripts/mutation_summary.py \ - --alignment {input.alignment} \ - --insertions {input.insertions} \ - --directory {params.outdir} \ - --basename {params.basename} \ - --reference {input.reference} \ - --genes {params.genes:q} \ - --genemap {input.genemap} \ - --output {output.mutation_summary} 2>&1 | tee {log} - """ - # # Rule for generating a per-build auspice config # diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 4b3f92f12..4e9ac00a4 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -65,15 +65,13 @@ rule align: """ input: sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin), - genemap = config["files"]["annotation"], + # Annotation used for codon-aware nucleotide alignment + # + annotation = config["files"]["annotation"], reference = config["files"]["alignment_reference"] output: alignment = "results/aligned_{origin}.fasta.xz", - insertions = "results/insertions_{origin}.tsv", - translations = expand("results/translations/seqs_{{origin}}.gene.{gene}.fasta.xz", gene=config.get('genes', ['S'])) params: - output_translations = lambda w: f"results/translations/seqs_{w.origin}.gene.{{gene}}.fasta", - output_translations_toxz = "results/translations/seqs_{origin}.gene.*.fasta", strain_prefixes=config["strip_strain_prefixes"], # Strip the compression suffix for the intermediate output from the aligner. uncompressed_alignment=lambda wildcards, output: Path(output.alignment).with_suffix(""), @@ -92,15 +90,12 @@ rule align: --sequences {input.sequences} \ --strip-prefixes {params.strain_prefixes:q} \ --output /dev/stdout 2> {params.sanitize_log} \ - | nextalign run \ + | nextclade run \ --jobs={threads} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --output-translations {params.output_translations} \ - --output-fasta {params.uncompressed_alignment} \ - --output-insertions {output.insertions} > {log} 2>&1; + --input-ref {input.reference} \ + --input-annotation {input.annotation} \ + --output-fasta {params.uncompressed_alignment} > {log} 2>&1; xz -2 -T {threads} {params.uncompressed_alignment}; - xz -2 -T {threads} {params.output_translations_toxz} """ def _get_subsampling_settings(wildcards): @@ -466,8 +461,8 @@ rule prepare_nextclade: conda: config["conda_environment"] shell: r""" - nextclade2 --version - nextclade2 dataset get --name {params.name} --output-zip {output.nextclade_dataset} + nextclade --version + nextclade dataset get --name {params.name} --output-zip {output.nextclade_dataset} """ rule build_align: @@ -481,14 +476,13 @@ rule build_align: nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", output: alignment = "results/{build_name}/aligned.fasta", - insertions = "results/{build_name}/insertions.tsv", nextclade_qc = 'results/{build_name}/nextclade_qc.tsv', translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S'])) params: outdir = "results/{build_name}/translations", strain_prefixes=config["strip_strain_prefixes"], sanitize_log="logs/sanitize_sequences_before_nextclade_{build_name}.txt", - output_translations = lambda w: f"results/{w.build_name}/translations/aligned.gene.{{gene}}.fasta" + output_translations = lambda w: f"results/{w.build_name}/translations/aligned.gene.{{cds}}.fasta" log: "logs/align_{build_name}.txt" benchmark: @@ -503,13 +497,12 @@ rule build_align: --sequences {input.sequences} \ --strip-prefixes {params.strain_prefixes:q} \ --output /dev/stdout 2> {params.sanitize_log} \ - | nextclade2 run \ + | nextclade run \ --jobs {threads} \ --input-dataset {input.nextclade_dataset} \ --output-tsv {output.nextclade_qc} \ --output-fasta {output.alignment} \ - --output-translations {params.output_translations} \ - --output-insertions {output.insertions} 2>&1 | tee {log} + --output-translations {params.output_translations} 2>&1 | tee {log} """ rule join_metadata_and_nextclade_qc: @@ -936,34 +929,6 @@ rule translate: --output {output.node_data} 2>&1 | tee {log} """ -rule build_mutation_summary: - message: "Summarizing {input.alignment}" - input: - alignment = rules.build_align.output.alignment, - insertions = rules.build_align.output.insertions, - translations = rules.build_align.output.translations, - reference = config["files"]["alignment_reference"], - genemap = config["files"]["annotation"] - output: - mutation_summary = "results/{build_name}/mutation_summary.tsv" - log: - "logs/mutation_summary_{build_name}.txt" - params: - outdir = "results/{build_name}/translations", - basename = "aligned" - conda: config["conda_environment"] - shell: - r""" - python3 scripts/mutation_summary.py \ - --alignment {input.alignment} \ - --insertions {input.insertions} \ - --directory {params.outdir} \ - --basename {params.basename} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --output {output.mutation_summary} 2>&1 | tee {log} - """ - rule distances: input: tree = rules.refine.output.tree,