From 8bab02b88ad3e15d77f3f09fd0678fca463a9a12 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 2 May 2024 16:18:05 -0700 Subject: [PATCH 1/9] Copy newreference script from RSV https://github.com/nextstrain/rsv/blob/a1788ce2c9c4375fb5a06d1426c64c45cf90225f/scripts/newreference.py fixup: fix comments to match behavior Co-authored-by: John SJ Anderson --- phylogenetic/bin/newreference.py | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 phylogenetic/bin/newreference.py diff --git a/phylogenetic/bin/newreference.py b/phylogenetic/bin/newreference.py new file mode 100755 index 00000000..a89c8196 --- /dev/null +++ b/phylogenetic/bin/newreference.py @@ -0,0 +1,53 @@ +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation, Seq +import shutil +import argparse +import sys + +def new_reference(referencefile, outgenbank, outfasta, gene): + ref = SeqIO.read(referencefile, "genbank") + startofgene = None + endofgene = None + for feature in ref.features: + if feature.type == 'source': + ref_source_feature = feature + if feature.type =='gene' or feature.type == 'CDS': + a = list(feature.qualifiers.items())[0][-1][0] + if a == gene: + startofgene = int(list(feature.location)[0]) + endofgene = int(list(feature.location)[-1])+1 + + # If user provides a --gene 'some name' that is not found, error out as this may indicate that + # the gene name is misspelled or the user may be using the wrong GenBank file. + if(gene is not None and startofgene is None and endofgene is None): + print(f"ERROR: No '{gene}' was found under 'gene' or 'CDS' features in the GenBank file.", file=sys.stderr) + sys.exit(1) + + record = ref[startofgene:endofgene] + source_feature = SeqFeature(FeatureLocation(start=0, end=len(record)), type='source', + qualifiers=ref_source_feature.qualifiers) + record.features.append(source_feature) + + SeqIO.write(record, outgenbank, 'genbank') + SeqIO.write(record, outfasta, "fasta") + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="make new reference depending on whether the entire genome or only part is to be used for the tree", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--reference", required=True, help="GenBank file with reference sequences") + parser.add_argument("--output-fasta", required=True, help="GenBank new reference file") + parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") + parser.add_argument("--gene", help="gene name or genome for entire genome") + args = parser.parse_args() + + if args.gene=='genome': + shutil.copy(args.reference, args.output_genbank) + SeqIO.write(SeqIO.read(args.reference, 'genbank'), args.output_fasta, 'fasta') + else: + new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) + From c725b31bf0adf5457eafaf0e67efaef0edac4333 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 2 May 2024 16:22:26 -0700 Subject: [PATCH 2/9] fixup: file permissions --- phylogenetic/bin/assign-colors.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 phylogenetic/bin/assign-colors.py diff --git a/phylogenetic/bin/assign-colors.py b/phylogenetic/bin/assign-colors.py old mode 100644 new mode 100755 From 31035919960ce7a48871fc2a19f9b7d3b8035fa2 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 2 May 2024 16:37:26 -0700 Subject: [PATCH 3/9] Use wildcard constraints for serotype-gene combinations Adds some wildcard constraints on serotype-gene combinations to avoid unchecked wildcard matching, such as having {serotype}.fasta match both "denv1_E.fasta" and "denv1.fasta". --- phylogenetic/Snakefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 82b5899c..dd8a56ea 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,6 +1,11 @@ configfile: "config/config_dengue.yaml" serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] +genes = ['genome', 'E'] + +wildcard_constraints: + serotype = "|".join(serotypes), + gene = "|".join(genes) rule all: input: From 9120d08510a9e6a5716909a72f228ecdd0cbb1e3 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 3 May 2024 16:24:50 -0700 Subject: [PATCH 4/9] Add genome postfix to reference files This is in preperation of having separate genome and gene (e.g. E, NS1) reference files. --- .../{reference_dengue_all.gb => reference_dengue_all_genome.gb} | 0 ...ference_dengue_denv1.gb => reference_dengue_denv1_genome.gb} | 0 ...ference_dengue_denv2.gb => reference_dengue_denv2_genome.gb} | 0 ...ference_dengue_denv3.gb => reference_dengue_denv3_genome.gb} | 0 ...ference_dengue_denv4.gb => reference_dengue_denv4_genome.gb} | 0 phylogenetic/rules/annotate_phylogeny.smk | 2 +- phylogenetic/rules/prepare_sequences.smk | 2 +- 7 files changed, 2 insertions(+), 2 deletions(-) rename phylogenetic/config/{reference_dengue_all.gb => reference_dengue_all_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv1.gb => reference_dengue_denv1_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv2.gb => reference_dengue_denv2_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv3.gb => reference_dengue_denv3_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv4.gb => reference_dengue_denv4_genome.gb} (100%) diff --git a/phylogenetic/config/reference_dengue_all.gb b/phylogenetic/config/reference_dengue_all_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_all.gb rename to phylogenetic/config/reference_dengue_all_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv1.gb b/phylogenetic/config/reference_dengue_denv1_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv1.gb rename to phylogenetic/config/reference_dengue_denv1_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv2.gb b/phylogenetic/config/reference_dengue_denv2_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv2.gb rename to phylogenetic/config/reference_dengue_denv2_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv3.gb b/phylogenetic/config/reference_dengue_denv3_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv3.gb rename to phylogenetic/config/reference_dengue_denv3_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv4.gb b/phylogenetic/config/reference_dengue_denv4_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv4.gb rename to phylogenetic/config/reference_dengue_denv4_genome.gb diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index b5280ede..9b801d46 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -44,7 +44,7 @@ rule translate: input: tree = "results/tree_{serotype}.nwk", node_data = "results/nt-muts_{serotype}.json", - reference = "config/reference_dengue_{serotype}.gb" + reference = "config/reference_dengue_{serotype}_genome.gb" output: node_data = "results/aa-muts_{serotype}.json" shell: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 5cb14c75..6ef35f38 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -83,7 +83,7 @@ rule align: """ input: sequences = "results/filtered_{serotype}.fasta", - reference = "config/reference_dengue_{serotype}.gb" + reference = "config/reference_dengue_{serotype}_genome.gb" output: alignment = "results/aligned_{serotype}.fasta" shell: From c05d129a1629ae5b762a4c1d78b07feb3055491e Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 6 May 2024 11:39:13 -0700 Subject: [PATCH 5/9] Add results/genome subdirectory This is in preperation of nesting each gene's specific files in subdirectories (e.g. `results/E/tree.nwk`) as suggested in comment: * https://github.com/nextstrain/private/issues/102#issuecomment-1981727993 --- phylogenetic/rules/annotate_phylogeny.smk | 24 +++++++++++----------- phylogenetic/rules/construct_phylogeny.smk | 12 +++++------ phylogenetic/rules/export.smk | 20 +++++++++--------- phylogenetic/rules/prepare_sequences.smk | 6 +++--- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 9b801d46..6e46bd4c 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -24,10 +24,10 @@ to the ones produced by Augur commands. rule ancestral: """Reconstructing ancestral sequences and mutations""" input: - tree = "results/tree_{serotype}.nwk", - alignment = "results/aligned_{serotype}.fasta" + tree = "results/genome/tree_{serotype}.nwk", + alignment = "results/genome/aligned_{serotype}.fasta" output: - node_data = "results/nt-muts_{serotype}.json" + node_data = "results/genome/nt-muts_{serotype}.json" params: inference = "joint" shell: @@ -42,11 +42,11 @@ rule ancestral: rule translate: """Translating amino acid sequences""" input: - tree = "results/tree_{serotype}.nwk", - node_data = "results/nt-muts_{serotype}.json", + tree = "results/genome/tree_{serotype}.nwk", + node_data = "results/genome/nt-muts_{serotype}.json", reference = "config/reference_dengue_{serotype}_genome.gb" output: - node_data = "results/aa-muts_{serotype}.json" + node_data = "results/genome/aa-muts_{serotype}.json" shell: """ augur translate \ @@ -62,10 +62,10 @@ rule traits: - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias """ input: - tree = "results/tree_{serotype}.nwk", + tree = "results/genome/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv" output: - node_data = "results/traits_{serotype}.json", + node_data = "results/genome/traits_{serotype}.json", params: columns = lambda wildcards: config['traits']['traits_columns'][wildcards.serotype], sampling_bias_correction = config['traits']['sampling_bias_correction'], @@ -85,12 +85,12 @@ rule traits: rule clades: """Annotating serotypes / genotypes""" input: - tree = "results/tree_{serotype}.nwk", - nt_muts = "results/nt-muts_{serotype}.json", - aa_muts = "results/aa-muts_{serotype}.json", + tree = "results/genome/tree_{serotype}.nwk", + nt_muts = "results/genome/nt-muts_{serotype}.json", + aa_muts = "results/genome/aa-muts_{serotype}.json", clade_defs = lambda wildcards: config['clades']['clade_definitions'][wildcards.serotype], output: - clades = "results/clades_{serotype}.json" + clades = "results/genome/clades_{serotype}.json" shell: """ augur clades \ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index b2f4a558..e0f6a28c 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -15,9 +15,9 @@ See Augur's usage docs for these commands for more details. rule tree: """Building tree""" input: - alignment = "results/aligned_{serotype}.fasta" + alignment = "results/genome/aligned_{serotype}.fasta" output: - tree = "results/tree-raw_{serotype}.nwk" + tree = "results/genome/tree-raw_{serotype}.nwk" shell: """ augur tree \ @@ -35,12 +35,12 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree = "results/tree-raw_{serotype}.nwk", - alignment = "results/aligned_{serotype}.fasta", + tree = "results/genome/tree-raw_{serotype}.nwk", + alignment = "results/genome/aligned_{serotype}.fasta", metadata = "data/metadata_{serotype}.tsv" output: - tree = "results/tree_{serotype}.nwk", - node_data = "results/branch-lengths_{serotype}.json", + tree = "results/genome/tree_{serotype}.nwk", + node_data = "results/genome/branch-lengths_{serotype}.json", params: coalescent = "const", date_inference = "marginal", diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 859f49af..445a457a 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -40,7 +40,7 @@ rule colors: rule prepare_auspice_config: """Prepare the auspice config file for each serotypes""" output: - auspice_config="results/config/auspice_config_{serotype}.json", + auspice_config="results/config/genome/auspice_config_{serotype}.json", params: replace_clade_key="clade_membership", replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"DENV genotype", @@ -109,17 +109,17 @@ rule prepare_auspice_config: rule export: """Exporting data files for auspice""" input: - tree = "results/tree_{serotype}.nwk", + tree = "results/genome/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv", - branch_lengths = "results/branch-lengths_{serotype}.json", - traits = "results/traits_{serotype}.json", - clades = "results/clades_{serotype}.json", - nt_muts = "results/nt-muts_{serotype}.json", - aa_muts = "results/aa-muts_{serotype}.json", - auspice_config = "results/config/auspice_config_{serotype}.json", + branch_lengths = "results/genome/branch-lengths_{serotype}.json", + traits = "results/genome/traits_{serotype}.json", + clades = "results/genome/clades_{serotype}.json", + nt_muts = "results/genome/nt-muts_{serotype}.json", + aa_muts = "results/genome/aa-muts_{serotype}.json", + auspice_config = "results/config/genome/auspice_config_{serotype}.json", colors = "results/colors_{serotype}.tsv", output: - auspice_json = "results/raw_dengue_{serotype}.json" + auspice_json = "results/genome/raw_dengue_{serotype}.json" params: strain_id = config.get("strain_id_field", "strain"), shell: @@ -137,7 +137,7 @@ rule export: rule final_strain_name: input: - auspice_json="results/raw_dengue_{serotype}.json", + auspice_json="results/genome/raw_dengue_{serotype}.json", metadata="data/metadata_{serotype}.tsv", output: auspice_json="auspice/dengue_{serotype}_genome.json" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 6ef35f38..789e6dd5 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -56,7 +56,7 @@ rule filter: metadata = "data/metadata_{serotype}.tsv", exclude = config["filter"]["exclude"], output: - sequences = "results/filtered_{serotype}.fasta" + sequences = "results/genome/filtered_{serotype}.fasta" params: group_by = config['filter']['group_by'], sequences_per_group = lambda wildcards: config['filter']['sequences_per_group'][wildcards.serotype], @@ -82,10 +82,10 @@ rule align: - filling gaps with N """ input: - sequences = "results/filtered_{serotype}.fasta", + sequences = "results/genome/filtered_{serotype}.fasta", reference = "config/reference_dengue_{serotype}_genome.gb" output: - alignment = "results/aligned_{serotype}.fasta" + alignment = "results/genome/aligned_{serotype}.fasta" shell: """ augur align \ From 08855dc99b118d3af65bfca3682c6af24365643f Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 6 May 2024 11:50:09 -0700 Subject: [PATCH 6/9] Parameterize "genome" in the phylogenetic pipeline In prep of building "genome" and "E" intermediate and final files for the phylogenetic pipeline. --- phylogenetic/Snakefile | 2 +- phylogenetic/rules/annotate_phylogeny.smk | 24 +++++++++++----------- phylogenetic/rules/construct_phylogeny.smk | 12 +++++------ phylogenetic/rules/export.smk | 22 ++++++++++---------- phylogenetic/rules/prepare_sequences.smk | 6 +++--- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index dd8a56ea..bfa0aa75 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -9,7 +9,7 @@ wildcard_constraints: rule all: input: - auspice_json = expand("auspice/dengue_{serotype}_genome.json", serotype=serotypes) + auspice_json = expand("auspice/dengue_{serotype}_{gene}.json", serotype=serotypes, gene='genome'), include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 6e46bd4c..d5c02706 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -24,10 +24,10 @@ to the ones produced by Augur commands. rule ancestral: """Reconstructing ancestral sequences and mutations""" input: - tree = "results/genome/tree_{serotype}.nwk", - alignment = "results/genome/aligned_{serotype}.fasta" + tree = "results/{gene}/tree_{serotype}.nwk", + alignment = "results/{gene}/aligned_{serotype}.fasta" output: - node_data = "results/genome/nt-muts_{serotype}.json" + node_data = "results/{gene}/nt-muts_{serotype}.json" params: inference = "joint" shell: @@ -42,11 +42,11 @@ rule ancestral: rule translate: """Translating amino acid sequences""" input: - tree = "results/genome/tree_{serotype}.nwk", - node_data = "results/genome/nt-muts_{serotype}.json", + tree = "results/{gene}/tree_{serotype}.nwk", + node_data = "results/{gene}/nt-muts_{serotype}.json", reference = "config/reference_dengue_{serotype}_genome.gb" output: - node_data = "results/genome/aa-muts_{serotype}.json" + node_data = "results/{gene}/aa-muts_{serotype}.json" shell: """ augur translate \ @@ -62,10 +62,10 @@ rule traits: - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias """ input: - tree = "results/genome/tree_{serotype}.nwk", + tree = "results/{gene}/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv" output: - node_data = "results/genome/traits_{serotype}.json", + node_data = "results/{gene}/traits_{serotype}.json", params: columns = lambda wildcards: config['traits']['traits_columns'][wildcards.serotype], sampling_bias_correction = config['traits']['sampling_bias_correction'], @@ -85,12 +85,12 @@ rule traits: rule clades: """Annotating serotypes / genotypes""" input: - tree = "results/genome/tree_{serotype}.nwk", - nt_muts = "results/genome/nt-muts_{serotype}.json", - aa_muts = "results/genome/aa-muts_{serotype}.json", + tree = "results/{gene}/tree_{serotype}.nwk", + nt_muts = "results/{gene}/nt-muts_{serotype}.json", + aa_muts = "results/{gene}/aa-muts_{serotype}.json", clade_defs = lambda wildcards: config['clades']['clade_definitions'][wildcards.serotype], output: - clades = "results/genome/clades_{serotype}.json" + clades = "results/{gene}/clades_{serotype}.json" shell: """ augur clades \ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index e0f6a28c..636cfd46 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -15,9 +15,9 @@ See Augur's usage docs for these commands for more details. rule tree: """Building tree""" input: - alignment = "results/genome/aligned_{serotype}.fasta" + alignment = "results/{gene}/aligned_{serotype}.fasta" output: - tree = "results/genome/tree-raw_{serotype}.nwk" + tree = "results/{gene}/tree-raw_{serotype}.nwk" shell: """ augur tree \ @@ -35,12 +35,12 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree = "results/genome/tree-raw_{serotype}.nwk", - alignment = "results/genome/aligned_{serotype}.fasta", + tree = "results/{gene}/tree-raw_{serotype}.nwk", + alignment = "results/{gene}/aligned_{serotype}.fasta", metadata = "data/metadata_{serotype}.tsv" output: - tree = "results/genome/tree_{serotype}.nwk", - node_data = "results/genome/branch-lengths_{serotype}.json", + tree = "results/{gene}/tree_{serotype}.nwk", + node_data = "results/{gene}/branch-lengths_{serotype}.json", params: coalescent = "const", date_inference = "marginal", diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 445a457a..f7d2ced2 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -40,7 +40,7 @@ rule colors: rule prepare_auspice_config: """Prepare the auspice config file for each serotypes""" output: - auspice_config="results/config/genome/auspice_config_{serotype}.json", + auspice_config="results/config/{gene}/auspice_config_{serotype}.json", params: replace_clade_key="clade_membership", replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"DENV genotype", @@ -109,17 +109,17 @@ rule prepare_auspice_config: rule export: """Exporting data files for auspice""" input: - tree = "results/genome/tree_{serotype}.nwk", + tree = "results/{gene}/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv", - branch_lengths = "results/genome/branch-lengths_{serotype}.json", - traits = "results/genome/traits_{serotype}.json", - clades = "results/genome/clades_{serotype}.json", - nt_muts = "results/genome/nt-muts_{serotype}.json", - aa_muts = "results/genome/aa-muts_{serotype}.json", - auspice_config = "results/config/genome/auspice_config_{serotype}.json", + branch_lengths = "results/{gene}/branch-lengths_{serotype}.json", + traits = "results/{gene}/traits_{serotype}.json", + clades = "results/{gene}/clades_{serotype}.json", + nt_muts = "results/{gene}/nt-muts_{serotype}.json", + aa_muts = "results/{gene}/aa-muts_{serotype}.json", + auspice_config = "results/config/{gene}/auspice_config_{serotype}.json", colors = "results/colors_{serotype}.tsv", output: - auspice_json = "results/genome/raw_dengue_{serotype}.json" + auspice_json = "results/{gene}/raw_dengue_{serotype}.json" params: strain_id = config.get("strain_id_field", "strain"), shell: @@ -137,10 +137,10 @@ rule export: rule final_strain_name: input: - auspice_json="results/genome/raw_dengue_{serotype}.json", + auspice_json="results/{gene}/raw_dengue_{serotype}.json", metadata="data/metadata_{serotype}.tsv", output: - auspice_json="auspice/dengue_{serotype}_genome.json" + auspice_json="auspice/dengue_{serotype}_{gene}.json" params: strain_id=config.get("strain_id_field", "strain"), display_strain_field=config.get("display_strain_field", "strain"), diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 789e6dd5..2229bee1 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -56,7 +56,7 @@ rule filter: metadata = "data/metadata_{serotype}.tsv", exclude = config["filter"]["exclude"], output: - sequences = "results/genome/filtered_{serotype}.fasta" + sequences = "results/{gene}/filtered_{serotype}.fasta" params: group_by = config['filter']['group_by'], sequences_per_group = lambda wildcards: config['filter']['sequences_per_group'][wildcards.serotype], @@ -82,10 +82,10 @@ rule align: - filling gaps with N """ input: - sequences = "results/genome/filtered_{serotype}.fasta", + sequences = "results/{gene}/filtered_{serotype}.fasta", reference = "config/reference_dengue_{serotype}_genome.gb" output: - alignment = "results/genome/aligned_{serotype}.fasta" + alignment = "results/{gene}/aligned_{serotype}.fasta" shell: """ augur align \ From 1f8771143125441b5e3f7aa13b3c639abcb92f00 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 6 May 2024 14:47:48 -0700 Subject: [PATCH 7/9] drop dengue in reference filenames --- .../{reference_dengue_all_genome.gb => reference_all_genome.gb} | 0 ...ference_dengue_denv1_genome.gb => reference_denv1_genome.gb} | 0 ...ference_dengue_denv2_genome.gb => reference_denv2_genome.gb} | 0 ...ference_dengue_denv3_genome.gb => reference_denv3_genome.gb} | 0 ...ference_dengue_denv4_genome.gb => reference_denv4_genome.gb} | 0 phylogenetic/rules/annotate_phylogeny.smk | 2 +- phylogenetic/rules/prepare_sequences.smk | 2 +- 7 files changed, 2 insertions(+), 2 deletions(-) rename phylogenetic/config/{reference_dengue_all_genome.gb => reference_all_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv1_genome.gb => reference_denv1_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv2_genome.gb => reference_denv2_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv3_genome.gb => reference_denv3_genome.gb} (100%) rename phylogenetic/config/{reference_dengue_denv4_genome.gb => reference_denv4_genome.gb} (100%) diff --git a/phylogenetic/config/reference_dengue_all_genome.gb b/phylogenetic/config/reference_all_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_all_genome.gb rename to phylogenetic/config/reference_all_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv1_genome.gb b/phylogenetic/config/reference_denv1_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv1_genome.gb rename to phylogenetic/config/reference_denv1_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv2_genome.gb b/phylogenetic/config/reference_denv2_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv2_genome.gb rename to phylogenetic/config/reference_denv2_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv3_genome.gb b/phylogenetic/config/reference_denv3_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv3_genome.gb rename to phylogenetic/config/reference_denv3_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv4_genome.gb b/phylogenetic/config/reference_denv4_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv4_genome.gb rename to phylogenetic/config/reference_denv4_genome.gb diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index d5c02706..0b2131bb 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -44,7 +44,7 @@ rule translate: input: tree = "results/{gene}/tree_{serotype}.nwk", node_data = "results/{gene}/nt-muts_{serotype}.json", - reference = "config/reference_dengue_{serotype}_genome.gb" + reference = "config/reference_{serotype}_genome.gb" output: node_data = "results/{gene}/aa-muts_{serotype}.json" shell: diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 2229bee1..6a0c2675 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -83,7 +83,7 @@ rule align: """ input: sequences = "results/{gene}/filtered_{serotype}.fasta", - reference = "config/reference_dengue_{serotype}_genome.gb" + reference = "config/reference_{serotype}_genome.gb" output: alignment = "results/{gene}/aligned_{serotype}.fasta" shell: From 08c7066193605bb196f7740aacddfd0044f82b91 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 6 May 2024 15:02:26 -0700 Subject: [PATCH 8/9] fixup: dengue2 reference genbank Move gene annotation to top of CDS to match other genbank files (denv1,3,4) --- phylogenetic/config/reference_denv2_genome.gb | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/phylogenetic/config/reference_denv2_genome.gb b/phylogenetic/config/reference_denv2_genome.gb index 710c6ec5..d5916490 100644 --- a/phylogenetic/config/reference_denv2_genome.gb +++ b/phylogenetic/config/reference_denv2_genome.gb @@ -49,13 +49,13 @@ FEATURES Location/Qualifiers /db_xref="GeneID:1494449" /gene="flavivirus polyprotein gene" CDS 97..438 - /db_xref="VBRC:35917" /gene="C" + /db_xref="VBRC:35917" /product="anchored capsid protein C" /protein_id="NP_739581.2" CDS 439..936 - /db_xref="VBRC:35919" /gene="M" + /db_xref="VBRC:35919" /product="membrane glycoprotein precursor M" /protein_id="NP_739582.2" CDS 439..711 @@ -64,50 +64,50 @@ FEATURES Location/Qualifiers /product="protein pr" /protein_id="YP_009164954.1" CDS 937..2421 - /db_xref="VBRC:35921" /gene="E" + /db_xref="VBRC:35921" /product="envelope protein E" /protein_id="NP_739583.2" CDS 2422..3477 - /db_xref="VBRC:35922" /gene="NS1" + /db_xref="VBRC:35922" /product="nonstructural protein NS1" /protein_id="NP_739584.2" CDS 3478..4131 - /db_xref="VBRC:35923" /gene="NS2A" + /db_xref="VBRC:35923" /product="nonstructural protein NS2A" /protein_id="NP_739585.2" CDS 4132..4521 - /db_xref="VBRC:35924" /gene="NS2B" + /db_xref="VBRC:35924" /product="nonstructural protein NS2B" /protein_id="NP_739586.2" CDS 4522..6375 - /db_xref="VBRC:35925" /gene="NS3" + /db_xref="VBRC:35925" /note="ATPase; component of capping enzyme (RNA thriphosphatase); protease; RNA-helicase" /product="nonstructural protein NS3" /protein_id="NP_739587.2" CDS 6376..6756 - /db_xref="VBRC:35926" /gene="NS4A" + /db_xref="VBRC:35926" /product="nonstructural protein NS4A" /protein_id="NP_739588.2" CDS 6757..6825 - /db_xref="VBRC:35927" /gene="2K" + /db_xref="VBRC:35927" /product="protein 2K" /protein_id="NP_739593.2" CDS 6826..7569 - /db_xref="VBRC:35928" /gene="NS4B" + /db_xref="VBRC:35928" /product="nonstructural protein NS4B" /protein_id="NP_739589.2" CDS 7570..10269 - /db_xref="VBRC:35929" /gene="NS5" + /db_xref="VBRC:35929" /note="methyltransferase component of capping enzyme; nonstructural protein NS5" /product="RNA-dependent RNA polymerase NS5" From f5b7bf61340814ed483f5718b6b1d69ac9faa907 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 6 May 2024 15:14:59 -0700 Subject: [PATCH 9/9] Add rule for generatating gene reference files This generates the reference_serotype_gene.gb and reference_serotype_gene.fasta files for each serotype. These files can then be subsequently used in augur align, augur translate, and optionally for nextclade align during the gene trees. --- phylogenetic/Snakefile | 1 + phylogenetic/rules/prepare_sequences_E.smk | 60 ++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 phylogenetic/rules/prepare_sequences_E.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index bfa0aa75..c763f009 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -12,6 +12,7 @@ rule all: auspice_json = expand("auspice/dengue_{serotype}_{gene}.json", serotype=serotypes, gene='genome'), include: "rules/prepare_sequences.smk" +include: "rules/prepare_sequences_E.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" diff --git a/phylogenetic/rules/prepare_sequences_E.smk b/phylogenetic/rules/prepare_sequences_E.smk new file mode 100644 index 00000000..bf4f0b65 --- /dev/null +++ b/phylogenetic/rules/prepare_sequences_E.smk @@ -0,0 +1,60 @@ +""" +This part of the workflow prepares reference files and sequences for constructing the gene phylogenetic trees. +REQUIRED INPUTS: + reference = path to reference sequence or genbank + sequences = path to all sequences from which gene sequences can be extracted + +OUTPUTS: + gene_fasta = reference fasta for the gene (e.g. E gene) + gene_genbank = reference genbank for the gene (e.g. E gene) + sequences = sequences with gene sequences extracted and aligned to the reference gene sequence +This part of the workflow usually includes the following steps: + - newreference.py: Creates new gene genbank and gene reference FASTA from the whole genome reference genbank + - nextclade: Aligns sequences to the reference gene sequence and extracts the gene sequences to ensure the reference files are valid +See Nextclade or script usage docs for these commands for more details. +""" + +ruleorder: align_and_extract_E > decompress + +rule generate_E_reference_files: + """ + Generating reference files for the E gene + """ + input: + reference = "config/reference_{serotype}_genome.gb", + output: + fasta = "results/config/reference_{serotype}_E.fasta", + genbank = "results/config/reference_{serotype}_E.gb", + params: + gene = "E", + shell: + """ + python3 bin/newreference.py \ + --reference {input.reference} \ + --output-fasta {output.fasta} \ + --output-genbank {output.genbank} \ + --gene {params.gene} + """ + +rule align_and_extract_E: + """ + Cutting sequences to the length of the E gene reference sequence + """ + input: + sequences = "data/sequences_{serotype}.fasta", + reference = "results/config/reference_{serotype}_E.fasta" + output: + sequences = "results/E/sequences_{serotype}.fasta" + params: + min_length = 1000, + shell: + """ + nextclade run \ + -j 1 \ + --input-ref {input.reference} \ + --output-fasta {output.sequences} \ + --min-seed-cover 0.01 \ + --min-length {params.min_length} \ + --silent \ + {input.sequences} + """