diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 82b5899c..c763f009 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,12 +1,18 @@ configfile: "config/config_dengue.yaml" serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] +genes = ['genome', 'E'] + +wildcard_constraints: + serotype = "|".join(serotypes), + gene = "|".join(genes) rule all: input: - auspice_json = expand("auspice/dengue_{serotype}_genome.json", serotype=serotypes) + auspice_json = expand("auspice/dengue_{serotype}_{gene}.json", serotype=serotypes, gene='genome'), include: "rules/prepare_sequences.smk" +include: "rules/prepare_sequences_E.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" diff --git a/phylogenetic/bin/assign-colors.py b/phylogenetic/bin/assign-colors.py old mode 100644 new mode 100755 diff --git a/phylogenetic/bin/newreference.py b/phylogenetic/bin/newreference.py new file mode 100755 index 00000000..a89c8196 --- /dev/null +++ b/phylogenetic/bin/newreference.py @@ -0,0 +1,53 @@ +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation, Seq +import shutil +import argparse +import sys + +def new_reference(referencefile, outgenbank, outfasta, gene): + ref = SeqIO.read(referencefile, "genbank") + startofgene = None + endofgene = None + for feature in ref.features: + if feature.type == 'source': + ref_source_feature = feature + if feature.type =='gene' or feature.type == 'CDS': + a = list(feature.qualifiers.items())[0][-1][0] + if a == gene: + startofgene = int(list(feature.location)[0]) + endofgene = int(list(feature.location)[-1])+1 + + # If user provides a --gene 'some name' that is not found, error out as this may indicate that + # the gene name is misspelled or the user may be using the wrong GenBank file. + if(gene is not None and startofgene is None and endofgene is None): + print(f"ERROR: No '{gene}' was found under 'gene' or 'CDS' features in the GenBank file.", file=sys.stderr) + sys.exit(1) + + record = ref[startofgene:endofgene] + source_feature = SeqFeature(FeatureLocation(start=0, end=len(record)), type='source', + qualifiers=ref_source_feature.qualifiers) + record.features.append(source_feature) + + SeqIO.write(record, outgenbank, 'genbank') + SeqIO.write(record, outfasta, "fasta") + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="make new reference depending on whether the entire genome or only part is to be used for the tree", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--reference", required=True, help="GenBank file with reference sequences") + parser.add_argument("--output-fasta", required=True, help="GenBank new reference file") + parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") + parser.add_argument("--gene", help="gene name or genome for entire genome") + args = parser.parse_args() + + if args.gene=='genome': + shutil.copy(args.reference, args.output_genbank) + SeqIO.write(SeqIO.read(args.reference, 'genbank'), args.output_fasta, 'fasta') + else: + new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) + diff --git a/phylogenetic/config/reference_dengue_all.gb b/phylogenetic/config/reference_all_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_all.gb rename to phylogenetic/config/reference_all_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv1.gb b/phylogenetic/config/reference_denv1_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv1.gb rename to phylogenetic/config/reference_denv1_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv2.gb b/phylogenetic/config/reference_denv2_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv2.gb rename to phylogenetic/config/reference_denv2_genome.gb index 710c6ec5..d5916490 100644 --- a/phylogenetic/config/reference_dengue_denv2.gb +++ b/phylogenetic/config/reference_denv2_genome.gb @@ -49,13 +49,13 @@ FEATURES Location/Qualifiers /db_xref="GeneID:1494449" /gene="flavivirus polyprotein gene" CDS 97..438 - /db_xref="VBRC:35917" /gene="C" + /db_xref="VBRC:35917" /product="anchored capsid protein C" /protein_id="NP_739581.2" CDS 439..936 - /db_xref="VBRC:35919" /gene="M" + /db_xref="VBRC:35919" /product="membrane glycoprotein precursor M" /protein_id="NP_739582.2" CDS 439..711 @@ -64,50 +64,50 @@ FEATURES Location/Qualifiers /product="protein pr" /protein_id="YP_009164954.1" CDS 937..2421 - /db_xref="VBRC:35921" /gene="E" + /db_xref="VBRC:35921" /product="envelope protein E" /protein_id="NP_739583.2" CDS 2422..3477 - /db_xref="VBRC:35922" /gene="NS1" + /db_xref="VBRC:35922" /product="nonstructural protein NS1" /protein_id="NP_739584.2" CDS 3478..4131 - /db_xref="VBRC:35923" /gene="NS2A" + /db_xref="VBRC:35923" /product="nonstructural protein NS2A" /protein_id="NP_739585.2" CDS 4132..4521 - /db_xref="VBRC:35924" /gene="NS2B" + /db_xref="VBRC:35924" /product="nonstructural protein NS2B" /protein_id="NP_739586.2" CDS 4522..6375 - /db_xref="VBRC:35925" /gene="NS3" + /db_xref="VBRC:35925" /note="ATPase; component of capping enzyme (RNA thriphosphatase); protease; RNA-helicase" /product="nonstructural protein NS3" /protein_id="NP_739587.2" CDS 6376..6756 - /db_xref="VBRC:35926" /gene="NS4A" + /db_xref="VBRC:35926" /product="nonstructural protein NS4A" /protein_id="NP_739588.2" CDS 6757..6825 - /db_xref="VBRC:35927" /gene="2K" + /db_xref="VBRC:35927" /product="protein 2K" /protein_id="NP_739593.2" CDS 6826..7569 - /db_xref="VBRC:35928" /gene="NS4B" + /db_xref="VBRC:35928" /product="nonstructural protein NS4B" /protein_id="NP_739589.2" CDS 7570..10269 - /db_xref="VBRC:35929" /gene="NS5" + /db_xref="VBRC:35929" /note="methyltransferase component of capping enzyme; nonstructural protein NS5" /product="RNA-dependent RNA polymerase NS5" diff --git a/phylogenetic/config/reference_dengue_denv3.gb b/phylogenetic/config/reference_denv3_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv3.gb rename to phylogenetic/config/reference_denv3_genome.gb diff --git a/phylogenetic/config/reference_dengue_denv4.gb b/phylogenetic/config/reference_denv4_genome.gb similarity index 100% rename from phylogenetic/config/reference_dengue_denv4.gb rename to phylogenetic/config/reference_denv4_genome.gb diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index b5280ede..0b2131bb 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -24,10 +24,10 @@ to the ones produced by Augur commands. rule ancestral: """Reconstructing ancestral sequences and mutations""" input: - tree = "results/tree_{serotype}.nwk", - alignment = "results/aligned_{serotype}.fasta" + tree = "results/{gene}/tree_{serotype}.nwk", + alignment = "results/{gene}/aligned_{serotype}.fasta" output: - node_data = "results/nt-muts_{serotype}.json" + node_data = "results/{gene}/nt-muts_{serotype}.json" params: inference = "joint" shell: @@ -42,11 +42,11 @@ rule ancestral: rule translate: """Translating amino acid sequences""" input: - tree = "results/tree_{serotype}.nwk", - node_data = "results/nt-muts_{serotype}.json", - reference = "config/reference_dengue_{serotype}.gb" + tree = "results/{gene}/tree_{serotype}.nwk", + node_data = "results/{gene}/nt-muts_{serotype}.json", + reference = "config/reference_{serotype}_genome.gb" output: - node_data = "results/aa-muts_{serotype}.json" + node_data = "results/{gene}/aa-muts_{serotype}.json" shell: """ augur translate \ @@ -62,10 +62,10 @@ rule traits: - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias """ input: - tree = "results/tree_{serotype}.nwk", + tree = "results/{gene}/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv" output: - node_data = "results/traits_{serotype}.json", + node_data = "results/{gene}/traits_{serotype}.json", params: columns = lambda wildcards: config['traits']['traits_columns'][wildcards.serotype], sampling_bias_correction = config['traits']['sampling_bias_correction'], @@ -85,12 +85,12 @@ rule traits: rule clades: """Annotating serotypes / genotypes""" input: - tree = "results/tree_{serotype}.nwk", - nt_muts = "results/nt-muts_{serotype}.json", - aa_muts = "results/aa-muts_{serotype}.json", + tree = "results/{gene}/tree_{serotype}.nwk", + nt_muts = "results/{gene}/nt-muts_{serotype}.json", + aa_muts = "results/{gene}/aa-muts_{serotype}.json", clade_defs = lambda wildcards: config['clades']['clade_definitions'][wildcards.serotype], output: - clades = "results/clades_{serotype}.json" + clades = "results/{gene}/clades_{serotype}.json" shell: """ augur clades \ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index b2f4a558..636cfd46 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -15,9 +15,9 @@ See Augur's usage docs for these commands for more details. rule tree: """Building tree""" input: - alignment = "results/aligned_{serotype}.fasta" + alignment = "results/{gene}/aligned_{serotype}.fasta" output: - tree = "results/tree-raw_{serotype}.nwk" + tree = "results/{gene}/tree-raw_{serotype}.nwk" shell: """ augur tree \ @@ -35,12 +35,12 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree = "results/tree-raw_{serotype}.nwk", - alignment = "results/aligned_{serotype}.fasta", + tree = "results/{gene}/tree-raw_{serotype}.nwk", + alignment = "results/{gene}/aligned_{serotype}.fasta", metadata = "data/metadata_{serotype}.tsv" output: - tree = "results/tree_{serotype}.nwk", - node_data = "results/branch-lengths_{serotype}.json", + tree = "results/{gene}/tree_{serotype}.nwk", + node_data = "results/{gene}/branch-lengths_{serotype}.json", params: coalescent = "const", date_inference = "marginal", diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 859f49af..f7d2ced2 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -40,7 +40,7 @@ rule colors: rule prepare_auspice_config: """Prepare the auspice config file for each serotypes""" output: - auspice_config="results/config/auspice_config_{serotype}.json", + auspice_config="results/config/{gene}/auspice_config_{serotype}.json", params: replace_clade_key="clade_membership", replace_clade_title=lambda wildcard: r"Serotype" if wildcard.serotype in ['all'] else r"DENV genotype", @@ -109,17 +109,17 @@ rule prepare_auspice_config: rule export: """Exporting data files for auspice""" input: - tree = "results/tree_{serotype}.nwk", + tree = "results/{gene}/tree_{serotype}.nwk", metadata = "data/metadata_{serotype}.tsv", - branch_lengths = "results/branch-lengths_{serotype}.json", - traits = "results/traits_{serotype}.json", - clades = "results/clades_{serotype}.json", - nt_muts = "results/nt-muts_{serotype}.json", - aa_muts = "results/aa-muts_{serotype}.json", - auspice_config = "results/config/auspice_config_{serotype}.json", + branch_lengths = "results/{gene}/branch-lengths_{serotype}.json", + traits = "results/{gene}/traits_{serotype}.json", + clades = "results/{gene}/clades_{serotype}.json", + nt_muts = "results/{gene}/nt-muts_{serotype}.json", + aa_muts = "results/{gene}/aa-muts_{serotype}.json", + auspice_config = "results/config/{gene}/auspice_config_{serotype}.json", colors = "results/colors_{serotype}.tsv", output: - auspice_json = "results/raw_dengue_{serotype}.json" + auspice_json = "results/{gene}/raw_dengue_{serotype}.json" params: strain_id = config.get("strain_id_field", "strain"), shell: @@ -137,10 +137,10 @@ rule export: rule final_strain_name: input: - auspice_json="results/raw_dengue_{serotype}.json", + auspice_json="results/{gene}/raw_dengue_{serotype}.json", metadata="data/metadata_{serotype}.tsv", output: - auspice_json="auspice/dengue_{serotype}_genome.json" + auspice_json="auspice/dengue_{serotype}_{gene}.json" params: strain_id=config.get("strain_id_field", "strain"), display_strain_field=config.get("display_strain_field", "strain"), diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 5cb14c75..6a0c2675 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -56,7 +56,7 @@ rule filter: metadata = "data/metadata_{serotype}.tsv", exclude = config["filter"]["exclude"], output: - sequences = "results/filtered_{serotype}.fasta" + sequences = "results/{gene}/filtered_{serotype}.fasta" params: group_by = config['filter']['group_by'], sequences_per_group = lambda wildcards: config['filter']['sequences_per_group'][wildcards.serotype], @@ -82,10 +82,10 @@ rule align: - filling gaps with N """ input: - sequences = "results/filtered_{serotype}.fasta", - reference = "config/reference_dengue_{serotype}.gb" + sequences = "results/{gene}/filtered_{serotype}.fasta", + reference = "config/reference_{serotype}_genome.gb" output: - alignment = "results/aligned_{serotype}.fasta" + alignment = "results/{gene}/aligned_{serotype}.fasta" shell: """ augur align \ diff --git a/phylogenetic/rules/prepare_sequences_E.smk b/phylogenetic/rules/prepare_sequences_E.smk new file mode 100644 index 00000000..bf4f0b65 --- /dev/null +++ b/phylogenetic/rules/prepare_sequences_E.smk @@ -0,0 +1,60 @@ +""" +This part of the workflow prepares reference files and sequences for constructing the gene phylogenetic trees. +REQUIRED INPUTS: + reference = path to reference sequence or genbank + sequences = path to all sequences from which gene sequences can be extracted + +OUTPUTS: + gene_fasta = reference fasta for the gene (e.g. E gene) + gene_genbank = reference genbank for the gene (e.g. E gene) + sequences = sequences with gene sequences extracted and aligned to the reference gene sequence +This part of the workflow usually includes the following steps: + - newreference.py: Creates new gene genbank and gene reference FASTA from the whole genome reference genbank + - nextclade: Aligns sequences to the reference gene sequence and extracts the gene sequences to ensure the reference files are valid +See Nextclade or script usage docs for these commands for more details. +""" + +ruleorder: align_and_extract_E > decompress + +rule generate_E_reference_files: + """ + Generating reference files for the E gene + """ + input: + reference = "config/reference_{serotype}_genome.gb", + output: + fasta = "results/config/reference_{serotype}_E.fasta", + genbank = "results/config/reference_{serotype}_E.gb", + params: + gene = "E", + shell: + """ + python3 bin/newreference.py \ + --reference {input.reference} \ + --output-fasta {output.fasta} \ + --output-genbank {output.genbank} \ + --gene {params.gene} + """ + +rule align_and_extract_E: + """ + Cutting sequences to the length of the E gene reference sequence + """ + input: + sequences = "data/sequences_{serotype}.fasta", + reference = "results/config/reference_{serotype}_E.fasta" + output: + sequences = "results/E/sequences_{serotype}.fasta" + params: + min_length = 1000, + shell: + """ + nextclade run \ + -j 1 \ + --input-ref {input.reference} \ + --output-fasta {output.sequences} \ + --min-seed-cover 0.01 \ + --min-length {params.min_length} \ + --silent \ + {input.sequences} + """