From 570daa60bf20c26f3477bdba8384f5b506abcd59 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 8 Mar 2024 11:41:00 -0800 Subject: [PATCH 1/2] Allows for CDS (as well as gene) features to generate a new gene reference The newreference.py script processes a GenBank file, producing new gene reference files (GenBank and FASTA) which are required for creating the Nextstrain gene tree views. As we intend to use this script as a template for other viruses (e.g. dengue, measles), it was necessary to allow the use of CDS features in the GenBank file, not just the gene. --- scripts/newreference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index 0d7b9e5..c6b47ec 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -6,10 +6,12 @@ def new_reference(referencefile, outgenbank, outfasta, gene): ref = SeqIO.read(referencefile, "genbank") + startofgene = None + endofgene = None for feature in ref.features: if feature.type == 'source': ref_source_feature = feature - if feature.type =='gene': + if feature.type =='gene' or feature.type == 'CDS': a = list(feature.qualifiers.items())[0][-1][0] if a == gene: startofgene = int(list(feature.location)[0]) From 8cd6a1384301161117252f8e6f612f7332bd9416 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 8 Mar 2024 13:57:13 -0800 Subject: [PATCH 2/2] newreference: If gene is given but not found in genbank, error out Currently, if the --gene option is used and the gene name is not found, the script will use the entire genome which may cause some silent undesired behaviors. This commit changes that such that the script will error out if the gene is not found in the GenBank file as this indicates the gene name may be misspelled or the user may be using the wrong GenBank file. If the --gene option is not used, the script will continue to process the entire genome as expected. Co-authored-by: Jover Lee --- scripts/newreference.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/newreference.py b/scripts/newreference.py index c6b47ec..063dafb 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -3,6 +3,7 @@ from Bio.SeqFeature import SeqFeature, FeatureLocation, Seq import shutil import argparse +import sys def new_reference(referencefile, outgenbank, outfasta, gene): ref = SeqIO.read(referencefile, "genbank") @@ -17,6 +18,12 @@ def new_reference(referencefile, outgenbank, outfasta, gene): startofgene = int(list(feature.location)[0]) endofgene = int(list(feature.location)[-1])+1 + # If user provides a --gene 'some name' is not found, print a warning and use the entire genome. + # Otherwise do not print a warning. + if(gene is not None and startofgene is None and endofgene is None): + print(f"ERROR: No '{gene}' was found under 'gene' or 'CDS' features in the GenBank file.", file=sys.stderr) + sys.exit(1) + record = ref[startofgene:endofgene] source_feature = SeqFeature(FeatureLocation(start=0, end=len(record)), type='source', qualifiers=ref_source_feature.qualifiers)