From 13ce1a930cd8ade51fabafb03beb6fc41ac5c112 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 12 Mar 2024 15:41:15 -0700 Subject: [PATCH] Add start and end flags to newreference.py to allow for creating subgenic phylogenetic trees Adds "--start" and "--end" arguments to provide 0-based start and end positions respective to a "--gene" of interest. Since the GenBank sequences can contain extra sequences off the end of the polyprotein, a the start and end positions are relative to the gene of interest was deemed more stable. --- scripts/newreference.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/newreference.py b/scripts/newreference.py index 063dafb..c8deb5c 100644 --- a/scripts/newreference.py +++ b/scripts/newreference.py @@ -5,7 +5,7 @@ import argparse import sys -def new_reference(referencefile, outgenbank, outfasta, gene): +def new_reference(referencefile, outgenbank, outfasta, gene, start, end): ref = SeqIO.read(referencefile, "genbank") startofgene = None endofgene = None @@ -25,10 +25,20 @@ def new_reference(referencefile, outgenbank, outfasta, gene): sys.exit(1) record = ref[startofgene:endofgene] + # Allows for subgenic regions + if (gene is not None and start is not None and end is not None): + record = record[int(start):int(end)] + source_feature = SeqFeature(FeatureLocation(start=0, end=len(record)), type='source', qualifiers=ref_source_feature.qualifiers) record.features.append(source_feature) + # For subgenic regions, adds a CDS feature {gene}_{start}_{end} + if(gene is not None and start is not None and end is not None): + gene = gene + "_" + start + "_" + end + record.features.append(SeqFeature(FeatureLocation(start=0, end=len(record)), type='CDS', + qualifiers={'gene': gene})) + SeqIO.write(record, outgenbank, 'genbank') SeqIO.write(record, outfasta, "fasta") @@ -43,11 +53,19 @@ def new_reference(referencefile, outgenbank, outfasta, gene): parser.add_argument("--output-fasta", required=True, help="GenBank new reference file") parser.add_argument("--output-genbank", required=True, help="GenBank new reference file") parser.add_argument("--gene", help="gene name or genome for entire genome") + parser.add_argument("--start", help="Start 0-based position relative to the gene (requires --gene argument)") + parser.add_argument("--end", help="End 0-based position relative to the gene (requires --gene argument)") args = parser.parse_args() if args.gene=='genome': + # Check if start and end are specified here + if (args.start is not None and args.end is not None): + print(f"ERROR: --start '{args.start}' --end '{args.end}' is not supported for full genome.", file=sys.stderr) + sys.exit(1) + + shutil.copy(args.reference, args.output_genbank) SeqIO.write(SeqIO.read(args.reference, 'genbank'), args.output_fasta, 'fasta') else: - new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene) + new_reference(args.reference, args.output_genbank, args.output_fasta, args.gene, args.start, args.end)