From e267f053af06b2ede9eefd4a62a21ba5f93efcbd Mon Sep 17 00:00:00 2001 From: Monica Poelchau Date: Mon, 16 Oct 2023 15:15:10 -0500 Subject: [PATCH] get protein_id from CDS --- gff3tool/bin/gff3_to_fasta.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gff3tool/bin/gff3_to_fasta.py b/gff3tool/bin/gff3_to_fasta.py index 499dff8..a059439 100755 --- a/gff3tool/bin/gff3_to_fasta.py +++ b/gff3tool/bin/gff3_to_fasta.py @@ -191,10 +191,11 @@ def splicer(gff, ftype, dline, stype, embedded_fasta=False): cname = child['attributes']['Name'] defline='>{0:s}'.format(cid) if stype == "pep": - if 'protein_id' in child['attributes']: - cid = child['attributes']['protein_id'] - else: - cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid) + for grandchild in child['children']: #first try to get the CDS protein_id + if 'protein_id' in grandchild['attributes']: + cid = grandchild['attributes']['protein_id'] + + cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid)#otherwise, if it has the -R[A-Z] format then modify that to -P[A-Z] defline = '>{0:s}'.format(cid) elif ftype[0] == 'CDS': defline='>{0:s}-CDS'.format(cid)