-
Notifications
You must be signed in to change notification settings - Fork 1
/
cliv2_gff2gtf.py
55 lines (42 loc) · 1.84 KB
/
cliv2_gff2gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
import re
def read_gff(filename):
outfile_name = str(filename).replace("gff", "gtf")
outfile = open(outfile_name, "w")
file = open(filename, "r")
gtf = ""
for line in file:
column = line.rstrip().split("\t")
# a boolean to determine if line contains actual features for gtf
# or details specific to GFF3
if bool(re.search("#.*|contig|match|match_part", line)):
continue
# only extract lines where the attribute is an exon
# search for "Genbank" and "transcript_id" in attributes, bordered
# with a ";". If found, assign as "gen_id" "trans_id", respectively.
attribute = str(column[2])
if attribute == "exon" :
gbank = re.search("gene_id=(.*?);", column[8])
trans = re.search("transcript_id=(.*?);", column[8])
gene = re.search("geneName=(.*?);", column[8])
old_id = re.search("ID=(.*?);", column[8])
if gbank:
gen_id = gbank.group(1)
if str(gen_id) != "absent_entrezID": # This line skips over any gff annotations that did not have blast hits
trans_id = trans.group(1)
gname = gene.group(1)
locusID = old_id.group(1)
else:
continue
else:
continue
# Formatting gene_id and transcript_id
first_cols = "\t".join(column[0:8])
last_col = 'gene_id "' + gen_id + '"; ' + 'transcript_id "' + trans_id + '"; gene_abbrev ' + gname + '; gnomeID ' + locusID
gtf += first_cols + "\t" + last_col + '\n'
else:
continue
outfile.write(gtf)
if __name__ == '__main__':
file = sys.argv[1]
read_gff(file)