-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtf2tbl.py
executable file
·304 lines (273 loc) · 10.1 KB
/
gtf2tbl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/env python3
import argparse
import operator
import re
import sys
# Convert GTF file to GenePred format.
#
# Rules for determining coordinates should be following the behaviour of
# Kent Tools gtfToGenePred:
#
# gtfToGenePred -genePredExt genes.gtf genes.gp
#
# - Use BED-like coordinates (zero-based half open intervals).
# - Take protein coding transcripts only.
# - If transcript does not have a CDS, set highest genomic coordinate
# of exon as start and end for cdsStart and cdsEnd.
# - Stop codon is included in cdsStart and cdsEnd. (CDS normally does
# not include stop codon.)
class Transcript:
@staticmethod
def _iterate_gtf(filename):
with open(filename, "r") as handle:
for line in handle:
if line.startswith("#") or not line.strip():
continue
(
seq_id,
source,
type,
start,
end,
score,
strand,
phase,
attributes_str,
) = line.rstrip().split("\t")
attributes = {}
for element in attributes_str.split(";"):
match = re.match(r'^\W*([A-Za-z_]+)\W+"([^"]+)"\W*$', element)
if match:
key, value = match.groups()
attributes[key] = value
gene_biotype = attributes.get("gene_biotype")
if gene_biotype and gene_biotype == "protein_coding":
# Make intervals half-open and zero-based (in GTF they are closed intervals and 1-based) ...
yield type, seq_id, int(start) - 1, int(end), strand, attributes
@staticmethod
def load_transcripts_from_file(gtf_filename):
transcript_id2Transcript = {}
for type, seq_id, start, end, strand, attributes in Transcript._iterate_gtf(
gtf_filename
):
if (
type == "transcript"
or type == "exon"
or type == "CDS"
or type == "stop_codon"
):
transcript_id = attributes["transcript_id"]
gene_id = attributes["gene_id"]
transcript = (
transcript_id2Transcript[transcript_id]
if transcript_id in transcript_id2Transcript
else Transcript(transcript_id, gene_id, seq_id, strand)
)
if type == "exon":
exon = (start, end)
transcript.exons.append(exon)
elif type == "CDS":
CDS = (start, end)
transcript.CDSs.append(CDS)
elif type == "stop_codon":
stop_codon = (start, end)
transcript.stop_codon = stop_codon
if transcript_id not in transcript_id2Transcript:
transcript_id2Transcript[transcript_id] = transcript
return transcript_id2Transcript.values()
def __init__(self, transcript_id, gene_id, chromosome, strand):
self.transcript_id = transcript_id
self.gene_id = gene_id
self.chromosome = chromosome
self.strand = strand
self.exons = []
self.CDSs = []
self.stop_codon = None
@property
def tx_start(self):
if not self.exon_count:
return None
return min(self.exon_starts)
@property
def tx_end(self):
if not self.exon_count:
return None
return max(self.exon_ends)
@property
def cds_start(self):
if not self.CDSs:
# Kent Tools gtfToGenePred seems to put CDS start always a the highest
# genomic coordinate of the transcript.
return self.tx_end # if self.strand == "-" else self.tx_start
cds_start = min(map(operator.itemgetter(0), self.CDSs))
if self.stop_codon and self.strand == "-":
# If transcript has an annotated stop codon, set start position as CDS
# start if transcript is on negative strand.
return self.stop_codon[0]
return cds_start
@property
def cds_end(self):
if not self.CDSs:
# Kent Tools gtfToGenePred seems to put CDS end always a the highest
# genomic coordinate of the transcript.
return self.tx_end # if self.strand == "-" else self.tx_start
cds_end = max(map(operator.itemgetter(1), self.CDSs))
if self.stop_codon and self.strand == "+":
# If transcript has an annotated stop codon, set end position as CDS
# end if transcript is on positive strand.
return self.stop_codon[1]
return cds_end
def __len__(self):
if not self.exon_count:
return 0
return self.tx_end - self.tx_start
@property
def exon_count(self):
return len(self.exons)
@property
def exon_starts(self):
if not self.exon_count:
return ()
return sorted(map(operator.itemgetter(0), self.exons))
@property
def exon_ends(self):
if not self.exon_count:
return ()
return sorted(map(operator.itemgetter(1), self.exons))
@property
def exon_frames(self):
"""
Get exon frames.
Get for each exon how many nucleotides it should get from the previous
exon to be in the open reading frame. If an exon is not in the CDS, "-1"
is used.
"""
if not self.exon_count:
return ()
exon_frames_list = []
left_over_codon_nucleotides = 0
if self.strand == "-":
# Negative strand.
for exon_start, exon_end in zip(
self.exon_starts[::-1], self.exon_ends[::-1]
):
if (
not self.CDSs
or (exon_start > self.cds_end)
or (exon_end < self.cds_start)
):
# Set exon frame for current exon to -1 if transcript does not
# have a CDS or exon is located outside of CDS.
exon_frames_list.append(-1)
else:
exon_frames_list.append(left_over_codon_nucleotides)
if exon_end > self.cds_end:
exon_end = self.cds_end
# Calculate number of nucleotides that are left over in the
# current exon and that will be used in the next exon to continue
# the open reading frame.
left_over_codon_nucleotides = (
exon_end + left_over_codon_nucleotides - exon_start
) % 3
return exon_frames_list[::-1]
else:
# Positive strand.
for exon_start, exon_end in zip(self.exon_starts, self.exon_ends):
if (
not self.CDSs
or (exon_end < self.cds_start)
or (exon_start > self.cds_end)
):
# Set exon frame for current exon to -1 if transcript does not
# have a CDS or exon is located outside of CDS.
exon_frames_list.append(-1)
else:
exon_frames_list.append(left_over_codon_nucleotides)
if exon_start < self.cds_start:
exon_start = self.cds_start
# Calculate number of nucleotides that are left over in the
# current exon and that will be used in the next exon to continue
# the open reading frame.
left_over_codon_nucleotides = (
exon_end + left_over_codon_nucleotides - exon_start
) % 3
return exon_frames_list
def main():
parser = argparse.ArgumentParser(
description="Convert a GTF file to a GenePred file."
)
parser.add_argument(
"-i",
"--gtf",
dest="gtf_filename",
action="store",
type=str,
required=True,
help="GTF input filename.",
)
parser.add_argument(
"-o",
"--gp",
dest="genepred_filename",
action="store",
type=str,
required=True,
help="GenePred (gene prediction) output filename.",
)
args = parser.parse_args()
print("Loading GTF into memory ...", file=sys.stderr)
transcripts = Transcript.load_transcripts_from_file(args.gtf_filename)
print("Conversion to GenePred format ...", file=sys.stderr)
with open(args.genepred_filename, "w") as fh_genepred:
print(
"#bin",
"name",
"chrom",
"strand",
"txStart",
"txEnd",
"cdsStart",
"cdsEnd",
"exonCount",
"exonStarts",
"exonEnds",
"score",
"name2",
"cdsStartStat",
"cdsEndStat",
"exonFrames",
sep="\t",
file=fh_genepred,
)
for tx in transcripts:
if len(tx) == 0:
print(
f"{tx.transcript_id:s} is an empty transcript.",
file=sys.stderr,
)
continue
exon_starts = ",".join(map(str, tx.exon_starts)) + ","
exon_ends = ",".join(map(str, tx.exon_ends)) + ","
exon_frames = ",".join(map(str, tx.exon_frames)) + ","
print(
"NA",
tx.transcript_id,
tx.chromosome,
tx.strand,
tx.tx_start,
tx.tx_end,
tx.cds_start,
tx.cds_end,
tx.exon_count,
exon_starts,
exon_ends,
"NA",
tx.gene_id,
"NA",
"NA",
exon_frames,
sep="\t",
file=fh_genepred,
)
if __name__ == "__main__":
main()