-
Notifications
You must be signed in to change notification settings - Fork 0
/
panTE.py
334 lines (294 loc) · 15.5 KB
/
panTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import argparse
import os
import re
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import shutil
import subprocess
#TODO removing IDX files and pre-pan file before running the script
#Define global vars
suffixes = ['EarlGrey.families.strained', 'EarlGrey.RM.out', 'fna']
#Example run
#python3 panTE.py -p /home/eduardo/Documentos/panTE/ -l genome.list -c 3 -d 20 -i 10 -e 10 -v 0.8
def parse_arguments():
# Configurando os argumentos
parser = argparse.ArgumentParser(description='Generates a panTE file from genome-specific runs of TE annotation')
parser.add_argument('-l', '--prefix_list', default='genome.list',
help='Text file with a list of genome file prefixes. Default is "genome.list".')
parser.add_argument('-p', '--path', required=True,
help='Path to input files.')
parser.add_argument('-c', '--fl_copy', default=3, type=int, help='Number of copies of the TE family in the genome. Default is 3.')
parser.add_argument('-s', '--strict', action='store_true', default=False,
help='Use strict parameters for full length TE identification. Boolean. Default is False.')
parser.add_argument('-d', '--div', default=20, type=float, help='Maximum divergence allowed. Default is 20.')
parser.add_argument('-i', '--ins', default=10, type=float, help='Maximum insertion allowed. Default is 10.')
parser.add_argument('-e', '--dele', default=10, type=float, help='Maximum deletion allowed. Default is 10.')
parser.add_argument('-v', '--cov', default=0.8, type=float, help='Minimum coverage allowed. Default is 0.8.')
parser.add_argument('--iter', default=1, type=int, help='Number of iterations to detect nested sequences. Default is 1.')
parser.add_argument('--minhsplen', default=80, type=int, help='Minimum HSP length. Default is 80 (bp).')
parser.add_argument('--minhspident', default=80, type=int, help='Minimum HSP identity. Default is 80 (%).')
parser.add_argument('--minlen', default=80, type=int, help='Minimum length of the cleaned sequence to retain. Default is 80 (bp).')
parser.add_argument('--minident', default=80, type=int, help='Minimum identity of the cleaned sequence to retain. Default is 80 (%).')
return parser.parse_args()
def read_identifiers(file):
# Lê os identificadores do arquivo
try:
with open(file, 'r') as f:
prefixes = [line.strip() for line in f.readlines()]
return prefixes
except FileNotFoundError:
print(f'File {file} not found.')
return []
def find_expected_files(path, suffixes,identifiers):
countOK=0
for identifier in identifiers:
for suffix in suffixes:
filePath = f'{path}/{identifier}.{suffix}'
if os.path.exists(filePath):
countOK+=1
else:
print(f"File not found: {filePath}")
if countOK == len(identifiers)*len(suffixes):
return True # All files found
def get_flTE(path,genomeFilePrefixes,strict,max_div,max_ins,max_del,min_cov,fl_copy,iteration,minhsplen,minhspident,minlen):
#Read the RM file and select the TEs that are full length
#Refactored from find_flTE.pl in EDTA package
for genomeFilePrefix in genomeFilePrefixes:
count_TE_identifiers={}
out_flTE=f'{path}/{genomeFilePrefix}.flTE.list'
outfa_flTE=f'{path}/{genomeFilePrefix}.flTE.fa'
filePath = f'{path}/{genomeFilePrefix}.EarlGrey.RM.out'
teSequenceFile = f'{path}/{genomeFilePrefix}.EarlGrey.families.strained'
print(filePath)
with open(filePath, 'r') as f:
# Loop over the input lines.
for line in f:
# Remove parentheses from the line using a regular expression.
line = re.sub(r'[\(\)]+', '', line)
# Skip empty lines or lines with only whitespace.
if re.match(r'^\s*$', line):
continue
# Split the line by whitespace and capture specific columns into variables.
columns = line.split()
if len(columns) < 14:
continue
if columns[0] == 'SW':
continue
if columns[0] == 'score':
continue
#TODO:Check the case for the complement strand
if columns[8] == '+':
SW, div, del_, ins = int(columns[0]), float(columns[1]), float(columns[2]), float(columns[3])
chr_, start, end, strand = columns[4], int(columns[5]), int(columns[6]), columns[8]
id_, type_, TEs, TEe, TEleft = columns[9], columns[10], int(columns[11]), int(columns[12]), int(columns[13])
else:
SW, div, del_, ins = int(columns[0]), float(columns[1]), float(columns[2]), float(columns[3])
chr_, start, end, strand = columns[4], int(columns[5]), int(columns[6]), columns[8]
id_, type_, TEleft, TEe, TEs = columns[9], columns[10], int(columns[11]), int(columns[12]), int(columns[13])
# Skip if type is "Simple_repeat" or "Low_complexity".
if type_ == "Simple_repeat" or type_ == "Low_complexity":
continue
# Skip unless SW is a number.
if not re.match(r'[0-9]+', str(SW)):
continue
TEidClassFam= f'{id_.lower()}#{type_}'
# Apply stringent conditions if stringent == 1.
if strict == 1:
# If stringent, only allow if divergence, insertion, and deletion are all zero.
if div == 0 and ins == 0 and del_ == 0:
if TEs == 1 and TEleft == 0:
if TEidClassFam in count_TE_identifiers.keys():
count_TE_identifiers[TEidClassFam]+=1
else:
count_TE_identifiers[TEidClassFam]=1
#print(line, end='',file=o) # Print the line if the condition is met.
else:
# If not stringent, apply the divergence, insertion, and deletion limits.
if div <= max_div and ins <= max_ins and del_ <= max_del:
full_len, length = 0, 0
# Calculate full length and actual length for strand "+".
full_len = TEe + TEleft
length = TEe - TEs + 1
# Ensure the length/full_length ratio is above the minimum coverage.
if length / (full_len + 1) >= min_cov:
if TEidClassFam in count_TE_identifiers.keys():
count_TE_identifiers[TEidClassFam]+=1
else:
count_TE_identifiers[TEidClassFam]=1
#print(line, end='',file=o) # Print the line if the condition is met.
print(f"Writing full length TEs that appear more than {fl_copy} times in the genome. Outfiles: {out_flTE} and {outfa_flTE}")
#indexing TE families fasta
TE_fasta = SeqIO.index(teSequenceFile, "fasta")
#print(list(TE_fasta.keys()))
countTEs=0
with open(out_flTE, 'w') as o, open(outfa_flTE, "w") as ofa:
print(f'TE_OriID\tTE_NewID\tNumberCopiesInGenome',file=o)
for TE in count_TE_identifiers.keys():
if count_TE_identifiers[TE] > fl_copy:
if TE in TE_fasta.keys():
countTEs+=1
#create a new identifier that has the countTEs var and pad with 6 zeroes
oldID,teClass=f'{TE}'.split('#')
newID=f'TE_{countTEs:08}#{teClass}'
TE_fasta[TE].id=newID
TE_fasta[TE].description=''
#print(f'{newID}\t{TE_fasta[TE].id}')
SeqIO.write(TE_fasta[TE], ofa, "fasta")
print(f'{TE}\t{newID}\t{count_TE_identifiers[TE]}',file=o)
else:
print(f"TE {TE} not found in the TE fasta file")
#Arquivo final *flTE.fa
def remove_nested_sequences(path, iteration,minhsplen,minhspident,minlen):
keep_TEs={}
touched_TEs={}
#Remove nested sequences from the flTE.fa file
flTE=f'{path}/pre_panTE.flTE.fa'
print(f'REMOVING NESTED SEQUENCES {flTE}\t{iteration}')
for iter in range(0, int(iteration)):
fileiter=f'{path}/panTE.flTE.iter{iter}.fa'
fileiteridx=f'{path}/panTE.flTE.iter{iter}.fa.idx'
shutil.copy(flTE, fileiter)
#TODO: Fix this shoudl not copy the original file
#Indexing fasta file for easy sequence retrieval
fasta_dict = SeqIO.index_db(fileiteridx, fileiter, "fasta")
makeblastdb = [
'makeblastdb',
'-in', fileiter,
'-dbtype', 'nucl']
# Run the command
try:
result = subprocess.run(makeblastdb, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(f"makeblastdb ran successfully on file {fileiter}")
#print(result.stdout.decode()) # Print the standard output (if any)
except subprocess.CalledProcessError as e:
print(f"Error running makeblastdb: {e}")
print(e.stderr.decode()) # Print the standard error (if any)
# Run blastn
for sequence_id in fasta_dict.keys():
sequence_id2=''
if "#" in sequence_id:
sequence_id2 = sequence_id.split("#")[0]
# Write the sequence to a temporary file for BLAST input
temp_fasta = f"{path}/temp_{sequence_id2}.fasta"
with open(temp_fasta, "w") as temp_file:
SeqIO.write(fasta_dict[sequence_id], temp_file, "fasta")
# Run BLAST for each sequence
output_blast_file = os.path.join(path, f"{sequence_id2}_blast_result.txt")
blast_command = [
'blastn', # or 'blastp' depending on your type of sequences
'-query', temp_fasta,
'-db', fileiter, # Specify your BLAST database
'-out', output_blast_file,
'-outfmt', '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send qlen slen', # Output format (tabular)
'-evalue', '1e-5', # Adjust E-value threshold as needed
'-word_size', '7',
'-dust', 'no'
]
# Run the BLAST command
try:
subprocess.run(blast_command, check=True)
print(f"BLAST completed for sequence {sequence_id}. Results saved to {output_blast_file}")
except subprocess.CalledProcessError as e:
print(f"Error running BLAST for sequence {sequence_id}: {e}")
#remove the temporary fasta file
os.remove(temp_fasta)
#Process the BLAST output
with open(output_blast_file, "r") as blast_output:
hsps={}
for line in blast_output:
# Split the line by tab
columns = line.strip().split("\t")
qseqid=columns[0]
sseqid=columns[1]
pident=float(columns[2])
length=int(columns[3])
mismatch=int(columns[4])
gapopen=int(columns[5])
qstart=int(columns[6])
qend=int(columns[7])
sstart=int(columns[8])
send=int(columns[9])
qlen=int(columns[10])
slen=int(columns[11])
if sseqid in touched_TEs.keys():
continue
if (qseqid == sseqid):
continue
if length <= minhsplen:
keep_TEs[qseqid]=fasta_dict[qseqid].seq
continue
if pident <= minhspident:
keep_TEs[qseqid]=fasta_dict[qseqid].seq
continue
if sstart > send:
sstart, send = send, sstart
if sseqid not in hsps.keys():
hsps[sseqid]=[]
hsps[sseqid].append([sstart,send])
for subject in hsps.keys():
for hsp in hsps[subject]:
ssstart, ssend = hsp
subjectseq=list(fasta_dict[subject].seq)
subjectseq[sstart:send] = ['R'] * ((send - sstart) + 1)
# print(subjectseq)
subjectseq_str=''.join(subjectseq)
#remove R letters from the sequence
subjectseq_str=subjectseq_str.replace('R','')
if (len(subjectseq_str) > minlen and len(subjectseq_str) < len(fasta_dict[subject].seq)):
keep_TEs[subject]=subjectseq_str
touched_TEs[subject]=1
#Write the sequences that passed the filter
# outPan=f'{path}/panTE.flTE.iter{iter+1}.fa'
# for TE in keep_TEs.keys():
# newrecord = SeqRecord(
# Seq(keep_TEs[TE]),
# id=TE,
# description=''
# )
# with open(outPan, "a") as o:
# SeqIO.write(newrecord, o, "fasta")
def join_and_rename(path,genomeFilePrefixes):
countTEs=0
out_flTE=f'{path}/pre_panTE.flTE.fa'
for genomeFilePrefix in genomeFilePrefixes:
in_flTE=f'{path}/{genomeFilePrefix}.flTE.fa'
mapids_flTE = f'{path}/{genomeFilePrefix}.flTE.mapids'
with open(mapids_flTE, 'w') as map_file, open(in_flTE,'r') as f, open(out_flTE,'a') as o:
for record in SeqIO.parse(f, "fasta"):
# print(record.id)
countTEs+=1
#create a new identifier that has the countTEs var and pad with 6 zeroes
oldID,teClass=f'{record.id}'.split('#')
newID=f'TE_{countTEs:08}#{teClass}'
map_file.write(f"{oldID}\t{teClass}\t{newID}\n")
newrecord = SeqRecord(
record.seq,
id=newID,
description=genomeFilePrefix
)
SeqIO.write(newrecord, o, "fasta")
def main():
# Processa os argumentos
args = parse_arguments()
# Verifica se o programa BLAST+ está instalado
blast_path = shutil.which("makeblastdb")
if not blast_path:
print(f"makeblastdb is not found in the PATH. Please install BLAST+, before continuing.")
# Lê os identificadores do arquivo
genomeFilePrefixes = read_identifiers(args.prefix_list)
if not genomeFilePrefixes:
print("Nenhum identificador encontrado.")
return
# Encontra arquivos correspondentes
found_files = find_expected_files(args.path, suffixes, genomeFilePrefixes)
if found_files:
print("Arquivos correspondentes encontrados.")
get_flTE(args.path,genomeFilePrefixes,args.strict,args.div,args.ins,args.dele,args.cov,args.fl_copy,args.iter,args.minhsplen, args.minhspident,args.minlen)
join_and_rename(args.path,genomeFilePrefixes)
remove_nested_sequences(args.path,args.iter,args.minhsplen,args.minhspident,args.minlen)
else:
print("Some files are missing. Check your input.")
if __name__ == '__main__':
main()