-
Notifications
You must be signed in to change notification settings - Fork 3
/
VPOT_conf.py
403 lines (399 loc) · 16.4 KB
/
VPOT_conf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
###########################################################################################################
# #
###########################################################################################################
#
import sys, re, glob, os, subprocess, time #
import numpy as np #
from shutil import copyfile #
#
tab='\t' #
nl='\n' #
#
###########################################################################################################
# Define global variables
##########################################################################################################
#
def init(): ##
global VPOT_option #
VPOT_option=0 #
global input_file #
input_file="" #
global output_dir #
output_dir="" #
global final_output_file #
final_output_file="" #
global ufunction #
ufunction="" #
global parameter_file #
parameter_file="" #
global gene_list #
gene_list="" #
global Population #
Population = ["ExAC","1000g"] #
global Exonic #
Exonic = ["ExonicFunc","VARIANT_TYPE"] #
global Inheritance_model #
Inheritance_model = ["DN","AR","AD","CH"] #
global chromosome_list #
chromosome_list = ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","M","MT"] #
global MAFval #
MAFval="0.01" #
global PF #
PF="PF" #
global PD #
PD="PD" #
global VT #
VT="VT" #
global GN #
GN="GN" #
global VS #
VS="VS" #
global QC #
QC="QC" #
global QC_PASS #
QC_PASS=False #
global startlen #
startlen=0 #
global PD_type #
PD_type=2 #
global PD_low #
PD_low=3 #
global PD_mid #
PD_mid=5 #
global PD_high #
PD_high=7 #
global Maxval # allow for max of 28 characters values for an alpha predictor.
Maxval=28 # allow for max of 28 characters values for an alpha predictor.
global CV #
CV="CV" #
global Non_alt_GT_types #
Non_alt_GT_types = ["0","."] #
#
global Maxcoverage # read coverage of sample must be equal or greater then this to included
Maxcoverage=0 # read coverage of sample must be equal or greater then this to included
#Maxcoverage=10 # read coverage of sample must be equal or greater then this to included
global Hete_Balance # Allele balance for heteozygous call, must be equal or greater then this to accept as hete, else is homozygous ref
Hete_Balance=0 # Allele balance for heteozygous call, must be equal or greater then this to accept as hete, else is homozygous ref
global Genotype_Quality # Genotype quality score must be equal or greater then this to include
Genotype_Quality=0 # Genotype quality score must be equal or greater then this to include
global VariantScoreThreshold # Variant score threshold - do not include variant in VPOL less than or equal to this value
VariantScoreThreshold=0 # Variant score threshold - do not include variant in VPOL less than or equal to this value
global VariantPercentageThreshold # Variant percentage threshold - do not include variant in VPOL below or equal to this percentage value
VariantPercentageThreshold=0 # Variant percentage threshold - do not include variant in VPOL below or equal to this percentage value
global sample_coverage # VCF format codes to look out for
global sample_coverage_loc # location of VCF format codes for sample
sample_coverage = ["GT","NR","NV","DP","AD","GQ"] # VCF format codes to look out for
sample_coverage_loc = [-1,-1,-1,-1,-1,-1] # location of VCF format codes for sample
# sample_coverage = ["GT","NR","NV","DP","AD"] # VCF format codes to look out for
# sample_coverage_loc = [-1,-1,-1,-1,-1] # location of VCF format codes for sample
#
global GT_val #
global NR_val #
global NV_val #
global DP_val #
global AD_val #
global GQ_val #
GT_val=0 #
NR_val=1 #
NV_val=2 #
DP_val=3 #
AD_val=4 #
GQ_val=5 #
#
global pop_array #
global pred_array #
global PF_array #
global PD_array #
global VT_array #
global GN_value #
global GENE_NAME #
global Gene_ref #
global Sample_ids #
pop_array=[] #
pred_array=[] #
PF_array=[] #
PD_array=[] #
VT_array=[] #
GN_value="" #
GENE_NAME="GENE_NAME" #
Gene_ref="NONE" #
Sample_ids=[] #
#
global input_type_gVCF #
input_type_gVCF=False #
global txt_start #
global txt_end#
txt_start=-1 #
txt_end=-1 #
global sample_loc #
global sample_ID #
global INFO_loc #
global FORMAT_loc #
sample_loc=-1 #
sample_ID="" #
INFO_loc=-1 #
FORMAT_loc=-1 #
#GT_loc=-1 #
global header_ln #
global blank_variant_ln #
global inh_model #
inh_model="NONE" #
header_ln="" #
blank_variant_ln="" #
#
global temp_output_file #
global working_file1 #
global working_file2 #
global working_file3 #
global working_file4 #
global full_file1 #
global full_file2 #
global sort_file1 #
global sort_file2 #
global sort_file3 #
temp_output_file="" #
working_file1="" #
working_file2="" #
working_file3="" #
working_file4="" #
full_file1="" #
full_file2="" #
sort_file1="" #
sort_file2="" #
sort_file3="" #
# output files
###########################################################################################################
#
###########################################################################################################
# determine if input field is a numeric field #
###########################################################################################################
def is_number(s):
try:
float(s)
# print "float :",s #
return True
except ValueError :
# print "not float :",s #
return False
#
###########################################################################################################
#
###########################################################################################################
def update_existing_variants(work_file,value): #
#
# print "update_existing_variants(): " #
##
with open(work_file,'r',encoding="utf-8") as samples_file, open(full_file2,'w',encoding="utf-8") as final_file : #
#ed outline1 = header_ln+nl #
# print "header line )",outline1 #
# print "test blank new variant line )",outline2 #
# print "blank variant line splited )",variant_line #
#ed final_file.write(outline1) # write new header line with new sample ID to the new combined samples output file
#
for line2 in samples_file: # work each line of current samples variant file
# print "Line in combined samples file: ",line2 #
line2_array=re.split('\t|\n|\r',line2) # split into file location and sample id
if (line2_array[0] != "#CHROM"): # not header line
if (value == "1"): # 1st file then no need for any addition
outline3='\t'.join(line2_array[:-1])+nl #
else : # not first - then just do the zero
outline3='\t'.join(line2_array[:-1])+tab+value+nl #
# print "new variant : ", outline3 #
final_file.write(outline3) #
#
#
#
###########################################################################################################
#
###########################################################################################################
def incorporate_this_src_into_full_file(): #
#
# print "incorporate_this_src_into_full_file():" #
# working_file1 = new sample's variants
# full_file1 = current combined samples' variants
# full_file2 = the new combined samples' variants file - will contain new sample's variants
#
new_sample_not_finished=True #
current_sample_not_finished=True #
#
# COMMAND="sort -V -k1,5 "+working_file1+" | grep -v \"CHROM\" > "+sort_file1 #
# COMMAND="sort -V -k1,5 "+working_file1+" > "+sort_file1 #
# COMMAND="sort -V -u -k1,5 "+working_file1+" > "+sort_file1 #
# COMMAND="sort -u -k1,2V -k3,5 "+working_file1+" > "+sort_file1 #
COMMAND="sort -u -k1,1V -k2,2g -k4,5 "+working_file1+" > "+sort_file1 #
subprocess.call(COMMAND, shell=True) #
copyfile(sort_file1,working_file1) # copy back
# sample_output_file=output_dir+"sample_"+sample_ID+"_"+suffix+".txt" #
# copyfile(sort_file1,sample_output_file) # copy back
#
# COMMAND="sort -V -k1,5 "+full_file1+" | grep -v \"CHROM\" > "+sort_file2 #
# COMMAND="sort -V -k1,5 "+full_file1+" > "+sort_file2 #
# COMMAND="sort -k1,2V -k3,5 "+full_file1+" > "+sort_file2 #
COMMAND="sort -k1,1V -k2,2g -k4,5 "+full_file1+" > "+sort_file2 #
subprocess.call(COMMAND, shell=True) #
copyfile(sort_file2,full_file1) # copy back
#
# print working_file1 #
with open(working_file1,'r',encoding="utf-8") as new_sample, open(full_file1,'r',encoding="utf-8") as current_sample, open(full_file2,'w',encoding="utf-8") as final_file : #
#ed outline1 = header_ln+nl #
variant_line=re.split('\t|\n',blank_variant_ln) # split up the blank variant line
#ed final_file.write(outline1) # write new header line with new sample ID to the new combined samples output file
#
S1=new_sample.readline() #
C1=current_sample.readline() #
# print "S1:", S1 #
# print "C1:", C1 #
while (new_sample_not_finished and current_sample_not_finished): # while we are working the two files
line1_array=re.split('\t|\n|\r',S1) # split the variant line
line2_array=re.split('\t|\n|\r',C1) # split the variant line
new_sample_len=len(line1_array) #
current_sample_len=len(line2_array) #
#
# print ("new variant - S1 : ",line1_array[:5]) #
# print ("variant already in file C1 : ",line2_array[:5]) # same variant
if (len(line1_array[0]) > 3): # have chr prefix
wrk_chr=line1_array[0]
new_chr=wrk_chr[3:] #
wrk_chr=line2_array[0] #
existing_chr=wrk_chr[3:] #
else:
new_chr=line1_array[0] #
existing_chr=line2_array[0] #
if (new_chr == "X"):
new_chr=23
elif (new_chr == "Y"):
new_chr=24
elif ((new_chr == "M") or (new_chr == "MT")):
new_chr=25
else:
new_chr=int(new_chr)
#
if (existing_chr == "X"):
existing_chr=23
elif (existing_chr == "Y"):
existing_chr=24
elif ((existing_chr == "M") or (existing_chr == "MT")):
existing_chr=25
else:
existing_chr=int(existing_chr)
#
# print ("new chr - ",str(new_chr))
# print ("existing chr - ",str(existing_chr))
#
# if (line1_array[:5] == line2_array[:5]): # same variant
# if (line1_array[0] == line2_array[0]): # same chr
if (new_chr == existing_chr): # same chr
if (int(line1_array[1]) == int(line2_array[1])) : # same POS (left flank)
if (line1_array[3] == line2_array[3]): # same REF
if (line1_array[4] == line2_array[4]): # same ALT - so the same variants
# outline3='\t'.join(line2_array[:-1])+tab+"1"+nl #
# print ("this sample genotype : ",line1_array[new_sample_len-2])
outline3='\t'.join(line2_array[:-1])+tab+line1_array[new_sample_len-2]+nl # add in the new sample to the existing variant detail
final_file.write(outline3) #
# print ("adding new sample to current variant 1 -",outline3) #
S1=new_sample.readline() #
C1=current_sample.readline() #
elif (line1_array[4] < line2_array[4]): # not same variant and new variant is before current variant
# new_sample_len=len(line1_array) #
variant_line[:new_sample_len-2]=line1_array[:new_sample_len-2] # one less to exclude the genotype and \n
# variant_line[:new_sample_len-1]=line1_array[:new_sample_len-1] # one less to exclude the genotype and \n
outline3='\t'.join(variant_line[:-1])+tab+line1_array[new_sample_len-2]+nl # replace the last 0 with a 1
# outline3='\t'.join(variant_line[:-1])+tab+"1"+nl # replace the last 0 with a 1
final_file.write(outline3) #
# print ("saving new variant with new sample 1 -",outline3) #
S1=new_sample.readline() #
else: # (line1_array[4] > line2_array[4]): # this variant is after current
outline3='\t'.join(line2_array[:-1])+tab+"0"+nl # add a 0 for current variant for sample
final_file.write(outline3) #
# print ("saving current variant with a 0 sample 1 -",outline3) #
C1=current_sample.readline() #
elif (line1_array[3] < line2_array[3]): # different ref - earlier
# new_sample_len=len(line1_array) #
variant_line[:new_sample_len-2]=line1_array[:new_sample_len-2] # one less to exclude the genotype and \n
# variant_line[:new_sample_len-1]=line1_array[:new_sample_len-1] # one less to exclude the \n
# outline3='\t'.join(variant_line[:-1])+tab+"1"+nl # replace the last 0 with a 1
outline3='\t'.join(variant_line[:-1])+tab+line1_array[new_sample_len-2]+nl # add in the new sample
final_file.write(outline3) #
# print ("saving new variant and new sample 2 -",outline3) #
S1=new_sample.readline() #
else: # (line1_array[3] > line2_array[3]) # saved variant ref >
outline3='\t'.join(line2_array[:-1])+tab+"0"+nl #
final_file.write(outline3) #
# print ("saving current variant already in file with 0 for new sample 2 -",outline3) #
C1=current_sample.readline() #
elif (int(line1_array[1]) < int(line2_array[1])) : # earlier position
# new_sample_len=len(line1_array) #
variant_line[:new_sample_len-2]=line1_array[:new_sample_len-2] # one less to exclude the genotype and \n
# variant_line[:new_sample_len-1]=line1_array[:new_sample_len-1] # one less to exclude the \n
outline3='\t'.join(variant_line[:-1])+tab+line1_array[new_sample_len-2]+nl # add the sample genotype
# outline3='\t'.join(variant_line[:-1])+tab+"1"+nl # replace the last 0 with a 1
final_file.write(outline3) #
# print ("saving new variant and new sample 3-",outline3) #
S1=new_sample.readline() #
else: # (int(line1_array[1]) > int(line2_array[1])) : #
outline3='\t'.join(line2_array[:-1])+tab+"0"+nl #
final_file.write(outline3) #
# print ("saving current variant already in file with 0 for new sample 3 -",outline3) #
C1=current_sample.readline() #
# elif (line1_array[0] < line2_array[0]): # different chromosome
elif (new_chr < existing_chr): # different chromosome
# print ("new < exist")
# new_sample_len=len(line1_array) #
variant_line[:new_sample_len-2]=line1_array[:new_sample_len-2] # one less to exclude the genotype and \n
# variant_line[:new_sample_len-1]=line1_array[:new_sample_len-1] # one less to exclude the \n
outline3='\t'.join(variant_line[:-1])+tab+line1_array[new_sample_len-2]+nl # add the sample genotype
# outline3='\t'.join(variant_line[:-1])+tab+"1"+nl # replace the last 0 with a 1
final_file.write(outline3) #
# print ("saving new variant and new sample 4 -",outline3) #
S1=new_sample.readline() #
else: # (line1_array[0] > line2_array[0]): #
# print ("new > exist")
outline3='\t'.join(line2_array[:-1])+tab+"0"+nl #
final_file.write(outline3) #
# print ("saving current variant already in file with 0 for new sample 5 -",outline3) #
C1=current_sample.readline() #
#
if not S1 : # no more lines in new sample
new_sample_not_finished=False #
# print ("new sample file finished") #
#
if not C1 : # no more lines in old combined
current_sample_not_finished=False #
# print ("current sample file finished") #
#
while (new_sample_not_finished): # there are still line to work
line1_array=re.split('\t|\n|\r',S1) # split the variant line
# print "sample file left", line1_array #
new_sample_len=len(line1_array) #
variant_line[:new_sample_len-2]=line1_array[:new_sample_len-2] # one less to exclude the genotype and \n
# variant_line[:new_sample_len-1]=line1_array[:new_sample_len-1] # one less to exclude the \n
outline3='\t'.join(variant_line[:-1])+tab+line1_array[new_sample_len-2]+nl # add the sample genotype
# outline3='\t'.join(variant_line[:-1])+tab+"1"+nl # replace the last 0 with a 1
final_file.write(outline3) #
# print ("saving new variant and new sample 6 -",outline3) #
S1=new_sample.readline() #
if not S1 : # no more lines in new sample
new_sample_not_finished=False #
#
while (current_sample_not_finished): # there are still line to work
line2_array=re.split('\t|\n|\r',C1) # split the variant line
# print "curr file left", line2_array #
outline3='\t'.join(line2_array[:-1])+tab+"0"+nl #
final_file.write(outline3) #
# print ("saving current variant already in file with 0 for new sample 6 -",outline3) #
C1=current_sample.readline() #
if not C1 : # no more lines in new sample
current_sample_not_finished=False #
# all done - now sort the file
#
# COMMAND="grep \"CHROM\" "+full_file2+" > "+sort_file1 # save the header line
# subprocess.call(COMMAND, shell=True) #
#
# COMMAND="sort -V -k1,5 "+full_file2+" > "+sort_file1 #
# COMMAND="sort -k1,2V -k3,5 "+full_file2+" > "+sort_file1 #
COMMAND="sort -k1,1V -k2,2g -k4,5 "+full_file2+" > "+sort_file1 #
subprocess.call(COMMAND, shell=True) #
#
copyfile(sort_file1,full_file2) # copy back
#
##################################################################