-
Notifications
You must be signed in to change notification settings - Fork 0
/
QuickVariants_Indelfilter.py
121 lines (118 loc) · 6.4 KB
/
QuickVariants_Indelfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import glob
import os
import statistics
from datetime import datetime
import argparse
############################################ Arguments and declarations ##############################################
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument("-i",
help="path to vcf files generated by QuickVariants",
type=str, default='test_vcf/',
metavar='a vcf_folder containing *.vcf files')
# optional
optional.add_argument("-af","-allele-frequency",
help="minimum allele frequency to be considered a point mutation (default = 0.8)",
type=float, default=.9,
metavar='minimum allele frequency')
optional.add_argument("-rd","-read-depth",
help="minimum read depth to be considered a point mutation (default = 6)",
type=float, default=3,
metavar='minimum read depth')
################################################## Definition ########################################################
args = parser.parse_args()
################################################### Set up ########################################################
min_maf_for_call = args.af
min_cov = args.rd
extension_mafcutoff = min_maf_for_call
extension_mafcutoff -= 0.1
extension_mafcutoff = max(extension_mafcutoff,0.01)
################################################### Function ########################################################
def filter_indel(vcf):
Qualified_indel = []
insertion = ['', 0,'','',[],[]]#CHR POS REF ALT SUBdepth Totaldepth
deletion = ['', [0],'','',[],[]]#CHR POS REF ALT SUBdepth Totaldepth
try:
f1 = open(vcf + '.snp', 'r')
except IOError:
os.system('grep \';\' %s > %s.snp' % (vcf, vcf))
for lines in open(vcf + '.snp', 'r'):
lines_set = lines.split('\n')[0].split('\t')
CHR, POS = lines_set[:2]
if '-' in lines:
POS = int(POS)
INDELset = lines_set[3].split(',')
# using only middle depth
Depthset = [i for i in lines_set[5].split(';')]
totaldepth = sum([float(j.split(',')[0]) + float(j.split(',')[1]) for j in Depthset])
REF = lines_set[2]
if totaldepth >= min_cov:
for i in range(1, len(Depthset)):
subdepth = sum([float(j) for j in Depthset[i].split(',')])
INDEL = INDELset[i-1]
if subdepth >= min_cov and subdepth / totaldepth >= extension_mafcutoff:
# qualified indel
if POS < 0:
POS = -POS
# insertion
if POS != insertion[1]:
# diff insertion
# sum the previous insertion
if len(insertion[3]) > 1: # insertion >= 2bp
insertion[4] = statistics.mean(insertion[4]) # mean of subdepth
insertion[5] = statistics.mean(insertion[5]) # mean of depth
Qualified_indel.append(
'\t'.join([str(x) for x in insertion]) + '\n')
# start of a new insertion, stringent cutoff
if subdepth / totaldepth >= min_maf_for_call:
insertion = [CHR, POS, '', INDEL, [subdepth], [totaldepth]]
else:
insertion = ['', 0, '', '', [], []]
else:
insertion[3] += INDEL
insertion[4].append(subdepth)
insertion[5].append(totaldepth)
elif INDEL == '-':
# deletion
if CHR != deletion[0] or abs(int(POS) - int(deletion[1][-1])) > 10:
# diff deletion
# sum the previous deletion
if len(deletion[2]) > 1: # deletion >= 2bp
deletion[4] = statistics.mean(deletion[4]) # mean of subdepth
deletion[5] = statistics.mean(deletion[5]) # mean of depth
deletion[1] = deletion[1][0] # use the first POS
Qualified_indel.append(
'\t'.join([str(x) for x in deletion]) + '\n')
# start of a new deletion, stringent cutoff
if subdepth / totaldepth >= min_maf_for_call:
deletion = [CHR,[POS], REF, '', [subdepth], [totaldepth]]
else:
deletion = ['', [0], '', '', [], []]
else:
# same deletion
deletion[1].append(POS)
deletion[2] += REF
deletion[4].append(subdepth)
deletion[5].append(totaldepth)
# sum the last deletion
if len(deletion[2]) > 1: # deletion >= 2bp
deletion[4] = statistics.mean(deletion[4]) # mean of subdepth
deletion[5] = statistics.mean(deletion[5]) # mean of depth
deletion[1] = deletion[1][0] # use the first POS
Qualified_indel.append(
'\t'.join([str(x) for x in deletion]) + '\n')
# sum the last insertion
if len(insertion[3]) > 1: # insertion >= 2bp
insertion[4] = statistics.mean(insertion[4]) # mean of subdepth
insertion[5] = statistics.mean(insertion[5]) # mean of depth
Qualified_indel.append(
'\t'.join([str(x) for x in insertion]) + '\n')
f1 = open(vcf + '.indel.txt', 'w')
f1.write('CHR\tPOS\tREF\tINDEL\tDepth\tTotal_depth\n')
f1.write(''.join(Qualified_indel))
f1.close()
################################################### Main ########################################################
allvcf = glob.glob('%s/*.vcf'%(args.i))
for vcf in allvcf:
filter_indel(vcf)