forked from Kennedy-Lab-UW/Duplex-Sequencing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CountMuts.py
executable file
·271 lines (229 loc) · 13.7 KB
/
CountMuts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python
"""
Count Muts Unique
by Brendan Kohrn and Mike Schmitt
Version 1.41
August 20, 2014
Modified from count-muts.py and count-muts-unique.py
Edited by Brendan Kohrn to fix a problem with 0 depth where no 0 depth should be and to allow n-length indels, and to merge the two conditions into one program.
This script pulls out the mutation frequencies from a pileup file given as stdin, or can take an imput file using the -i option, and writes to stdout, or can take an output file name.
Sites with less than MINDEPTH, or clonalities outside of the range MIN_CLONALITY-MAX_CLONALITY, are excluded from analysis.
If -u is specified, this program counts each mutation exactly once (i.e. clonal expansions are counted as a single mutation)
Usage:
cat seq.pileup | CountMuts.py [-h] [-d MINDEPTH] [-C MAX_CLONALITY] [-c MIN_CLONALITY] [-n N_CUTOFF] [-s START] [-e END] [-u] > outfile.countmuts
optional arguments:
-h, --help show this help message and exit
-d MINDEPTH, --depth MINDEPTH
Minimum depth for counting mutations at a site
(default = 20)
-C MAX_CLONALITY, --max_clonality MAX_CLONALITY
Cutoff of mutant reads for scoring a clonal mutation
(default = 0.3)
-c MIN_CLONALITY, --min_clonality MIN_CLONALITY
Cutoff of mutant reads for scoring a clonal mutation
(default = 0)
-n N_CUTOFF, --n_cutoff N_CUTOFF
Maximum fraction of N's allowed to score a position
(default = 0.05)
-s START, --start START
Position at which to start scoring for mutations
(default = 0)
-e END, --end END Position at which to stop scoring for mutations. If
set to 0, no position filtering will be performed
(default = 0)
-u, --unique run countMutsUnique instead of countMuts
"""
from __future__ import print_function
from argparse import ArgumentParser
import sys
import re
from math import sqrt
def Wilson(positive, total) :
if total == 0:
return 0
freq = float(positive)/float(total)
z = 1.96 #1.96 = 95%
phat = float(positive) / total
positiveCI = (phat + z*z/(2*total) + z * sqrt((phat*(1-phat)+z*z/(4*total))/total))/(1+z*z/total)
negativeCI = (phat + z*z/(2*total) - z * sqrt((phat*(1-phat)+z*z/(4*total))/total))/(1+z*z/total)
return (phat, positiveCI , negativeCI )
def CountMutations(o, f, fOut):
depths = []
Aseq = 0
AtoT = 0
AtoC = 0
AtoG = 0
Tseq = 0
TtoA = 0
TtoC = 0
TtoG = 0
Cseq = 0
CtoA = 0
CtoT = 0
CtoG = 0
Gseq = 0
GtoA = 0
GtoT = 0
GtoC = 0
ins = {0:0}
dels = {0:0}
#mpFile = open("testMPfile.mutpos", "w") #ADDED
#mpFirst = True #ADDED
for line in f:
linebins = line.split()
#convert sequence information to uppercase
linebins[4] = linebins[4].replace('t','T')
linebins[4] = linebins[4].replace('c','C')
linebins[4] = linebins[4].replace('g','G')
linebins[4] = linebins[4].replace('a','A')
linebins[4] = linebins[4].replace('n','N')
#count depth
depth = int(linebins[3]) - linebins[4].count('N')
#count and remove insertions
newIns = map(int, re.findall(r'\+\d+', linebins[4]))
if o.unique:
newIns = list(set(newIns))
for length in newIns:
if length not in ins:
ins[length] = 1
else:
ins[length] += 1
rmStr = r'\+' + str(length) + "."*length
linebins[4] = re.sub(rmStr, '', linebins[4])
#count and remove deletions
newDels = map(str, re.findall(r'-\d+', linebins[4]))
if o.unique:
newDels = list(set(newDels))
for length in newDels:
length = int(length[1:])
if length not in dels:
dels[length] = 1
else:
dels[length] += 1
rmStr = r'-' + str(length) + "."*length
linebins[4] = re.sub(rmStr, '', linebins[4])
#skip sites that fall outside of specified start/end ranges, that have insufficient depth or that have clonal mutations or excess frequency of N's:
if o.end !=0 and int(linebins[1]) < o.start:
pass
elif o.end !=0 and int(linebins[1]) > o.end:
pass
elif (float(linebins[4].count('N'))/(float(depth) + float(linebins[4].count('N')))) > o.n_cutoff:
pass
elif depth < o.mindepth:
pass
elif (float(max(linebins[4].count('T'),linebins[4].count('C'),linebins[4].count('G'),linebins[4].count('A'), (max(newIns.count(n) for n in list(set(newIns))) if newIns != [] else 0), (max(newDels.count(m) for m in list(set(newDels))) if newDels != [] else 0))) / float(depth)) > o.max_clonality:
pass
elif (float(max(linebins[4].count('T'),linebins[4].count('C'),linebins[4].count('G'),linebins[4].count('A'), (max(newIns.count(n) for n in list(set(newIns))) if newIns != [] else 0), (max(newDels.count(m) for m in list(set(newDels))) if newDels != [] else 0))) / float(depth)) < o.min_clonality:
pass
else:
#remove N entries
#linebins[4] = linebins[4].replace('N','')
#remove start line and end line markers
linebins[4] = re.sub('\$','',linebins[4])
linebins[4] = re.sub('\^.','',linebins[4])
#count point mutations
if linebins[2] == 'A':
Aseq += depth
if linebins[4].count('T') > 0: AtoT += (1 if o.unique else linebins[4].count('T'))
if linebins[4].count('C') > 0: AtoC += (1 if o.unique else linebins[4].count('C'))
if linebins[4].count('G') > 0: AtoG += (1 if o.unique else linebins[4].count('G'))
elif linebins[2] == 'T':
Tseq += depth
if linebins[4].count('A') > 0: TtoA += (1 if o.unique else linebins[4].count('A'))
if linebins[4].count('C') > 0: TtoC += (1 if o.unique else linebins[4].count('C'))
if linebins[4].count('G') > 0: TtoG += (1 if o.unique else linebins[4].count('G'))
elif linebins[2] == 'C':
Cseq += depth
if linebins[4].count('A') > 0: CtoA += (1 if o.unique else linebins[4].count('A'))
if linebins[4].count('T') > 0: CtoT += (1 if o.unique else linebins[4].count('T'))
if linebins[4].count('G') > 0: CtoG += (1 if o.unique else linebins[4].count('G'))
elif linebins[2] == 'G':
Gseq += depth
if linebins[4].count('A') > 0: GtoA += (1 if o.unique else linebins[4].count('A'))
if linebins[4].count('T') > 0: GtoT += (1 if o.unique else linebins[4].count('T'))
if linebins[4].count('C') > 0: GtoC += (1 if o.unique else linebins[4].count('C'))
#if mpFirst: #ADDED
#mpFirst = False #ADDED
#else: #ADDED
#mpFile.write("\n") #ADDED
#mpFile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (linebins[0],linebins[2], linebins[1], depth, linebins[4].count('T') + linebins[4].count('C') + linebins[4].count('G') + linebins[4].count('A'), linebins[4].count('T'), linebins[4].count('C'), linebins[4].count('G'), linebins[4].count('A'), len(newIns), len(newDels), linebins[4].count('N'))) #ADDED
totalseq = Aseq + Tseq + Cseq + Gseq
totalptmut = AtoT + AtoC + AtoG + TtoA + TtoC + TtoG + CtoA + CtoT + CtoG + GtoA + GtoT + GtoC
totalindel = sum(ins) + sum(dels)
totalins = sum(ins[n] for n in ins.keys())
totaldels = sum(dels[n] for n in dels.keys())
#mpFile.close() #ADDED
print("\nMinimum depth: %s" % o.mindepth, file = fOut)
print("Clonality: %s - %s" % (o.min_clonality, o.max_clonality), file = fOut)
if o.end != 0:
print('Position: %s - %s' % (o.start, o.end), file = fOut)
if o.unique:
print('Unique Counts', file = fOut)
print("\nA's sequenced: %s" % Aseq, file = fOut)
print("Mutation type\t#\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print(("A to T:\t%s" % AtoT) + ('\t%.2e\t%.2e\t%.2e' % Wilson(AtoT, max(Aseq, 1))), file = fOut) #Output is in the form: Mutation type, number of times mutation is oberseved, frequency, 95% positive CI, 95% negative CI (Confidence Intervals are based on the Wilson Confidence Interval)
print(("A to C:\t%s" % AtoC) + ('\t%.2e\t%.2e\t%.2e' % Wilson(AtoC, max(Aseq, 1))), file = fOut)
print(("A to G:\t%s" % AtoG) + ('\t%.2e\t%.2e\t%.2e' % Wilson(AtoG, max(Aseq, 1))), file = fOut)
print("\nT's sequenced: %s" % Tseq, file = fOut)
print("Mutation type\t#\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print(("T to A:\t%s" % TtoA) + ('\t%.2e\t%.2e\t%.2e' % Wilson(TtoA, max(Tseq, 1))), file = fOut)
print(("T to C:\t%s" % TtoC) + ('\t%.2e\t%.2e\t%.2e' % Wilson(TtoC, max(Tseq, 1))), file = fOut)
print(("T to G:\t%s" % TtoG) + ('\t%.2e\t%.2e\t%.2e' % Wilson(TtoG, max(Tseq, 1))), file = fOut)
print("\nC's sequenced: %s" % Cseq, file = fOut)
print("Mutation type\t#\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print(("C to A:\t%s" % CtoA) + ('\t%.2e\t%.2e\t%.2e' % Wilson(CtoA, max(Cseq, 1))), file = fOut)
print(("C to T:\t%s" % CtoT) + ('\t%.2e\t%.2e\t%.2e' % Wilson(CtoT, max(Cseq, 1))), file = fOut)
print(("C to G:\t%s" % CtoG) + ('\t%.2e\t%.2e\t%.2e' % Wilson(CtoG, max(Cseq, 1))), file = fOut)
print("\nG's sequenced: %s" % Gseq, file = fOut)
print("Mutation type\t#\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print(("G to A:\t%s" % GtoA) + ('\t%.2e\t%.2e\t%.2e' % Wilson(GtoA, max(Gseq, 1))), file = fOut)
print(("G to T:\t%s" % GtoT) + ('\t%.2e\t%.2e\t%.2e' % Wilson(GtoT, max(Gseq, 1))), file = fOut)
print(("G to C:\t%s" % GtoC) + ('\t%.2e\t%.2e\t%.2e' % Wilson(GtoC, max(Gseq, 1))), file = fOut)
print("\nTotal nucleotides sequenced: %s" % totalseq, file = fOut)
print("Total point mutations: %s" % totalptmut, file = fOut)
print("\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print('Overall point mutation frequency:\t%.2e\t%.2e\t%.2e\n' % Wilson(totalptmut, max(totalseq, 1)), file = fOut)
insKeys = sorted(ins.items(), key=lambda x: x[0])
print("Mutation type\t#\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
for n in insKeys:
print(('+%s insertions: %s' % (n[0], n[1])) + ('\t%.2e\t%.2e\t%.2e' % Wilson(n[1], max(totalseq,1))), file = fOut)
if dels != {}:
print('', file = fOut)
delsKeys = sorted(dels.items(), key=lambda x: x[0])
for n in delsKeys:
print(('-%s deletions: %s' % (n[0], n[1])) + ('\t%.2e\t%.2e\t%.2e' % Wilson(n[1], max(totalseq,1))), file = fOut)
print("\nTotal insertion events: %s" % totalins, file = fOut)
print("\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print("Overall insert frequency:\t%.2e\t%.2e\t%.2e" % Wilson(totalins, max(totalseq, 1)), file = fOut)
print("\nTotal deletion events: %s" % totaldels, file = fOut)
print("\tFrequency\t95% positive CI\t95% negative CI", file = fOut)
print("Overall deletion frequency:\t%.2e\t%.2e\t%.2e" % Wilson(totaldels, max(totalseq, 1)), file = fOut)
def main():
parser = ArgumentParser()
parser.add_argument('-i', '--infile', action ='store', dest = 'inFile', help = 'An imput file. If None, defaults to stdin. [None]', default = None)
parser.add_argument('-o', '--outfile', action = 'store', dest = 'outFile', help = 'A filename for the output file. If None, outputs to stdout. [None]', default = None)
parser.add_argument("-d", "--depth", action="store", type=int, dest="mindepth",
help="Minimum depth for counting mutations at a site [%(default)s]", default=20)
parser.add_argument("-c", "--min_clonality", action="store", type=float, dest="min_clonality",
help="Cutoff of mutant reads for scoring a clonal mutation [%(default)s]", default=0)
parser.add_argument("-C", "--max_clonality", action="store", type=float, dest="max_clonality",
help="Cutoff of mutant reads for scoring a clonal mutation [%(default)s]", default=0.3)
parser.add_argument("-n", "--n_cutoff", action="store", type=float, dest="n_cutoff",
help="Maximum fraction of N's allowed to score a position [%(default)s]", default=0.05)
parser.add_argument("-s", "--start", action="store", type=int, dest="start",
help="Position at which to start scoring for mutations [%(default)s]", default=0)
parser.add_argument("-e", "--end", action="store", type=int, dest="end",
help="Position at which to stop scoring for mutations. If set to 0, no position filtering will be performed [%(default)s]", default=0)
parser.add_argument('-u', '--unique', action='store_true', dest='unique', help='Run countMutsUnique instead of countMuts')
o = parser.parse_args()
if o.inFile != None:
f = open(o.inFile, 'r')
else:
f = sys.stdin
if o.outFile != None:
fOut = open(o.outFile, 'w')
else:
fOut = sys.stdout
CountMutations(o, f, fOut)
if __name__ == "__main__":
main()