-
Notifications
You must be signed in to change notification settings - Fork 0
/
QuickVariants_Pointmutationfilter.py
86 lines (84 loc) · 4.14 KB
/
QuickVariants_Pointmutationfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import glob
import os
from datetime import datetime
import argparse
import pandas as pd
############################################ Arguments and declarations ##############################################
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument("-i",
help="path to vcf files generated by QuickVariants",
type=str, default='test_vcf/',
metavar='a vcf_folder containing *.vcf files')
# optional
optional.add_argument("-af","-allele-frequency",
help="minimum allele frequency to be considered a point mutation (default = 0.9)",
type=float, default=.9,
metavar='minimum allele frequency')
optional.add_argument("-rd","-read-depth",
help="minimum read depth to be considered a point mutation (default = 3)",
type=float, default=3,
metavar='minimum read depth')
################################################## Definition ########################################################
args = parser.parse_args()
################################################### Set up ########################################################
min_maf_for_call = args.af
min_cov = args.rd
################################################### Function ########################################################
def filter_snp(depthall, depthsnp):
MAF = depthsnp / depthall
if MAF >= min_maf_for_call and depthsnp >= min_cov:
return True
return False
def filter_point_mutation(vcf):
try:
f1 = open(vcf + '.point_mutation.txt', 'r')
except IOError:
# grep all snps
try:
f1 = open(vcf + '.snp', 'r')
except IOError:
os.system('grep \';\' %s > %s.snp' % (vcf, vcf))
snpoutput = ['CHR\tPOS\tREF\tALT\tDPall\tDPsnp\n']
# load each line of snps
snpline = 0
SNP = pd.read_csv(vcf + '.snp',sep='\t',header=None)
SNP.columns = ['CHR','POS','REF','ALT','DP','DETAILS-MIDDLE','DETAILS-ENDS','SUPPORT']
SNP = SNP[~SNP['REF'].isin(['N'])]
SNPtrue = []
totallines = SNP.shape[0]
for indexline in range(0,totallines):
snpline += 1
CHR,POS,REF,ALT,DP,middleDP,endDP = list(SNP.iloc[indexline,:7])
if POS > 0:
# not an indel, next POS not a start of an indel
ALT = ALT.split(',')
withsnp = False
DP = float(DP)
middleDP = [x.split(',') for x in middleDP.split(';')]
endDP = [x.split(',') for x in endDP.split(';')]
if DP > 0:
for i in range(0,len(ALT)):
# each potential ALT
if ALT[i] != '-':
# not indel
# using both middle and end depth
depthsnp = sum([float(x) for x in middleDP[i + 1]]) + sum([float(x) for x in endDP[i + 1]]) # skip REF
if filter_snp(DP, depthsnp):
# a qualified snp
withsnp = True
snpoutput.append('%s\t%s\t%s\t%s\t%s\t%s\n'%(CHR,POS,REF,ALT[i],DP,depthsnp))
if withsnp:
SNPtrue.append(indexline)
if snpline % 1000000 == 0:
print(datetime.now(), 'processed %s lines' % (snpline))
# # output qualified point mutations
# SNP.iloc[SNPtrue,:].to_csv(vcf + '.point_mutation.vcf',sep='\t',index = None)
f1 = open(vcf + '.point_mutation.txt', 'w')
f1.write(''.join(snpoutput))
f1.close()
################################################### Main ########################################################
allvcf = glob.glob('%s/*.vcf'%(args.i))
for vcf in allvcf:
filter_point_mutation(vcf)