-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcf_to_arbitrary_allele_freq.py
223 lines (153 loc) · 5.91 KB
/
vcf_to_arbitrary_allele_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/python
# python 2.7
# 21 Dec 2016
# Mathias Scharmann
# Command line outline
# usage example
# python
# Inputs
# Outputs
#module load python/2.7
import os
import sys
#######
# checks if file exists and breaks script if not
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
print "Error: {0} does not exist".format(x)
exit()
x = str(x)
return x
# checks for non-UNIX linebreaks
def linebreak_check(x):
if "\r" in open(x, "rb").readline():
print "Error: classic mac (CR) or DOS (CRLF) linebreaks in {0}".format(x)
exit()
# parses command line arguments
def get_commandline_arguments ():
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--popmap", required=True, dest="popmapfile", type=extant_file, help="name and path of the popmap file: 1st column barcode/sample name separated by tab from second column indicating the population", metavar="FILE")
parser.add_argument("--vcf", required=True, dest="vcf", type=extant_file, help="vcf genotypes", metavar="FILE")
parser.add_argument("--max_missing_ingr", required=True, help="site-filter: maximum proportion of individuals with missing genotype per population to include a site, float value")
parser.add_argument("--out", required=True, help="name and path of output file", metavar="FILENAME")
args = parser.parse_args()
print args
linebreak_check(args.popmapfile)
return args
########
def check_congruence (popmapfile, vcf):
popmapsamples = []
with open(popmapfile, "r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
popmapsamples.append(fields[0])
popmapsamples = set(popmapsamples)
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
vcf_samples = header_line[9:]
for samp in popmapsamples:
if samp not in vcf_samples:
print "vcf does not contain all popmap samples"
exit()
print "all samples in popmap are in .vcf, good to go!"
#######
def read_popmap (popmapfile):
popdict = {}
with open(popmapfile,"r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
try:
popdict[fields[1]].append( fields[0] )
except KeyError:
popdict[fields[1]] = [ fields[0] ]
return popdict
def vcf_to_arbitrary_allele_freqs (vcf, popdict, max_missing_ingr, output_file):
presence_treshold_ingr = 1.0 - max_missing_ingr
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
samples = header_line[9:]
# E3379_L96 43 . ATCG ATTG,ATCA,GTCA,GTCG 4179.3 . AB=BLABLA 0/0:1:
popdict_vcf_idxes = {}
for k, v in popdict.items():
popdict_vcf_idxes[ k ] = [ header_line.index(x) for x in v ]
ingroup_pops = [ x for x in popdict.keys() ]
pop_order = popdict.keys()
output_lines = [pop_order]
with open(output_file, "w") as OUTF:
OUTF.write( "\n".join(["\t".join(x) for x in output_lines ]) + "\n" )
# now start the main business of walking through the vcf:
print "counting allele frequencies, polarisation is arbitrary (NOT necessarily minor or major.)"
allele_freq_dict = {k : [] for k in pop_order }
with open(vcf, "r") as INFILE:
cnt = 0
for line in INFILE:
if line.startswith("#"):
continue
cnt += 1
c = cnt / 100000.0
if (c).is_integer():
print str(cnt)
write_ouput_chunk ( allele_freq_dict, pop_order, output_file )
# and clean memory by replacing the dict:
allele_freq_dict = {k : [] for k in pop_order }
fields = line.strip("\n").split("\t")
## first check that site occurs in the outgroup and fixed in the outgroup:
# a = [ fields[x].split(":")[0].split("/") for x in popdict_vcf_idxes[outgr] ]
# b = [x for genotype in a for x in genotype if x != "."]
# pop_n = float( len( popdict_vcf_idxes[outgr] ) )*2
# if len(b) >= presence_treshold_outgr * pop_n:
# if len(set(b)) == 1:
# we have a site that survives max_missing AND is fixed in the outgroup
# now check the other populations:
# print fields
for pop in ingroup_pops:
a = [ fields[x].split(":")[0].split("/") for x in popdict_vcf_idxes[pop] ]
b = [x for genotype in a for x in genotype if x != "."]
pop_n = float( len( popdict_vcf_idxes[pop] ) )*2
# print pop_n
if len(b) >= presence_treshold_ingr * pop_n:
p = str( b.count( "1" )/float(len(b)) ) # OK we just count the allele "1", it does not matter
else:
p = "-"
allele_freq_dict[pop].append( p )
# final write:
write_ouput_chunk ( allele_freq_dict, pop_order, output_file )
return cnt
def write_ouput_chunk ( allele_freq_dict, pop_order, output_file ):
"""
better to write the output from time to time rather than to clog up the memory with GB!
"""
# clean allele_freq_dict from sites that are not present in at least four populations:
print "filtering sites in allele frequency table for presence in >= 4 populations"
output_lines = []
for i in range( len( allele_freq_dict[allele_freq_dict.keys()[0]] ) ):
column = [ allele_freq_dict[pop][i] for pop in pop_order ]
if len( [ x for x in column if x != "-" ] ) >= 4:
output_lines.append( column ) # now its a row!
with open(output_file, "a") as OUTF:
OUTF.write( "\n".join(["\t".join(x) for x in output_lines ]) + "\n" )
######################## MAIN
args = get_commandline_arguments ()
check_congruence (args.popmapfile, args.vcf)
popdict = read_popmap (args.popmapfile)
max_missing_ingr = float( args.max_missing_ingr )
input_sites_cnt = vcf_to_arbitrary_allele_freqs (args.vcf, popdict, max_missing_ingr, args.out)
## check length of output_file:
output_lines = sum(1 for line in open( args.out )) -1
print "retained sites: {0} out of {1}".format( output_lines, input_sites_cnt )
print "Done!"