-
Notifications
You must be signed in to change notification settings - Fork 0
/
indiv_phylip_to_pop_phylip.v2.py
120 lines (91 loc) · 3.29 KB
/
indiv_phylip_to_pop_phylip.v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/local/bin/python
# Python 2.7.6
#
# 2 Mar 2014
# Mathias Scharmann
import sys
revcombs = {'B': 'CGT', 'D': 'AGT', 'H': 'ACT', 'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'V': 'ACG', 'Y': 'CT', 'A' : 'A', 'C':'C', 'T':'T', 'G':'G', 'n':'-', '-':'-'}
def get_IUPAC_amb_code (nucs):
# nucs must be a "".joined string of the sorted list of nucleotides
# the IUPAC ambiguity codes:
# the keys in this IUPAC ambiguity code dictionary are produced by:
# "".join(sorted(list("NUCLEOTIDES"))) -> consistent lookup possible without additional loops!
combs = {'AC':'M', 'GT':'K', 'CG':'S', 'AT':'W', 'AG':'R', 'CT':'Y',
'ACG':'V', 'CGT':'B', 'AGT':'D', 'ACT':'H', 'ACGT':'N',
'A' : 'A', 'C':'C', 'T':'T', 'G':'G', 'n':'-', '-':'-', 'N':'N'}
out_code = combs[nucs]
return out_code
def read_phylip(INFILE, samples):
indict = {}
with open(INFILE, "r") as infile:
infile.readline() # remove header
for line in infile:
fields = line.strip("\n").split(" ")
if fields[0] in samples: # discard unwanted samples straight away
indict[fields[0]] = fields[1]
return indict
def read_pop_info (INFILE):
popdict = {}
with open(INFILE, "r") as infile:
for line in infile:
fields = line.strip("\n").split("\t")
if fields[1] in popdict.keys():
popdict[fields[1]].append(fields[0])
else:
popdict[fields[1]] = [fields[0]]
print popdict
return popdict
def make_pop_level_seqs (indict, popdict):
# sort samples in indict into their populations:
binned_dict = {}
for sample, seq in indict.items():
for pop, samples in popdict.items():
if sample in samples:
try:
binned_dict[pop].append(seq)
except KeyError:
binned_dict[pop] = [seq]
# make consensus sequence out of individual sequences in the binned_dict:
consensus_dict = {}
for pop, seqs in binned_dict.items():
outseq = []
print "working on pop {0}".format(pop)
for column_index in range(0, len(seqs[0])): # to loop over all columns
aligned_column = [ seq[column_index] for seq in seqs ]
resolved = [ revcombs[x] for x in aligned_column ]
aligned_column = list("".join(resolved))
nucs = [x for x in aligned_column if x != '-'] # remove all the n -
if len(nucs) == 0:
outnuc = "-"
else:
outnuc = get_IUPAC_amb_code( "".join(sorted(list(set(nucs)))) )
print column_index, "\r",
outseq.append(outnuc)
outseqstring = "".join(outseq)
consensus_dict[pop] = outseqstring
return consensus_dict
def write_phylip (concat_dict, outfile):
with open(outfile, "w") as OUTFILE:
ntaxa = len(concat_dict.keys())
len_align = len(concat_dict[concat_dict.keys()[0]]) # gets value of first element in dictionary -> this OK since all seqs have same length
header = str(ntaxa) + " " + str(len_align) + "\n"
OUTFILE.write(header)
for sample, seq in concat_dict.items():
out_line = sample + " " + seq + "\n"
OUTFILE.write(out_line)
OUTFILE.close()
################################## MAIN
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
popdict = read_pop_info(argv[1])
samples = []
for x in popdict.values():
samples += x
print samples
indict = read_phylip(argv[0], samples)
consensus_dict = make_pop_level_seqs (indict, popdict)
write_phylip (consensus_dict, argv[2])
print "Done!"
if __name__ == "__main__":
main(sys.argv[1:])