-
Notifications
You must be signed in to change notification settings - Fork 26
/
run_seqtools.py
executable file
·85 lines (72 loc) · 4.34 KB
/
run_seqtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#! /usr/bin/env python
'''
Created on Aug 8, 2013
@author: smirarab
'''
import sys
from pasta.alignment import CompactAlignment
import argparse
from copy import copy
from math import ceil
import statistics
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Manipulate Alignments')
parser.add_argument('-infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
help='name of the input file (default: standard input)')
parser.add_argument('-outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
help='name of the output file (default: standard output)')
parser.add_argument('-informat', nargs='?', default="FASTA", choices = ["COMPACT3", "FASTA"],
help='format of the input file (default: FASTA)')
parser.add_argument('-outformat', nargs='?', default="FASTA", choices = ["COMPACT3", "FASTA", "PHYLIP"],
help='format of the output file (default: FASTA)')
parser.add_argument('-masksites', metavar='N', type=int,
help='sites with less than N non-gap characters will be masked out')
parser.add_argument('-filterfragments', metavar='N', type=int,
help='sequences with less than N non-gap sequences will be removed')
parser.add_argument('-masksitesp', metavar='r', type=float,
help='sites with less than r portion non-gap characters will be masked out')
parser.add_argument('-filterfragmentsp', metavar='r', type=float,
help='sequences with less than r portion non-gap sequences will be removed')
parser.add_argument('-filterfragmentsrelp', metavar='r', type=float,
help='sequences with non-gap sequence length less than r * median length will be removed')
parser.add_argument('-rename', metavar='MappingFile', type=argparse.FileType('r'),
help='Rename sequences, according to the mapping file generated by PASTA')
args = parser.parse_args()
alg = CompactAlignment()
alg.read_file_object(args.infile,args.informat)
if args.masksites or args.masksitesp:
if args.masksitesp and ( args.masksitesp > 1 or args.masksitesp < 0 ):
raise ValueError("-masksitesp %f is not between 0 and 1" %args.masksitesp)
r = args.masksites if args.masksites else ceil(args.masksitesp * len(alg))
sys.stderr.write("will remove sites with less than %d non-gaps among %d \n" %(r,len(alg)))
sys.stderr.write("Length went from %d " %(alg.sequence_length()))
alg.mask_gapy_sites(r)
sys.stderr.write("to %d \n" %(alg.sequence_length()))
if args.filterfragments or args.filterfragmentsp or args.filterfragmentsrelp:
if args.filterfragmentsp and ( args.filterfragmentsp > 1 or args.filterfragmentsp < 0 ):
raise ValueError("-filterfragmentsp %f is not between 0 and 1" %args.filterfragmentsp)
if args.filterfragmentsrelp and ( args.filterfragmentsrelp > 1 or args.filterfragmentsrelp < 0 ):
raise ValueError("-filterfragmentsrelp %f is not between 0 and 1" %args.filterfragmentsrelp)
if args.filterfragmentsrelp:
r = statistics.median((len(v.seq) for v in alg.values()))
sys.stderr.write("Median seq length is %d\n" %r)
r *= args.filterfragmentsrelp
else:
r = args.filterfragments if args.filterfragments else ceil(args.filterfragmentsp * alg.sequence_length())
sys.stderr.write("will remove sequences with less than %d non-gaps among %d \n" %(r,alg.sequence_length()))
sys.stderr.write("Number of sequences went from %d " %(len(alg)))
rem = []
for k, v in alg.items():
if len(v.seq) < r:
rem.append(k)
for k in rem:
alg.pop(k, None)
sys.stderr.write("to %d \n" %(len(alg)))
if args.rename:
lines=[x for x in args.rename.readlines() if x!="\n"]
namemap=dict(list(zip((x.strip() for i,x in enumerate(lines) if i%2 == 0),
(x.strip() for i,x in enumerate(lines) if i%2 == 1))))
names = copy(list(alg.keys()))
for k in names:
alg[namemap[k]] = alg.pop(k)
alg.write(args.outfile, args.outformat)