-
Notifications
You must be signed in to change notification settings - Fork 1
/
samplePairs.py
82 lines (69 loc) · 3.45 KB
/
samplePairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''
This script samples the first base of every non-overlapping genomic window defined across consecutive human genomic
bases that align to mouse, converting aligning pairs of human and mouse bases into aligning pairs of human and mouse
regions.
The main output file (ending with .gz) has 18 columns:
- Columns 1-4 (chrom, start, end, nucleotide) correspond to the first human base of the 50-bp window
- Columns 5-8 correspond to the aligning mouse base of the start of the 50-bp block
- Column 9 is the pair index of the start of the 50-bp block
- Columns 10-13 correspond to the human of the end of the 50-bp block
- Columns 14-17 correspond to the aligning mouse base of the end of the 50-bp block
- Coummn 18 is the pair index of the end of the 50-bp block
We also generate separate output files for human regions (file ending with .h.gz) and mouse regions (file ending with
.m.gz)
'''
import gzip,argparse,os
def main():
description = 'Sample the first base of every non-overlapping genomic window defined across consecutive human \
genomic bases that align to mouse'
epilog = '# Example: python source/samplePairs.py -i position/hg19.mm10.basepair.gz -o position/hg19.mm10.50bp'
parser = argparse.ArgumentParser(prog='python source/samplePairs.py',
description=description,
epilog=epilog)
g1 = parser.add_argument_group('required arguments')
g1.add_argument('-i','--input-filename',
help='path to output filename from aggregateAligningBases containing aligning pairs of human and mouse bases',
required=True,type=str)
parser.add_argument('-b','--bin-size',
help='size (bp) of the non-overlapping genomic window (default: 50)',
type=int,default=50)
g1.add_argument('-o','--output-prefix',
help='prefix for output files',
required=True,type=str)
args = parser.parse_args()
with gzip.open(args.input_filename) as fin, gzip.open(args.output_prefix+'.gz','wb') as fout:
# Write the very first line
line = fin.readline().strip()
fout.write(line+b'\t')
l = line.split()
hg_chrom = l[0].decode('utf-8')
hg_pos = int(l[1])
i = 1
prev_chrom = hg_chrom
prev_pos = hg_pos
prev_line = line
for line in fin: # Iterate
line = line.strip()
l = line.split()
hg_chrom = l[0].decode('utf-8')
hg_pos = int(l[1])
# Current base is contiguous to the previous base
if prev_chrom == hg_chrom and hg_pos == prev_pos + 1:
if i%args.bin_size==0:
fout.write(prev_line+b'\n')
fout.write(line+b'\t')
i += 1
# Current base is a start of new block
else:
fout.write(prev_line+b'\n')
fout.write(line+b'\t')
i = 1
# Store current data
prev_chrom = hg_chrom
prev_pos = hg_pos
prev_line = line
fout.write(line)
# Write separate files for human and mouse regions
os.system("gzip -cd %s.gz | awk -v OFS='\t' '{print $1,$2,$3,$9}' | gzip > %s.h.gz" % (args.output_prefix,args.output_prefix))
os.system("gzip -cd %s.gz | awk -v OFS='\t' '{print $5,$6,$7,$9}' | sort -k1,1 -k2,2n | gzip > %s.m.gz" % (args.output_prefix,args.output_prefix))
main()