-
Notifications
You must be signed in to change notification settings - Fork 0
/
DeduperSR.py
executable file
·194 lines (145 loc) · 6.12 KB
/
DeduperSR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
# Version 0.1
# This is a genome referenced-based deduplication pseudocode.
# Removes all PCR duplicates in a given sorted SAM file by Chromosome number then UMI.
# Remove all supplemental and unmapped reads.
# This program for single-reads only.
import argparse
import re
# UMI set
umi_set = set()
# set to store unique read info: {UMI, chrom, pos, strand}
umipos_set = set()
# dictionary to keep track of current record
# {"UMI":None, "Chrom":None, "Pos":None, "Strand":None}
record_dict = {}
# unique reads per chromosome
count_dict = {"header":0, "uniq":0, "incorrect":0, "dups":0, "removed": 0}
# keep track of which chromosome we're checking.
curr_chrom = ""
prev_chrom = ""
def get_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("-f", "--input", required=True, type=str, help="absolute filepath to sorted .sam file")
parser.add_argument("-o", "--output", required=True, type=str, help="absolute filepath to output sam file")
parser.add_argument("-u", "--umi", required=True, type=str, help="absolute filepath to umi txt file")
return(parser.parse_args())
args = get_args()
input_file = args.input
out_file = args.output
umi_file = args.umi
# Create UMI set with given UMI text file
def create_UMI_set():
with open(umi_file, "r") as f:
line_count = 0
for line in f:
line_count += 1
if line_count > 2 :
umi_set.add(line.strip())
# umi_set.add("ACT") # for test files
# umi_set.add("AAA") # for test files
# parse bitwise flag
def check_flag(flag):
# check if read is unmapped, or supplementary alignment
# if yes, skip
if ((flag & 4) == 4 or (flag & 2064) == 2064 or (flag & 2048) == 2048) :
count_dict["removed"] += 1
return "SKIP"
# check if sequence is reverse
# if yes, add "-" to current search list
if ((flag & 16) == 16):
record_dict["Strand"] = "-"
else:
record_dict["Strand"] = "+"
return ""
# parse sequence record for QNAME, bitwise flag, chromosome, position,
# and cigar string
# return "SKIP" to stop searching if UMI doesn't exist
def parse_sam(line):
# Grab UMI in QNAME of sequence line (col 0)
# Check if UMI is in UMI set
# if yes, add to current search list
# otherwise, ignore line and return "SKIP"
# umi = re.search("[ATGC]{3}", line[0])[0] # for test files
umi = line[0].split(":")[-1]
if umi in umi_set:
record_dict["UMI"] = umi
else:
count_dict["incorrect"] +=1
return "SKIP"
# Grab bitwise flag (col 1)
flag = check_flag(int(line[1]))
if flag == "SKIP":
return "SKIP"
# chromosome RNAME (col 2)
# add chromosome num to current search list
record_dict["Chrom"]= line[2]
prev_chrom = curr_chrom
# add left-most position (col 3) to current search list
record_dict["Pos"] = int(line[3])
# return cigar string (col 5)
return line[5]
# parse input cigar string
# store alignment match, soft clipping, skipped regions, and deletion variables
# update position if needed
def parse_cigar(cigar):
# if current strand is positive, check for soft clipping on 5'
# adjust left-most position if needed
if record_dict["Strand"] == "+":
left_soft = [int(i) for i in re.findall("[0-9]+(?=S[0-9]+)", cigar)]
record_dict["Pos"] -= sum(left_soft)
# if strand is reverse, adjust for:
# alignment match(es), deletions, right soft clippings, skipped regions
# position = position + alignment match + skipped region + right alignment match
# + right soft clipping + deletion - 1 (left adjusted, inclusive) ** didn't adjust for inclusive.
if record_dict["Strand"]== "-":
record_dict["Pos"] += sum([int(i) for i in re.findall("[0-9]+(?=M)", cigar)]) # align
record_dict["Pos"] += sum([int(i) for i in re.findall("[0-9]+(?=D)", cigar)]) # deletion
record_dict["Pos"] += sum([int(i) for i in re.findall("(?<=[A-Z]{1})[0-9]+(?=S)", cigar)]) # right soft clip
record_dict["Pos"] += sum([int(i) for i in re.findall("[0-9]+(?=N)", cigar)]) # skipped
# write out counter for headers, unique reads, duplicates, and incorrect UMIs found
# write out counter for each unique read found per chromosome
def write_counters():
with open("deduper_report.txt", "w") as f1:
for key in count_dict:
print(f"{key}:\t{count_dict[key]}\tfound", file=f1)
# Read sorted input .sam file (will be sorted by chromosome then by UMI).
# Open write file.
with open(input_file, "r") as f, \
open(out_file, "w") as out:
create_UMI_set()
for line in f:
# If current line is a header, write out "@" headers to .sam file
# else save line to write out when unique record is found
# update count dictionary with number of header lines
if line.strip().startswith("@") == True:
print(f"{line.strip()}", file=out)
count_dict["header"] += 1
continue
record = line.strip().split("\t")[0:9]
cigar = parse_sam(record)
# if UMI in sam record doesn't exist, stop search and continue to next record
# otherwise, continue parse through cigar string
if cigar == "SKIP":
continue
parse_cigar(cigar)
# add UMI, chromosome, updated position, and strand type to dictionary as key
# add sequence record as value and true for unique found
# write out record to sam file if unique
# update count dictionary of unique or duplicates
if tuple(record_dict.values()) not in umipos_set:
umipos_set.add(tuple(record_dict.values()))
print(f"{line.strip()}", file=out)
count_dict["uniq"] +=1
else:
# duplicate found
# update duplicate counter
count_dict["dups"] +=1
# done searching for duplicates for this chromosome
if prev_chrom != curr_chrom:
#umipos_set = {}
umipos_set.clear()
record.clear()
# done searching for all duplicates
# write out counters
write_counters()