forked from JunyueC/sci-RNA-seq3_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 1
/
rm_dup_barcode_UMI_no_mismatch.py
78 lines (67 loc) · 2.54 KB
/
rm_dup_barcode_UMI_no_mismatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'''
This script accept a input sorted sam file, a output sam file, and a mismatch rate, then it will remove
duplicates based on the barcode + UMI (edit distance <= 1), and chromatin and start site, at the same
time, it will output the duplication number for each read, and generate the histogram plot for the read
per duplication number
'''
# using python3 updated 2024-03-12 used ChatGPT to convert so hopefully they did a good job
import sys
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
sys.setrecursionlimit(1000000)
def rm_dup_samfile(samfile, output_file, mismatch):
with open(samfile) as f1, open(output_file, 'w') as f2, open(output_file+'.csv', 'w') as f3:
pre_barcode = set()
pre_line = []
unique_id = []
pre_chrom = 0
pre_site = 0
dup = False
mismatch = int(mismatch)
pre_dup_num = 0
cur_dup_num = 0
for line in f1:
if line[0] == '@':
f2.write(line)
else:
name = (((line.split('\t'))[0]).split(','))
barcode_UMI = name[0] + name[1]
chrom_num = (line.split('\t'))[2]
start_site = (line.split('\t'))[3]
if start_site == pre_site and chrom_num == pre_chrom:
dup = False
if barcode_UMI not in pre_barcode:
pre_dup_num = cur_dup_num
cur_dup_num = 1
f2.write(line)
pre_barcode.add(barcode_UMI)
f3.write('%d\n' % pre_dup_num)
else:
cur_dup_num += 1
else:
pre_dup_num = cur_dup_num
cur_dup_num = 1
f2.write(line)
pre_chrom = chrom_num
pre_site = start_site
pre_barcode = set()
pre_barcode.add(barcode_UMI)
if pre_dup_num != 0:
f3.write("%d\n" % pre_dup_num)
'''
#plot the histogram for the read duplication number
dups = (pd.read_csv(output_file+'.csv', header=None))[0]
fig = plt.figure()
plt.hist(dups, bins=100)
plt.xlabel("Duplication number")
plt.ylabel("Read number")
fig.savefig(output_file + '.png')
'''
if __name__ == "__main__":
samfile = sys.argv[1]
output_file = sys.argv[2]
mismatch = sys.argv[3]
rm_dup_samfile(samfile, output_file, mismatch)