forked from hemberg-lab/scRNA.seq.course
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_Flexible_FullTranscript_Demultiplexing.pl
154 lines (141 loc) · 4.06 KB
/
1_Flexible_FullTranscript_Demultiplexing.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
use strict;
use warnings;
# Input a pair of sequencing FastQ files
# gunzip, read, write out smaller broken down files in a format suitable for submitting job array, and re-gzip each one in turn
# Breakdown by lane & cellID
# Keep order.
# This should work equally well for single-end reads and can take any number of files as arguments.
# TESTED
if (@ARGV != 7) {
print STDERR "perl 1_Flexible_FullTranscript_Demultiplexing.pl read1.fq read2.fq b_pos b_len index mismatch prefix\n";
print STDERR "
read1.fq : barcode containing read
read2.fq : non-barcode containg read
b_pos : position of cell-barcode in the read. [\"start\" or \"end\"]
b_len : length of cell-barcode (bp)
index : file contain a single column of expected barcodes
mismatch : maximum number of permitted mismatches (recommend 2)
prefix : prefix for output fq files.\n";
exit(1);
}
my $infile1 = $ARGV[0];
my $infile2 = $ARGV[1];
my $barcode_pos = $ARGV[2];
my $barcode_len = $ARGV[3];
my $barcode_index_file = $ARGV[4];
my $MAXmismatch = $ARGV[5];
my $OUTprefix = $ARGV[6];
if ($OUTprefix =~ /^(.+)\/[^\/]$/) {
if ($1 ne ".") {
system("mkdir -p $1");
}
}
my %CellBarcodes = ();
my %ofhs1 = ();
my %ofhs2 = ();
open(my $ifh, $barcode_index_file) or die "Cannot open $barcode_index_file\n";
my $index=1;
while (<$ifh>) {
chomp;
$CellBarcodes{$_} = $index;
open(my $fh1, '>', "$OUTprefix\_$_\_read1.fq") or die $!;
$ofhs1{$index} = $fh1;
open(my $fh2, '>', "$OUTprefix\_$_\_read2.fq") or die $!;
$ofhs2{$index} = $fh2;
$index++;
} close($ifh);
my $NotProperBarcodes = 0;
my $NotPossibleCell = 0;
my $AmbiguousCell = 0;
my $ExactMatch = 0;
my $Mismatch = 0;
my $total_reads = 0;
my $OutputReads = 0;
open(my $ifh1, $infile1) or die $!;
open(my $ifh2, $infile2) or die $!;
while(<$ifh1>) {
my $file1line=$_;
my $file2line = <$ifh2>;
if ($file1line =~ /^@/) {
# Ensure matching pair of reads
my @thing1 = split(/\s+/, $file1line);
my @thing2 = split(/\s+/, $file2line);
my $readname = $thing1[0];
#if ($readname ne $thing2[0]) {die "file1 & file2 readnames don't match!\n";}
my $barcode_read = <$ifh1>;
chomp $barcode_read;
my $read2 = <$ifh2>;
chomp $read2;
$total_reads++;
<$ifh1>; <$ifh2>;
my $file1qual = <$ifh1>;
chomp $file1qual;
my $file2qual = <$ifh2>;
chomp $file2qual;
my $CellID = "";
if ($barcode_pos eq "start") {
$CellID = substr($barcode_read, 0, $barcode_len, "");
substr($file1qual, 0, $barcode_len, "");
} else {
$CellID = substr($barcode_read, -$barcode_len, $barcode_len, "");
substr($file1qual, -$barcode_len, $barcode_len, "");
}
my $mismatches = 0;
if (!exists($CellBarcodes{$CellID})) {
my @matches = ();
my %close = ();
foreach my $expected_barcode (keys(%CellBarcodes)) {
my $count = ( $expected_barcode ^ $CellID ) =~ tr/\0//;
if ($count >= length($expected_barcode)-$MAXmismatch) {
$close{$expected_barcode} = $count;
}
}
if (scalar(keys(%close)) > 0) {
my $max = my_max(values(%close));
$mismatches = length($CellID) - $max;
foreach my $code (keys(%close)) {
if ($close{$code} == $max) {
push(@matches, $code);
}
}
}
if (scalar(@matches) == 1) {
$CellID = $matches[0];
$Mismatch++;
} elsif (scalar(@matches) > 1) {
$AmbiguousCell++;
next;
} else {
$NotPossibleCell++;
next;
}
} else {
$ExactMatch++;
}
# print the read
my $handle1 = $ofhs1{$CellBarcodes{$CellID}};
my $handle2 = $ofhs2{$CellBarcodes{$CellID}};
print $handle1 "$readname\n$barcode_read\n+\n$file1qual\n";
print $handle2 "$readname\n$read2\n+\n$file2qual\n";
$OutputReads++;
} else {next;}
}
print STDERR "
Doesn't match any cell: $NotPossibleCell
Ambiguous: $AmbiguousCell
Exact Matches: $ExactMatch
Contain Mismatches: $Mismatch
Input Reads: $total_reads
Output Reads: $OutputReads\n";
close($ifh1);
close($ifh2);
foreach my $ofh1 (keys(%ofhs1)) {close($ofhs1{$ofh1});}
foreach my $ofh2 (keys(%ofhs2)) {close($ofhs2{$ofh2});}
sub my_max {
if (scalar(@_) == 1) {return($_[0])};
my $max = shift;
foreach my $ele (@_) {
if ($ele > $max) {$max = $ele;}
}
return($max);
}