-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_sam_cel384v2.pl
executable file
·342 lines (283 loc) · 12 KB
/
process_sam_cel384v2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#!/usr/bin/perl -w
## script to demultiplex CELSeq2 single-cell RNA seq data, do the bookkeeping and convert reads to txpt counts
## original writtten by Dominic Grün and Lennart Kester. Heavily modified by Philip Lijnzaad <plijnzaad@gmail.çom>.
use strict;
use Carp;
use File::Basename;
use FileHandle;
use Getopt::Long;
use tools;
use mismatch;
my $version=getversion($0);
warn "Running $0, version $version\nwith arguments:\n @ARGV\n";
our ($barfile, $umi_len, $cbc_len, $allow_mm, $prefix, $ref, $help);
my $usage = "
Usage: $0 --barcodefile barcodes.csv --umi_len UMILENGTH --cbc_len CBCLENGTH [--prefix name ] [ -allow_mm=1 ] file.bam [ file2.bam ...]
Arguments:
--barcodefile FILE File with cell bar codes (format: id \\t sequence)
--umi_len N Length of the UMIs
file.bam ... Name(s) of the bam file(s), typically from several lanes. If
more than one they are assumed to have the same header.
Options:
--help This message
--allow_mm N How many mismatches to allow in the cell barcodes (default: 0)
--prefix name Prefix for the four output files: NAME.coutt.csv, NAME.coutb.csv, NAME.coutc.csv and NAME.sout
Default is the name of the first bam file without extension and lane number.
--ref name Name of the reference genome (only for logging)
";
### There used to be code to deal with UMIs containing N's, but they are extremely rare and not worth rescuing given
### the complexity of it. They were removed after version v15 (14 march 2017, commit 8fcc58d11a20b0f2cbdc978edfa3742c4e32a900)
die $usage unless GetOptions('barcodefile=s'=> \$barfile,
'umi_len=i'=> \$umi_len,
'cbc_len=i'=> \$cbc_len,
'allow_mm=i'=> \$allow_mm,
'prefix=s' => \$prefix,
'ref=s' => \$ref,
'help|h' => \$help);
my @bams=@ARGV;
if ( $help || !(@bams && $barfile && $umi_len && $cbc_len) ) {
die $usage;
}
my $maxumis = 4 ** $umi_len;
my $barcodes_mixedcase = mismatch::readbarcodes($barfile); ## eg. $h->{'AGCGtT') => 'M3'
my $barcodes = mismatch::mixedcase2upper($barcodes_mixedcase); ## e.g. $h->{'AGCGTT') => 'M3'
sub byletterandnumber { # sort the barcodes by their ids (which may contain prefixes)
my ($aa,$bb) = ($a,$b);
$aa=$barcodes->{$aa};
$bb=$barcodes->{$bb};
my($re)= qr/([A-Za-z_]*)([0-9]+)/;
my ($Sa, $Na) = ($aa =~ $re);
my ($Sb, $Nb) = ($bb =~ $re);
($Sa cmp $Sb) || ($Na <=> $Nb);
}
my @cbcs = sort byletterandnumber (keys %$barcodes); # e.g. 'AGCGTT'
my @wells = map { $barcodes->{$_} } @cbcs; # e.g. 'B6'
my $mismatch_REs=undef;
if ($allow_mm) {
$mismatch_REs = mismatch::convert2mismatchREs(barcodes=>$barcodes_mixedcase, allowed_mismatches =>$allow_mm);
}
$barcodes_mixedcase=undef; # not used in remainder, delete to avoid confusion
my $nreads = 0; # everything
my $ninvalidUMI=0;
my $nantisense=0;
my $nunisense=0;
my $nvalid=0;
my $nignored=0;
## mismatched cell barcodes:
my $nmmCBC=0;
my $nmapped_mmCBC=0;
my $nrescued_mmCBC=0;
my $nmapped=0;
my $nunimapped=0;
my $nrefgenes=0;
my $nERCCs=0;
my $tc = {};
# read through sam file create a hash with all genes and cells and extract mapped reads into the hash
my $samtools = "samtools"; # alternative: sambamba, might be faster
if (!$prefix) {
$prefix=$bams[0];
$prefix =~ s/\.bam//;
$prefix =~ s/[-_,.]L0*[1-4][-_,.]//g if @bams > 1;
warn "using prefix $prefix\n";
}
for my $f (@bams) {
die "$f: $!" unless $f;
}
my $cmd = "($samtools view -H $bams[0]; " . join("; ", map { "samtools view $_" } @bams) . ")";
open(IN,"$cmd |") || die "$cmd: $!";
my $saturation = "$prefix-saturation.txt";
open(SATURATION, "> $saturation") || die "$saturation: $!";
my @headers=qw(reads nmapped nvalid genes umis txpts);
print SATURATION "#" . join("\t", @headers) . "\n";
print SATURATION join("\t", (1) x int(@headers) )."\n";
## well saturation: open three files
my $wellsat_prefix = "$prefix-wellsat";
my $wellsat_files={};
for my $type ( qw(genes umis) ) {
my $file="$wellsat_prefix-$type.txt";
my $fh = FileHandle->new("> $file") || die "$file: $!";
$wellsat_files->{$type}=$fh;
print $fh "reads\t" . join("\t", @wells) . "\n";
}
my $sample_every = 10_000;
my $genes_seen={}; # cumulative
my $umis_seen={}; # cumulative
my $wellwise_seen={}; # keys are those of %$wellsat_files
sub umi_correction {
my($n, $maxumis)=@_;
return 'NA' if $maxumis <= 1;
$n= $n - 0.5 if $n >= $maxumis;
sprintf("%.2f", -$maxumis*log(1 - ($n/$maxumis)));
}
HEADERLINE:
while(<IN> ) {
## loop is shifted one readline so final printing of SATURATION is not skipped
if ( /^\@SQ/ ){
my ($tag, $name, @rest)=split("\t", $_);
if ($name =~ /^ERCC-0/) { $nERCCs++; } else { $nrefgenes++; }
next HEADERLINE;
}
if ( /^\@PG/ ){
warn "$0: found: $_"; # PL:should check if it contains bwa
next HEADERLINE;
}
last HEADERLINE unless /^@/;
}
READ:
while(1) {
chomp $_;
my @r1 = split("\t",$_);
my ($QNAME,$FLAG,$RNAME,$POS,$MAPQ,$CIGAR,$MRNM,$MPOS,$ISIZE,$SEQ,$QUAL,@rest)=@r1;
my ($cbc,$umi);
my(@parts)=split(':', $QNAME);
for my $tag ( @parts ) {
$cbc= $1 if $tag =~ /cbc=([A-Z]+)/i;
$umi= $1 if $tag =~ /umi=([A-Z]+)/i;
}
die "$0: could not find cbc= or umi= in id $QNAME of the files @bams " unless $cbc && $umi;
if ($umi =~ /N/i) { # very rare (< 0.1%), not worth rescuing. See v15 for code that did.
$ninvalidUMI++;
$nreads++;
next READ;
}
my $X0 = 0;
my $dum = 'NA';
foreach my $el (@rest){
# ($dum,$dum,$NM) = split(":",$el) if ($el =~ /^NM\:/); # NM: number of mismatches
# ($dum,$dum,$XA) = split(":",$el) if ($el =~ /^XA\:/); # XA: number of alternative hits (chr,pos,CIGAR,NM;)+
($dum,$dum,$X0) = split(":",$el) if ($el =~ /^X0\:/); # X0: number of best hits (bwa-specific!)
}
$nmapped += ($X0 > 0); # $X0 is number of (equally) optimal locations to which the read maps (one was chosen at random)
$nunimapped += ($X0 == 1);
my $antisense= !!($FLAG & 16);
$nantisense += $antisense;
$nunisense += ( ($X0 == 1) && !$antisense);
if (! exists $barcodes->{$cbc} && $allow_mm) {
$cbc=mismatch::rescue($cbc, $mismatch_REs); # gives back the barcode without mismatches (if it can be found)
$nrescued_mmCBC += defined($cbc);
}
## count only reads with valid barcode, uniquely mapping in the sense orientation:
if ($cbc && exists $barcodes->{$cbc}) {
if ($X0 == 1 && ! $antisense){
$nvalid++;
$tc->{$RNAME}{$cbc}{$umi}++;
if ($RNAME !~ /^ERCC-/ ) {
$genes_seen->{$RNAME}++ ;
$umis_seen->{$RNAME.$umi}++;
$wellwise_seen->{'genes'}{$cbc}{$RNAME}++;
$wellwise_seen->{'umis'}{$cbc}{$RNAME.$umi}++;
}
} else {
$nignored++;
$tc->{'#IGNORED'}{$cbc}{$umi} ++;
## keep track of some (non-exclusive!) subsets of this
$tc->{'#unmapped'}{$cbc}{$umi} += ($X0 == 0 );
$tc->{'#multimapped'}{$cbc}{$umi} += ($X0 > 1 );
$tc->{'#antisense'}{$cbc}{$umi} += $antisense; # (may overlap with multimappers)
}
} else {
$nmmCBC++;
$nmapped_mmCBC += ($X0 > 0);
}
$nreads++;
} continue {
if ($nreads % $sample_every == 0 || eof(IN) ) {
my $g=int(keys(%$genes_seen)); # includes those
my $u=int(keys(%$umis_seen)); # with unknown CBC
print SATURATION join("\t",
($nreads,
$nmapped, # includes non-unique and/or antisense and/or invalid CBC
$nvalid, # only the valid reads (those used in downstream analysis)
$g,
$u,
umi_correction($u,$maxumis*$g))) . "\n";
## note: for the wellwise counts, only print $nreads; get the rest from the
## overall saturation counts
my $fh;
my @genecounts = map { int keys %{$wellwise_seen->{'genes'}{$_}} } @cbcs;
$fh=$wellsat_files->{'genes'};
print $fh "$nreads\t" . join("\t", @genecounts) . "\n";
my @umicounts = map { int keys %{$wellwise_seen->{'umis'}{$_}} } @cbcs;
$fh=$wellsat_files->{'umis'};
print $fh "$nreads\t" . join("\t", @umicounts) . "\n";
}
warn int($nreads/1_000_000) . " million reads processed\n" if $nreads % 1_000_000 == 0;
last READ if eof(IN);
$_ = <IN>;
} # READ
close(IN) || die "$cmd: $!";
close(SATURATION) || die "$saturation: $!";
foreach my $type (keys %$wellsat_files ) {
my $fh= $wellsat_files->{$type};
$fh->close() || die "Well saturation file for type $type :$!";
}
my $coutt = "$prefix.coutt.csv";
my $coutb = "$prefix.coutb.csv";
my $coutc = "$prefix.coutc.csv";
my $sout = "$prefix.sout";
open(OUTT, "> $coutt") || die "$coutt: $!";
open(OUTB, "> $coutb") || die "$coutb: $!";
open(OUTC, "> $coutc") || die "$coutc: $!";
open (SOUT, "> $sout") || die "$sout: $!";
print OUTT "GENEID\t".join("\t", @wells)."\n";
print OUTB "GENEID\t".join("\t", @wells)."\n";
print OUTC "GENEID\t".join("\t", @wells)."\n";
## gather read counts, umi counts and transcript counts
my $trc = 0;
my $nsaturated_umis=0;
GENE:
foreach my $gene (sort keys %$tc) {
print OUTB $gene;
print OUTT $gene;
print OUTC $gene;
WELL:
foreach my $cbc (@cbcs) {
my $n = 0; # distinct UMIs for this gene+well
my $rc = 0; # total reads for this gene+well
my $umihash=$tc->{$gene}{$cbc};
my @umis = keys %{$umihash};
UMI:
foreach my $umi (@umis) {
my $reads=$tc->{$gene}{$cbc}{$umi};
$n += ($reads > 0);
$rc += $reads; # total valid (=uniquely sense-mapped) reads for this gene+cell
} # UMI
$trc += $rc unless $gene =~ /^#/;
$nsaturated_umis += ($n == $maxumis);
$n = $n - 0.5 if ($n == $maxumis); # saturation correction
my $txpts = $n; # used only for '#IGNORED' etc. @@@fix this
$txpts = -log(1 - ($n/$maxumis)) * $maxumis unless ($gene =~ /^#/ ); # binomial correction
print OUTB "\t$n";
print OUTC "\t$rc";
print OUTT "\t$txpts";
} # WELL
print OUTT "\n";
print OUTB "\n";
print OUTC "\n";
} # GENE
sub stat_format {
my($part, $total)=@_;
sprintf("%s / %s = %.1f %%\n", commafy($part), commafy($total), 100*$part/$total);
}
$ref = "unknown" unless $ref;
print SOUT "reference transcriptome: $ref\n";
print SOUT " contains $nERCCs ERCCs and " . commafy($nrefgenes) . " genes\n";
print SOUT "invalid UMI (all other numbers refer to valid UMIs): " , stat_format($ninvalidUMI, $nreads);
print SOUT "number of mapped reads: " , stat_format($nmapped, $nreads);
print SOUT "uniquely mapping reads: ", stat_format($nunimapped, $nreads);
print SOUT "antisense mapping reads: ", stat_format($nantisense, $nreads);
print SOUT "uniquely sense mapping reads: ", stat_format($nunisense, $nreads);
print SOUT "uniquely sense mapping with valid CBC: " , stat_format($trc, $nreads);
warn "*** trc ($trc) should be == nvalid ($nvalid) *** \n" if $trc != $nvalid;
print SOUT "rescued mismatched CBC: " , stat_format($nrescued_mmCBC, $nreads);
print SOUT "unknown CBC: " , stat_format($nmmCBC, $nreads );
print SOUT "mapped read, but unknown CBC: " , stat_format($nmapped_mmCBC, $nunisense);
print SOUT "total reads = unique&valid + ignored + unknown CBC + invalidUMI:\n"
. sprintf("%d = %d + %d + %d + %d\n", $nreads,$trc, $nignored, $nmmCBC, $ninvalidUMI);
$nreads /= 100;
print SOUT "%% unique&valid + ignored + mmCBC + invalidUMI:\n"
. sprintf("100%% = %.1f + %.1f + %.1f + %.1f\n",
$trc/$nreads, $nignored/$nreads, $nmmCBC/$nreads, $ninvalidUMI/$nreads);
## note: should be slightly less than 100%: we're threw out the invalid UMIs
print SOUT "well+gene combinations that used up all umis: $nsaturated_umis\n";
close SOUT;