-
Notifications
You must be signed in to change notification settings - Fork 1
/
mismatch.pm
513 lines (453 loc) · 16.8 KB
/
mismatch.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
## Package to handle demultiplexing with mismatches, also offering possibility to disallow mismatches in
## certain positions (useful when barcode library was wrongly designed).
## Written by <[email protected]>
### Usage: see demultiplex-{fastq,sam}.pl
package mismatch;
use strict;
use Math::Combinatorics qw(combine);
use Regexp::Optimizer;
use File::Basename;
use FileHandle;
sub readbarcodes {
## utility function to read barcodes, returns hash with
## eg. $barcodes->{'AGCGtT') => 'M3' } Note that lower case letters
## (used for disallowing specific mismatches) are still there (and won't
## match actual barcodes). Expects a barcode_id and sequence (3-12 nt
## long) on a single line separated by whitespace
my ($file)=@_;
my $barcodeids={};
my $barcodes_mixedcase = {};
my $uppercase_codes={};
my $len=undef;
my $nlowercase=0;
open(FILE, "$file") or die "Barcode '$file': $!";
LINE:
while(<FILE>) {
s/[\n\r]*$//g;
s/#.*//;
next LINE unless $_;
my ($barcodeid, $code)=split(' ',$_); # e.g. 'G7 \t CCAACAAT'
my $l=length($code);
if ( defined($len) && $l != $len) {
die "Barcode $code has length $l, expected uniform length $len, file $file line $.";
} else {
$len=$l;
}
my $regexp='^[ACGT]{3,12}$';
die "Barcode $code does not match /$regexp/i, file $file, line $." unless $code =~ /$regexp/i;
# (3,12 are a wild guess at sanity)
$nlowercase += ($code =~ /[a-z]/) ;
die "Barcode id '$barcodeid' not unique" if $barcodeids->{$barcodeid}++;
die "Barcode '$code' not unique" if $uppercase_codes->{"\U$code"}++;
$barcodes_mixedcase->{$code}=$barcodeid;
} # while LINE
close(FILE);
warn "Found $nlowercase barcodes containing lower case letters. These will be uppercased and will not be allowed to mismatch at the lowercased positions"
if $nlowercase;
$barcodes_mixedcase;
} # readbarcodes
sub mixedcase2upper {
## utility function to convert the mixed case hash (which is used for the mismatch regular expressions) to an uppercased
## hash
my ($mixed) = @_;
my $barcodes={};
for my $code ( keys %$mixed ) { $barcodes->{"\U$code"}=$mixed->{$code}}
$barcodes;
}
sub convert2mismatchREs {
## takes hash with barcodes (e.g. $h->{'AGCGtT') => 'M3' ) and the allowed
## number of mismatches returns per barcode the mismatch regular expresson
## e.g. $h->{'AGCGTT') => REGEXP(0x25a7788) The resulting map only
## contains uppercase barcodes, as this is needed for mapping it to the
## output file. The hash returned contains, per barcode, one regexp
## representing all possible mismatches of that barcode. In the values
## (i.e. regexps), lowercase letters (if any) are uppercased and the
## regexp does not allow these letters to mismatch.
my $args = ref $_[0] eq 'HASH' ? shift : {@_}; # args: barcodes, allowed_mismatches
my $o=Regexp::Optimizer->new;
my $mm_REs={};
for my $code (keys %{$args->{barcodes}}) {
my @res= _getmismatch_REs($code, $args->{allowed_mismatches}); # empty if allowed_mismatches==0
my $r='^'.join("|", @res).'$';
$r=$o->optimize(qr/$r/);
$mm_REs->{"\U$code"}= $r; # just one big regexp. Note the uppercasing
} # for $code
$mm_REs;
} # convert2mismatchREs
sub get_mismatchREs {
### set up array of regexp for increasing numbers of mismatches, to be
### tested in turn. They are to be tested in turn. Element 0
### (corresponding to 'no mismatches') is deliberately left undefined
### to avoid confusion
my $args = ref $_[0] eq 'HASH' ? shift : {@_};
my ($barcodes, $max_mismatches)= map {$args->{$_}} qw(barcodes max_mismatches);
return undef if ($max_mismatches==0);
my $mismatch_REs=[];
$#{$mismatch_REs}= ($max_mismatches);
for(my $i=1; $i<=$max_mismatches; $i++) {
my $re= mismatch::convert2mismatchREs(barcodes=>$barcodes,
allowed_mismatches =>$i);
## eg. $h->{'AGCGTT') => REGEXP(0x25a7788)
$mismatch_REs->[$i]=$re;
}
$mismatch_REs;
} # get_mismatchREs
sub rescue {
### return the barcode without mismatches (not its ID!)
my($foundcode, $mm_REs)=@_;
foreach my $code (keys %$mm_REs) {
my $re=$mm_REs->{$code};
return $code if $foundcode =~ $re;
}
return undef;
} # rescue
sub safe_rescue {
my($foundcode, $mm_REs)=@_;
my @found=();
foreach my $code (keys %$mm_REs) {
my $re=$mm_REs->{$code};
push(@found, $code) if $foundcode =~ $re;
}
return @found;
} # rescue
sub format_mm {
## given an real barcode and the found one (with mismatches), make all the
## mismatched letters lowercase
my($orig, $mm)=@_;
my @orig=split('', $orig);
my @mm=split('', $mm);
my @new;
for(my $i=0; $i<@orig; $i++) {
push(@new, ($orig[$i] eq $mm[$i])? $mm[$i] : "\L$mm[$i]");
}
join("", @new);
}
sub _getmismatch_REs {
## for one barcode, set up the regular expressions that allows mismatches
my($code, $max_mm)=@_;
return () if ! $max_mm;
##mark the fixed positons with !
my @fixed = ();
if ($code =~ /[a-z]/) {
my $fixed= $code;
$fixed =~ s/[a-z]/!/g;
@fixed = split(//, $fixed);
$code = "\U$code";
}
my @mmcodes=();
my(@code)=split(//, $code);
## set up array of arrays with '.' where to do the replacements:
for(my $i=1; $i<=$max_mm; $i++) {
## set up all possible combinations of mismatch positions (usually
## just 1, since max_mm usually 1) combine (Math::Combinatorics)
## returns all unique (combinations of $i mismatch positions in a set
## of length($code) barcode positions
my @pos_sets = combine($i, 0..$#code);
COMB:
foreach my $pos_set ( @pos_sets ) {
## replace the mismatch positions with '.' (regexp for "any character") using splicing (yay)
my @mm=@code;
@mm[ @$pos_set ] = split(//, '.' x int(@$pos_set) );
my $mm_re=join("", @mm);
for my $i (0 .. $#fixed) {
if ($fixed[$i] eq '!' && $mm[$i] eq '.') {
## warn "regexp $mm_re conflicts with a fixed position, skipped\n";
next COMB;
}
}
push(@mmcodes, $mm_re);
}
}
@mmcodes;
} # _getmismatch_REs
sub hammingdist {
## honestly stolen from http://www.perlmonks.org/?node_id=500244
length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] );
}
sub getversion {
# usage: my $version = getversion($0);
my($path)=@_;
my ($fullpath)=`which $path`;
my ($script,$dir) = fileparse($fullpath);
chomp($script);
my $ls=`cd $dir 2>/dev/null && git ls-files $script 2>/dev/null`;
chomp($ls);
return "NOT_UNDER_VERSION_CONTROL" if ($ls ne $script);
my $branch=`cd $dir 2>/dev/null && git rev-parse --abbrev-ref HEAD`;
chomp($branch);
my $version=`cd $dir 2>/dev/null && git describe --match 'v[0-9]*' --tags --dirty --always 2> /dev/null`;
chomp($version);
$version =~ s/-(\d+)-g([a-f0-9]+)/-$1-$2/; ## trash the silly 'g' prefix, only confuses things
$version='UNKNOWN' unless $version;
## my $timestamp = `git log -1 --date=iso 2>/dev/null | sed -n '/^Date:/{s/Date: *//;s/ /_/g;p;}' 2>/dev/null `;
## chomp($timestamp);
$branch.'_'.$version;
} # getversion
sub commafy {
# insert comma's to separate powers of 1000
my($i)=@_;
my $r = join('',reverse(split('',$i)));
$r =~ s/(\d{3})/$1,/g;
$r =~ s/,$//;
join('',reverse(split('',$r)));
}
sub demultiplex {
my $args = ref $_[0] eq 'HASH' ? shift : {@_};
my ($type, $input, $outputs, $barcodes, $mismatch_REs, $groups, $barcode_re)=
map {$args->{$_}} qw(type input outputs barcodes mismatch_REs groups barcode_re);
die "unknown type '$type', must be fastq or bam" if ($type ne 'fastq' && $type ne 'bam');
warn "*** This code will not work on FASTQ files produced by Casava < 1.8
*** (does not warn about it yet)\n";
my($nexact, $nunknown, $nrescued, $statsperbarcode, $statspermm);
my($nrefseqs, $warned); # only used for bam
foreach my $code (keys %$barcodes ) {
$statsperbarcode->{$code}=[];
$statspermm->{$code}={};
}
my $filehandles=$outputs;
RECORD:
while(1) {
my $foundcode;
my $record=<$input>;
## ($foundcode and $record are the only two variables needed)
if ($type eq 'fastq') {
### e.g.: ^@NS500413:172:HVFHWBGXX:1:11101:4639:1062 1:N:0:CCGTCCAT$
### this code will NOT work for FASTQ file produced by Casava < 1.8
### see https://en.wikipedia.org/wiki/FASTQ_format descriptions used
### in FASTQ files from NCBI/EBI
$foundcode=(split(':', $record))[-1];
$foundcode =~ s/[\n\r]*$//;
$record .= <$input>; # sequence line
$record .= <$input>; # '+'
$record .= <$input>; # quality line
} else {
### sam file, header line:
if ($record =~ /^@/) { # header line, needed by all files
for my $lib (keys %$filehandles) {
$filehandles->{$lib}->print($record);
}
$nrefseqs += ($record =~ /^\@SQ/);
next RECORD;
}
### @@@FIX: at this point we should insert add a @PG record to the bam headers ...
if ( $nrefseqs ==0 && !$warned++ ) {
warn "*** expected to find reference sequences in the sam headers (the \@SQ records)\n";
warn "*** be sure to use output from samtools -h\n";
}
## else: sam file, read line:
## e.g. ^NS500413:188:H3M3WBGXY:1:11101:10124:1906:cbc=TACCTGTC:umi=TTCGAC \t 0 \t GLUL__chr1 \t 3255 \t 25 \t 76M \t
my($qname,$flag, $rname, $pos, $mapq, $cigar, $rnext, $pnext, $tlen,
$seq, $qual, @optionals)=split("\t", $record);
for my $part (split(":", $qname)) {
$foundcode=$1 if $part =~ $barcode_re;
}
die "could not find barcode in QNAME '$qname', expected /$barcode_re/, line $." unless $foundcode;
}
### at this point we need and have just $foundcode and $record
my $lib;
CASE:
while(1) {
$lib=$barcodes->{$foundcode}; # majority of cases
if ($lib) {
$nexact++;
$statsperbarcode->{$foundcode}->[0]++;
last CASE;
}
if (! $mismatch_REs) {
$nunknown++;
$lib='UNKNOWN';
last CASE;
}
my $correction;
my $nmismatches;
TRY:
for($nmismatches=1; $nmismatches < @$mismatch_REs; $nmismatches++) {
$correction=mismatch::rescue($foundcode, $mismatch_REs->[ $nmismatches ]);
last TRY if $correction;
}
if($correction) {
$lib=$barcodes->{$correction};
$nrescued->[$nmismatches]++;
$statsperbarcode->{$correction}->[$nmismatches]++;
$statspermm->{$correction}->{$foundcode}++;
last CASE;
} else {
$nunknown++;
$lib='UNKNOWN';
last CASE;
}
die "should not reach this point";
} # CASE
$lib= $groups->{$lib} if $groups;
$lib = 'UNKNOWN' unless $lib;
$filehandles->{$lib}->print($record);
last RECORD if ( $input->eof() || !$record );
} # RECORD
{nexact=>$nexact, nrescued=>$nrescued, nunknown=>$nunknown,
statsperbarcode=>$statsperbarcode, statspermm=>$statspermm};
} # sub demultiplex
sub open_infile {
die "not used nor tested";
my($file)=@_;
my $fh=FileHandle->new();
if ($file =~ /\.gz/) {
$fh->open("zcat $file | ", "r") or die "'$file': $!";
} else {
$fh->open("< $file") or die "'$file': $!";
}
$fh;
}
sub close_infile {
die "not used nor tested";
}
sub open_outfiles {
my $args = ref $_[0] eq 'HASH' ? shift : {@_};
my ($outdir, $prefix, $type, $files)=map {$args->{$_}} qw(outdir prefix type files);
my(@libs)=@$files;
my $fhs={};
die "Output directory $outdir: $!" if ($outdir && !(-d $outdir && -w $outdir));
for my $lib (@libs) {
my $name;
my $fh;
if ($type =~ /fastq/) {
$name=sprintf("%s.fastq.gz", $lib);
$name="$prefix$name" if $prefix;
$name="$outdir/$name" if $outdir;
$fh = FileHandle->new("| gzip -n > $name");
} elsif ($type eq 'bam' ) {
$name=sprintf("%s.bam", $lib);
$name="$prefix$name" if $prefix;
$name="$outdir/$name" if $outdir;
$fh = FileHandle->new(" | samtools view - -h -b > $name");
} else {
die "open_outfiles: unknown type '$type' requested";
}
die "library $lib, file $name: $!" unless $fh;
warn "Creating/overwriting file $name ...\n";
$fhs->{$lib}=$fh;
}
$fhs;
} # open_outfiles
sub close_outfiles {
my($fhs)=@_;
for my $lib (keys %$fhs) {
$fhs->{$lib}->close() or die "could not close (or open?) demultiplexed bam file for library $lib; investigate";
}
}
sub read_groups {
#return hash mapping barcode to group
#expects, per line, whitespace-separated barcode_id and group
my($file)=@_;
open(FILE, $file) || die "$0: $file: $!";
my $groups={};
while(<FILE>) {
s/#.*//;
s/[\r\n]*$//;
next unless /\S+\s+\S+/;
my($barcode,$group)=split(' ',$_);
die "barcode $barcode not unique in group file $file, line $.," if $groups->{$barcode};
$groups->{$barcode}=$group;
}
close(FILE);
$groups;
} # sub read_group
sub print_statsperbarcode {
my $args = ref $_[0] eq 'HASH' ? shift : {@_}; # args: file, stats, max_mismatches, barcodes
my ($stats, $mismatches, $max_mismatches, $barcodes)=
map {$args->{$_}} qw(stats mismatches max_mismatches barcodes);
## overall stats:
my $file="counts-overall.txt";
warn "Creating file $file ...\n";
open(OUT, "> $file") || die "$file: $!";
my $plain="exact\t" . join("\t", map { "${_}mm"; } 1..$max_mismatches);
my $perc="%exact\t" . join("\t", map { "%".$_."mm"; } 1..$max_mismatches);
print OUT "#id\tcode\t$plain\t$perc\n";
CODE:
foreach my $code (sort keys %$barcodes) {
my $id=$barcodes->{$code};
my $nexact=$stats->{$code}->[0] || 0;
print OUT "$id\t$code\t$nexact\t";
my $total=$nexact;
I:
for(my $i=1; $i<=$max_mismatches; $i++) {
my $n=$stats->{$code}->[$i] || 0;
print OUT mismatch::commafy($n) . "\t";
$total += $n;
}
## now percentages:
printf OUT "%.1f\t", 100*$nexact/$total;
I:
for(my $i=1; $i<=$max_mismatches; $i++) {
my $n=$stats->{$code}->[$i] || 0;
printf OUT "%4.1f\t", 100*$n/$total . "\t";
}
print OUT "\n";
} # CODE
close(OUT);
### now per mismatch
$file="counts-permismatch.txt";
warn "Creating file $file ...\n";
open(OUT, "> $file") || die "$file: $!";
print OUT "#id\tbarcode\tmismatched barcodes with counts\n";
CODE:
foreach my $code (sort keys %$barcodes) {
my $id=$barcodes->{$code};
print OUT "$id\t$code\t";
my $mms=$mismatches->{$code};
for my $m (sort keys %$mms) {
print OUT format_mm($code, $m) . ": $mms->{$m}\t";
}
print OUT "\n";
} # CODE
close(OUT);
} # sub print_statsperbarcode
sub _open_fh {
# open file (for appending), or open stdout. Returns filehandle
my($fh)=@_;
if ($fh) {
if(ref $fh) {
die "expected FileHandle" unless ref $fh eq 'FileHandle';
} else {
warn "appending to file $fh ... ";
$fh = FileHandle->new(">> $fh") or die "$fh: $!";
}
} else {
## $fh= FileHandle->new_from_fd(1, ">"); # stdout
$fh= FileHandle->new("> -"); # stdout
}
$fh;
} # _open_fh
sub invert_hash {
## go from $h->{CCTGCA}=> 'K13' to $h->{K13}=>'CCTGCA'
my($hash)=@_;
my $new={};
while( my($key,$val) = each %$hash) {
die "cannot invert hash: val $val (key $key) is not unique " if defined($new->{$val});
$new->{$val}=$key;
}
$new;
} # invert_hash
sub byletterandnumber {
## usage: @sorted = sort { mismatch::byletterandnumber($a,$b) } @unsorted
my ($aa,$bb)=@_;
my($re)= qr/([A-Za-z_]*)([0-9]+)/;
my ($Sa, $Na) = ($aa =~ $re);
my ($Sb, $Nb) = ($bb =~ $re);
($Sa cmp $Sb) || ($Na <=> $Nb);
} # byletterandnumber
sub print_barcode_readgroups {
# prints barcode readgroups to fh.
# If $fh is a string, opens that file and appends to it
# If $f is a filehandle, prints there
# If $f is undef, prints to stdout.
# Unknown barcodes get RG:UNK
my($barcodes, $fh)=@_;
my $well2cbc=invert_hash $barcodes;
my @wells= sort { byletterandnumber($a,$b) } keys %$well2cbc;
$fh=_open_fh($fh);
foreach my $well (@wells) {
print $fh "\@RG\tID:$well\tDS:$well2cbc->{$well}\n";
}
print $fh "\@RG\tID:UNK\tDS:unknown\n";
} # print_barcode_readgroups
1;