-
Notifications
You must be signed in to change notification settings - Fork 1
/
calculate_linkage_disequilibrium.pl
executable file
·827 lines (713 loc) · 42.9 KB
/
calculate_linkage_disequilibrium.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
#!/usr/bin/env perl
use warnings;
$| = 1;
#Add use lib statement to assume there is a directory at the same level as bin in which the script is run, called 'lib'
use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin";
use strict;
use FileHandle;
use aomisc;
use Cwd;
use diagnostics;
use Getopt::Long;
use Data::Dumper;
use File::Basename;
use File::Temp;
use Statistics::R;
use Parallel::Loops;
# Andrew J. Oler, PhD
# Computational Biology Section
# Bioinformatics and Computational Biosciences Branch (BCBB)
# OCICB/OSMO/OD/NIAID/NIH
# Bethesda, MD 20892
#
#
#This package is free software; you can redistribute it and/or modify
#it under the terms of the GPL (either version 1, or at your option,
#any later version) or the Artistic License 2.0.
#Print out the options
if (@ARGV){ print STDERR "Arguments: ", join " ", @ARGV, "\n"; }
my $save;
my $files;
my $verbose;
my $output;
my $gzip;
my $prefix;
my $nofilter;
my $variant_threshold = 0.001;
my $cpu = 2;
my $force;
my $label = "Sample";
my $group_id = "Group";
my $table_only;
GetOptions('save=s' => \$save,
'output=s' => \$output,
'verbose' => \$verbose,
'files=s' => \$files,
'gzip' => \$gzip,
'prefix=s' => \$prefix,
'nofilter' => \$nofilter,
'variant_threshold=s' => \$variant_threshold,
'cpu|p=s' => \$cpu,
'force' => \$force,
'label|labels=s' => \$label,
'group_id=s' => \$group_id,
'table_only' => \$table_only,
);
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $usage = "
calculate_linkage_disequilibrium.pl Takes a file of variants and sequences and checks for
linkage between each pair of variants. Variants are queried at the level of nucleotide,
codon, and amino acid. Linkage is computed with fisher's exact test and is reported as
p-value, FDR-adjusted p-value, and Odds ratio (OR). This script requires R on the PATH.
Usage: calculate_linkage_disequilibrium.pl <variants.xls> <cleanreads.txt> [<cleanpeptides.txt>]
variants.xls, cleanreads.txt, and cleanpeptides.txt are output files from
convert_reads_to_amino_acid.pl.
variants.xls and cleanreads.txt are required; cleanpeptides.txt is recommended as well.
e.g.,
Convert_reads.variants.xls
## Variants above threshold frequency 0.005
#Type gene position consensus variant count coverage frequency filter
codon AAM75158.1 159 AGN AGT 424 19003 0.0223 CONSENSUS_N,
codon AAM75158.1 164 TTG NTG 1838 19003 0.0967 VARIANT_N,
codon AAM75158.1 166 TGG TNG 135 19003 0.0071 VARIANT_N,
codon AAM75158.1 167 CTG CTN 108 19003 0.0057 VARIANT_N,
codon AAM75158.1 176 GAG AAG 127 19003 0.0067 PASS
head *cleanreads.txt
24002 431
#readID:gene:positionForFirstNucleotideBase cleanRead
Y3YRC_ATAAGCTT_3|AAM75158.1|158 AGCAGNTTTTACAGAAATTTGCTATGGCTGACGAAGAAGGAGAGCTCATACCCAGAGCTGAAAAATTCTTATGTGAACAAAAAAAGGAAAGAAGNCCTTGTACTGTGGGGTATTCATCACCCGCCTAACAGTAA
Y3YRC_TAGAGCTT_6|AAM75158.1|158 AGCAGNTTTTACAGAAATTTGCTATGGCTGACGAAGAAGGAGAGCTCATACCCAGAGCTGAAAAATTCTTATGTGAACAAAAAAAGGAAAGAAGNCCTTGTACTGTGGGGTATTCATCACCCGCCTAACAGTAA
Y3YRC_GGTAATCG_2|AAM75158.1|158 AGCAGNTTTTACAGAAATNTGCTATGGCTGACGAAGAAGGAGAGCTCATACCCAGAGCTGAAAAATTCTTATGTNAACAAAAAAAGGAAAGAAGNCCTTGTACTGTGGGGTATTCATCACCCGCCTAACAGTAA
head *cleanpeptides.txt
24002 143
#readID:gene:positionForFirstAminoAcid cleanPeptide
Y3YRC_ATAAGCTT_3|AAM75158.1|472 SXFYRNLLWLTKKESSYPELKNSYVNKKRKEXLVLWGIHHPPNS
Y3YRC_TAGAGCTT_6|AAM75158.1|472 SXFYRNLLWLTKKESSYPELKNSYVNKKRKEXLVLWGIHHPPNS
Y3YRC_GGTAATCG_2|AAM75158.1|472 SXFYRNXLWLTKKESSYPELKNSYVNKKRKEXLVLWGIHHPPNS
Example output:
cat Calculate_linkage_disequilibrium_report.xls
#group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
Group Sample nuc AAM75158.1 526:G:A:556:A:G 526 G A 0.006468859 556 A G 0.015343491 18329 255 89 32 8.62E-31 25.82045 8.62E-31
Group Sample codon AAM75158.1 176:GAG:AAG:186:AGG:GGG 176 GAG AAG 0.006420546 186 AGG GGG 0.015355805 18315 255 88 32 6.59E-31 26.09373 6.59E-31
Group Sample aa AAM75158.1 176:E:K:186:R:G 176 E K 0.006419859 186 R G 0.015354162 18317 255 88 32 6.57E-31 26.09658 6.57E-31
group is the Group name, specified by --group_id
sample is the label for the sample, specified by --label
type is the type of variant being considered, either nuc, codon, or aa.
gene is the gene name from the variants.xls input file that the variant corresponds to
comparison is a colon-delimited combination of pos1:c1:v1:pos2:c2:v2 that can be used as a unique identifier
c1 and v1 refer to consensus and variant, respectively, in the first position
c2 and v2 refer to consensus and variant, respectively, in the second position
v1freq and v2freq are the variant frequencies from the variants.xls input file
pos1 and pos2 refer to positions (either amino acid/codon or nucleotide) of the first and second variants, respectively
c1c2 is the count of reads that have consensus at both positions
c1v2 is the count of reads that have consensus at the first position and variant at the second position
v1c1 is the count of reads that have variant at the first position and consensus at the second position
v1v2 is the count of reads that have variant at both positions
OR is the odds ratio. In this case, it is the likelihood that the sequence at the first position
affects the sequence at the second position.
OR > 1 indicates that variants for both positions or consensus sequences for both positions are often found together
OR < 1 indicates that having a variant at one position and a consensus at the other is the common trend.
OR = 1 indicates that the two positions are not linked.
* Note that the OR should not be considered meaningful unless the FDR value shows that the relationship is significant (e.g., < 0.05).
p-value is the result from the fisher's exact test
FDR is the adjusted p-value from the Benjamini & Hochberg method
Replicates
If multiple replicates for a sample are available, it is recommended that these be
processed together. The data for different replicates is not merged, but information for
all replicates is used to include all variants that pass the threshold in at least one
replicate. (The data can be merged by feeding the output of this script to
combine_linkage_values.pl .) To process replicates together, provide a comma-delimited
list of each argument type. If you are inputting files for replicates, the --group_id and
--label parameters are recommended. For example:
calculate_linkage_disequilibrium.pl --group_id Immunized --label Sample1,Sample2,Sample3 variants1.xls,variants2.xls,variants3.xls Sample1.cleanreads.txt,Sample2.cleanreads.txt,Sample3.cleanreads.txt Sample1.cleanpeptides.txt,Sample2.cleanpeptides.txt,Sample3.cleanpeptides.txt
OPTIONS:
--save Directory in which to save files. Default = pwd. If folder doesn't exist,
it will be created.
--prefix Prefix for output files. '<--group_id value>.<label[i]>' will be added to
the prefix.
--variant_threshold Minimum threshold to consider for a variant. Default is 0.001.
-p/--cpu Number of CPUs to use for parallel processing. Default = 2. Speed-up is
~0.5x per cpu. E.g., for 8 cpu, I get ~4x speedup compared to using 1 cpu.
8-12 cpus is optimal (or number of physical cores) and time estimate is most
accurate in that range as well.
--label Single label for sample or a comma-delimited list of labels if
replicates are used. Default is 'Sample'. Default when replicates are detected
is a list of numbers such as '1,2,3', depending on the number of replicates
detected.
--group_id Group id. A name to give to this sample (or samples) to use for the Group
column. Useful for naming output files that were processed together, especially
if you are going to combine multiple files afterwards. Default is 'Group'.
--table_only Don't calculate p-values, just print out the table of counts.
--force Force the analysis to run, even if expected runtime is > 24h. Default is
to only run if low range of expected is <= 24h.
--nofilter Compare all variants in variants.xls file. Default is to only consider
variants that have PASS status.
";
# To do
# Maybe add another option to specify which filters to allow (comma-delimited)
# Add threads to
# Compute quasi-cliques with DENSE. (or Cocain?). Add description of requirements (DENSE/mgqce) to usage.
# Add fdr threshold option for quasi-clique variants. Default = 0.05
# Add part to count the number of pairs with fdr <= 0.05 while printing.
# Redo time estimate after changes on 2017-02-26.
# Change log
# 2013-06-05
# Added $variant_threshold option. This way I can run this script multiple times on the same variant file with different frequency thresholds, rather than running convert_reads_to_amino_acid every time I want a different threshold.
# 2013-09-23
# Changed get_reads subroutine to use '|' as delimiter for cleanpeptides.txt and cleanreads.txt input files to separate readID, gene, position.
# Changed get_reads sub to skip PHYLIP header lines: next if (m/^\s+\d+\s+\d+/);
# 2013-09-27
# Added option --cpu and made a requirement for Parallel::Loops module. This now allows parallel processing for calculating p-values in R. Speedup is # cpu/2 (e.g., for 8 cpus, I get about 4-fold speedup).
# Added --force
# 2014-01-07
# Added -p to input to --cpu.
# Modified read_variants to adjust the count to remove comparisons of alternative alleles of the same position and also modified linkage_disequilibrium sub to make sure that if two variants have the same position that they won't be compared to one another.
# Adjust time estimation to account for the number of reads in the file.
# Ran read file size from 1000 - 9000 (every 1000) and -p from 2 to 20 (every 2) on niaid-1-11 for datapoints. Used variants file with only codons, using threshold 0.005, which gave 9 variants (36 comparisons). Computed time_per_comparison = real_time (from "time" command) * p / 36.
# Also ran comparison with the same variant file and read file sizes but -p 1 to 4 (every 1) on laptop.
# Equation from niaid-1-11 test: Y = 0.0700371 + number_reads * 0.0001235 + p * 0.1297 (Very good fit)
# Equation from laptop test: Y = -0.0520766 + number_reads * 0.0001522 + p * 0.2966441 (Very good fit)
# Equation for merged data: Y = 0.3099219 + number_reads * 0.0001317 + p * 0.1115561
# 2014-01-10
# Changed file output name to use dots '.' instead of underscores.
# 2014-04-02
# Modified get_reads() sub to also get the read_count from the id (to accomodate files that have been uniqued with fastx_collapser) and then modified the linkage_disequilibrium sub to use that count in incrementing the genotypes, e.g., $AB += $read_count;
# 2014-06-12
# Modified script to store the frequency of the variants (called by convert_reads_to_amino_acid.pl) and include that in the output report as well.
# 2014-09-06
# Modified to make it work with replicates. Added options --label --group_id, --table_only.
# Now arguments can either be regular or comma-delimited lists if replicates are provided.
# Added "group", "sample", and "comparison" columns to the output file
# Added get_common_variant_list subroutine to get a list of variants that pass the threshold in at least one of the replicates (if replicates are provided).
# ("Process samples in groups of replicates so that I consider all variants that pass the threshold in at least one replicate." )
# Modified linkage_disequilibrium subroutine to take the $variants as well (for frequency?)
# 2014-11-15
# Fixed time calculation in (and put it in get_common_variant_list) to work for multiple-file input with set of variants that are above threshold in at least one of the files. Adds up separate times based on the size of the cleanreads.txt files and then gives a total estimate.
# Fixed labels to default to 1, 2, 3, etc. if no --labels option is given and there are multiple files
# Add labels for group and Sample ids
# Changed default prefix to '<--group_id value>.<label[i]>' for multiple file input.
# Fix loop to write a separate output file for each input file if multiple file input.
# Updated example output in $usage
# Modify the parallelization to be more efficient by sending a pair of variants to compare to a thread, instead of setting up threads for only the second variant. (And change the time estimation calculation to match)
# Changed the structure of the $variants hashref in get_common_variant_list to be all hashes HoHoHoH (?) instead of HoHoAoH. It had an array in there before because I wanted to walk through each of the variants that were present and do the comparisons that way. But now, I need to do comparisons for some variants that are NOT present in a sample's variants.xls file. It will look up the c1v1, c1v2, c2v1, c2v2 counts from the cleanreads.txt and cleanpeptides.txt files anyway. (That is in the case where it was above threshold in only one of the replicates and was not found in one of the other replicates.) Now I just need to look up the frequency from that data structure. Maybe it's not necessary to keep it around....
# Changed get_common_variant_list to make an AoA of all comparisons to be performed. This way, each comparison can be separate
# Updated equation. I couldn't quite remember what I used to fit the graph previously, but I used Excel this time, threshold of 0.004, which had 20 variants (187 codons). Fit to equation Time_per_comparison = A + B * read_count + C * cpu. Allow A, B, and C change. Make column of values for "predicted_time_per_comparison" with that formula. Then another column for diff^2 (Time_per_comparison-predicted_time_per_comparison)^2. Make sum of the diff^2 column and set that to minimize. For the test, I did p = 2 to 20 (by 2) and read_counts from 1000 to 9000 (by 1000) like last time.
# Equation from niaid-1-11 test: Y = 0.54984528 + number_reads * 8.6017*10^-6 + p * 8.96667*10^-9
# Plotted results and found that optimal speed-up is found with -p 8-12.
# 2017-02-26
# Updated linkage_disequilibrium sub to compute p-values outside of the Parallel Loop in order to avoid errors with checking version of R in Statistics::R
# From To do: "Fix bug - not carrying over frequency from input variants.xls file." I checked and it seems to be fine.
# 2017-02-28
# Change prefix to be group.id even for single-sample, unless prefix is provided by user.
unless ($ARGV[1]){ print STDERR "$usage\n"; exit; }
unless ($ARGV[2]){ print STDERR "No peptide file found. Will only look at nucleotide and codon variants.\n"; }
my $start_time = time;
my @variants_files = split(/,/, $ARGV[0]);
my @clean_reads_files = split(/,/, $ARGV[1]);
my @clean_peptides_files = split(/,/, $ARGV[2]) if $ARGV[2];
my @labels = split(/,/, $label);
### Check for replicates and fix the --label default if so (see $usage)
# Throw an error if number of variants files is different than number of cleanreads files or cleanpeptide files. Required arguments
my $num_var_files = scalar(@variants_files);
my $num_clean_reads_files = scalar(@clean_reads_files);
my $num_clean_peptide_files = scalar(@clean_peptides_files) if $ARGV[2];
my $warned = 0;
if ($num_clean_reads_files != $num_var_files){
print STDERR "Number of cleanreads.txt files doesn't match the number of variants files!\n";
$warned++;
}
elsif($ARGV[2] && ($num_clean_peptide_files != $num_var_files) ){
print STDERR "Number of cleanpeptides.txt files doesn't match the number of variants files!\n";
$warned++;
}
exit if ($warned);
# Be more lenient with labels because not required argument
if ($label =~ m/^Sample$/ && $num_var_files > 1){
# User didn't supply labels
# Set new defaults for @labels
@labels = (1..$num_var_files);
}
# Get list of variants passing threshold in at least one file.
# If some are found, an estimate of the total processing time will be computed.
# If that value is > 24 hours, the script will die unless --force is also specified
# If no variants passing threshold are found, this sub will also kill the script.
my ($variants_to_consider, $comparisons) = get_common_variant_list(\@variants_files); # Get a list/hash of variants that pass the threshold in at least one replicate. Hashref, $variants->{$type}->{$gene}->{$position}->{$variant_combined_name}++;
# print "Variants to consider: \n", Dumper($variants_to_consider), Dumper($comparisons); exit;
my $save_dir = $save || Cwd::cwd();
unless (-d $save_dir){ mkdir($save_dir) or warn "$!\n" }
warn "save directory: $save_dir\n";
# Walk through each input variant file and
for (my $i = 0; $i < @variants_files; $i++){
print STDERR "Processing file $variants_files[$i]...\n\t";
my $filebase;
if ($prefix){
$filebase = $prefix . "." . $group_id . "." . $labels[$i];
}
else {
$filebase = $group_id . "." . $labels[$i];
}
my $linkage_disequil = $save_dir . "/" . $filebase . ".linkage.minfreq".$variant_threshold.".xls"; # was $prefix . "_linkage_disequilibrium_report_minfreq".$variant_threshold.".xls";
$linkage_disequil =~ s/_linkage.linkage/_linkage/;
my $linkage_disequil_fh = open_to_write("$linkage_disequil");
print $linkage_disequil_fh "#group\tsample\ttype\tgene\tcomparison\tpos1\tc1\tv1\tv1freq\tpos2\tc2\tv2\tv2freq\tc1c2\tc1v2\tv1c2\tv1v2\tp-value\tOR\tFDR\n";
my $variants = read_variants($variants_files[$i]); # HoHoAoH; first keys, type, i.e., 'nuc', 'codon', 'aa'; second keys gene; array of variants; hash with keys 'frequency', 'position', 'variant', 'consensus'
# print Dumper($variants); # exit;
# Now read the nucleotide file *cleanreads.txt into memory and get linkage for variants of type 'nuc' and 'codon'
my $clean_reads = get_reads($clean_reads_files[$i]); # print Dumper($clean_reads); exit;
my $stats = linkage_disequilibrium($clean_reads,$variants,'nuc',$labels[$i],$linkage_disequil_fh); # print Dumper($stats); exit;
$stats = linkage_disequilibrium($clean_reads,$variants,'codon',$labels[$i],$linkage_disequil_fh); # print Dumper($stats); exit; # I'll probably need a special step for codon
# Flush the variable to free up memory
undef($clean_reads);
# Now, if a cleanpeptide.txt file is available, read it into memory
exit unless $ARGV[2];
$clean_reads = get_reads($clean_peptides_files[$i]);
$stats = linkage_disequilibrium($clean_reads,$variants,'aa',$labels[$i],$linkage_disequil_fh); # Calculates pairwise linkage between variants.
close($linkage_disequil_fh);
&elapsed($start_time, "\tElapsed", $verbose);
}
&elapsed($start_time, 'Total', $verbose);
#-----------------------------------------------------------------------------
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
sub get_common_variant_list {
# Looks in the input file(s) to see which variants are above the threshold.
# If a variant is above the threshold frequency in at least one of the input files,
# it is retained in the list. This will be all of the variants considered for the
# comparison. This list will be used to analyze each of the input files. Therefore
# the time to analyze them can be calculated for one file and then multiplied by the
# number of input files.
# Also enumerates every pair of variants to be compared in an AoA.
my $files = shift;
my @files = @$files; # list of variant files
my $variants; # Hashref of variants above the threshold in at least one of the replicates. Format: $variants->{$type}->{$gene}->{$variant_combined_name}++;
# Analyze each file separately
for (my $i = 0; $i < @files; $i++){
my $file = $files[$i];
if ($verbose){ print STDERR "Reading file to get list of common variants: $file\n"; }
my $readfh = open_to_read($file);
while(<$readfh>){
# #Type gene position consensus variant count coverage frequency filter
# codon AAM75158.1 159 AGN AGT 424 19003 0.0223 CONSENSUS_N,
chomp;
next if (m/^#/);
my ($type,$gene,$position,$consensus,$variant,$count,$coverage,$freq,$filter) = split(/\t/);
unless($nofilter){
next unless ($filter eq "PASS");
}
next unless ($freq > $variant_threshold);
my $variant_combined_name = join ":", $position, $consensus, $variant; # position:consensus:variant
$variants->{$type}->{$gene}->{$position}->{$variant_combined_name}++;
}
close ($readfh);
}
# Did any variants pass the threshold in at least one file? If not, then no need to continue.
unless ($variants){
print STDERR "No variants passing threshold.\n";
exit;
}
if ($verbose){ print "variants\n", Dumper($variants); }
# Estimate the time to run for each file (partly based on the size of the cleanreads.txt files)
# Initialize variables for time estimation
my $total_time = 0;
my $total_comparisons = 0;
my $counts; # hashref of counts by type
my $variant_positions; # hashref of counts by type and position. I don't want to compare linkage for variants that have the same position, so I need to count the number of comparisons that would be done with variants of the same position and subtract that number from the total comparisons for each type.
foreach my $type (keys %$variants){
foreach my $gene (keys %{$variants->{$type}}){
foreach my $position (keys %{$variants->{$type}->{$gene}}){
my $num_variants_at_position = scalar(keys %{$variants->{$type}->{$gene}->{$position}});
# Increment the type and count the number of variants at a single position
$counts->{$type} += $num_variants_at_position;
$variant_positions->{$type}->{$position} += $num_variants_at_position;
}
}
}
if ($verbose){ print "var positions\n", Dumper($variant_positions); print "counts\n", Dumper($counts); }
# Estimate time for each file by passing cleanreads for each file (and $counts, $variant_positions, which is the same for all files)
for (my $i = 0; $i < @files; $i++){
my $file = $files[$i];
my $file_num = $i + 1;
print STDERR "\nFile $file_num:\t$file\n";
my ($time_this_file,$num_comparisons_this_file) = calculate_time_for_file($counts, $variant_positions, $clean_reads_files[$i]); # $counts, $variant_positions will be same for all files, but need to pass the cleanreads file so it knows how many reads are in this file.
$total_time += $time_this_file;
$total_comparisons += $num_comparisons_this_file;
}
# Now check the total time it will take for all files to see if it is less than 24 hours.
my $calculated_time_str = hours_and_minutes($total_time);
print STDERR "\n$total_comparisons total comparisons in all files. Estimated to take about $calculated_time_str with $cpu cpu(s).\n\n";
unless ($force){
if ($total_time > 86400){ # 24 hours = 86400 seconds
print STDERR "Too much time. Please choose a different variant_threshold.\n";
exit;
}
}
# Now enumerate the comparisons that will be done for each variant type
my $comparisons; # HoHoAoA, first key type; second key gene; array of 2-way comparisons
my $enumerated_comparisons;
foreach my $type (keys %$variants){ # Format: $variants->{$type}->{$gene}->{$position}->{$variant_combined_name}++;
foreach my $gene (keys %{$variants->{$type}}){
my @positions = sort {$a <=> $b} keys %{$variants->{$type}->{$gene}};
# print STDERR "type: $type positions: ", "@positions", "\n";
for (my $i = 0; $i < (@positions - 1); $i++){ # First variant position
# print STDERR "i:$i\n";
foreach my $first_var (keys %{$variants->{$type}->{$gene}->{$positions[$i]}}){ # Walk through all of the variants of the first position
# print STDERR "\t\tfirst: $positions[$i] $first_var\n";
for (my $j = $i + 1; $j < @positions; $j++){ # Second variant position. start with one position higher than the first variant position
# print STDERR "\tj:$j\n";
foreach my $second_var (keys %{$variants->{$type}->{$gene}->{$positions[$j]}}){ # walk through all of the variants of the second position.
# print STDERR "\t\t\tsecond: $positions[$j] $second_var\n";
push @{$comparisons->{$type}->{$gene}}, [ $first_var, $second_var ];
$enumerated_comparisons->{$type}++;
}
}
}
}
}
}
my $total_enumerated_comparisons = total(values %$enumerated_comparisons) * scalar(@files);
if ($verbose){ print STDERR "Enumerated comparison count: $total_enumerated_comparisons\n", Dumper($enumerated_comparisons); }
unless ($total_enumerated_comparisons == $total_comparisons){
print STDERR "Number of comparisons don't match: $total_enumerated_comparisons (enumerated) vs. $total_comparisons (calculated)\n";
}
return ($variants,$comparisons); # Maybe not necessary to return $variants... it's used in read_variants right now to only save a variant to the hash if it's in this list of variants that pass the threshold. Maybe not necessary; could save all, since it will be flushed.
}
#-----------------------------------------------------------------------------
sub calculate_time_for_file {
# Takes a hashref of counts (with type of variant as key and number of variants as value),
# calculates the total number of comparisons to be performed, then adjusts the counts
# to remove self-comparisons (because they are not performed), then applies a formula
# to get an estimate for time based on previous tests.
# In the future, it might be a good idea to give an estimate based on the current performance,
# e.g., as it is computing, get a rate of seconds per comparison and use the number of
# comparisons done and number remaining to get an estimate of time remaining and an
# adjusted total time as it is going (like in GATK).
# It might be simpler to just get rid of this sub altogether and use the enumerated_comparisons above for the calculation....
my ($counts, $variant_positions, $clean_reads_file) = @_;
my %counts = %$counts;
# Calculate the total number of comparisons by type (including self-comparisons, i.e., comparisons of variants at the same position)
my %comparisons;
foreach my $type (keys %counts){
my $k = 2;
if ($counts{$type} > 1){
$comparisons{$type} = nCk($counts{$type}, $k);
}
else {
$comparisons{$type} = 0;
}
}
# Adjust the number of comparisons by type (in %comparisons) by deducting the comparisons between variants of the same position (because these will not be performed)
foreach my $type (keys %$variant_positions){
foreach my $pos (keys %{$variant_positions->{$type}}){
my $this_count = $variant_positions->{$type}->{$pos};
if ($this_count > 1){
my $nCk = nCk($this_count,2); # Count the number of self-comparisons that would be done for the variants at this position
$comparisons{$type} -= $nCk;
if ($verbose){ print STDERR "Removing $nCk comparison(s) for $type\n"; }
}
}
}
if ($verbose){ print Dumper(\%comparisons); }
my @grthn2; # Array of variant types that have at least two positions.
my $total_comparisons = 0;
my $calculated_time = 0;
if ($variant_positions){
print STDERR "Variants passing threshold:\n";
foreach my $type (sort keys %$variant_positions){
my $num_variants = $counts{$type};
my $num_positions = scalar (keys %{$variant_positions->{$type}});
my $num_comparisons = $comparisons{$type};
print STDERR "$type $num_variants\tat $num_positions positions\t($num_comparisons comparisons)\n";
push (@grthn2, $type) if ($num_positions >= 2);
}
if (@grthn2){
print STDERR "Will compare linkage at the level of " . join " ", @grthn2;
print STDERR "\n";
unless($table_only){ # No need to give a time estimate if we aren't going to compute p-values.
# Calculate time based on number of comparisons (range from ~15-25 seconds per comparison -- that probably depends a lot on the number of reads in the cleanreads.txt file...)
my $read_count = `cat $clean_reads_file | wc -l` - 2;
if($verbose){ print STDERR "read count: $read_count\n"; }
foreach my $type (keys %comparisons){
my $this_comparisons = $comparisons{$type};
$total_comparisons += $this_comparisons;
my $n = $counts{$type};
my $effective_cpus = range([$n, $cpu], 'min'); # Will be different depending on the number of variants of this type (i.e., if you ask for 8 cpus, but there are only variants at 5 positions, you can only run 5 comparisons at a time -- 5 "effective" cpus). ** Note that when I've changed the script to run each comparison in parallel (instead of each position in parallel), I'll need to change this estimation method. It will be faster.
# my $this_time = $this_comparisons * (0.3099219 + $read_count * 0.0001317 + $effective_cpus * 0.1115561) / $effective_cpus; # The equation calculates the estimated user time that will be used; need to divide by the number of cpus to get the wall time. From 2014-01-07
# my $this_time = $this_comparisons * (0.54984528 + $read_count * 8.6017e-6 + $effective_cpus * 8.96667e-9) / $effective_cpus; # From 2014-11-15 New equation using Excel Solver for all datapoints.
my $this_time = $this_comparisons * (0.31 + $read_count * 4e-5) / $effective_cpus; # From 2014-11-15 just using all data from p 8, p 10, p 12 -> linear regression for x = read_count, y = time_per_comparison. R^2 value 0.97566
$calculated_time += $this_time;
if($verbose){ print "effective_cpus: $effective_cpus\n"; }
if($verbose){ print "Time for $type: $this_time\tRead counts: $read_count\n"; }
}
}
}
else {
print STDERR "Not enough variants to make any comparisons.\n";
exit;
}
}
else {
print STDERR "No variants passing threshold.\n";
exit;
}
if($verbose){ print "Time for this file: $calculated_time\n"; }
return ($calculated_time, $total_comparisons);
# my ($high_range, $low_range) = (0,0);
# $high_range += $nCk * 25 / $effective_cpus;
# $low_range += $nCk * 8 / $effective_cpus;
# my ($high_range_str, $low_range_str) = ( hours_and_minutes($high_range), hours_and_minutes($low_range) );
# print STDERR "$comparison_count total comparisons. May take between $low_range_str and $high_range_str to complete with $cpu cpus.\n";
}
#-----------------------------------------------------------------------------
sub nCk {
my ($n,$k) = @_;
my $nCk = factorial($n)/(factorial($k)*factorial($n-$k)); # n!/(k!*(n-k)!)
return $nCk;
}
#-----------------------------------------------------------------------------
sub factorial {
# From http://www.catonmat.net/blog/perl-one-liners-explained-part-three/
use bignum; # Changed from bigint to bignum so I can do float-point math with the results.
my $num = shift;
my $f = 1;
$f *= $_ for 1..$num;
return $f;
}
#-----------------------------------------------------------------------------
sub enumerate_comparisons {
# Takes an arrayref of variants of format "position:consensus:variant"
# Returns an AoA of all comparisons to perform. Each sub-array will have two values
my $variants = shift;
my $comparisons; # AoA to return
for (my $i = 0; $i < @$variants; $i++){ # First variant
for (my $j = $i + 1; $j < (@$variants); $j++){
push @{$comparisons}, [ $variants->[$i], $variants->[$j] ];
}
}
return $comparisons;
}
#-----------------------------------------------------------------------------
sub read_variants {
my $file = shift;
my $variants; # HoHoA to store the variants being read from the file.
my $readfh = open_to_read($file);
while(<$readfh>){
# #Type position consensus variant count coverage frequency filter
# codon AAM75158.1 159 AGN AGT 424 19003 0.0223 CONSENSUS_N,
chomp;
next if (m/^#/);
my ($type,$gene,$position,$consensus,$variant,$count,$coverage,$freq,$filter) = split(/\t/);
unless($nofilter){
next unless ($filter eq "PASS");
}
my $variant_combined_name = join ":", $position, $consensus, $variant; # position:consensus:variant
# print STDERR "$variant_combined_name\n";
next unless (exists($variants_to_consider->{$type}->{$gene}->{$position}->{$variant_combined_name})); # Check to see that it was above threshold in at least one of the replicates
# print STDERR "\tpass\n";
# Store the variant info in a hash
my %hash = (
'position' => $position,
'variant' => $variant,
'consensus' => $consensus,
'frequency' => $freq,
);
# Add variant to the array.
# push @{$variants->{$type}->{$gene}}, \%hash;
# Add variant to the hash.
$variants->{$type}->{$gene}->{$variant_combined_name} = \%hash;
}
close ($readfh);
return $variants;
}
#-----------------------------------------------------------------------------
sub get_reads {
my $file = shift;
my $reads; # Save the reads in an AoH. Hashes will have keys 'gene', 'pos', 'seq'
my $readfh = open_to_read($file);
while(<$readfh>){
# #readID:gene:positionForFirstNucleotideBase cleanRead
# Y3YRC:ATAAGCTT:3|AAM75158.1|158 AGCAGNTTTTACAGAAATTTGCTATGGCTGACGAAGAAGGAGAGCTCATACCCAGAGCTGAAAAATTCTTATGTGAACAAAAAAAGGAAAGAAGNCCTTGTACTGTGGGGTATTCATCACCCGCCTAACAGTAA
# #readID:gene:positionForFirstAminoAcid cleanPeptide
# Y3YRC:ATAAGCTT:3|AAM75158.1|472 SXFYRNLLWLTKKESSYPELKNSYVNKKRKEXLVLWGIHHPPNS
# OR
# #readID|gene|positionForFirstNucleotideBase cleanRead
# 1-42|Seq12_093009_HA_cds|382 TTCGAAAGATTCAAAATATTTCCCAAAGAAAGCTCATGGCCCGACCACAACACAAACGGAGTAACGGCAGCATGCTCCCATGAGGGGAAAAACAGTTTTTACAGAAATTTGCTATGGCTGACGAAGAAGGAGAGCTCATACCCAGAGCTGAAAAATTCTTATGTGAACAAAAAAAGGAAAGAAGTCCTTGTACTGTGGGGTATTCATCNNNNNNNTAACAGTAAGGAACAACAGAATCTCTATCAGAATGAAAATGCTTATGTCTCTGTAGTGACTTCAAATTATAACAGGAGATTTACCCCGGAAATAGCAGAAAGACCCAAAGTAAAAGGTCAAGCTGGGAGGATGAACTATTACTGGACCTTGCTAAAACCCGGAGACACAATAATATTTGAGGCAAATGGAAATCTAATAGCACCAATGTATGCTTTC
# #readID|gene|positionForFirstAminoAcid cleanPeptide
# 1-42|Seq12_093009_HA_cds|128 FERFKIFPKESSWPDHNTNGVTAACSHEGKNSFYRNLLWLTKKESSYPELKNSYVNKKRKEVLVLWGIHXXXNSKEQQNLYQNENAYVSVVTSNYNRRFTPEIAERPKVKGQAGRMNYYWTLLKPGDTIIFEANGNLIAPMYAF
chomp;
next if (m/^#/);
next if (m/^\s+\d+\s+\d+/); # First line of file, prepended as a PHYLIP header
my ($name,$read) = split(/\s+/);
my @name = split(/\|/,$name); # Was split(/_/,$name);
my $position = pop(@name);
my $gene = pop(@name);
# Get sequence count from id. If the id has format number - number, then it will be assumed the second number is the read count.
my $id = shift(@name);
my ($num,$seq_count) = (1,1);
if ($id =~ m/^\d+-\d+$/){
($num,$seq_count) = split(/-/, $id);
}
my %hash = (
'gene' => $gene,
'position' => $position,
'seq' => $read,
'count' => $seq_count,
);
push @$reads, \%hash;
}
close ($readfh);
return $reads;
}
#-----------------------------------------------------------------------------
sub linkage_disequilibrium {
my ($reads, $variants, $type, $label, $linkage_disequil_fh) = @_;
# $reads is a AoH where hashes have keys 'gene', 'pos', 'seq'
# $type will decide how to split up the read/sequence (either into single characters or multiple
print STDERR "Processing variants of type '$type'\n";
my $this_variants = $variants->{$type}; #HoAoH, where first key is gene, and keys of deepest hash are 'position', 'variant', 'consensus'
my $this_variants_to_consider = $variants_to_consider->{$type}; # Format of $variants_to_consider : $variants->{$type}->{$gene}->{$variant_combined_name}++;
my $maxProcs = $cpu;
my $pl = Parallel::Loops->new($maxProcs);
# print Dumper($this_variants); # exit;
my @stats; # AoA to save data. Need to save it all first so I can then calculate the FDR from the p-values.
$pl->share(\@stats); # Make the array shared between fork children
foreach my $gene (keys %$this_variants_to_consider){
# Walk through each variant doing all pairwise comparisons
my @variants = sort keys %{$this_variants_to_consider->{$gene}};
my $count = 0;
my $comparison = ""; # initialize. Single array of two elements.
$pl->while( sub {$count++; $comparison = $comparisons->{$type}->{$gene}->[$count - 1]; }, sub { # was $pl->foreach( \@primerIDs, sub {
my @variants_to_compare = @$comparison;
# if($verbose){ print Dumper(\@variants_to_compare); }
my ($first_pos,$first_cons,$first_var) = split(/:/, $comparison->[0]);
my ($second_pos,$second_cons,$second_var) = split(/:/, $comparison->[1]);
my ($first_var_freq,$second_var_freq) = (0,0); # Default freq if not found in the $this_variants hash
$first_var_freq = $this_variants->{$gene}->{$comparison->[0]}->{frequency} if (exists($this_variants->{$gene}->{$comparison->[0]}));
$second_var_freq = $this_variants->{$gene}->{$comparison->[1]}->{frequency} if (exists($this_variants->{$gene}->{$comparison->[1]}));
next if ($first_pos == $second_pos); # This shouldn't ever happen, but just in case.
# Now walk through the $reads AoH and count up the variants at each of these positions
my $AB = 0; # consensus at both positions. A is first position, B is second position.
my $Ab = 0; # consensus at first position, variant at second position
my $aB = 0; # variant at first position, consensus at second position
my $ab = 0; # variant at both positions
foreach my $read (@$reads){
# print calculate_linkage_disequilibrium.pl Convert_reads_variants_minfreq0_codon.xls($read); print "gene: $gene\n";
next unless ($gene eq $read->{'gene'}); # Only consider a read if it was mapped to the gene that the variant came from, otherwise the positions won't be accurate.
my ($read_pos,$read_seq,$read_count) = ($read->{'position'},uc($read->{'seq'}),$read->{'count'});
if ($type eq 'codon'){
$read_pos = ($read_pos + 2) / 3; # Convert nucleotide position to amino acid/codon position. (Add 2 before dividing by 3 because the position refers to the first base of the codon, so dividing that by 3 will not give a whole number.)
}
# print "read: $read->{'gene'}, $read_pos, $read_seq\n";
my @seq;
if ($type =~ m/nuc|aa/){
@seq = split(/|/,$read_seq); # nuc or aa, split into single characters
}
else {
@seq = unpack("(a3)*", $read_seq); # codon, split into triplets
}
# print join " ", @seq, "\n";
my $first_pos_in_read = $first_pos - $read_pos;
my $second_pos_in_read = $second_pos - $read_pos;
# print "positions in read: $first_pos_in_read, $second_pos_in_read\n";
next unless($seq[$first_pos_in_read] && $seq[$second_pos_in_read]);
unless ($seq[$first_pos_in_read] && $first_cons && $seq[$second_pos_in_read] && $second_cons && $first_var && $second_var){
print STDERR "One of the positions or variants is not right:\n";
print STDERR join ", ", $first_pos_in_read, $seq[$first_pos_in_read], $second_pos_in_read, $seq[$second_pos_in_read], $first_cons, $second_cons, $first_var, $second_var; print STDERR "\n$read_seq\n"; exit;
}
if ( ($seq[$first_pos_in_read] eq $first_cons) && ($seq[$second_pos_in_read] eq $second_cons) ){
$AB += $read_count;
}
elsif( ($seq[$first_pos_in_read] eq $first_cons) && ($seq[$second_pos_in_read] eq $second_var) ){
$Ab += $read_count;
}
elsif( ($seq[$first_pos_in_read] eq $first_var) && ($seq[$second_pos_in_read] eq $second_cons) ){
$aB += $read_count;
}
elsif( ($seq[$first_pos_in_read] eq $first_var) && ($seq[$second_pos_in_read] eq $second_var) ){
$ab += $read_count;
}
else {
# This is possible to happen, I suppose. Doesn't match these variants or consensus. Don't count it.
my $first_pos_in_read_1_based = $first_pos_in_read + 1;
my $second_pos_in_read_1_based = $second_pos_in_read + 1;
print STDERR "Either variants ($first_pos: $first_var, $second_pos: $second_var) or consensus ($first_pos: $first_cons, $second_pos: $second_cons) doesn't match, so not counting. Actual is $first_pos ($first_pos_in_read_1_based in read): $seq[$first_pos_in_read], $second_pos ($second_pos_in_read_1_based in read): $seq[$second_pos_in_read]. Probably a read with a variant that was below your threshold or else a position with more than 2 variants. Or, you have multiple samples and this variant is present in one sample but not the other.\n", Dumper($read) if ($verbose);
}
}
# print "AB: $AB, Ab: $Ab, aB: $aB, ab: $ab\n";
my $comparison_combined_name = join ":", $first_pos, $first_cons, $first_var, $second_pos, $second_cons, $second_var; # position:consensus:variant
#group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# (p-value OR FDR) columns will be added outside of the parallel loop
my @array = ( $group_id, $label, $type, $gene, $comparison_combined_name, $first_pos, $first_cons, $first_var, $first_var_freq, $second_pos, $second_cons, $second_var, $second_var_freq, $AB, $Ab, $aB, $ab );
push @stats, \@array;
if ($count % 100 == 0){
print STDERR "Processed $count comparisons. ";
&elapsed($start_time, 'Elapsed', 1); # $verbose
}
}); # End of parallel loop
}
my $stats = \@stats;
# Compute p-values from contingency table counts
my $R = Statistics::R->new(); # http://search.cpan.org/~fangly/Statistics-R-0.34/lib/Statistics/R.pm
my ($p,$OR);
for (my $i = 0; $i < @$stats; $i++){
if ($table_only){
# Don't compute p-values
($p,$OR) = ("NA","NA");
}
else {
# Now get the p-value using Fisher's exact test in R
my ($AB,$Ab,$aB,$ab) = @{$stats->[$i]}[-4..-1];
$R->set('x', [$AB,$Ab,$aB,$ab]);
$R->run(q`y = matrix(x,nrow=2)`);
$R->run(q`temp = fisher.test(y)`);
$R->run(q`p = temp$p.value`);
$R->run(q`or = temp$estimate`);
$OR = $R->get('or');
$p = $R->get('p');
}
push @{$stats->[$i]}, ($p,$OR);
}
# Now get the FDR-adjusted p-value using Benjamini & Hochberg method
# print Dumper($stats);
my @pvalues;
foreach (@$stats){
push @pvalues, $_->[-2];
}
# print join " ", @pvalues; print "\n";
if (scalar(@pvalues)>1){
$R->set('x',\@pvalues);
$R->run(q`y=p.adjust(x,method="BH",n=length(x))`); # z$adj.p.value = p.adjust(z$p.value, method="BH",n = sum(!is.na(z$p.value)))
my $fdr_array = $R->get('y');
# print join " ", @$fdr_array; print "\n";
die "Error in calculating FDR adjusted p-values: different number of p-values and FDR values\n", join " ", @pvalues, "\n", @$fdr_array, "\n" unless (scalar(@$fdr_array) == scalar(@$stats));
for (my $i=0; $i < @$fdr_array; $i++){
push @{$stats->[$i]}, $fdr_array->[$i];
}
}
else {
# There is only one item in @$stats array
# Put the same value for FDR as for p-value.
push @{$stats->[0]}, $stats->[0]->[-2];
}
# Print out the results
foreach (@$stats){
if (scalar(@{$_}) > 1){
print $linkage_disequil_fh join "\t", @{$_};
print $linkage_disequil_fh "\n";
}
}
return $stats; # Not really necessary since this variants have already been printed, but might be nice to return this value for debugging purposes.
}
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------