-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_outlier_rates_from_iqtree_files.pl
134 lines (91 loc) · 3.12 KB
/
find_outlier_rates_from_iqtree_files.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env perl
###########################################################################################################################################################
# Extract rates for partitions from iqtree output file. Calculate median quartile 1 and 3 for collection of rate values
# Remove fasta files with outlier rates from working directory
###########################################################################################################################################################
use strict;
use warnings;
my %hash_of_rates=();
my %hash_of_ids=();
#Extract rates
my @fasta_files=glob "*.fas";
chomp @fasta_files;
my $rates_file=shift @ARGV;
my $ids_file=shift @ARGV;
open my $fh, '<', $rates_file or die"Could not open file \"$rates_file\":$!\n";
open my $fh_2, '<', $ids_file or die"Could not open file \"$ids_file\":$!\n";
#read rate file from iqtree
while(my $line=<$fh>){
chomp $line;
my @elements=split (" ", $line);
$elements[0]=~s/ //g;
$elements[2]=~s/ //g;
$hash_of_rates{$elements[0]}=$elements[2];
}
close $fh;
#read ids
while(my $line=<$fh_2>){
chomp $line;
my @elements=split (" ", $line);
$elements[0]=~s/ //g;
$elements[1]=~s/ //g;
$hash_of_ids{$elements[0]}=$elements[1];
}
close $fh_2;
my @values=values %hash_of_rates;
my @sorted_values=sort { $a <=> $b } @values;
my $no=scalar @sorted_values;
my @lower_half;
my @upper_half;
#Calculate median and quartiles of rate values for the dataset
my $median;
my $Q1;
my $Q3;
if (@sorted_values%2==0) {
$median=($sorted_values[(@sorted_values/2)-1]+$sorted_values[(@sorted_values/2)])/2;
#calculate Q1, Q3 for even number of rates
for(my $i=0; $i<=(@sorted_values-1); ++$i){
if ($i<=((@sorted_values/2)-1)){
push (@lower_half, $sorted_values[$i])
}
else{
push (@upper_half, $sorted_values[$i])
}
}
$Q1= $lower_half[(@lower_half/2)+0.5];
$Q3= $upper_half[(@upper_half/2)+0.5];
}
else{
$median= $sorted_values[(@sorted_values/2)+0.5];
#print "$median\n";
#calculate Q1, Q3 for odd number of rates
for(my $i=0; $i<=(@sorted_values-1); ++$i){
if ($i<=((@sorted_values/2)-1)){
push (@lower_half, $sorted_values[$i])
}
else{
push (@upper_half, $sorted_values[$i])
}
}
#calculate Q1, Q3 for odd number of rates
$Q1=$median=($lower_half[(@lower_half/2)-1]+$lower_half[@lower_half/2])/2;
$Q3=($upper_half[(@upper_half/2)-1]+$upper_half[@upper_half/2])/2;
}
print scalar @lower_half, "\n";
print scalar @upper_half, "\n";
print "@lower_half", "\n";
print "@upper_half", "\n";
print "$Q1", "\n";
print "$Q3", "\n";
my $outlier_threshold2=$Q3+(($Q3-$Q1)*1.5);
my $outlier_threshold1=$Q1-(($Q3-$Q1)*1.5);
print "$outlier_threshold1", "\n";
print "$outlier_threshold2", "\n";
my %hash_new;
foreach (sort (keys %hash_of_rates)){
if (exists ($hash_of_ids{$_})) {
next unless -f "$hash_of_ids{$_}.fas";
system"rm $hash_of_ids{$_}.fas" if $hash_of_rates{$_}>$outlier_threshold2;
system"rm $hash_of_ids{$_}.fas" if $hash_of_rates{$_}<$outlier_threshold1;
}
}