-
Notifications
You must be signed in to change notification settings - Fork 0
/
aggreagated_analysis.py
130 lines (106 loc) · 5.07 KB
/
aggreagated_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This script aggregates results of multiple model analyzes. This can be useful when having multiple splits
of train and test data and want to compare the models coming from different train splits, or when evaluating
multiple models of the same target based on different sets of known active and inactive compounds.
The script takes a directory on its input, finds all models (*.am). From the files, the top N (parameter of the script)
features with highest likelihood ratios are taken. The output reports in how many models given feature value was found
among the top N most important features. Since Bayescreen does binning, we track ranges of values not the
values themselves. However, in each model the ranges can differ slightly due to different min-max normalization.
Two ranges are thus deemed identical if they intersect. Each feature value/range is formatted as follows:
feature_name(range_of_values): average_likelihood
"""
import argparse
import common
__author__ = "David Hoksza"
__email__ = "[email protected]"
__license__ = 'X11'
model_suffix = "am"
def get_overlap(a, b):
"""
Finds out whether two intervals intersect.
:param a: Two-element array representing an interval
:param b: Two-element array representing an interval
:return: True if the intervals intersect, false otherwise
"""
return max(0, min(a[1], b[1]) - max(a[0], b[0]))
def read_model(fm):
"""
Reads in model in the format generated by the analyze_model script.
:param fm: Filename with the model
:return: Dictionary with the model analyzed
"""
features = []
in_section = False
for line in fm:
if "Features values importance" in line: in_section = True
if in_section:
s_line = line.split(",")
if len(s_line) == 4:
features.append({
"name": s_line[1],
"ratio": common.to_float(s_line[0]),
"interval": [common.to_float(x) for x in s_line[3].strip(" \n").strip("()").split(";")],
"cnt": 1
})
if len(features) >= args.n: break;
return features
def merge_features(features, aux_features):
"""
For two list of features values and for each value finds out whether it intersect with any feature value
in the second list.
:param features: Feature values dict (feature_name, interval, likelihood_ratio, number_of_intersected)
:param aux_features: Feature values dict (feature_name, interval, likelihood_ratio, number_of_intersected)
:return: Merged feature values dict (feature_name, interval, likelihood_ratio, number_of_intersected)
"""
for val1 in aux_features:
intersected = False
for ix2 in range(len(features)):
val2 = features[ix2]
if val1["name"] == val2["name"] and bool(get_overlap(val1["interval"], val2["interval"])):
features[ix2]["interval"] = [min(val1["interval"][0], val2["interval"][0]),
max(val1["interval"][1], val2["interval"][1])]
features[ix2]["ratio"] += val1["ratio"]
features[ix2]["cnt"] += 1
intersected = True
if not intersected:
features.append(val1)
return features
def analyze_models(dir):
"""
Takes all model analyzes in a dictionary and merges their top N feature values.
:param dir: Direcotry to scan
:return:
"""
features = []
for model in common.find_files_recursively(dir, "*.{}".format(model_suffix)):
with common.open_file(model) as fm:
aux_features = read_model(fm)
features = merge_features(features, aux_features)
features_factors = {}
for val in features:
if val["cnt"] not in features_factors:
features_factors[val["cnt"]] = [val]
else:
features_factors[val["cnt"]].append(val)
compressed = ""
for key in sorted(features_factors, reverse=True):
print(key)
for val in sorted(features_factors[key], key=lambda x: x["ratio"], reverse=True):
print("{}({};{}): {}".format(val["name"], val["interval"][0], val["interval"][1], val["ratio"]/val["cnt"]))
compressed = "{}, {}-{} ({} - {:.2f})".format(compressed, val["name"], round(val["interval"][1] - val["interval"][0],1), key, val["ratio"]/val["cnt"])
print("")
print("Compressed:")
print(compressed.strip(", "))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--directory",
required=True,
help="Directory with the analyzed models (.{} suffix).".format(model_suffix))
parser.add_argument("-n",
type=int,
default=20,
help="Top n features to be considered in each model")
args = parser.parse_args()
analyze_models(args.directory)