-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
242 lines (206 loc) · 9.55 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import pandas as pd
import numpy as np
import os
import shutil
from datetime import datetime
import matplotlib.pyplot as plt
from evaluate_training import find_threshold, signal_llp_efficiencies, bkg_falsePositives, \
make_multi_roc_curve, plot_prediction_histograms
from keras.utils import np_utils
def create_directories(model_name, file_name):
"""Creates directories to store model plots + Keras files and returns directory name."""
# Append time/date to directory name
creation_time = str(datetime.now().strftime('%Y-%m-%d_%H:%M:%S/'))
dir_name = model_name + file_name + "_" + creation_time
# Create directories
os.makedirs("plots/" + dir_name)
print("Directory plots/" + dir_name + " created!")
os.makedirs("keras_outputs/" + dir_name)
print("Directory keras_outputs/" + dir_name + " created!")
return dir_name
def load_dataset(file_name):
"""Loads .pkl file, does some pre-processing and returns Pandas DataFrame"""
# Load dataset
df = pd.read_pickle(file_name)
# Replace infs with nans
df = df.replace([np.inf, -np.inf], np.nan)
# Replace nans with 0
df = df.fillna(0)
# Delete some 'virtual' variables only needed for pre-processing
del df['track_sign']
del df['clus_sign']
# Delete track_vertex vars in tracks
vertex_delete = [col for col in df if col.startswith("nn_track_vertex_x")]
vertex_delete += [col for col in df if col.startswith("nn_track_vertex_y")]
vertex_delete += [col for col in df if col.startswith("nn_track_vertex_z")]
for item in vertex_delete:
del df[item]
# Print sizes of inputs for signal, qcd, and bib
print("\nLength of Signal is: " + str(df[df.label == 1].shape[0]))
print("Length of QCD is: " + str(df[df.label == 0].shape[0]))
print("Length of BIB is: " + str(df[df.label == 2].shape[0]))
return df
def process_kfold_run(roc_results, acc_results, model_names, model_file_names, name_list, seed):
"""Generates ROC AUC and accuracy plots from KFold run and saves results to a .txt file"""
print("\nPlotting KFold Cross Validation results...\n")
creation_time = str(datetime.now().strftime('%m-%d_%H:%M'))
kfold_dir = "kfold_" + creation_time
# create directory for kfold plots
os.makedirs("plots/" + kfold_dir)
# move model files to kfold directory
for f in model_file_names:
shutil.move("plots/" + f, "plots/" + kfold_dir + "/" + f)
# plot roc auc scores
fig = plt.figure()
fig.suptitle('Model Comparison with ROC AUC metric')
ax = fig.add_subplot(111)
ax.yaxis.grid(True)
plt.boxplot(roc_results)
ax.set_xticklabels(model_names)
fig.savefig("plots/" + kfold_dir + "/kfold_cv_roc.pdf", format="pdf", transparent=True)
# plot accuracy scores
fig = plt.figure()
fig.suptitle('Model Comparison with accuracy metric')
ax = fig.add_subplot(111)
ax.yaxis.grid(True)
plt.boxplot(acc_results)
ax.set_xticklabels(model_names)
fig.savefig("plots/" + kfold_dir + "/kfold_cv_acc.pdf", format="pdf", transparent=True)
# save results to file
f = open("plots/" + kfold_dir + "/kfold_data.txt", "w+")
f.write("File name list\n")
f.write(str(name_list))
f.write("\nModel list\n")
f.write(str(model_names))
f.write("\nROC AUC data\n")
f.write(str(roc_results))
f.write("\nAccuracy data\n")
f.write(str(acc_results))
f.write("\nSeed\n")
f.write(str(seed))
f.close()
def process_grid_search_run(roc_results, acc_results, model_file_names, lr_values, reg_values, filters_cnn_constit, filters_cnn_track, filters_cnn_MSeg):
"""Aggregates model files and metrics built during grid search"""
print("\nProcessing grid search results...\n")
creation_time = str(datetime.now().strftime('%m-%d_%H:%M'))
gridSearch_dir = "gridSearch_" + creation_time
print("\nSuccessfully trained %.0f models\n" % len(model_file_names))
for f in model_file_names:
shutil.move("plots/" + f, "plots/" + gridSearch_dir + "/" + f)
# find the ordering based on model metric
order = np.argsort(-1 * np.asarray(roc_results))
ranks = order.argsort()
# rebuild all the model hyper-parameter configurations
lr = []
reg = []
constit = []
track = []
mseg = []
for l in lr_values:
for r in reg_values:
for i in range(len(filters_cnn_track)):
lr.append(l)
reg.append(r)
constit.append(filters_cnn_constit[i][-1])
track.append(filters_cnn_track[i][-1])
mseg.append(filters_cnn_MSeg[i][-1])
# put data into a pandas DataFrame
columns = ['learning_rate', 'regularization', 'cnn_final_layer_constit', 'cnn_final_layer_track', 'cnn_final_layer_MSeg', 'roc_score', 'acc_score', 'rank']
data_list = [lr, reg, constit, track, mseg, roc_results, acc_results, ranks]
data = {}
for i in range(len(columns)):
data[columns[i]] = data_list[i]
df = pd.DataFrame(data, columns=columns)
# save df to file
df.to_csv("plots/" + gridSearch_dir + "/df.csv", index=False)
# print out correlation matrix
print(df.corr())
# save results to file
f = open("plots/" + gridSearch_dir + "/gridSearch_data.txt", "w+")
f.write("Model list\n")
f.write(str(model_file_names))
f.write("\n\nROC AUC data\n")
f.write(str(roc_results))
f.write("\n\nAccuracy data\n")
f.write(str(acc_results))
f.write("\n\nRank\n")
f.write(str(ranks))
f.write("\n\nLearning values\n")
f.write(str(lr_values))
f.write("\n\nRegularization values\n")
f.write(str(reg_values))
f.write("\n\nFilters constit\n")
f.write(str(filters_cnn_constit))
f.write("\n\nFilters track\n")
f.write(str(filters_cnn_track))
f.write("\n\nFilters MSeg\n")
f.write(str(filters_cnn_MSeg))
f.close()
def evaluate_model(model, dir_name, X_test, y_test, weights_test, Z_test, mcWeights_test, n_folds):
# evaluate the model using Keras api
acc_index = model.metrics_names.index('main_output_categorical_accuracy')
# model.evaluate expects target data to be the same shape/format as model.fit
y_eval = np_utils.to_categorical(y_test)
y_eval = [y_eval, y_eval, y_eval, y_eval, y_eval]
# get accuracy of model on test set
test_acc = model.evaluate(X_test, y_eval, verbose=1, sample_weight=weights_test)[acc_index]
# make predictions
prediction = model.predict(X_test, verbose=0) # currently expects X to be a list of length 4 (model has 4 inputs)
prediction = prediction[0] # model currently has 5 outputs (1 main output, 4 outputs for monitoring LSTMs)
# Sum of MC weights
bib_weight = np.sum(mcWeights_test[y_test == 2])
sig_weight = np.sum(mcWeights_test[y_test == 1])
qcd_weight = np.sum(mcWeights_test[y_test == 0])
bib_weight_length = len(mcWeights_test[y_test == 2])
sig_weight_length = len(mcWeights_test[y_test == 1])
qcd_weight_length = len(mcWeights_test[y_test == 0])
mcWeights_test[y_test == 0] *= qcd_weight_length / qcd_weight
mcWeights_test[y_test == 2] *= bib_weight_length / bib_weight
mcWeights_test[y_test == 1] *= sig_weight_length / sig_weight
destination = "plots/" + dir_name + "/"
plot_prediction_histograms(destination, prediction, y_test, mcWeights_test)
# This will be the BIB efficiency to aim for when making ROC curve
threshold = 1 - 0.0316
# Third label: the label of the class we are doing a 'family' of. Other two classes will make the ROC curve
third_label = 2
# We'll be writing the stats to training_details.txt
f = open(destination + "training_details.txt", "a")
if n_folds:
f.write("\nKFold iteration # %s" % str(n_folds))
f.write("\nEvaluation metrics\n")
# Find threshold, or at what label we will have the required percentage of 'test_label' correctl predicted
test_threshold, leftovers = find_threshold(prediction, y_test, threshold * 100, third_label)
# Make ROC curve of leftovers, those not tagged by above function
bkg_eff, tag_eff, roc_auc = make_multi_roc_curve(prediction, y_test, mcWeights_test, test_threshold, third_label,
leftovers)
# Write AUC to training_details.txt
f.write("Threshold: %s, ROC AUC: %s\n" % (str(-threshold + 1), str(roc_auc)))
f.write("Accuracy: %s\n" % str(test_acc))
print("AUC: " + str(roc_auc))
# Make ROC curve
plt.plot(tag_eff, bkg_eff, label=f"BIB Eff: {threshold :.3f}" + f", AUC: {roc_auc:.3f}")
plt.xlabel("LLP Tagging Efficiency")
axes = plt.gca()
axes.set_xlim([0, 1])
# Finish and plot ROC curve family
plt.legend()
plt.yscale('log', nonposy='clip')
signal_test = prediction[y_test == 1]
qcd_test = prediction[y_test == 0]
print(signal_test[0:100].shape)
print("Length of Signal: " + str(len(signal_test)) + ", length of signal with weight 1: " + str(
len(signal_test[signal_test[:, 1] < 0.1])))
print("Length of QCD: " + str(len(qcd_test)) + ", length of qcd with weight 1: " + str(
len(qcd_test[qcd_test[:, 1] < 0.1])))
if third_label == 2:
plt.ylabel("QCD Rejection")
plt.savefig(destination + "roc_curve_atlas_rej_bib" + ".pdf", format='pdf', transparent=True)
if third_label == 0:
plt.ylabel("BIB Rejection")
plt.savefig(destination + "roc_curve_atlas_rej_qcd" + ".pdf", format='pdf', transparent=True)
plt.clf()
# Make plots of signal efficiency vs mH, mS
signal_llp_efficiencies(prediction, y_test, Z_test, destination, f)
bkg_falsePositives(prediction, y_test, Z_test, destination, f)
f.close()
return roc_auc, test_acc