forked from amazon-science/bias-bounties
-
Notifications
You must be signed in to change notification settings - Fork 0
/
updater.py
179 lines (146 loc) · 7.51 KB
/
updater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from model import *
from sklearn import metrics
def simple_update(curr_model, h_t, g_t):
"""
Updates the curr_model model object to incorporate the (g_t,h_t).
Inputs:
curr_model: Decision List object that is to be updated
h_t: new model that performs better than curr_model on points for which g_t returns 1
g_t: function from X -> {0,1} which returns 1 if x is in identified group and 0 else.
Return: None
"""
new_node = DecisionlistNode(predicate=g_t, leaf=h_t)
curr_model.prepend(new_node)
return None
def measure_group_errors(model, X, y):
"""
Helper function that measures the group errors of groups defined in model over test data X with true
labels y
Inputs:
model: DecisionList or PointerDecisionList object
X: n x m dataframe of test data
y: dataframe of n true labels (or optimal predictions) of points in X
"""
indices = [X.apply(g, axis=1) == 1 for g in model.predicates]
xs = [X[i] for i in indices]
ys = [y[i] for i in indices]
group_errors = []
for i in range(len(model.predicates)):
pred_ys = xs[i].apply(model.predict, axis=1)
group_errors.append(metrics.zero_one_loss(ys[i], pred_ys))
return group_errors
def measure_all_group_errors(model, groups, X, y):
"""
Function to measure group errors over any predefined groups for algorithm analysis
"""
indices = [X.apply(g, axis=1) == 1 for g in groups]
xs = [X[i] for i in indices]
ys = [y[i] for i in indices]
group_errors = []
for i in range(len(groups)):
pred_ys = xs[i].apply(model.predict, axis=1)
group_errors.append(metrics.zero_one_loss(ys[i], pred_ys))
return group_errors
def get_group_weights(model, X):
"""
Helper function that returns the list of group weight in the dataset X
"""
indices = [X.apply(g, axis=1) == 1 for g in model.predicates]
xs = [X[i] for i in indices]
weights = [len(xs[i]) / float(len(X)) for i in range(len(xs))]
return weights
def find_next_problem_node(curr_model, new_errors):
# we need to search through the errors on all past rounds to find the ones with the best performance for a group
# for each node, return the update index corresponding to the best round. If there are no problem indices, returns
model_index = -1
adjusted_node_index = -1
for node_index in curr_model.update_node_indices_tracking_rejects:
# if we've made it to the most current update, stop
if node_index > curr_model.num_rounds:
break
# determine the round where the node had lowest error
indices_min_round = np.nanargmin(curr_model.test_errors[:curr_model.num_rounds, node_index])
# grab the value of the minimum error
min_val = curr_model.test_errors[indices_min_round, node_index]
# figure out the index of the actual update, not including rejects
adjusted_model_index = sum(curr_model.track_rejects[:indices_min_round])
# and adjust the node index to not include rejects
adjusted_node_index = sum(curr_model.track_rejects[:node_index])
# compare the most recent updates' error to the previous minimum
if min_val < new_errors[adjusted_node_index]:
# if the most recent update was worse, record the index of the best round, accounting for rejected models_to_update
model_index = adjusted_model_index
# also grab the adjusted node index (which might be different)
adjusted_node_index = sum(curr_model.track_rejects[:node_index])
break
return [adjusted_node_index, model_index]
def iterative_update(curr_model, h_t, g_t, train_X, train_y, test_X, test_y, group_name, all_groups, group_indicators,
vis=True):
"""
Updates the curr_model to incorporate (g_t, h_t) in a way that preserves group error
monotonicity over the sample data X with labels y
Inputs:
curr_model: PointerDecisionList object that is to be updated
h_t: new model that performs better than curr_model on points for which g_t returns 1
g_t: function from X -> {0,1} which returns 1 if x is in identified group and 0 else.
Return: None
"""
# add a round to the round tracker
curr_model.num_rounds += 1
curr_model.track_rejects.append(1) # mask to keep track of where the rejects are in the test errors.
curr_model.update_node_indices_tracking_rejects.append(
curr_model.num_rounds) # tracking where the update is relative to the rejcted rounds
# run simple update
new_node = PointerDecisionListNode(predicate=g_t, leaf=h_t, pred_name=group_name)
curr_model.prepend(new_node)
# measure new group errors and compare to old
new_errors = measure_group_errors(curr_model, test_X, test_y)
initial_new_errors = new_errors
# recursively check for new errors
[problem_node_index, problem_node_model_index] = find_next_problem_node(curr_model, new_errors)
problem_node_tracking = []
# initialProblemNode = problemNode
while True:
# if there were no problem nodes found, break
if problem_node_model_index == -1:
break
# otherwise, append a repair node to the PDL
else:
# add node to tracker so we can visualize PDL
problem_node_tracking.append([curr_model.pred_names[problem_node_index], problem_node_model_index])
# build a node that points to that model
new_node = PointerDecisionListNode(predicate=curr_model.predicates[problem_node_index],
catch_node=True,
right_main_node=curr_model.update_nodes[problem_node_model_index])
# prepend that node to the model
curr_model.prepend(new_node)
# the group of new model will change w new node appended so check those
new_errors = measure_group_errors(curr_model, test_X, test_y)
# check for further/new problem nodes
[problem_node_index, problem_node_model_index] = find_next_problem_node(curr_model, new_errors)
if new_errors is None:
curr_model.pop() # remove the new model from the head of the pDL
return "Could not calculate all group errors and cannot update"
# now that all of the updates have happened, add the final node of the update to the model
curr_model.update_nodes.append(new_node)
curr_model.train_errors[curr_model.num_rounds] = measure_all_group_errors(curr_model, all_groups, train_X, train_y)
curr_model.test_errors[curr_model.num_rounds] = measure_all_group_errors(curr_model, all_groups, test_X, test_y)
if vis:
print("Running iterative update for group: " + group_name)
print("Model groups prior to update: ")
print(curr_model.pred_names[:-1])
print("Group errors on test set prior to update (over all groups)")
print(curr_model.test_errors[curr_model.num_rounds - 1])
print("Group errors after new group has been prepended to PDL:")
print(initial_new_errors)
print("Group Weights:")
print(get_group_weights(curr_model, test_X))
if len(problem_node_tracking) > 0:
print("Repaired Nodes:")
print(problem_node_tracking)
else:
print("No repairs needed.")
print("Group errors of every group, even those not yet introduced, on test set after repairs:")
print(["Total"] + group_indicators)
print(curr_model.test_errors[curr_model.num_rounds])
return [curr_model.train_errors, curr_model.test_errors]