-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
302 lines (257 loc) · 12.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# Copyright 2016 Mandiant, A FireEye Company
# Authors: Brian Jones
# License: Apache 2.0
''' Utility functions for "Relational Learning with TensorFlow" tutorial '''
import numpy as np
import pandas as pd
def df_to_idx_array(df):
'''Converts a Pandas DataFrame containing relationship triples
into a numpy index array.
Args:
df: Pandas DataFrame with columns 'head', 'rel', and 'tail'. These
columns must be Categorical. See make_categorical().
Returns:
A (N x 3) numpy integer index array built from the column Categorical
codes.
'''
idx_array = np.zeros((len(df),3), dtype=np.int)
idx_array[:,0] = df['head'].cat.codes
idx_array[:,1] = df['rel'].cat.codes
idx_array[:,2] = df['tail'].cat.codes
return idx_array
def make_categorical(df, field_sets):
'''Make DataFrame columns Categorical so that they can be converted to
index arrays for feeding into TensorFlow models.
Args:
df: Pandas DataFrame with columns 'head', 'rel', and 'tail'
field_sets: A tuples containing the item category sets: (head_set,
rel_set, tail_set). Note that head_set and tail_set can
be the same if the model embeds all entities into a common
space.
Returns:
A new Pandas DataFrame where the 'head', 'rel', and 'tail' columns have
been made Caetgorical using the supplied field_sets.
'''
head_set, rel_set, tail_set = field_sets
result = pd.DataFrame()
result['head'] = pd.Categorical(df['head'].values, categories=head_set)
result['rel'] = pd.Categorical(df['rel'].values, categories=rel_set)
result['tail'] = pd.Categorical(df['tail'].values, categories=tail_set)
if 'truth_flag' in df:
result['truth_flag'] = df['truth_flag']
return result, df_to_idx_array(result)
def corrupt(triple, field_replacements, forbidden_set,
rng, fields=[0,2], max_tries=1000):
''' Produces a corrupted negative triple for the supplied positive triple
using rejection sampling. Only a single field (from one in the fields
argument) is changed.
Args:
triple: A tuple or list with 3 entries: (head, rel, tail)
field_replacements: A tuple of array-like: (head entities, relationships,
tail entities), each containing the (unique) items to use as
replacements for the corruption
forbidden_set: A set of triples (typically all known true triples)
that we should not accidentally create when generating corrupted
negatives.
rng: Numpy RandomState object
fields: The fields that can be replaced in the triple. Default is
[0,2] which corresponds to the head and tail entries. [0,1,2]
would randomly replace any of the three entries.
max_tries: The maximum number of random corruption attempts before
giving up and throwing an exception. A corruption attempt can fail
if the sampled negative is a triple found in forbidden_set.
Returns:
A corrupted tuple (head, rel, tail) where one entry is different
than the triple passed in.
'''
collision = False
for _ in range(max_tries):
field = rng.choice(fields)
replacements = field_replacements[field]
corrupted = list(triple)
corrupted[field] = replacements[rng.randint(len(replacements))]
collision = (tuple(corrupted) in forbidden_set)
if not collision:
break
if collision:
raise Exception('Failed to sample a corruption for {} after {} tries'.format(triple, max_tries))
return corrupted
def create_tf_pairs(true_df, all_true_df, rng):
'''Creates a DataFrame with constrastive positive/negative pairs given
true triples to constrast and set of "all known" true triples in order
to avoid accidentally sampling a negative from this set.
Args:
true_df: Pandas DataFrame containing true triples to contrast.
It must contain columns 'head', 'rel', and 'tail'. One
random negative will be created for each.
all_true_df: Pandas DataFrame containing "all known" true triples.
This will be used to to avoid randomly generating negatives
that happen to be true but were not in true_df.
rng: A Numpy RandomState object
Returns:
A new Pandas DataFrame with alternating pos/neg pairs. If true_df
contains rows [p1, p2, ..., pN], then this will contain 2N rows in the
form [p1, n1, p2, n2, ..., pN, nN].
'''
all_true_tuples = set(all_true_df.itertuples(index=False))
replacements = (list(set(true_df['head'])), [], list(set(true_df['tail'])))
result = []
for triple in true_df.itertuples(index=False):
corruption = corrupt(triple, replacements, all_true_tuples, rng)
result.append(triple)
result.append(corruption)
result = pd.DataFrame(result, columns=['head', 'rel', 'tail'])
result['truth_flag'] = np.tile([True, False], len(true_df))
return result
def threshold_and_eval(test_df, test_scores, val_df, val_scores):
''' Test set evaluation protocol from:
Socher, Richard, et al. "Reasoning with neural tensor networks for
knowledge base completion." Advances in Neural Information Processing
Systems. 2013.
Finds model output thresholds using val_df to create a binary
classifier, and then measures classification accuracy on the test
set scores using these thresholds. A different threshold is found
for each relationship type. All Dataframes must have a 'rel' column.
Args:
test_df: Pandas DataFrame containing the test triples
test_scores: A numpy array of test set scores, one for each triple
in test_df
val_df: A Pandas DataFrame containing the validation triples
test_scores: A numpy array of validation set scores, one for each triple
in val_df
Returns:
A tuple containing (accuracy, test_predictions, test_scores, threshold_map)
accuracy: the overall classification accuracy on the test set
test_predictions: True/False output for test set
test_scores: Test set scores
threshold_map: A dict containing the per-relationship thresholds
found on the validation set, e.g. {'_has_part': 0.562}
'''
def find_thresh(df, scores):
''' find threshold that maximizes accuracy on validation set '''
#print(df.shape, scores.shape)
sorted_scores = sorted(scores)
best_score, best_thresh = -np.inf, -np.inf
for i in range(len(sorted_scores)-1):
thresh = (sorted_scores[i] + sorted_scores[i+1]) / 2.0
predictions = (scores > thresh)
correct = np.sum(predictions == df['truth_flag'])
if correct >= best_score:
best_score, best_thresh = correct, thresh
return best_thresh
threshold_map = {}
for relationship in set(val_df['rel']):
mask = np.array(val_df['rel'] == relationship)
threshold_map[relationship] = find_thresh(val_df.loc[mask], val_scores[mask])
test_entry_thresholds = np.array([threshold_map[r] for r in test_df['rel']])
test_predictions = (test_scores > test_entry_thresholds)
accuracy = np.sum(test_predictions == test_df['truth_flag']) / len(test_predictions)
return accuracy, test_predictions, test_scores, threshold_map
def model_threshold_and_eval(model, test_df, val_df):
''' See threshold_and_eval(). This is the same except that the supplied
model will be used to generate the test_scores and val_scores.
Args:
model: A trained relational learning model whose predict() will be
called on index arrays generated from test_df and val_df
test_df: Pandas DataFrame containing the test triples
val_df: A Pandas DataFrame containing the validation triples
Returns:
A tuple containing (accuracy, test_predictions, test_scores, threshold_map)
accuracy: the overall classification accuracy on the test set
test_predictions: True/False output for test set
test_scores: Test set scores
threshold_map: A dict containing the per-relationship thresholds
found on the validation set, e.g. {'_has_part': 0.562}
'''
val_scores = model.predict(df_to_idx_array(val_df))
test_scores = model.predict(df_to_idx_array(test_df))
return threshold_and_eval(test_df, test_scores, val_df, val_scores)
def pair_ranking_accuracy(model_output):
''' Pair ranking accuracy. This only works when model_output comes from
alternating positive/negative pairs: [pos,neg,pos,neg,...,pos,neg]
Returns:
The fraction of pairs for which the positive example is scored higher
than the negative example
'''
output_pairs = np.reshape(model_output, [-1,2])
correct = np.sum(output_pairs[:,0] > output_pairs[:,1])
return float(correct) / len(output_pairs)
def model_pair_ranking_accuracy(model, data):
''' See pair_ranking_accuracy(), this simply calls model.predict(data) to
generate model_output
Returns:
The fraction of pairs for which the positive example is scored higher
than the negative example
'''
return pair_ranking_accuracy(model.predict(data))
class ContrastiveTrainingProvider(object):
''' Provides mini-batches for stochastic gradient descent by augmenting
a set of positive training triples with random contrastive negative samples.
Args:
train: A 2D numpy array with positive training triples in its rows
batch_pos_cnt: Number of positive examples to use in each mini-batch
separate_head_tail: If True, head and tail corruptions are sampled
from entity sets limited to those found in the respective location.
If False, head and tail replacements are sampled from the set of
all entities, regardless of location.
rng: (optional) A NumPy RandomState object
TODO: Allow a variable number of negative examples per positive. Right
now this class always provides a single negative per positive, generating
pairs: [pos, neg, pos, neg, ...]
'''
def __init__(self, train, batch_pos_cnt=50,
separate_head_tail=False, rng=None):
self.train = train
self.batch_pos_cnt = batch_pos_cnt
self.separate_head_tail = separate_head_tail
if rng is None:
rng = np.random.RandomState()
self.rng = rng
self.num_examples = len(train)
self.epochs_completed = 0
self.index_in_epoch = 0
# store set of training tuples for quickly checking negatives
self.triples_set = set(tuple(t) for t in train)
# replacement entities
if separate_head_tail:
head_replacements = list(set(train[:,0]))
tail_replacements = list(set(train[:,2]))
else:
all_entities = set(train[:,0]).union(train[:,2])
head_replacements = tail_replacements = list(all_entities)
self.field_replacements = [head_replacements,
list(set(train[:,1])),
tail_replacements]
self._shuffle_data()
def _shuffle_data(self):
self.rng.shuffle(self.train)
def next_batch(self):
'''
Returns:
A tuple (batch_triples, batch_labels):
batch_triples: Bx3 numpy array of triples, where B=2*batch_pos_cnt
batch_labels: numpy array with 0/1 labels for each row in
batch_triples
Each positive is followed by a constrasting negative, so batch_labels
will alternate: [1, 0, 1, 0, ..., 1, 0]
'''
start = self.index_in_epoch
self.index_in_epoch += self.batch_pos_cnt
if self.index_in_epoch > self.num_examples:
# Finished epoch, shuffle data
self.epochs_completed += 1
self.index_in_epoch = self.batch_pos_cnt
start = 0
self._shuffle_data()
end = self.index_in_epoch
batch_triples = []
batch_labels = []
for positive in self.train[start:end]:
batch_triples.append(positive)
batch_labels.append(1.0)
negative = corrupt(positive, self.field_replacements, self.triples_set, self.rng)
batch_triples.append(negative)
batch_labels.append(0.0)
batch_triples = np.vstack(batch_triples)
batch_labels = np.array(batch_labels)
return batch_triples, batch_labels