-
Notifications
You must be signed in to change notification settings - Fork 2
/
train.py
365 lines (300 loc) · 18.9 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import argparse
import time
from datetime import datetime
import numpy as np
import tensorflow as tf
from cleverhans.model import CallableModelWrapper
import attacks as ae
import data
import models
import regularizers
import utils
nn_dict = {'fc': models.MLP,
'cnn_lenet_small': models.LeNetSmall
}
eps_dict = {2: {'mnist': 0.3,
'fmnist': 0.3,
'gts': 0.2,
'cifar10': 0.1},
np.inf: {'mnist': 0.1,
'fmnist': 0.1,
'gts': 4 / 255,
'cifar10': 2 / 255},
1: {'mnist': 1.0,
'fmniat': 1.0,
'gts': 3.0,
'cifar10': 2.0},
}
def adv_train(x, y):
n_adv = int(hps.ae_frac * hps.batch_size)
if n_adv == 0:
return x
else:
adv_inputs = ae.pgd_attack(x[:n_adv], y[:n_adv], cleverhans_model, hps.p,
eps_dict[hps.p][hps.dataset], hps.pgd_n_iter)
return tf.concat([adv_inputs, x[n_adv:]], axis=0)
def forward_pass_cleverhans(x):
""" Only for compatibility with Cleverhans. """
logits = model.net(x)[-1]
return logits
def eval_in_batches(x_in, y_in, sess, tensors, batch_iter):
"""Get all predictions for a dataset by running it in small batches.
Note, we assume that this is not for training, thus is_train=False
"""
vals_total = [0] * len(tensors)
n_batches = 0
for batch_x, batch_y in batch_iter:
vals = sess.run(tensors, feed_dict={x_in: batch_x, y_in: batch_y, lr_tf: hps.lr,
n_rb_tf: n_rb, n_db_tf: n_db, frac_reg_tf: frac_reg,
is_train: False})
for i in range(len(vals)):
vals_total[i] += vals[i]
n_batches += 1
return [val_total / n_batches for val_total in vals_total]
parser = argparse.ArgumentParser(description='Define hyperparameters.')
parser.add_argument('--gpus', nargs='+', type=int, default=[0], help='GPU indices. Multi-gpu training is supported.')
parser.add_argument('--gpu_memory', type=float, default=0.0,
help='GPU memory fraction to use')
parser.add_argument('--exp_name', type=str, default='test',
help='Name of the experiment, which is used to save the results/metrics/model in a certain folder.')
parser.add_argument('--dataset', type=str, default='mnist', help='mnist, cifar10, fmnist, gts, svhn')
parser.add_argument('--nn_type', type=str, default='fc1', help='NN type: fc1, fc10, cnn_lenet_small')
parser.add_argument('--n_epochs', type=int, default=100, help='Number of epochs.')
parser.add_argument('--restore', action='store_true',
help='Restore the model by exp_name+dataset and combination of lambda and gammas')
parser.add_argument('--data_augm', action='store_true',
help='Data augmentation: rotation, mirroring (not for mnist and gtrsrb), gauss noise.')
parser.add_argument('--p', type=str, default='2', help='P-norm: 2, inf, univ (note: as strings)')
''' parameters for MMR-l_2 and MMR-l_\infty '''
parser.add_argument('--reg', type=str, default='full', help='full, cheap')
parser.add_argument('--lmbd', type=float, default=1.0, help='Lambda')
parser.add_argument('--gamma_rb', type=float, default=1.0, help='Gamma for region boundaries')
parser.add_argument('--gamma_db', type=float, default=1.0, help='Gamma decision boundaries')
parser.add_argument('--ae_frac', type=float, default=0.0, help='Fraction of AE in a batch [0..1].')
''' parameters for MMR-Universal '''
parser.add_argument('--gamma_l1', type=float, default=1.0, help='Gamma_1')
parser.add_argument('--gamma_linf', type=float, default=0.1, help='Gamma_\infty')
parser.add_argument('--lmbd_l1', type=float, default=1.0, help='Lambda_1')
parser.add_argument('--lmbd_linf', type=float, default=1.0, help='Lmabda_\infty')
parser.add_argument('--hyp_start', type=float, default=0.2, help='percentage of hyperplanes pushed at the beginning')
parser.add_argument('--hyp_end', type=float, default=0.05, help='percentage of hyperplanes pushed at the end')
parser.add_argument('--lr', type=float, default=-1, help='learning rate')
hps = parser.parse_args() # returns a Namespace object, new fields can be set like hps.abc = 10
hps.batch_size = 128
hps.lr = 0.001
# mnist and fmnist with L2-MMR require a smaller learning rate
if hps.p == '2' and hps.dataset in ['mnist', 'fmnist'] and hps.lmbd > 0.0:
if hps.ae_frac == 0.0:
hps.lr = 0.0001
else: # if AT
hps.lr = 0.00005
elif hps.p == 'univ':
hps.lr = 5e-4 if hps.dataset in ['mnist', 'fmnist'] else 1e-3
hps_str = utils.create_hps_str(hps)
hps.seed = 1
hps.p = {'1': 1, '2': 2, 'inf': np.inf, 'univ': 'univ'}[hps.p]
hps.q = {1: np.inf, 2: 2, np.inf: 1, 'univ': 'univ'}[hps.p] # q norm is used in the denominator of MMR
if hps.p == np.inf and hps.gamma_db >= 1.0: # supports 2 ways of specifying gammas: in [0..1] and in [1..255]
hps.gamma_rb, hps.gamma_db = hps.gamma_rb / 255, hps.gamma_db / 255 # to make it easier to specify gammas via cmd
if hps.p == 'univ' and hps.gamma_linf >= 1.0: hps.gamma_linf /= 255.0
if hps.p == 'univ':
hps.ae_frac = 0.0
else:
hps.hyp_start = 0.1
hps.hyp_end = 0.02
hps.pgd_n_iter = 40 if hps.dataset in ['mnist', 'fmnist'] else 7 # as in Madry et al
if hps.gpu_memory == 0.0: # if it wasn't set
if 'cnn' in hps.nn_type:
hps.gpu_memory = 0.85
elif hps.dataset in ['mnist', 'fmnist'] or hps.lmbd == 0.0:
hps.gpu_memory = 0.15
else:
hps.gpu_memory = 0.25
cur_timestamp = str(datetime.now())[:-7] # to get rid of milliseconds
log = utils.Logger()
log.add('The script started on GPUs {} with hyperparameters: {} at {}'.format(hps.gpus, hps_str, cur_timestamp))
x_train, x_test, y_train, y_test = data.get_dataset(hps.dataset) # e.g. x_train of mnist has (60000, 28, 28, 1) shape
n_test_ex = x_test.shape[0]
n_train_ex, hps.height, hps.width, hps.n_col = x_train.shape
hps.n_in, hps.n_out = hps.height * hps.width * hps.n_col, y_train.shape[1]
if hps.data_augm:
n_pad = 2 # n_pad pixels from very side (increases the height and width by 2*n_pad)
hps.height_pad, hps.width_pad = hps.height + 2 * n_pad, hps.width + 2 * n_pad
x_train, x_test = data.zero_pad(x_train, n_pad), data.zero_pad(x_test, n_pad)
hps.random_crop, hps.fl_mirroring, hps.gauss_noise_flag = True, True, False
hps.fl_rotations, hps.max_rotate_angle = False, np.pi / 20
model_type = hps.nn_type if 'cnn' in hps.nn_type else 'fc' # is used to select the correct model from models.py
hps.n_hs = utils.get_hidden_units(hps.nn_type)
n_batches_train = n_train_ex // hps.batch_size # 'all' is 600 with bs=100 or 750 with bs=80
n_batches_test = n_test_ex // hps.batch_size
n_eval_pgd = 200
n_eval_pgd_final = 10000
if hps.p != 'univ': eps_pgd = eps_dict[hps.p][hps.dataset]
graph = tf.Graph()
with graph.as_default(), tf.device('/gpu:0'):
if hps.data_augm:
x_in = tf.placeholder(tf.float32, [None, hps.height_pad, hps.width_pad, hps.n_col])
else:
x_in = tf.placeholder(tf.float32, [None, hps.height, hps.width, hps.n_col])
y_in = tf.placeholder(tf.float32, [None, hps.n_out])
is_train = tf.placeholder(tf.bool, name='is_training')
lr_tf = tf.placeholder(tf.float32, shape=[])
frac_reg_tf = tf.placeholder(tf.float32, shape=[]) # from 0 to 1 - how strong is the regularizer
n_rb_tf = tf.placeholder(tf.int32, shape=[])
n_db_tf = tf.placeholder(tf.int32, shape=[])
if hps.data_augm: # Data augmentation is implemented inside the TF comp. graph
x_tf = tf.cond(is_train, lambda: data.augment_train(x_in, hps),
lambda: data.augment_test(x_in, hps))
else:
x_tf = tf.identity(x_in)
optimizer = tf.train.AdamOptimizer(lr_tf, beta1=0.9, beta2=0.999, epsilon=1e-08)
tower_grads = []
hps.batch_size_gpu = hps.batch_size // len(hps.gpus)
losses_plain, losses_reg, regs_rb, regs_db, err_rates = [], [], [], [], []
regs_rb_l1, regs_rb_linf, regs_db_l1, regs_db_linf = [], [], [], []
with tf.variable_scope(tf.get_variable_scope()):
for i in range(len(hps.gpus)):
with tf.device('/gpu:%d' % i), tf.name_scope('tower_%d' % i) as scope:
id_from, id_to = i * hps.batch_size_gpu, i * hps.batch_size_gpu + hps.batch_size_gpu
x, y = x_tf[id_from:id_to], y_in[id_from:id_to]
log.add('Batch on GPU {}: from {} to {}, tensor {}'.format(i, id_from, id_to, x))
model = nn_dict[model_type](is_train, hps)
cleverhans_model = CallableModelWrapper(forward_pass_cleverhans, 'logits')
# adv. training with PGD attack
x = tf.cond(is_train, lambda: adv_train(x, y), lambda: x)
y_list = model.net(x)
ce_loss_per_example = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_list[-1])
loss_plain_tower = tf.reduce_mean(ce_loss_per_example)
if hps.lmbd > 0: # for computational efficiency
# Regularizer components
if 'fc' in hps.nn_type:
rb_term, db_term = regularizers.mmr_fc(y_list, y, model.W, hps.n_in, hps.n_hs, hps.n_out,
n_rb_tf, n_db_tf, hps.gamma_rb, hps.gamma_db,
hps.batch_size_gpu, hps.q)
rb_reg_part = frac_reg_tf * hps.lmbd * rb_term / tf.cast(n_rb_tf, tf.float32)
db_reg_part = frac_reg_tf * hps.lmbd * db_term / tf.cast(n_db_tf, tf.float32)
else:
if hps.p in [2, np.inf]:
rb_term, db_term = regularizers.mmr_cnn(y_list, x, y, model, n_rb_tf, n_db_tf, hps.gamma_rb,
hps.gamma_db, hps.batch_size_gpu, hps.q)
rb_reg_part = frac_reg_tf * hps.lmbd * rb_term / tf.cast(n_rb_tf, tf.float32)
db_reg_part = frac_reg_tf * hps.lmbd * db_term / tf.cast(n_db_tf, tf.float32)
elif hps.p == 'univ':
reg_details = regularizers.mmr_cnn(y_list, x, y, model, n_rb_tf, n_db_tf, [hps.gamma_l1, hps.gamma_linf],
[hps.gamma_l1, hps.gamma_linf], hps.batch_size_gpu, hps.q)
rb_reg_part = frac_reg_tf * (hps.lmbd_l1 * reg_details[0] + hps.lmbd_linf * reg_details[1]) / tf.cast(n_rb_tf, tf.float32)
db_reg_part = frac_reg_tf * (hps.lmbd_l1 * reg_details[2] + hps.lmbd_linf * reg_details[3]) / tf.cast(n_db_tf, tf.float32)
else:
rb_reg_part, db_reg_part = tf.constant(0.0), tf.constant(0.0)
reg_rb_tower, reg_db_tower = tf.reduce_mean(rb_reg_part), tf.reduce_mean(db_reg_part)
if hps.p == 'univ':
reg_rb_l1 = tf.reduce_mean(reg_details[0] / tf.cast(n_rb_tf, tf.float32))
reg_rb_linf = tf.reduce_mean(reg_details[1]/tf.cast(n_rb_tf, tf.float32))
reg_db_l1 = tf.reduce_mean(reg_details[2]/tf.cast(n_db_tf, tf.float32))
reg_db_linf = tf.reduce_mean(reg_details[3]/tf.cast(n_db_tf, tf.float32))
regs_rb_l1.append(reg_rb_l1)
regs_rb_linf.append(reg_rb_linf)
regs_db_l1.append(reg_db_l1)
regs_db_linf.append(reg_db_linf)
loss_reg_tower = loss_plain_tower + reg_rb_tower + reg_db_tower
# Error rate
incorrect_prediction = tf.not_equal(tf.argmax(y_list[-1], 1), tf.argmax(y, 1))
err_rate_tower = tf.reduce_mean(tf.cast(incorrect_prediction, tf.float32))
# Calculate the gradients for the batch of data on this tower.
grads_vars_in_tower = optimizer.compute_gradients(loss_reg_tower)
# Keep track of the gradients across all towers.
tower_grads.append(grads_vars_in_tower)
losses_plain.append(loss_plain_tower)
losses_reg.append(loss_reg_tower)
regs_rb.append(reg_rb_tower)
regs_db.append(reg_db_tower)
err_rates.append(err_rate_tower)
# Reuse variables for this tower.
tf.get_variable_scope().reuse_variables()
loss_plain = tf.reduce_mean(tf.stack(losses_plain))
loss_reg = tf.reduce_mean(tf.stack(losses_reg))
reg_rb = tf.reduce_mean(tf.stack(regs_rb))
reg_db = tf.reduce_mean(tf.stack(regs_db))
err_rate = tf.reduce_mean(tf.stack(err_rates))
if hps.p == 'univ':
reg_rb_l1_2 = tf.reduce_mean(tf.stack(regs_rb_l1))
reg_rb_linf_2 = tf.reduce_mean(tf.stack(regs_rb_linf))
reg_db_l1_2 = tf.reduce_mean(tf.stack(regs_db_l1))
reg_db_linf_2 = tf.reduce_mean(tf.stack(regs_db_linf))
grads_vars = utils.average_gradients(tower_grads)
train_step_loss_reg = optimizer.apply_gradients(grads_vars, name='train_step')
# Separate forward pass graph for Cleverhans wrapper (for PGD attack) placed on the last GPU
logits_all_gpus = forward_pass_cleverhans(x_tf)
# Model saver
saver = tf.train.Saver()
# GPU settings
gpu_options = tf.GPUOptions(visible_device_list=str(hps.gpus)[1:-1], per_process_gpu_memory_fraction=hps.gpu_memory)
config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
with tf.Session(graph=graph, config=config) as sess:
with graph.as_default(), tf.device('/gpu:0'):
if hps.p != 'univ': pgd_ae_tensor = ae.pgd_attack(x_tf, y_in, cleverhans_model, hps.p, eps_dict[hps.p][hps.dataset], hps.pgd_n_iter)
sess.run(tf.global_variables_initializer()) # run 'init' op
epoch_start, epoch_end = 0, hps.n_epochs
log.add('Session started with hyperparameters: {} \n'.format(hps_str))
time_start = time.time()
# for epoch in range(epoch_start, epoch_end):
for epoch in range(1, epoch_end + 1):
epoch_start_reduced_lr = 0.9
lr_actual = hps.lr / 10 if epoch >= epoch_start_reduced_lr * hps.n_epochs else hps.lr
frac_reg = min(epoch / 10.0, 1.0) # from 0 to 1 linearly over the first 10 epochs
frac_start, frac_end = hps.hyp_start, hps.hyp_end # decrease the number of linear region hyperplanes from 10% to 2%
n_db = hps.n_out # the number of decision boundary hyperplanes is always the same (the number of classes)
n_total_hidden_units = utils.get_n_total_hidden_units(hps.nn_type, hps.n_hs, hps.height)
n_rb_start, n_rb_end = int(frac_start * n_total_hidden_units), int(frac_end * n_total_hidden_units)
n_rb = (n_rb_end - n_rb_start) / hps.n_epochs * epoch + n_rb_start
tensors_to_eval = [err_rate, loss_plain, reg_rb, reg_db]
err_rate_train, loss_plain_train, reg_rb_train, reg_db_train = 0, 0, 0, 0
reg_rb_l1_val_2, reg_rb_linf_val_2, reg_db_l1_val_2, reg_db_linf_val_2 = 0.0, 0.0, 0.0, 0.0
for batch_x, batch_y in data.get_batch_iterator(x_train, y_train, hps.batch_size, shuffle=True,
n_batches=n_batches_train):
if hps.p in [2, np.inf]:
_, err_rate_train_val, loss_plain_train_val, reg_rb_val, reg_db_val = sess.run(
[train_step_loss_reg] + tensors_to_eval, feed_dict={x_in: batch_x, y_in: batch_y, lr_tf: lr_actual,
n_rb_tf: n_rb, n_db_tf: n_db,
frac_reg_tf: frac_reg, is_train: True})
elif hps.p == 'univ':
_, err_rate_train_val, loss_plain_train_val, reg_rb_val, reg_db_val, reg_rb_l1_val, reg_rb_linf_val, reg_db_l1_val, reg_db_linf_val = sess.run(
[train_step_loss_reg] + tensors_to_eval + [reg_rb_l1_2, reg_rb_linf_2, reg_db_l1_2, reg_db_linf_2],
feed_dict={x_in: batch_x, y_in: batch_y, lr_tf: lr_actual,
n_rb_tf: n_rb, n_db_tf: n_db,
frac_reg_tf: frac_reg, is_train: True,
})
err_rate_train += err_rate_train_val / n_batches_train
loss_plain_train += loss_plain_train_val / n_batches_train
reg_rb_train += reg_rb_val / n_batches_train
reg_db_train += reg_db_val / n_batches_train
if hps.p == 'univ':
reg_rb_l1_val_2 += reg_rb_l1_val / n_batches_train
reg_rb_linf_val_2 += reg_rb_linf_val / n_batches_train
reg_db_l1_val_2 += reg_db_l1_val / n_batches_train
reg_db_linf_val_2 += reg_db_linf_val / n_batches_train
log.add('Epoch {}: epoch training is done, {:.2f} sec elapsed'.format(epoch, time.time() - time_start))
test_data_iter = data.get_batch_iterator(x_test, y_test, hps.batch_size, n_batches=n_batches_test)
err_rate_test, loss_plain_test, reg_rb_test, reg_db_test = eval_in_batches(
x_in, y_in, sess, tensors_to_eval, test_data_iter)
reg_train, reg_test = reg_rb_train + reg_db_train, reg_rb_test + reg_db_test
log.add('Epoch {}: train/test eval is done, {:.2f} sec elapsed'.format(epoch, time.time() - time_start))
if hps.p in [2, np.inf]:
str_test = 'test_err {:.3%} test_losspl {:.6f} test_reg {:.5f} '.format(
err_rate_test, loss_plain_test, reg_test)
str_train = 'train_err {:.3%} train_losspl {:.6f} train_reg {:.5f} '.format(
err_rate_train, loss_plain_train, reg_train)
log.add('Epoch {:d} '.format(epoch) + str_test + str_train)
if hps.p == 'univ':
str_test = 'test_err {:.3%} test_losspl {:.6f} '.format(
err_rate_test, loss_plain_test)
str_train = 'train_err {:.3%} train_losspl {:.6f} '.format(
err_rate_train, loss_plain_train)
log.add('Epoch {:d} '.format(epoch) + str_test + str_train)
str_reg = 'rb_l1: {:.5}, rb_linf: {:.5}, db_l1: {:.5}, db_linf: {:.5}'.format(reg_rb_l1_val_2, reg_rb_linf_val_2, reg_db_l1_val_2, reg_db_linf_val_2)
log.add('Epoch {:d}: '.format(epoch) + str_reg)
save_model_every_n_epochs = 10
if (epoch % save_model_every_n_epochs == 0 and epoch != epoch_start) or epoch == epoch_end:
mat_path, bounds_path = utils.save_results(sess, saver, model.W + model.b, cur_timestamp, hps_str, hps,
log, epoch)
log.add('Worker done in {:.2f} min ({})\n\n'.format((time.time() - time_start) / 60, hps_str))