diff --git a/autoencoder/autoencoder_models/Autoencoder.py b/autoencoder/autoencoder_models/Autoencoder.py
index e6d61fec35..4dadc579f2 100644
--- a/autoencoder/autoencoder_models/Autoencoder.py
+++ b/autoencoder/autoencoder_models/Autoencoder.py
@@ -18,7 +18,7 @@ def __init__(self, n_input, n_hidden, transfer_function=tf.nn.softplus, optimize
         self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
 
         # cost
-        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.sub(self.reconstruction, self.x), 2.0))
+        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
         self.optimizer = optimizer.minimize(self.cost)
 
         init = tf.global_variables_initializer()
diff --git a/autoencoder/autoencoder_models/DenoisingAutoencoder.py b/autoencoder/autoencoder_models/DenoisingAutoencoder.py
index 05c57cfb82..3622a6d330 100644
--- a/autoencoder/autoencoder_models/DenoisingAutoencoder.py
+++ b/autoencoder/autoencoder_models/DenoisingAutoencoder.py
@@ -22,7 +22,7 @@ def __init__(self, n_input, n_hidden, transfer_function = tf.nn.softplus, optimi
         self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
 
         # cost
-        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.sub(self.reconstruction, self.x), 2.0))
+        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
         self.optimizer = optimizer.minimize(self.cost)
 
         init = tf.global_variables_initializer()
@@ -89,7 +89,7 @@ def __init__(self, n_input, n_hidden, transfer_function = tf.nn.softplus, optimi
         self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
 
         # cost
-        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.sub(self.reconstruction, self.x), 2.0))
+        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
         self.optimizer = optimizer.minimize(self.cost)
 
         init = tf.global_variables_initializer()
diff --git a/autoencoder/autoencoder_models/VariationalAutoencoder.py b/autoencoder/autoencoder_models/VariationalAutoencoder.py
index 05e9f4ed9a..77d4a12a88 100644
--- a/autoencoder/autoencoder_models/VariationalAutoencoder.py
+++ b/autoencoder/autoencoder_models/VariationalAutoencoder.py
@@ -17,13 +17,13 @@ def __init__(self, n_input, n_hidden, optimizer = tf.train.AdamOptimizer()):
         self.z_log_sigma_sq = tf.add(tf.matmul(self.x, self.weights['log_sigma_w1']), self.weights['log_sigma_b1'])
 
         # sample from gaussian distribution
-        eps = tf.random_normal(tf.pack([tf.shape(self.x)[0], self.n_hidden]), 0, 1, dtype = tf.float32)
-        self.z = tf.add(self.z_mean, tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps))
+        eps = tf.random_normal(tf.stack([tf.shape(self.x)[0], self.n_hidden]), 0, 1, dtype = tf.float32)
+        self.z = tf.add(self.z_mean, tf.multiply(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps))
 
         self.reconstruction = tf.add(tf.matmul(self.z, self.weights['w2']), self.weights['b2'])
 
         # cost
-        reconstr_loss = 0.5 * tf.reduce_sum(tf.pow(tf.sub(self.reconstruction, self.x), 2.0))
+        reconstr_loss = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
         latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq
                                            - tf.square(self.z_mean)
                                            - tf.exp(self.z_log_sigma_sq), 1)
diff --git a/compression/decoder.py b/compression/decoder.py
old mode 100755
new mode 100644
diff --git a/compression/encoder.py b/compression/encoder.py
old mode 100755
new mode 100644
diff --git a/compression/msssim.py b/compression/msssim.py
old mode 100755
new mode 100644
diff --git a/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py b/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py
index 6e9a627491..6c2cc49b51 100644
--- a/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py
+++ b/differential_privacy/dp_sgd/dp_mnist/dp_mnist.py
@@ -273,7 +273,7 @@ def Train(mnist_train_file, mnist_test_file, network_parameters, num_steps,
         images, network_parameters)
 
     cost = tf.nn.softmax_cross_entropy_with_logits(
-        logits, tf.one_hot(labels, 10))
+        logits=logits, labels=tf.one_hot(labels, 10))
 
     # The actual cost is the average across the examples.
     cost = tf.reduce_sum(cost, [0]) / batch_size
@@ -343,7 +343,7 @@ def Train(mnist_train_file, mnist_test_file, network_parameters, num_steps,
 
     # We need to maintain the intialization sequence.
     for v in tf.trainable_variables():
-      sess.run(tf.initialize_variables([v]))
+      sess.run(tf.variables_initializer([v]))
     sess.run(tf.global_variables_initializer())
     sess.run(init_ops)
 
diff --git a/differential_privacy/dp_sgd/dp_optimizer/utils.py b/differential_privacy/dp_sgd/dp_optimizer/utils.py
index f751b7a518..0d449b60c8 100644
--- a/differential_privacy/dp_sgd/dp_optimizer/utils.py
+++ b/differential_privacy/dp_sgd/dp_optimizer/utils.py
@@ -236,7 +236,7 @@ def BatchClipByL2norm(t, upper_bound, name=None):
   with tf.op_scope([t, upper_bound], name, "batch_clip_by_l2norm") as name:
     saved_shape = tf.shape(t)
     batch_size = tf.slice(saved_shape, [0], [1])
-    t2 = tf.reshape(t, tf.concat(0, [batch_size, [-1]]))
+    t2 = tf.reshape(t, tf.concat(axis=0, values=[batch_size, [-1]]))
     upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]),
                               tf.constant(1.0/upper_bound))
     # Add a small number to avoid divide by 0
@@ -266,7 +266,7 @@ def SoftThreshold(t, threshold_ratio, name=None):
   assert threshold_ratio >= 0
   with tf.op_scope([t, threshold_ratio], name, "soft_thresholding") as name:
     saved_shape = tf.shape(t)
-    t2 = tf.reshape(t, tf.concat(0, [tf.slice(saved_shape, [0], [1]), -1]))
+    t2 = tf.reshape(t, tf.concat(axis=0, values=[tf.slice(saved_shape, [0], [1]), -1]))
     t_abs = tf.abs(t2)
     t_x = tf.sign(t2) * tf.nn.relu(t_abs -
                                    (tf.reduce_mean(t_abs, [0],
diff --git a/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py b/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py
index 4931e2751b..82b3ae2da2 100644
--- a/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py
+++ b/differential_privacy/dp_sgd/per_example_gradients/per_example_gradients.py
@@ -189,7 +189,7 @@ def __call__(self, x, z_grads):
     z_grads, = z_grads
     x_expanded = tf.expand_dims(x, 2)
     z_grads_expanded = tf.expand_dims(z_grads, 1)
-    return tf.mul(x_expanded, z_grads_expanded)
+    return tf.multiply(x_expanded, z_grads_expanded)
 
 
 pxg_registry.Register("MatMul", MatMulPXG)
@@ -245,7 +245,7 @@ def _PxConv2DBuilder(self, input_, w, strides, padding):
       num_x = int(conv_x.get_shape()[0])
       assert num_x == 1, num_x
     assert len(conv_px) == batch_size
-    conv = tf.concat(0, conv_px)
+    conv = tf.concat(axis=0, values=conv_px)
     assert int(conv.get_shape()[0]) == batch_size
     return conv, w_px
 
@@ -274,7 +274,7 @@ def __call__(self, w, z_grads):
                                   self.colocate_gradients_with_ops,
                                   gate_gradients=self.gate_gradients)
 
-    return tf.pack(gradients_list)
+    return tf.stack(gradients_list)
 
 pxg_registry.Register("Conv2D", Conv2DPXG)
 
diff --git a/differential_privacy/multiple_teachers/deep_cnn.py b/differential_privacy/multiple_teachers/deep_cnn.py
index afc46eec1e..1b4bc7b10e 100644
--- a/differential_privacy/multiple_teachers/deep_cnn.py
+++ b/differential_privacy/multiple_teachers/deep_cnn.py
@@ -75,7 +75,7 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
   var = _variable_on_cpu(name, shape,
                          tf.truncated_normal_initializer(stddev=stddev))
   if wd is not None:
-    weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
+    weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
     tf.add_to_collection('losses', weight_decay)
   return var
 
@@ -398,7 +398,7 @@ def train_op_fun(total_loss, global_step):
                                   decay_steps,
                                   LEARNING_RATE_DECAY_FACTOR,
                                   staircase=True)
-  tf.scalar_summary('learning_rate', lr)
+  tf.summary.scalar('learning_rate', lr)
 
   # Generate moving averages of all losses and associated summaries.
   loss_averages_op = moving_av(total_loss)
@@ -413,7 +413,7 @@ def train_op_fun(total_loss, global_step):
 
   # Add histograms for trainable variables.
   for var in tf.trainable_variables():
-    tf.histogram_summary(var.op.name, var)
+    tf.summary.histogram(var.op.name, var)
 
   # Track the moving averages of all trainable variables.
   variable_averages = tf.train.ExponentialMovingAverage(
@@ -485,7 +485,7 @@ def train(images, labels, ckpt_path, dropout=False):
     train_op = train_op_fun(loss, global_step)
 
     # Create a saver.
-    saver = tf.train.Saver(tf.all_variables())
+    saver = tf.train.Saver(tf.global_variables())
 
     print("Graph constructed and saver created")
 
diff --git a/differential_privacy/privacy_accountant/tf/accountant.py b/differential_privacy/privacy_accountant/tf/accountant.py
index e1aab7c5cb..51285dbdd7 100644
--- a/differential_privacy/privacy_accountant/tf/accountant.py
+++ b/differential_privacy/privacy_accountant/tf/accountant.py
@@ -361,12 +361,12 @@ def _differential_moments(self, sigma, s, t):
     exponents = tf.constant([j * (j + 1.0 - 2.0 * s) / (2.0 * sigma * sigma)
                              for j in range(t + 1)], dtype=tf.float64)
     # x[i, j] = binomial[i, j] * signs[i, j] = (i choose j) * (-1)^{i-j}
-    x = tf.mul(binomial, signs)
+    x = tf.multiply(binomial, signs)
     # y[i, j] = x[i, j] * exp(exponents[j])
     #         = (i choose j) * (-1)^{i-j} * exp(j(j-1)/(2 sigma^2))
     # Note: this computation is done by broadcasting pointwise multiplication
     # between [t+1, t+1] tensor and [t+1] tensor.
-    y = tf.mul(x, tf.exp(exponents))
+    y = tf.multiply(x, tf.exp(exponents))
     # z[i] = sum_j y[i, j]
     #      = sum_j (i choose j) * (-1)^{i-j} * exp(j(j-1)/(2 sigma^2))
     z = tf.reduce_sum(y, 1)
diff --git a/im2txt/im2txt/show_and_tell_model.py b/im2txt/im2txt/show_and_tell_model.py
index 1292ea3e6e..41b62d6509 100644
--- a/im2txt/im2txt/show_and_tell_model.py
+++ b/im2txt/im2txt/show_and_tell_model.py
@@ -264,7 +264,7 @@ def build_model(self):
       if self.mode == "inference":
         # In inference mode, use concatenated states for convenient feeding and
         # fetching.
-        tf.concat(initial_state, 1, name="initial_state")
+        tf.concat(axis=initial_state, values=1, name="initial_state")
 
         # Placeholder for feeding a batch of concatenated states.
         state_feed = tf.placeholder(dtype=tf.float32,
@@ -274,11 +274,11 @@ def build_model(self):
 
         # Run a single LSTM step.
         lstm_outputs, state_tuple = lstm_cell(
-            inputs=tf.squeeze(self.seq_embeddings, squeeze_dims=[1]),
+            inputs=tf.squeeze(self.seq_embeddings, axis=[1]),
             state=state_tuple)
 
         # Concatentate the resulting state.
-        tf.concat(state_tuple, 1, name="state")
+        tf.concat(axis=state_tuple, values=1, name="state")
       else:
         # Run the batch of sequence embeddings through the LSTM.
         sequence_length = tf.reduce_sum(self.input_mask, 1)
diff --git a/inception/inception/data/preprocess_imagenet_validation_data.py b/inception/inception/data/preprocess_imagenet_validation_data.py
old mode 100755
new mode 100644
diff --git a/inception/inception/data/process_bounding_boxes.py b/inception/inception/data/process_bounding_boxes.py
old mode 100755
new mode 100644
diff --git a/inception/inception/image_processing.py b/inception/inception/image_processing.py
index 6d8b992ea6..4e3a57f0b2 100644
--- a/inception/inception/image_processing.py
+++ b/inception/inception/image_processing.py
@@ -221,7 +221,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
     if not thread_id:
       image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                     bbox)
-      tf.image_summary('image_with_bounding_boxes', image_with_box)
+      tf.summary.image('image_with_bounding_boxes', image_with_box)
 
   # A large fraction of image datasets contain a human-annotated bounding
   # box delineating the region of the image containing the object of interest.
@@ -242,7 +242,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
     if not thread_id:
       image_with_distorted_box = tf.image.draw_bounding_boxes(
           tf.expand_dims(image, 0), distort_bbox)
-      tf.image_summary('images_with_distorted_bounding_box',
+      tf.summary.image('images_with_distorted_bounding_box',
                        image_with_distorted_box)
 
     # Crop the image to the specified bounding box.
@@ -259,7 +259,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
     # the third dimension.
     distorted_image.set_shape([height, width, 3])
     if not thread_id:
-      tf.image_summary('cropped_resized_image',
+      tf.summary.image('cropped_resized_image',
                        tf.expand_dims(distorted_image, 0))
 
     # Randomly flip the image horizontally.
@@ -269,7 +269,7 @@ def distort_image(image, height, width, bbox, thread_id=0, scope=None):
     distorted_image = distort_color(distorted_image, thread_id)
 
     if not thread_id:
-      tf.image_summary('final_distorted_image',
+      tf.summary.image('final_distorted_image',
                        tf.expand_dims(distorted_image, 0))
     return distorted_image
 
@@ -328,8 +328,8 @@ def image_preprocessing(image_buffer, bbox, train, thread_id=0):
     image = eval_image(image, height, width)
 
   # Finally, rescale to [-1,1] instead of [0, 1)
-  image = tf.sub(image, 0.5)
-  image = tf.mul(image, 2.0)
+  image = tf.subtract(image, 0.5)
+  image = tf.multiply(image, 2.0)
   return image
 
 
@@ -394,7 +394,7 @@ def parse_example_proto(example_serialized):
   ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
 
   # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat(0, [ymin, xmin, ymax, xmax])
+  bbox = tf.concat(axis=0, values=[ymin, xmin, ymax, xmax])
 
   # Force the variable number of bounding boxes into the shape
   # [1, num_boxes, coords].
@@ -505,6 +505,6 @@ def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None,
     images = tf.reshape(images, shape=[batch_size, height, width, depth])
 
     # Display the training images in the visualizer.
-    tf.image_summary('images', images)
+    tf.summary.image('images', images)
 
     return images, tf.reshape(label_index_batch, [batch_size])
diff --git a/inception/inception/inception_distributed_train.py b/inception/inception/inception_distributed_train.py
index 0cfbd97ee8..75e39211ab 100644
--- a/inception/inception/inception_distributed_train.py
+++ b/inception/inception/inception_distributed_train.py
@@ -133,7 +133,7 @@ def train(target, dataset, cluster_spec):
                                       FLAGS.learning_rate_decay_factor,
                                       staircase=True)
       # Add a summary to track the learning rate.
-      tf.scalar_summary('learning_rate', lr)
+      tf.summary.scalar('learning_rate', lr)
 
       # Create an optimizer that performs gradient descent.
       opt = tf.train.RMSPropOptimizer(lr,
@@ -171,8 +171,8 @@ def train(target, dataset, cluster_spec):
           loss_name = l.op.name
           # Name each loss as '(raw)' and name the moving average version of the
           # loss as the original loss name.
-          tf.scalar_summary(loss_name + ' (raw)', l)
-          tf.scalar_summary(loss_name, loss_averages.average(l))
+          tf.summary.scalar(loss_name + ' (raw)', l)
+          tf.summary.scalar(loss_name, loss_averages.average(l))
 
         # Add dependency to compute loss_averages.
         with tf.control_dependencies([loss_averages_op]):
@@ -191,7 +191,7 @@ def train(target, dataset, cluster_spec):
 
       # Add histograms for model variables.
       for var in variables_to_average:
-        tf.histogram_summary(var.op.name, var)
+        tf.summary.histogram(var.op.name, var)
 
       # Create synchronous replica optimizer.
       opt = tf.train.SyncReplicasOptimizer(
@@ -215,7 +215,7 @@ def train(target, dataset, cluster_spec):
       # Add histograms for gradients.
       for grad, var in grads:
         if grad is not None:
-          tf.histogram_summary(var.op.name + '/gradients', grad)
+          tf.summary.histogram(var.op.name + '/gradients', grad)
 
       apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
 
@@ -233,7 +233,7 @@ def train(target, dataset, cluster_spec):
       saver = tf.train.Saver()
 
       # Build the summary operation based on the TF collection of Summaries.
-      summary_op = tf.merge_all_summaries()
+      summary_op = tf.summary.merge_all()
 
       # Build an initialization operation to run below.
       init_op = tf.global_variables_initializer()
diff --git a/inception/inception/inception_eval.py b/inception/inception/inception_eval.py
index b91b2f9f05..a61e9f4258 100644
--- a/inception/inception/inception_eval.py
+++ b/inception/inception/inception_eval.py
@@ -158,10 +158,10 @@ def evaluate(dataset):
     saver = tf.train.Saver(variables_to_restore)
 
     # Build the summary operation based on the TF collection of Summaries.
-    summary_op = tf.merge_all_summaries()
+    summary_op = tf.summary.merge_all()
 
     graph_def = tf.get_default_graph().as_graph_def()
-    summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir,
+    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir,
                                             graph_def=graph_def)
 
     while True:
diff --git a/inception/inception/inception_model.py b/inception/inception/inception_model.py
index b15615dd8b..fedae13ae7 100644
--- a/inception/inception/inception_model.py
+++ b/inception/inception/inception_model.py
@@ -115,7 +115,7 @@ def loss(logits, labels, batch_size=None):
   # shape [FLAGS.batch_size, num_classes].
   sparse_labels = tf.reshape(labels, [batch_size, 1])
   indices = tf.reshape(tf.range(batch_size), [batch_size, 1])
-  concated = tf.concat(1, [indices, sparse_labels])
+  concated = tf.concat(axis=1, values=[indices, sparse_labels])
   num_classes = logits[0].get_shape()[-1].value
   dense_labels = tf.sparse_to_dense(concated,
                                     [batch_size, num_classes],
@@ -147,8 +147,8 @@ def _activation_summary(x):
   # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
   # session. This helps the clarity of presentation on tensorboard.
   tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
-  tf.contrib.deprecated.histogram_summary(tensor_name + '/activations', x)
-  tf.contrib.deprecated.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
+  tf.summary.histogram(tensor_name + '/activations', x)
+  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
 
 def _activation_summaries(endpoints):
diff --git a/inception/inception/inception_train.py b/inception/inception/inception_train.py
index 3794184d2e..886cc8a1ae 100644
--- a/inception/inception/inception_train.py
+++ b/inception/inception/inception_train.py
@@ -132,8 +132,8 @@ def _tower_loss(images, labels, num_classes, scope, reuse_variables=None):
     loss_name = re.sub('%s_[0-9]*/' % inception.TOWER_NAME, '', l.op.name)
     # Name each loss as '(raw)' and name the moving average version of the loss
     # as the original loss name.
-    tf.scalar_summary(loss_name +' (raw)', l)
-    tf.scalar_summary(loss_name, loss_averages.average(l))
+    tf.summary.scalar(loss_name +' (raw)', l)
+    tf.summary.scalar(loss_name, loss_averages.average(l))
 
   with tf.control_dependencies([loss_averages_op]):
     total_loss = tf.identity(total_loss)
@@ -166,7 +166,7 @@ def _average_gradients(tower_grads):
       grads.append(expanded_g)
 
     # Average over the 'tower' dimension.
-    grad = tf.concat(0, grads)
+    grad = tf.concat(axis=0, values=grads)
     grad = tf.reduce_mean(grad, 0)
 
     # Keep in mind that the Variables are redundant because they are shared
@@ -223,8 +223,8 @@ def train(dataset):
     num_classes = dataset.num_classes() + 1
 
      # Split the batch of images and labels for towers.
-    images_splits = tf.split(0, FLAGS.num_gpus, images)
-    labels_splits = tf.split(0, FLAGS.num_gpus, labels)
+    images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
+    labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)
 
     # Calculate the gradients for each model tower.
     tower_grads = []
@@ -268,20 +268,20 @@ def train(dataset):
     summaries.extend(input_summaries)
 
     # Add a summary to track the learning rate.
-    summaries.append(tf.scalar_summary('learning_rate', lr))
+    summaries.append(tf.summary.scalar('learning_rate', lr))
 
     # Add histograms for gradients.
     for grad, var in grads:
       if grad is not None:
         summaries.append(
-            tf.histogram_summary(var.op.name + '/gradients', grad))
+            tf.summary.histogram(var.op.name + '/gradients', grad))
 
     # Apply the gradients to adjust the shared variables.
     apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
 
     # Add histograms for trainable variables.
     for var in tf.trainable_variables():
-      summaries.append(tf.histogram_summary(var.op.name, var))
+      summaries.append(tf.summary.histogram(var.op.name, var))
 
     # Track the moving averages of all trainable variables.
     # Note that we maintain a "double-average" of the BatchNormalization
@@ -301,10 +301,10 @@ def train(dataset):
                         batchnorm_updates_op)
 
     # Create a saver.
-    saver = tf.train.Saver(tf.all_variables())
+    saver = tf.train.Saver(tf.global_variables())
 
     # Build the summary operation from the last tower summaries.
-    summary_op = tf.merge_summary(summaries)
+    summary_op = tf.summary.merge(summaries)
 
     # Build an initialization operation to run below.
     init = tf.global_variables_initializer()
@@ -329,7 +329,7 @@ def train(dataset):
     # Start the queue runners.
     tf.train.start_queue_runners(sess=sess)
 
-    summary_writer = tf.train.SummaryWriter(
+    summary_writer = tf.summary.FileWriter(
         FLAGS.train_dir,
         graph_def=sess.graph.as_graph_def(add_shapes=True))
 
diff --git a/inception/inception/slim/inception_model.py b/inception/inception/slim/inception_model.py
index e42a7be75b..00a5351302 100644
--- a/inception/inception/slim/inception_model.py
+++ b/inception/inception/slim/inception_model.py
@@ -122,7 +122,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 32, [1, 1])
-          net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch5x5, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_35x35x256a'] = net
         # mixed_1: 35 x 35 x 288.
         with tf.variable_scope('mixed_35x35x288a'):
@@ -138,7 +138,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
-          net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch5x5, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_35x35x288a'] = net
         # mixed_2: 35 x 35 x 288.
         with tf.variable_scope('mixed_35x35x288b'):
@@ -154,7 +154,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
-          net = tf.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch5x5, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_35x35x288b'] = net
         # mixed_3: 17 x 17 x 768.
         with tf.variable_scope('mixed_17x17x768a'):
@@ -167,7 +167,7 @@ def inception_v3(inputs,
                                       stride=2, padding='VALID')
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID')
-          net = tf.concat([branch3x3, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch3x3, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_17x17x768a'] = net
         # mixed4: 17 x 17 x 768.
         with tf.variable_scope('mixed_17x17x768b'):
@@ -186,7 +186,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch7x7, branch7x7dbl, branch_pool], values=3)
           end_points['mixed_17x17x768b'] = net
         # mixed_5: 17 x 17 x 768.
         with tf.variable_scope('mixed_17x17x768c'):
@@ -205,7 +205,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch7x7, branch7x7dbl, branch_pool], values=3)
           end_points['mixed_17x17x768c'] = net
         # mixed_6: 17 x 17 x 768.
         with tf.variable_scope('mixed_17x17x768d'):
@@ -224,7 +224,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch7x7, branch7x7dbl, branch_pool], values=3)
           end_points['mixed_17x17x768d'] = net
         # mixed_7: 17 x 17 x 768.
         with tf.variable_scope('mixed_17x17x768e'):
@@ -243,7 +243,7 @@ def inception_v3(inputs,
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch7x7, branch7x7dbl, branch_pool], values=3)
           end_points['mixed_17x17x768e'] = net
         # Auxiliary Head logits
         aux_logits = tf.identity(end_points['mixed_17x17x768e'])
@@ -276,7 +276,7 @@ def inception_v3(inputs,
                                      stride=2, padding='VALID')
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID')
-          net = tf.concat([branch3x3, branch7x7x3, branch_pool], 3)
+          net = tf.concat(axis=[branch3x3, branch7x7x3, branch_pool], values=3)
           end_points['mixed_17x17x1280a'] = net
         # mixed_9: 8 x 8 x 2048.
         with tf.variable_scope('mixed_8x8x2048a'):
@@ -284,17 +284,17 @@ def inception_v3(inputs,
             branch1x1 = ops.conv2d(net, 320, [1, 1])
           with tf.variable_scope('branch3x3'):
             branch3x3 = ops.conv2d(net, 384, [1, 1])
-            branch3x3 = tf.concat([ops.conv2d(branch3x3, 384, [1, 3]),
-                                   ops.conv2d(branch3x3, 384, [3, 1])], 3)
+            branch3x3 = tf.concat(axis=[ops.conv2d(branch3x3, 384, [1, 3]),
+                                   ops.conv2d(branch3x3, 384, [3, 1])], values=3)
           with tf.variable_scope('branch3x3dbl'):
             branch3x3dbl = ops.conv2d(net, 448, [1, 1])
             branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
-            branch3x3dbl = tf.concat([ops.conv2d(branch3x3dbl, 384, [1, 3]),
-                                      ops.conv2d(branch3x3dbl, 384, [3, 1])], 3)
+            branch3x3dbl = tf.concat(axis=[ops.conv2d(branch3x3dbl, 384, [1, 3]),
+                                      ops.conv2d(branch3x3dbl, 384, [3, 1])], values=3)
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch3x3, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_8x8x2048a'] = net
         # mixed_10: 8 x 8 x 2048.
         with tf.variable_scope('mixed_8x8x2048b'):
@@ -302,17 +302,17 @@ def inception_v3(inputs,
             branch1x1 = ops.conv2d(net, 320, [1, 1])
           with tf.variable_scope('branch3x3'):
             branch3x3 = ops.conv2d(net, 384, [1, 1])
-            branch3x3 = tf.concat([ops.conv2d(branch3x3, 384, [1, 3]),
-                                   ops.conv2d(branch3x3, 384, [3, 1])], 3)
+            branch3x3 = tf.concat(axis=[ops.conv2d(branch3x3, 384, [1, 3]),
+                                   ops.conv2d(branch3x3, 384, [3, 1])], values=3)
           with tf.variable_scope('branch3x3dbl'):
             branch3x3dbl = ops.conv2d(net, 448, [1, 1])
             branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
-            branch3x3dbl = tf.concat([ops.conv2d(branch3x3dbl, 384, [1, 3]),
-                                      ops.conv2d(branch3x3dbl, 384, [3, 1])], 3)
+            branch3x3dbl = tf.concat(axis=[ops.conv2d(branch3x3dbl, 384, [1, 3]),
+                                      ops.conv2d(branch3x3dbl, 384, [3, 1])], values=3)
           with tf.variable_scope('branch_pool'):
             branch_pool = ops.avg_pool(net, [3, 3])
             branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
-          net = tf.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
+          net = tf.concat(axis=[branch1x1, branch3x3, branch3x3dbl, branch_pool], values=3)
           end_points['mixed_8x8x2048b'] = net
         # Final pooling and prediction
         with tf.variable_scope('logits'):
diff --git a/inception/inception/slim/ops.py b/inception/inception/slim/ops.py
index badc64a3bb..7c225f612b 100644
--- a/inception/inception/slim/ops.py
+++ b/inception/inception/slim/ops.py
@@ -331,9 +331,9 @@ def one_hot_encoding(labels, num_classes, scope=None):
     batch_size = labels.get_shape()[0]
     indices = tf.expand_dims(tf.range(0, batch_size), 1)
     labels = tf.cast(tf.expand_dims(labels, 1), indices.dtype)
-    concated = tf.concat([indices, labels], 1)
+    concated = tf.concat(axis=[indices, labels], values=1)
     onehot_labels = tf.sparse_to_dense(
-        concated, tf.pack([batch_size, num_classes]), 1.0, 0.0)
+        concated, tf.stack([batch_size, num_classes]), 1.0, 0.0)
     onehot_labels.set_shape([batch_size, num_classes])
     return onehot_labels
 
diff --git a/inception/inception/slim/variables.py b/inception/inception/slim/variables.py
index 03f2c83e27..1d967b79e9 100644
--- a/inception/inception/slim/variables.py
+++ b/inception/inception/slim/variables.py
@@ -240,7 +240,7 @@ def global_step(device=''):
     # Get the device for the variable.
     with tf.device(variable_device(device, 'global_step')):
       return tf.get_variable('global_step', shape=[], dtype=tf.int64,
-                             initializer=tf.zeros_initializer,
+                             initializer=tf.zeros_initializer(),
                              trainable=False, collections=collections)
 
 
diff --git a/namignizer/model.py b/namignizer/model.py
index 5435798a64..72c5c5ecb6 100644
--- a/namignizer/model.py
+++ b/namignizer/model.py
@@ -64,7 +64,7 @@ def __init__(self, is_training, config):
                 (cell_output, state) = cell(inputs[:, time_step, :], state)
                 outputs.append(cell_output)
 
-        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
+        output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, size])
         softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
         softmax_b = tf.get_variable("softmax_b", [vocab_size])
         logits = tf.matmul(output, softmax_w) + softmax_b
diff --git a/neural_gpu/neural_gpu.py b/neural_gpu/neural_gpu.py
index ecf85f508e..786a588a84 100644
--- a/neural_gpu/neural_gpu.py
+++ b/neural_gpu/neural_gpu.py
@@ -36,7 +36,7 @@ def conv_linear(args, kw, kh, nin, nout, rate, do_bias, bias_start, prefix):
     if len(args) == 1:
       arg = args[0]
     else:
-      arg = tf.concat(args, 3)
+      arg = tf.concat(axis=args, values=3)
     res = tf.nn.convolution(arg, k, dilation_rate=(rate, 1), padding="SAME")
     if not do_bias: return res
     with tf.device("/cpu:0"):
@@ -71,14 +71,14 @@ def place_at14(decided, selected, it):
   """Place selected at it-th coordinate of decided, dim=1 of 4."""
   slice1 = decided[:, :it, :, :]
   slice2 = decided[:, it + 1:, :, :]
-  return tf.concat([slice1, selected, slice2], 1)
+  return tf.concat(axis=[slice1, selected, slice2], values=1)
 
 
 def place_at13(decided, selected, it):
   """Place selected at it-th coordinate of decided, dim=1 of 3."""
   slice1 = decided[:, :it, :]
   slice2 = decided[:, it + 1:, :]
-  return tf.concat([slice1, selected, slice2], 1)
+  return tf.concat(axis=[slice1, selected, slice2], values=1)
 
 
 def tanh_cutoff(x, cutoff):
@@ -211,7 +211,7 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
   # beam_val is [batch_size x beam_size]; let b = batch_size * beam_size
   # decided is len x b x a x b
   # output is b x out_size; step is b x len x a x b;
-  outputs = tf.split(tf.nn.log_softmax(output), beam_size, 0)
+  outputs = tf.split(axis=tf.nn.log_softmax(output), num_or_size_splits=beam_size, value=0)
   all_beam_vals, all_beam_idx = [], []
   beam_range = 1 if is_first else beam_size
   for i in xrange(beam_range):
@@ -221,9 +221,9 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
                                  cur_beam_val], "GREPO", summarize=8)
     all_beam_vals.append(top_out + tf.expand_dims(cur_beam_val, 1))
     all_beam_idx.append(top_out_idx)
-  all_beam_idx = tf.reshape(tf.transpose(tf.concat(all_beam_idx, 1), [1, 0]),
+  all_beam_idx = tf.reshape(tf.transpose(tf.concat(axis=all_beam_idx, values=1), [1, 0]),
                             [-1])
-  top_beam, top_beam_idx = tf.nn.top_k(tf.concat(all_beam_vals, 1), k=beam_size)
+  top_beam, top_beam_idx = tf.nn.top_k(tf.concat(axis=all_beam_vals, values=1), k=beam_size)
   top_beam_idx = tf.Print(top_beam_idx, [top_beam, top_beam_idx],
                           "GREP", summarize=8)
   reordered = [[] for _ in xrange(len(tensors_to_reorder) + 1)]
@@ -236,8 +236,8 @@ def reorder_beam(beam_size, batch_size, beam_val, output, is_first,
     reordered[0].append(tf.gather(output, which_beam))
     for i, t in enumerate(tensors_to_reorder):
       reordered[i + 1].append(tf.gather(t, which_beam))
-  new_tensors = [tf.concat(t, 0) for t in reordered]
-  top_out_idx = tf.concat(top_out_idx, 0)
+  new_tensors = [tf.concat(axis=t, values=0) for t in reordered]
+  top_out_idx = tf.concat(axis=top_out_idx, values=0)
   return (top_beam, new_tensors[0], top_out_idx, new_tensors[1:])
 
 
@@ -266,9 +266,9 @@ def __init__(self, nmaps, vec_size, niclass, noclass, dropout,
     self.input = tf.placeholder(tf.int32, name="inp")
     self.target = tf.placeholder(tf.int32, name="tgt")
     self.prev_step = tf.placeholder(tf.float32, name="prev_step")
-    gpu_input = tf.split(self.input, num_gpus, 0)
-    gpu_target = tf.split(self.target, num_gpus, 0)
-    gpu_prev_step = tf.split(self.prev_step, num_gpus, 0)
+    gpu_input = tf.split(axis=self.input, num_or_size_splits=num_gpus, value=0)
+    gpu_target = tf.split(axis=self.target, num_or_size_splits=num_gpus, value=0)
+    gpu_prev_step = tf.split(axis=self.prev_step, num_or_size_splits=num_gpus, value=0)
     batch_size = tf.shape(gpu_input[0])[0]
 
     if backward:
@@ -410,7 +410,7 @@ def dec_step(step, it, it_int, decided, output_ta, tgts,
       out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
       output = tf.gather(target_emb_weights, out)
       output = tf.reshape(output, [-1, 1, nmaps])
-      output = tf.concat([output] * height, 1)
+      output = tf.concat(axis=[output] * height, values=1)
       tgt = tgts[it, :, :, :]
       selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
                          lambda: output, lambda: tgt)
@@ -419,7 +419,7 @@ def dec_step(step, it, it_int, decided, output_ta, tgts,
       out_idx = place_at13(
           out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
       if mem_size > 0:
-        mem = tf.concat([mem] * height, 2)
+        mem = tf.concat(axis=[mem] * height, values=2)
         dec_write = place_at14(dec_write, mem, it_incr)
       return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd,
               out_idx, beam_cost)
@@ -459,7 +459,7 @@ def dec_step(step, it, it_int, decided, output_ta, tgts,
                                               gpu_targets_tn)
               embedded_targets_tn = tf.transpose(
                   embedded_targets_tn, [2, 0, 1, 3])  # len x b x 1 x nmaps
-              embedded_targets_tn = tf.concat([embedded_targets_tn] * height, 2)
+              embedded_targets_tn = tf.concat(axis=[embedded_targets_tn] * height, values=2)
 
         # First image comes from start by applying convolution and adding 0s.
         start = tf.transpose(start, [0, 2, 1, 3])  # Now b x len x h x vec_s
@@ -505,7 +505,7 @@ def decoder_loop_fn((state, prev_cell_out, _), (cell_inp, cur_tgt)):
               attn_res = attention_query(attn_q, tf.get_variable(
                   "attn_v", [height * nmaps],
                   initializer=tf.random_uniform_initializer(-0.1, 0.1)))
-              concatenated = tf.reshape(tf.concat([cell_inp, attn_res], 1),
+              concatenated = tf.reshape(tf.concat(axis=[cell_inp, attn_res], values=1),
                                         [batch_size, 2 * height * nmaps])
               cell_inp = tf.layers.dense(
                   concatenated, height * nmaps, name="attn_merge")
@@ -519,14 +519,14 @@ def decoder_loop_fn((state, prev_cell_out, _), (cell_inp, cur_tgt)):
                 res = tf.gather(target_emb_weights, res)
                 res *= tf.expand_dims(mask[:, 0], 1)
                 output = tf.layers.dense(
-                    tf.concat([output, res], 1), height * nmaps, name="rnnmem")
+                    tf.concat(axis=[output, res], values=1), height * nmaps, name="rnnmem")
 
               return new_state, output, mem_loss
             # pylint: enable=cell-var-from-loop
             gpu_targets = tf.squeeze(gpu_target[gpu], [1])  # b x len
             gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
             dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
-            dec_inp = tf.concat([dec_zero, gpu_targets], 1)
+            dec_inp = tf.concat(axis=[dec_zero, gpu_targets], values=1)
             dec_inp = dec_inp[:, :length]
             embedded_dec_inp = tf.gather(target_emb_weights, dec_inp)
             embedded_dec_inp_proj = tf.layers.dense(
@@ -573,9 +573,9 @@ def enc_step_lambda(i, step):
                                   height, vec_size])
 
             # Prepare for beam search.
-            tgts = tf.concat([embedded_targets_tn] * beam_size, 1)
+            tgts = tf.concat(axis=[embedded_targets_tn] * beam_size, values=1)
             beam_cost = tf.zeros([batch_size, beam_size])
-            step = tf.concat([step] * beam_size, 0)
+            step = tf.concat(axis=[step] * beam_size, values=0)
             # First step hard-coded.
             step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
                 step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx,
@@ -654,7 +654,7 @@ def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc):
                        % (gpu, time.time() - start_time))
 
     self.updates = []
-    self.after_enc_step = tf.concat(self.after_enc_step, 0)  # Concat GPUs.
+    self.after_enc_step = tf.concat(axis=self.after_enc_step, values=0)  # Concat GPUs.
     if backward:
       tf.get_variable_scope()._reuse = False
       tf.get_variable_scope().set_caching_device(None)
@@ -667,10 +667,10 @@ def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc):
 
     self.losses = [gpu_avg([gpu_losses[g][i] for g in xrange(num_gpus)])
                    for i in xrange(len(gpu_losses[0]))]
-    self.out_idx = tf.concat(gpu_out_idx, 0)
+    self.out_idx = tf.concat(axis=gpu_out_idx, values=0)
     self.grad_norms = [gpu_avg([gpu_grad_norms[g][i] for g in xrange(num_gpus)])
                        for i in xrange(len(gpu_grad_norms[0]))]
-    self.outputs = [tf.concat([gpu_outputs[g] for g in xrange(num_gpus)], 1)]
+    self.outputs = [tf.concat(axis=[gpu_outputs[g] for g in xrange(num_gpus)], values=1)]
     self.quantize_op = quantize_weights_op(512, 8)
     if backward:
       self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
diff --git a/neural_programmer/data_utils.py b/neural_programmer/data_utils.py
old mode 100755
new mode 100644
diff --git a/neural_programmer/model.py b/neural_programmer/model.py
old mode 100755
new mode 100644
index 8c06c4f1f5..59e9a92163
--- a/neural_programmer/model.py
+++ b/neural_programmer/model.py
@@ -121,14 +121,14 @@ def LSTM_question_embedding(self, sentence, sentence_length):
       if (self.utility.FLAGS.rnn_dropout > 0.0):
         question_hidden = question_hidden * rnn_dropout_mask
       hidden_vectors.append(tf.expand_dims(question_hidden, 0))
-    hidden_vectors = tf.concat(0, hidden_vectors)
+    hidden_vectors = tf.concat(axis=0, values=hidden_vectors)
     return question_hidden, hidden_vectors
 
   def history_recurrent_step(self, curr_hprev, hprev):
     #A single RNN step for controller or history RNN
     return tf.tanh(
         tf.matmul(
-            tf.concat(1, [hprev, curr_hprev]), self.params[
+            tf.concat(axis=1, values=[hprev, curr_hprev]), self.params[
                 "history_recurrent"])) + self.params["history_recurrent_bias"]
 
   def question_number_softmax(self, hidden_vectors):
@@ -150,13 +150,13 @@ def compute_ans(op_embedding, comparison):
                             tf.expand_dims(
                                 tf.transpose(self.batch_ordinal_question_one), 2
                             ), [1, 1, self.utility.FLAGS.embedding_dims]), 0))))
-      question_number_softmax = tf.nn.softmax(tf.concat(1, [first, second]))
+      question_number_softmax = tf.nn.softmax(tf.concat(axis=1, values=[first, second]))
       if (self.mode == "test"):
         cond = tf.equal(question_number_softmax,
                         tf.reshape(
                             tf.reduce_max(question_number_softmax, 1),
                             [self.batch_size, 1]))
-        question_number_softmax = tf.select(
+        question_number_softmax = tf.where(
             cond,
             tf.fill(tf.shape(question_number_softmax), 1.0),
             tf.fill(tf.shape(question_number_softmax), 0.0))
@@ -164,7 +164,7 @@ def compute_ans(op_embedding, comparison):
                                           self.data_type)
       ans = tf.reshape(
           tf.reduce_sum(question_number_softmax * tf.concat(
-              1, [self.batch_question_number, self.batch_question_number_one]),
+              axis=1, values=[self.batch_question_number, self.batch_question_number_one]),
                         1), [self.batch_size, 1])
       return ans
 
@@ -225,7 +225,7 @@ def compute_column_softmax(self, column_controller_vector, time_step):
     column_controller_vector = nn_utils.apply_dropout(
         column_controller_vector, self.utility.FLAGS.dropout, self.mode)
     self.full_column_hidden_vectors = tf.concat(
-        1, [self.column_hidden_vectors, self.word_column_hidden_vectors])
+        axis=1, values=[self.column_hidden_vectors, self.word_column_hidden_vectors])
     self.full_column_hidden_vectors += self.summary_text_entry_embeddings
     self.full_column_hidden_vectors = nn_utils.apply_dropout(
         self.full_column_hidden_vectors, self.utility.FLAGS.dropout, self.mode)
@@ -258,7 +258,7 @@ def compute_first_or_last(self, select, first=True):
           temp_ans.append(curr_prob)
         else:
           temp_ans.append(tf.zeros_like(curr_prob))
-      temp_ans = tf.transpose(tf.concat(0, temp_ans))
+      temp_ans = tf.transpose(tf.concat(axis=0, values=temp_ans))
       answer += temp_ans
     return answer
 
@@ -266,7 +266,7 @@ def make_hard_softmax(self, softmax):
     #converts soft selection to hard selection. used at test time
     cond = tf.equal(
         softmax, tf.reshape(tf.reduce_max(softmax, 1), [self.batch_size, 1]))
-    softmax = tf.select(
+    softmax = tf.where(
         cond, tf.fill(tf.shape(softmax), 1.0), tf.fill(tf.shape(softmax), 0.0))
     softmax = tf.cast(softmax, self.data_type)
     return softmax
@@ -297,7 +297,7 @@ def compute_max_or_min(self, select, maxi=True):
       curr_prob = curr_prob * tf.expand_dims((1 - sum_prob), 2)
       curr_prob = curr_prob * tf.expand_dims(
           tf.cast((1 - sum_prob) > 0.0, self.data_type), 2)
-      answer = tf.select(select_mask, curr_prob, answer)
+      answer = tf.where(select_mask, curr_prob, answer)
       sum_prob += tf.reduce_sum(curr_prob, 2)
     return answer
 
@@ -335,11 +335,11 @@ def perform_operations(self, softmax, full_column_softmax, select,
                                1)  #BS * max_elements
     select_min = tf.reduce_sum(init_min * select_full_column_softmax,
                                1)  #BS * max_elements
-    select_prev = tf.concat(1, [
+    select_prev = tf.concat(axis=1, values=[
         tf.slice(select, [0, 1], [self.batch_size, self.max_elements - 1]),
         tf.cast(tf.zeros([self.batch_size, 1]), self.data_type)
     ])
-    select_next = tf.concat(1, [
+    select_next = tf.concat(axis=1, values=[
         tf.cast(tf.zeros([self.batch_size, 1]), self.data_type), tf.slice(
             select, [0, 0], [self.batch_size, self.max_elements - 1])
     ])
@@ -352,11 +352,11 @@ def perform_operations(self, softmax, full_column_softmax, select,
     length_content = 1
     length_select = 13
     length_print = 1
-    values = tf.concat(1, [count])
+    values = tf.concat(axis=1, values=[count])
     softmax_content = tf.slice(softmax, [0, 0],
                                [self.batch_size, length_content])
     #compute scalar output
-    output = tf.reduce_sum(tf.mul(softmax_content, values), 1)
+    output = tf.reduce_sum(tf.multiply(softmax_content, values), 1)
     #compute lookup answer
     softmax_print = tf.slice(softmax, [0, length_content + length_select],
                              [self.batch_size, length_print])
@@ -384,7 +384,7 @@ def perform_operations(self, softmax, full_column_softmax, select,
     ]
     select = tf.reduce_sum(
         tf.tile(tf.expand_dims(softmax_select, 2), [1, 1, self.max_elements]) *
-        tf.concat(1, select_lists), 1)
+        tf.concat(axis=1, values=select_lists), 1)
     select = select * self.select_whole_mask
     return output, select
 
@@ -396,11 +396,11 @@ def one_pass(self, select, question_embedding, hidden_vectors, hprev,
         self.batch_question_attention_mask)  #batch_size * embedding_dims
     controller_vector = tf.nn.relu(
         tf.matmul(hprev, self.params["controller_prev"]) + tf.matmul(
-            tf.concat(1, [question_embedding, attention_vector]), self.params[
+            tf.concat(axis=1, values=[question_embedding, attention_vector]), self.params[
                 "controller"]))
     column_controller_vector = tf.nn.relu(
         tf.matmul(hprev, self.params["column_controller_prev"]) + tf.matmul(
-            tf.concat(1, [question_embedding, attention_vector]), self.params[
+            tf.concat(axis=1, values=[question_embedding, attention_vector]), self.params[
                 "column_controller"]))
     controller_vector = nn_utils.apply_dropout(
         controller_vector, self.utility.FLAGS.dropout, self.mode)
@@ -413,7 +413,7 @@ def one_pass(self, select, question_embedding, hidden_vectors, hprev,
         tf.matmul(tf.transpose(self.params_unit), tf.transpose(softmax)))
     column_controller_vector = tf.nn.relu(
         tf.matmul(
-            tf.concat(1, [
+            tf.concat(axis=1, values=[
                 column_controller_vector, weighted_op_representation
             ]), self.params["break_conditional"]))
     full_column_softmax = self.compute_column_softmax(column_controller_vector,
@@ -429,7 +429,7 @@ def one_pass(self, select, question_embedding, hidden_vectors, hprev,
   def compute_lookup_error(self, val):
     #computes lookup error.
     cond = tf.equal(self.batch_print_answer, val)
-    inter = tf.select(
+    inter = tf.where(
         cond, self.init_print_error,
         tf.tile(
             tf.reshape(tf.constant(1e10, self.data_type), [1, 1, 1]), [
@@ -450,12 +450,12 @@ def soft_min(self, x, y):
 
   def error_computation(self):
     #computes the error of each example in a batch
-    math_error = 0.5 * tf.square(tf.sub(self.scalar_output, self.batch_answer))
+    math_error = 0.5 * tf.square(tf.subtract(self.scalar_output, self.batch_answer))
     #scale math error
     math_error = math_error / self.rows
     math_error = tf.minimum(math_error, self.utility.FLAGS.max_math_error *
                             tf.ones(tf.shape(math_error), self.data_type))
-    self.init_print_error = tf.select(
+    self.init_print_error = tf.where(
         self.batch_gold_select, -1 * tf.log(self.batch_lookup_answer + 1e-300 +
                                             self.invert_select_full_mask), -1 *
         tf.log(1 - self.batch_lookup_answer)) * self.select_full_mask
@@ -466,24 +466,24 @@ def error_computation(self):
       print_error += self.compute_lookup_error(val + 0.0)
     print_error = print_error * self.utility.FLAGS.print_cost / self.num_entries
     if (self.mode == "train"):
-      error = tf.select(
+      error = tf.where(
           tf.logical_and(
               tf.not_equal(self.batch_answer, 0.0),
               tf.not_equal(
                   tf.reduce_sum(tf.reduce_sum(self.batch_print_answer, 1), 1),
                   0.0)),
           self.soft_min(math_error, print_error),
-          tf.select(
+          tf.where(
               tf.not_equal(self.batch_answer, 0.0), math_error, print_error))
     else:
-      error = tf.select(
+      error = tf.where(
           tf.logical_and(
               tf.equal(self.scalar_output, 0.0),
               tf.equal(
                   tf.reduce_sum(tf.reduce_sum(self.batch_lookup_answer, 1), 1),
                   0.0)),
           tf.ones_like(math_error),
-          tf.select(
+          tf.where(
               tf.equal(self.scalar_output, 0.0), print_error, math_error))
     return error
 
@@ -558,7 +558,7 @@ def batch_process(self):
       input_col = tf.reduce_sum(
           tf.expand_dims(soft_column_softmax, 2) *
           self.full_column_hidden_vectors, 1)
-      history_input = tf.concat(1, [input_op, input_col])
+      history_input = tf.concat(axis=1, values=[input_op, input_col])
       history_input = nn_utils.apply_dropout(
           history_input, self.utility.FLAGS.dropout, self.mode)
       hprev = self.history_recurrent_step(history_input, hprev)
@@ -567,7 +567,7 @@ def batch_process(self):
     self.scalar_output = output
     error = self.error_computation()
     cond = tf.less(error, 0.0001, name="cond")
-    correct_add = tf.select(
+    correct_add = tf.where(
         cond, tf.fill(tf.shape(cond), 1.0), tf.fill(tf.shape(cond), 0.0))
     correct = tf.reduce_sum(correct_add)
     error = error / batch_size
@@ -579,11 +579,11 @@ def compute_error(self):
     #Sets mask variables and performs batch processing
     self.batch_gold_select = self.batch_print_answer > 0.0
     self.full_column_mask = tf.concat(
-        1, [self.batch_number_column_mask, self.batch_word_column_mask])
+        axis=1, values=[self.batch_number_column_mask, self.batch_word_column_mask])
     self.full_processed_column = tf.concat(
-        1,
-        [self.batch_processed_number_column, self.batch_processed_word_column])
-    self.full_processed_sorted_index_column = tf.concat(1, [
+        axis=1,
+        values=[self.batch_processed_number_column, self.batch_processed_word_column])
+    self.full_processed_sorted_index_column = tf.concat(axis=1, values=[
         self.batch_processed_sorted_index_number_column,
         self.batch_processed_sorted_index_word_column
     ])
@@ -603,7 +603,7 @@ def compute_error(self):
             tf.equal(self.batch_word_column_entry_mask,
                      self.utility.dummy_token_id)), self.data_type)
     self.select_full_mask = tf.concat(
-        1, [self.select_mask, self.select_word_mask])
+        axis=1, values=[self.select_mask, self.select_word_mask])
     self.select_whole_mask = tf.maximum(
         tf.reshape(
             tf.slice(self.select_mask, [0, 0, 0],
@@ -614,7 +614,7 @@ def compute_error(self):
                      [self.batch_size, 1, self.max_elements]),
             [self.batch_size, self.max_elements]))
     self.invert_select_full_mask = tf.cast(
-        tf.concat(1, [
+        tf.concat(axis=1, values=[
             tf.equal(self.batch_number_column, self.utility.FLAGS.pad_int),
             tf.equal(self.batch_word_column_entry_mask,
                      self.utility.dummy_token_id)
diff --git a/neural_programmer/neural_programmer.py b/neural_programmer/neural_programmer.py
old mode 100755
new mode 100644
diff --git a/neural_programmer/nn_utils.py b/neural_programmer/nn_utils.py
old mode 100755
new mode 100644
diff --git a/neural_programmer/parameters.py b/neural_programmer/parameters.py
old mode 100755
new mode 100644
diff --git a/neural_programmer/wiki_data.py b/neural_programmer/wiki_data.py
old mode 100755
new mode 100644
diff --git a/next_frame_prediction/cross_conv/model.py b/next_frame_prediction/cross_conv/model.py
index d8d32392bc..927382fd2a 100644
--- a/next_frame_prediction/cross_conv/model.py
+++ b/next_frame_prediction/cross_conv/model.py
@@ -65,7 +65,7 @@ def Build(self):
       diff = diff * 2.0 - self.params['scale']
       diff_output = self.diff_output * 2.0 - self.params['scale']
       concat_image = tf.concat(
-          1, [image, image + diff_output, image + diff, diff_output])
+          axis=1, values=[image, image + diff_output, image + diff, diff_output])
       tf.summary.image('origin_predict_expect_predictdiff', concat_image)
       self.summary_op = tf.summary.merge_all()
       return self.loss
@@ -113,7 +113,7 @@ def _BuildMotionKernel(self):
     assert shape[1] == shape[2] and shape[1] == 128
     batch_size = shape[0]
 
-    net = tf.concat(3, [image, diff])
+    net = tf.concat(axis=3, values=[image, diff])
     with tf.variable_scope('motion_encoder'):
       with slim.arg_scope([slim.conv2d], padding='VALID'):
         net = slim.conv2d(net, 96, [5, 5], stride=1)
@@ -128,7 +128,7 @@ def _BuildMotionKernel(self):
 
         z = tf.reshape(net, shape=[batch_size, -1])
         self.z_mean, self.z_stddev_log = tf.split(
-            split_dim=1, num_split=2, value=z)
+            axis=1, num_or_size_splits=2, value=z)
         self.z_stddev = tf.exp(self.z_stddev_log)
 
         epsilon = tf.random_normal(
@@ -174,7 +174,7 @@ def _CrossConvHelper(self, encoded_image, kernel):
   def _CrossConv(self, encoded_images):
     """Apply the motion kernel on the encoded_images."""
     cross_conved_images = []
-    kernels = tf.split(split_dim=3, num_split=4, value=self.kernel)
+    kernels = tf.split(axis=3, num_or_size_splits=4, value=self.kernel)
     for (i, encoded_image) in enumerate(encoded_images):
       with tf.variable_scope('cross_conv_%d' % i):
         kernel = kernels[i]
@@ -187,7 +187,7 @@ def _CrossConv(self, encoded_images):
         for j in xrange(len(encoded_image)):
           conved_image.append(self._CrossConvHelper(
               encoded_image[j], kernel[j]))
-        cross_conved_images.append(tf.concat(0, conved_image))
+        cross_conved_images.append(tf.concat(axis=0, values=conved_image))
         sys.stderr.write('cross_conved shape: %s\n' %
                          cross_conved_images[-1].get_shape())
     return cross_conved_images
@@ -224,7 +224,7 @@ def _BuildImageDecoder(self, cross_conved_images):
         nets.append(self._Deconv(
             cross_conved_image, 64, kernel_size=3, stride=stride))
 
-    net = tf.concat(3, nets)
+    net = tf.concat(axis=3, values=nets)
     net = slim.conv2d(net, 128, [9, 9], padding='SAME', stride=1)
     net = slim.conv2d(net, 128, [1, 1], padding='SAME', stride=1)
     net = slim.conv2d(net, 3, [1, 1], padding='SAME', stride=1)
diff --git a/next_frame_prediction/cross_conv/reader.py b/next_frame_prediction/cross_conv/reader.py
index 58d69747b6..cd3cd22047 100644
--- a/next_frame_prediction/cross_conv/reader.py
+++ b/next_frame_prediction/cross_conv/reader.py
@@ -42,7 +42,7 @@ def SequenceToImageAndDiff(images):
     for i in xrange(0, len(resized_images)-1):
       diffs.append(resized_images[i+1] - resized_images[i])
     image_diff_list.append(
-        (tf.concat(0, resized_images[:-1]), tf.concat(0, diffs)))
+        (tf.concat(axis=0, values=resized_images[:-1]), tf.concat(axis=0, values=diffs)))
   return image_diff_list
 
 
diff --git a/real_nvp/real_nvp_multiscale_dataset.py b/real_nvp/real_nvp_multiscale_dataset.py
index 8587261f9b..d2263156ca 100644
--- a/real_nvp/real_nvp_multiscale_dataset.py
+++ b/real_nvp/real_nvp_multiscale_dataset.py
@@ -332,7 +332,7 @@ def masked_conv_aff_coupling(input_, mask_in, dim, name,
                      residual_blocks=residual_blocks,
                      bottleneck=bottleneck, skip=skip)
         mask = tf.mod(mask_channel + mask, 2)
-        res = tf.split(res, 2, 3)
+        res = tf.split(axis=res, num_or_size_splits=2, value=3)
         shift, log_rescaling = res[-2], res[-1]
         scale = variable_on_cpu(
             "rescaling_scale", [],
@@ -486,9 +486,9 @@ def conv_ch_aff_coupling(input_, dim, name,
             scope.reuse_variables()
 
         if change_bottom:
-            input_, canvas = tf.split(input_, 2, 3)
+            input_, canvas = tf.split(axis=input_, num_or_size_splits=2, value=3)
         else:
-            canvas, input_ = tf.split(input_, 2, 3)
+            canvas, input_ = tf.split(axis=input_, num_or_size_splits=2, value=3)
         shape = input_.get_shape().as_list()
         batch_size = shape[0]
         height = shape[1]
@@ -509,7 +509,7 @@ def conv_ch_aff_coupling(input_, dim, name,
                      train=train, weight_norm=weight_norm,
                      residual_blocks=residual_blocks,
                      bottleneck=bottleneck, skip=skip)
-        shift, log_rescaling = tf.split(res, 2, 3)
+        shift, log_rescaling = tf.split(axis=res, num_or_size_splits=2, value=3)
         scale = variable_on_cpu(
             "scale", [],
             tf.constant_initializer(1.))
@@ -570,9 +570,9 @@ def conv_ch_add_coupling(input_, dim, name,
             scope.reuse_variables()
 
         if change_bottom:
-            input_, canvas = tf.split(input_, 2, 3)
+            input_, canvas = tf.split(axis=input_, num_or_size_splits=2, value=3)
         else:
-            canvas, input_ = tf.split(input_, 2, 3)
+            canvas, input_ = tf.split(axis=input_, num_or_size_splits=2, value=3)
         shape = input_.get_shape().as_list()
         channels = shape[3]
         res = input_
@@ -736,8 +736,8 @@ def rec_masked_conv_coupling(input_, hps, scale_idx, n_scale,
                 log_diff_1 = log_diff[:, :, :, :channels]
                 log_diff_2 = log_diff[:, :, :, channels:]
             else:
-                res_1, res_2 = tf.split(res, 2, 3)
-                log_diff_1, log_diff_2 = tf.split(log_diff, 2, 3)
+                res_1, res_2 = tf.split(axis=res, num_or_size_splits=2, value=3)
+                log_diff_1, log_diff_2 = tf.split(axis=log_diff, num_or_size_splits=2, value=3)
             res_1, inc_log_diff = rec_masked_conv_coupling(
                 input_=res_1, hps=hps, scale_idx=scale_idx + 1, n_scale=n_scale,
                 use_batch_norm=use_batch_norm, weight_norm=weight_norm,
@@ -798,8 +798,8 @@ def rec_masked_deconv_coupling(input_, hps, scale_idx, n_scale,
                 log_diff_1 = log_diff[:, :, :, :channels]
                 log_diff_2 = log_diff[:, :, :, channels:]
             else:
-                res_1, res_2 = tf.split(res, 2, 3)
-                log_diff_1, log_diff_2 = tf.split(log_diff, 2, 3)
+                res_1, res_2 = tf.split(axis=res, num_or_size_splits=2, value=3)
+                log_diff_1, log_diff_2 = tf.split(axis=log_diff, num_or_size_splits=2, value=3)
             res_1, log_diff_1 = rec_masked_deconv_coupling(
                 input_=res_1, hps=hps,
                 scale_idx=scale_idx + 1, n_scale=n_scale,
@@ -1305,7 +1305,7 @@ def __init__(self, hps, sampling=False):
             z_lost = z_complete
             for scale_idx in xrange(hps.n_scale - 1):
                 z_lost = squeeze_2x2_ordered(z_lost)
-                z_lost, _ = tf.split(z_lost, 2, 3)
+                z_lost, _ = tf.split(axis=z_lost, num_or_size_splits=2, value=3)
                 z_compressed = z_lost
                 z_noisy = z_lost
                 for _ in xrange(scale_idx + 1):
diff --git a/real_nvp/real_nvp_utils.py b/real_nvp/real_nvp_utils.py
index 203ca35ec4..004ef62ca6 100644
--- a/real_nvp/real_nvp_utils.py
+++ b/real_nvp/real_nvp_utils.py
@@ -99,8 +99,8 @@ def conv_layer(input_,
                     filter_size[1] - input_.get_shape().as_list()[2],
                     input_.get_shape().as_list()[3]
                 ])
-                res = tf.concat(1, [pad_1, res])
-                res = tf.concat(2, [pad_2, res])
+                res = tf.concat(axis=1, values=[pad_1, res])
+                res = tf.concat(axis=2, values=[pad_2, res])
         res = tf.nn.conv2d(
             input=res,
             filter=weights,
@@ -139,8 +139,8 @@ def depool_2x2(input_, stride=2):
     channels = shape[3]
     res = tf.reshape(input_, [batch_size, height, 1, width, 1, channels])
     res = tf.concat(
-        2, [res, tf.zeros([batch_size, height, stride - 1, width, 1, channels])])
-    res = tf.concat(4, [
+        axis=2, values=[res, tf.zeros([batch_size, height, stride - 1, width, 1, channels])])
+    res = tf.concat(axis=4, values=[
         res, tf.zeros([batch_size, height, stride, width, stride - 1, channels])
     ])
     res = tf.reshape(res, [batch_size, stride * height, stride * width, channels])
@@ -158,11 +158,11 @@ def batch_random_flip(input_):
     height = shape[1]
     width = shape[2]
     channels = shape[3]
-    res = tf.split(0, batch_size, input_)
+    res = tf.split(axis=0, num_or_size_splits=batch_size, value=input_)
     res = [elem[0, :, :, :] for elem in res]
     res = [tf.image.random_flip_left_right(elem) for elem in res]
     res = [tf.reshape(elem, [1, height, width, channels]) for elem in res]
-    res = tf.concat(0, res)
+    res = tf.concat(axis=0, values=res)
 
     return res
 
@@ -175,7 +175,7 @@ def as_one_hot(input_, n_indices):
     n_elem = numpy.prod(shape)
     indices = tf.range(n_elem)
     indices = tf.cast(indices, tf.int64)
-    indices_input = tf.concat(0, [indices, tf.reshape(input_, [-1])])
+    indices_input = tf.concat(axis=0, values=[indices, tf.reshape(input_, [-1])])
     indices_input = tf.reshape(indices_input, [2, -1])
     indices_input = tf.transpose(indices_input)
     res = tf.sparse_to_dense(
diff --git a/slim/deployment/model_deploy.py b/slim/deployment/model_deploy.py
index 8e56e24c7a..1ad69a469a 100644
--- a/slim/deployment/model_deploy.py
+++ b/slim/deployment/model_deploy.py
@@ -232,10 +232,10 @@ def _gather_clone_loss(clone, num_clones, regularization_losses):
       sum_loss = tf.add_n(all_losses)
   # Add the summaries out of the clone device block.
   if clone_loss is not None:
-    tf.scalar_summary(clone.scope + '/clone_loss', clone_loss,
+    tf.summary.scalar(clone.scope + '/clone_loss', clone_loss,
                       name='clone_loss')
   if regularization_loss is not None:
-    tf.scalar_summary('regularization_loss', regularization_loss,
+    tf.summary.scalar('regularization_loss', regularization_loss,
                       name='regularization_loss')
   return sum_loss
 
@@ -404,12 +404,12 @@ def deploy(config,
 
     if total_loss is not None:
       # Add total_loss to summary.
-      summaries.add(tf.scalar_summary('total_loss', total_loss,
+      summaries.add(tf.summary.scalar('total_loss', total_loss,
                                       name='total_loss'))
 
     if summaries:
       # Merge all summaries together.
-      summary_op = tf.merge_summary(list(summaries), name='summary_op')
+      summary_op = tf.summary.merge(list(summaries), name='summary_op')
     else:
       summary_op = None
 
@@ -467,9 +467,9 @@ def _add_gradients_summaries(grads_and_vars):
         grad_values = grad.values
       else:
         grad_values = grad
-      summaries.append(tf.histogram_summary(var.op.name + ':gradient',
+      summaries.append(tf.summary.histogram(var.op.name + ':gradient',
                                             grad_values))
-      summaries.append(tf.histogram_summary(var.op.name + ':gradient_norm',
+      summaries.append(tf.summary.histogram(var.op.name + ':gradient_norm',
                                             tf.global_norm([grad_values])))
     else:
       tf.logging.info('Var %s has no gradient', var.op.name)
diff --git a/slim/eval_image_classifier.py b/slim/eval_image_classifier.py
index e5b923e1d0..f4d3f6adbb 100644
--- a/slim/eval_image_classifier.py
+++ b/slim/eval_image_classifier.py
@@ -160,7 +160,7 @@ def main(_):
     # Print the summaries to screen.
     for name, value in names_to_values.iteritems():
       summary_name = 'eval/%s' % name
-      op = tf.scalar_summary(summary_name, value, collections=[])
+      op = tf.summary.scalar(summary_name, value, collections=[])
       op = tf.Print(op, [value], summary_name)
       tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
 
diff --git a/slim/nets/alexnet.py b/slim/nets/alexnet.py
index a6b93de054..4e7e563cd1 100644
--- a/slim/nets/alexnet.py
+++ b/slim/nets/alexnet.py
@@ -113,7 +113,7 @@ def alexnet_v2(inputs,
         net = slim.conv2d(net, num_classes, [1, 1],
                           activation_fn=None,
                           normalizer_fn=None,
-                          biases_initializer=tf.zeros_initializer,
+                          biases_initializer=tf.zeros_initializer(),
                           scope='fc8')
 
       # Convert end_points_collection into a end_point dict.
diff --git a/slim/nets/cifarnet.py b/slim/nets/cifarnet.py
index 371a9cbf28..44ca0fed2d 100644
--- a/slim/nets/cifarnet.py
+++ b/slim/nets/cifarnet.py
@@ -77,7 +77,7 @@ def cifarnet(images, num_classes=10, is_training=False,
     net = slim.fully_connected(net, 192, scope='fc4')
     end_points['fc4'] = net
     logits = slim.fully_connected(net, num_classes,
-                                  biases_initializer=tf.zeros_initializer,
+                                  biases_initializer=tf.zeros_initializer(),
                                   weights_initializer=trunc_normal(1/192.0),
                                   weights_regularizer=None,
                                   activation_fn=None,
diff --git a/slim/nets/inception_v1.py b/slim/nets/inception_v1.py
index 8f644796e7..e1ed3147f8 100644
--- a/slim/nets/inception_v1.py
+++ b/slim/nets/inception_v1.py
@@ -93,7 +93,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -110,7 +110,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -132,7 +132,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -149,7 +149,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -166,7 +166,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -183,7 +183,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -200,7 +200,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -222,7 +222,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
 
@@ -239,7 +239,7 @@ def inception_v1_base(inputs,
           with tf.variable_scope('Branch_3'):
             branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
             branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-          net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+          net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if final_endpoint == end_point: return net, end_points
     raise ValueError('Unknown final endpoint %s' % final_endpoint)
diff --git a/slim/nets/inception_v2.py b/slim/nets/inception_v2.py
index 6c9f10098b..2d75ba97a0 100644
--- a/slim/nets/inception_v2.py
+++ b/slim/nets/inception_v2.py
@@ -145,7 +145,7 @@ def inception_v2_base(inputs,
               branch_3, depth(32), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 28 x 28 x 256
@@ -175,7 +175,7 @@ def inception_v2_base(inputs,
               branch_3, depth(64), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 28 x 28 x 320
@@ -200,7 +200,7 @@ def inception_v2_base(inputs,
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.max_pool2d(
               net, [3, 3], stride=2, scope='MaxPool_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1, branch_2])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 14 x 14 x 576
@@ -230,7 +230,7 @@ def inception_v2_base(inputs,
               branch_3, depth(128), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 14 x 14 x 576
@@ -260,7 +260,7 @@ def inception_v2_base(inputs,
               branch_3, depth(128), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 14 x 14 x 576
@@ -290,7 +290,7 @@ def inception_v2_base(inputs,
               branch_3, depth(96), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
 
@@ -321,7 +321,7 @@ def inception_v2_base(inputs,
               branch_3, depth(96), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 14 x 14 x 576
@@ -346,7 +346,7 @@ def inception_v2_base(inputs,
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.max_pool2d(net, [3, 3], stride=2,
                                      scope='MaxPool_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1, branch_2])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
       # 7 x 7 x 1024
@@ -376,7 +376,7 @@ def inception_v2_base(inputs,
               branch_3, depth(128), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
 
@@ -407,7 +407,7 @@ def inception_v2_base(inputs,
               branch_3, depth(128), [1, 1],
               weights_initializer=trunc_normal(0.1),
               scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
         end_points[end_point] = net
         if end_point == final_endpoint: return net, end_points
     raise ValueError('Unknown final endpoint %s' % final_endpoint)
diff --git a/slim/nets/inception_v3.py b/slim/nets/inception_v3.py
index 5c5f96519e..2e5e5f3e2e 100644
--- a/slim/nets/inception_v3.py
+++ b/slim/nets/inception_v3.py
@@ -158,7 +158,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(32), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -182,7 +182,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(64), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -205,7 +205,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(64), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -224,7 +224,7 @@ def inception_v3_base(inputs,
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
                                      scope='MaxPool_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1, branch_2])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -253,7 +253,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -282,7 +282,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
       # mixed_6: 17 x 17 x 768.
@@ -310,7 +310,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -339,7 +339,7 @@ def inception_v3_base(inputs,
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
                                  scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -361,7 +361,7 @@ def inception_v3_base(inputs,
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
                                      scope='MaxPool_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1, branch_2])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
       # mixed_9: 8 x 8 x 2048.
@@ -371,21 +371,21 @@ def inception_v3_base(inputs,
           branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1')
         with tf.variable_scope('Branch_1'):
           branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1')
-          branch_1 = tf.concat(3, [
+          branch_1 = tf.concat(axis=3, values=[
               slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'),
               slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')])
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1')
           branch_2 = slim.conv2d(
               branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3')
-          branch_2 = tf.concat(3, [
+          branch_2 = tf.concat(axis=3, values=[
               slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'),
               slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')])
         with tf.variable_scope('Branch_3'):
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(
               branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
 
@@ -396,21 +396,21 @@ def inception_v3_base(inputs,
           branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1')
         with tf.variable_scope('Branch_1'):
           branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1')
-          branch_1 = tf.concat(3, [
+          branch_1 = tf.concat(axis=3, values=[
               slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'),
               slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')])
         with tf.variable_scope('Branch_2'):
           branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1')
           branch_2 = slim.conv2d(
               branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3')
-          branch_2 = tf.concat(3, [
+          branch_2 = tf.concat(axis=3, values=[
               slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'),
               slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')])
         with tf.variable_scope('Branch_3'):
           branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
           branch_3 = slim.conv2d(
               branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1')
-        net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
       end_points[end_point] = net
       if end_point == final_endpoint: return net, end_points
     raise ValueError('Unknown final endpoint %s' % final_endpoint)
diff --git a/slim/nets/inception_v4.py b/slim/nets/inception_v4.py
index 0c581f7c41..b706d5b71c 100644
--- a/slim/nets/inception_v4.py
+++ b/slim/nets/inception_v4.py
@@ -49,7 +49,7 @@ def block_inception_a(inputs, scope=None, reuse=None):
       with tf.variable_scope('Branch_3'):
         branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
         branch_3 = slim.conv2d(branch_3, 96, [1, 1], scope='Conv2d_0b_1x1')
-      return tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
 
 
 def block_reduction_a(inputs, scope=None, reuse=None):
@@ -69,7 +69,7 @@ def block_reduction_a(inputs, scope=None, reuse=None):
       with tf.variable_scope('Branch_2'):
         branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID',
                                    scope='MaxPool_1a_3x3')
-      return tf.concat(3, [branch_0, branch_1, branch_2])
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
 
 
 def block_inception_b(inputs, scope=None, reuse=None):
@@ -93,7 +93,7 @@ def block_inception_b(inputs, scope=None, reuse=None):
       with tf.variable_scope('Branch_3'):
         branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
         branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
-      return tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
 
 
 def block_reduction_b(inputs, scope=None, reuse=None):
@@ -115,7 +115,7 @@ def block_reduction_b(inputs, scope=None, reuse=None):
       with tf.variable_scope('Branch_2'):
         branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID',
                                    scope='MaxPool_1a_3x3')
-      return tf.concat(3, [branch_0, branch_1, branch_2])
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
 
 
 def block_inception_c(inputs, scope=None, reuse=None):
@@ -128,20 +128,20 @@ def block_inception_c(inputs, scope=None, reuse=None):
         branch_0 = slim.conv2d(inputs, 256, [1, 1], scope='Conv2d_0a_1x1')
       with tf.variable_scope('Branch_1'):
         branch_1 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1')
-        branch_1 = tf.concat(3, [
+        branch_1 = tf.concat(axis=3, values=[
             slim.conv2d(branch_1, 256, [1, 3], scope='Conv2d_0b_1x3'),
             slim.conv2d(branch_1, 256, [3, 1], scope='Conv2d_0c_3x1')])
       with tf.variable_scope('Branch_2'):
         branch_2 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1')
         branch_2 = slim.conv2d(branch_2, 448, [3, 1], scope='Conv2d_0b_3x1')
         branch_2 = slim.conv2d(branch_2, 512, [1, 3], scope='Conv2d_0c_1x3')
-        branch_2 = tf.concat(3, [
+        branch_2 = tf.concat(axis=3, values=[
             slim.conv2d(branch_2, 256, [1, 3], scope='Conv2d_0d_1x3'),
             slim.conv2d(branch_2, 256, [3, 1], scope='Conv2d_0e_3x1')])
       with tf.variable_scope('Branch_3'):
         branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
         branch_3 = slim.conv2d(branch_3, 256, [1, 1], scope='Conv2d_0b_1x1')
-      return tf.concat(3, [branch_0, branch_1, branch_2, branch_3])
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
 
 
 def inception_v4_base(inputs, final_endpoint='Mixed_7d', scope=None):
@@ -192,7 +192,7 @@ def add_and_check_final(name, net):
         with tf.variable_scope('Branch_1'):
           branch_1 = slim.conv2d(net, 96, [3, 3], stride=2, padding='VALID',
                                  scope='Conv2d_0a_3x3')
-        net = tf.concat(3, [branch_0, branch_1])
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
         if add_and_check_final('Mixed_3a', net): return net, end_points
 
       # 73 x 73 x 160
@@ -207,7 +207,7 @@ def add_and_check_final(name, net):
           branch_1 = slim.conv2d(branch_1, 64, [7, 1], scope='Conv2d_0c_7x1')
           branch_1 = slim.conv2d(branch_1, 96, [3, 3], padding='VALID',
                                  scope='Conv2d_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1])
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
         if add_and_check_final('Mixed_4a', net): return net, end_points
 
       # 71 x 71 x 192
@@ -218,7 +218,7 @@ def add_and_check_final(name, net):
         with tf.variable_scope('Branch_1'):
           branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
                                      scope='MaxPool_1a_3x3')
-        net = tf.concat(3, [branch_0, branch_1])
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
         if add_and_check_final('Mixed_5a', net): return net, end_points
 
       # 35 x 35 x 384
diff --git a/slim/nets/overfeat.py b/slim/nets/overfeat.py
index 0c8f45ce02..64a542523a 100644
--- a/slim/nets/overfeat.py
+++ b/slim/nets/overfeat.py
@@ -41,7 +41,7 @@ def overfeat_arg_scope(weight_decay=0.0005):
   with slim.arg_scope([slim.conv2d, slim.fully_connected],
                       activation_fn=tf.nn.relu,
                       weights_regularizer=slim.l2_regularizer(weight_decay),
-                      biases_initializer=tf.zeros_initializer):
+                      biases_initializer=tf.zeros_initializer()):
     with slim.arg_scope([slim.conv2d], padding='SAME'):
       with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
         return arg_sc
@@ -107,7 +107,7 @@ def overfeat(inputs,
         net = slim.conv2d(net, num_classes, [1, 1],
                           activation_fn=None,
                           normalizer_fn=None,
-                          biases_initializer=tf.zeros_initializer,
+                          biases_initializer=tf.zeros_initializer(),
                           scope='fc8')
       # Convert end_points_collection into a end_point dict.
       end_points = slim.utils.convert_collection_to_dict(end_points_collection)
diff --git a/slim/nets/vgg.py b/slim/nets/vgg.py
index c9a66e1bd3..7de2806220 100644
--- a/slim/nets/vgg.py
+++ b/slim/nets/vgg.py
@@ -58,7 +58,7 @@ def vgg_arg_scope(weight_decay=0.0005):
   with slim.arg_scope([slim.conv2d, slim.fully_connected],
                       activation_fn=tf.nn.relu,
                       weights_regularizer=slim.l2_regularizer(weight_decay),
-                      biases_initializer=tf.zeros_initializer):
+                      biases_initializer=tf.zeros_initializer()):
     with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
       return arg_sc
 
diff --git a/slim/preprocessing/cifarnet_preprocessing.py b/slim/preprocessing/cifarnet_preprocessing.py
index 1057d46288..195a5c7d00 100644
--- a/slim/preprocessing/cifarnet_preprocessing.py
+++ b/slim/preprocessing/cifarnet_preprocessing.py
@@ -45,7 +45,7 @@ def preprocess_for_train(image,
   Returns:
     A preprocessed image.
   """
-  tf.image_summary('image', tf.expand_dims(image, 0))
+  tf.summary.image('image', tf.expand_dims(image, 0))
 
   # Transform the image to floats.
   image = tf.to_float(image)
@@ -58,7 +58,7 @@ def preprocess_for_train(image,
   # Randomly flip the image horizontally.
   distorted_image = tf.image.random_flip_left_right(distorted_image)
 
-  tf.image_summary('distorted_image', tf.expand_dims(distorted_image, 0))
+  tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
 
   # Because these operations are not commutative, consider randomizing
   # the order their operation.
@@ -67,7 +67,7 @@ def preprocess_for_train(image,
   distorted_image = tf.image.random_contrast(distorted_image,
                                              lower=0.2, upper=1.8)
   # Subtract off the mean and divide by the variance of the pixels.
-  return tf.image.per_image_whitening(distorted_image)
+  return tf.image.per_image_standardization(distorted_image)
 
 
 def preprocess_for_eval(image, output_height, output_width):
@@ -81,7 +81,7 @@ def preprocess_for_eval(image, output_height, output_width):
   Returns:
     A preprocessed image.
   """
-  tf.image_summary('image', tf.expand_dims(image, 0))
+  tf.summary.image('image', tf.expand_dims(image, 0))
   # Transform the image to floats.
   image = tf.to_float(image)
 
@@ -89,10 +89,10 @@ def preprocess_for_eval(image, output_height, output_width):
   resized_image = tf.image.resize_image_with_crop_or_pad(image,
                                                          output_width,
                                                          output_height)
-  tf.image_summary('resized_image', tf.expand_dims(resized_image, 0))
+  tf.summary.image('resized_image', tf.expand_dims(resized_image, 0))
 
   # Subtract off the mean and divide by the variance of the pixels.
-  return tf.image.per_image_whitening(resized_image)
+  return tf.image.per_image_standardization(resized_image)
 
 
 def preprocess_image(image, output_height, output_width, is_training=False):
diff --git a/slim/preprocessing/inception_preprocessing.py b/slim/preprocessing/inception_preprocessing.py
index 133264b654..ca3eba0baa 100644
--- a/slim/preprocessing/inception_preprocessing.py
+++ b/slim/preprocessing/inception_preprocessing.py
@@ -192,7 +192,7 @@ def preprocess_for_train(image, height, width, bbox,
     # the coordinates are ordered [ymin, xmin, ymax, xmax].
     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                   bbox)
-    tf.image_summary('image_with_bounding_boxes', image_with_box)
+    tf.summary.image('image_with_bounding_boxes', image_with_box)
 
     distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox)
     # Restore the shape since the dynamic slice based upon the bbox_size loses
@@ -200,7 +200,7 @@ def preprocess_for_train(image, height, width, bbox,
     distorted_image.set_shape([None, None, 3])
     image_with_distorted_box = tf.image.draw_bounding_boxes(
         tf.expand_dims(image, 0), distorted_bbox)
-    tf.image_summary('images_with_distorted_bounding_box',
+    tf.summary.image('images_with_distorted_bounding_box',
                      image_with_distorted_box)
 
     # This resizing operation may distort the images because the aspect
@@ -215,7 +215,7 @@ def preprocess_for_train(image, height, width, bbox,
         lambda x, method: tf.image.resize_images(x, [height, width], method=method),
         num_cases=num_resize_cases)
 
-    tf.image_summary('cropped_resized_image',
+    tf.summary.image('cropped_resized_image',
                      tf.expand_dims(distorted_image, 0))
 
     # Randomly flip the image horizontally.
@@ -227,10 +227,10 @@ def preprocess_for_train(image, height, width, bbox,
         lambda x, ordering: distort_color(x, ordering, fast_mode),
         num_cases=4)
 
-    tf.image_summary('final_distorted_image',
+    tf.summary.image('final_distorted_image',
                      tf.expand_dims(distorted_image, 0))
-    distorted_image = tf.sub(distorted_image, 0.5)
-    distorted_image = tf.mul(distorted_image, 2.0)
+    distorted_image = tf.subtract(distorted_image, 0.5)
+    distorted_image = tf.multiply(distorted_image, 2.0)
     return distorted_image
 
 
@@ -270,8 +270,8 @@ def preprocess_for_eval(image, height, width,
       image = tf.image.resize_bilinear(image, [height, width],
                                        align_corners=False)
       image = tf.squeeze(image, [0])
-    image = tf.sub(image, 0.5)
-    image = tf.mul(image, 2.0)
+    image = tf.subtract(image, 0.5)
+    image = tf.multiply(image, 2.0)
     return image
 
 
diff --git a/slim/preprocessing/lenet_preprocessing.py b/slim/preprocessing/lenet_preprocessing.py
index 22c352a291..ac5e71af88 100644
--- a/slim/preprocessing/lenet_preprocessing.py
+++ b/slim/preprocessing/lenet_preprocessing.py
@@ -39,6 +39,6 @@ def preprocess_image(image, output_height, output_width, is_training):
   image = tf.to_float(image)
   image = tf.image.resize_image_with_crop_or_pad(
       image, output_width, output_height)
-  image = tf.sub(image, 128.0)
+  image = tf.subtract(image, 128.0)
   image = tf.div(image, 128.0)
   return image
diff --git a/slim/preprocessing/vgg_preprocessing.py b/slim/preprocessing/vgg_preprocessing.py
index 672c7408e1..1900cae220 100644
--- a/slim/preprocessing/vgg_preprocessing.py
+++ b/slim/preprocessing/vgg_preprocessing.py
@@ -73,7 +73,7 @@ def _crop(image, offset_height, offset_width, crop_height, crop_width):
       ['Rank of image must be equal to 3.'])
   cropped_shape = control_flow_ops.with_dependencies(
       [rank_assertion],
-      tf.pack([crop_height, crop_width, original_shape[2]]))
+      tf.stack([crop_height, crop_width, original_shape[2]]))
 
   size_assertion = tf.Assert(
       tf.logical_and(
@@ -81,7 +81,7 @@ def _crop(image, offset_height, offset_width, crop_height, crop_width):
           tf.greater_equal(original_shape[1], crop_width)),
       ['Crop size greater than the image size.'])
 
-  offsets = tf.to_int32(tf.pack([offset_height, offset_width, 0]))
+  offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
 
   # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
   # define the crop size.
@@ -227,10 +227,10 @@ def _mean_image_subtraction(image, means):
   if len(means) != num_channels:
     raise ValueError('len(means) must match the number of channels')
 
-  channels = tf.split(2, num_channels, image)
+  channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
   for i in range(num_channels):
     channels[i] -= means[i]
-  return tf.concat(2, channels)
+  return tf.concat(axis=2, values=channels)
 
 
 def _smallest_size_at_least(height, width, smallest_side):
diff --git a/slim/train_image_classifier.py b/slim/train_image_classifier.py
index 0e95c60206..b2509e503b 100644
--- a/slim/train_image_classifier.py
+++ b/slim/train_image_classifier.py
@@ -316,8 +316,8 @@ def _configure_optimizer(learning_rate):
 def _add_variables_summaries(learning_rate):
   summaries = []
   for variable in slim.get_model_variables():
-    summaries.append(tf.histogram_summary(variable.op.name, variable))
-  summaries.append(tf.scalar_summary('training/Learning Rate', learning_rate))
+    summaries.append(tf.summary.histogram(variable.op.name, variable))
+  summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate))
   return summaries
 
 
@@ -489,17 +489,17 @@ def clone_fn(batch_queue):
     end_points = clones[0].outputs
     for end_point in end_points:
       x = end_points[end_point]
-      summaries.add(tf.histogram_summary('activations/' + end_point, x))
-      summaries.add(tf.scalar_summary('sparsity/' + end_point,
+      summaries.add(tf.summary.histogram('activations/' + end_point, x))
+      summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                       tf.nn.zero_fraction(x)))
 
     # Add summaries for losses.
     for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
-      summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss))
+      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
 
     # Add summaries for variables.
     for variable in slim.get_model_variables():
-      summaries.add(tf.histogram_summary(variable.op.name, variable))
+      summaries.add(tf.summary.histogram(variable.op.name, variable))
 
     #################################
     # Configure the moving averages #
@@ -517,7 +517,7 @@ def clone_fn(batch_queue):
     with tf.device(deploy_config.optimizer_device()):
       learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
       optimizer = _configure_optimizer(learning_rate)
-      summaries.add(tf.scalar_summary('learning_rate', learning_rate,
+      summaries.add(tf.summary.scalar('learning_rate', learning_rate,
                                       name='learning_rate'))
 
     if FLAGS.sync_replicas:
@@ -543,7 +543,7 @@ def clone_fn(batch_queue):
         optimizer,
         var_list=variables_to_train)
     # Add total_loss to summary.
-    summaries.add(tf.scalar_summary('total_loss', total_loss,
+    summaries.add(tf.summary.scalar('total_loss', total_loss,
                                     name='total_loss'))
 
     # Create gradient updates.
@@ -561,7 +561,7 @@ def clone_fn(batch_queue):
                                        first_clone_scope))
 
     # Merge all summaries together.
-    summary_op = tf.merge_summary(list(summaries), name='summary_op')
+    summary_op = tf.summary.merge(list(summaries), name='summary_op')
 
 
     ###########################
diff --git a/street/python/nn_ops.py b/street/python/nn_ops.py
index 8f41e1f914..20c3b50285 100644
--- a/street/python/nn_ops.py
+++ b/street/python/nn_ops.py
@@ -92,7 +92,7 @@ def rnn_helper(inp,
     elif direction == "backward":
       out = backward
     else:
-      out = tf.concat(2, [forward, backward])
+      out = tf.concat(axis=2, values=[forward, backward])
   return out
 
 
@@ -183,7 +183,7 @@ def lstm_layer(inp,
   with tf.variable_scope(name):
     if backward:
       if length is None:
-        inp = tf.reverse(inp, [False, True, False])
+        inp = tf.reverse(inp, [1])
       else:
         inp = tf.reverse_sequence(inp, length, 1, 0)
 
@@ -217,14 +217,14 @@ def lstm_layer(inp,
 
     batch_size = shapes.tensor_dim(inp, dim=0)
     num_frames = shapes.tensor_dim(inp, dim=1)
-    prev = tf.reshape(inp, tf.pack([batch_size * num_frames, num_prev]))
+    prev = tf.reshape(inp, tf.stack([batch_size * num_frames, num_prev]))
 
     if use_native_weights:
       with tf.variable_scope("LSTMCell"):
         b = tf.get_variable(
             "B",
             shape=[4 * num_nodes],
-            initializer=tf.zeros_initializer,
+            initializer=tf.zeros_initializer(),
             dtype=tf.float32)
       biases = tf.identity(b, name="biases")
     else:
@@ -236,17 +236,17 @@ def lstm_layer(inp,
               biases, name="biases_reg"))
     prev = tf.nn.xw_plus_b(prev, w_i_m, biases)
 
-    prev = tf.reshape(prev, tf.pack([batch_size, num_frames, 4, num_nodes]))
+    prev = tf.reshape(prev, tf.stack([batch_size, num_frames, 4, num_nodes]))
     if state is None:
-      state = tf.fill(tf.pack([batch_size, num_nodes]), 0.0)
+      state = tf.fill(tf.stack([batch_size, num_nodes]), 0.0)
     if memory is None:
-      memory = tf.fill(tf.pack([batch_size, num_nodes]), 0.0)
+      memory = tf.fill(tf.stack([batch_size, num_nodes]), 0.0)
 
     out, _, mem = rnn.variable_lstm(prev, state, memory, w_m_m, clip=clip)
 
     if backward:
       if length is None:
-        out = tf.reverse(out, [False, True, False])
+        out = tf.reverse(out, [1])
       else:
         out = tf.reverse_sequence(out, length, 1, 0)
 
diff --git a/street/python/vgsl_input.py b/street/python/vgsl_input.py
index 17372e29ae..e4495c680a 100644
--- a/street/python/vgsl_input.py
+++ b/street/python/vgsl_input.py
@@ -79,7 +79,7 @@ def ImageInput(input_pattern, num_threads, shape, using_ctc, reader=None):
   # Give the images a nice name as well.
   images = tf.identity(images, name='Images')
 
-  tf.image_summary('Images', images)
+  tf.summary.image('Images', images)
   return images, heights, widths, labels, sparse_labels, truths
 
 
@@ -145,6 +145,6 @@ def _ImageProcessing(image_buffer, shape):
   image = tf.image.decode_png(image_buffer, channels=shape.depth)
   image.set_shape([shape.height, shape.width, shape.depth])
   image = tf.cast(image, tf.float32)
-  image = tf.sub(image, 128.0)
-  image = tf.mul(image, 1 / 100.0)
+  image = tf.subtract(image, 128.0)
+  image = tf.multiply(image, 1 / 100.0)
   return image
diff --git a/street/python/vgsl_model.py b/street/python/vgsl_model.py
index 52a1d57d93..d621549330 100644
--- a/street/python/vgsl_model.py
+++ b/street/python/vgsl_model.py
@@ -147,7 +147,7 @@ def Eval(train_dir,
       sequence_error=None)
   with tf.Graph().as_default():
     model = InitNetwork(eval_data, model_str, 'eval', reader=reader)
-    sw = tf.train.SummaryWriter(eval_dir)
+    sw = tf.summary.FileWriter(eval_dir)
 
     while True:
       sess = tf.Session('')
@@ -369,7 +369,7 @@ def _AddOutputs(self, prev_layer, out_dims, out_func, num_classes):
     if self.mode == 'train':
       # Setup loss for training.
       self.loss = self._AddLossFunction(logits, height_in, out_dims, out_func)
-      tf.scalar_summary('loss', self.loss, name='loss')
+      tf.summary.scalar('loss', self.loss, name='loss')
     elif out_dims == 0:
       # Be sure the labels match the output, even in eval mode.
       self.labels = tf.slice(self.labels, [0, 0], [-1, 1])
@@ -484,7 +484,7 @@ def _AddOptimizer(self, optimizer_type):
       opt = tf.train.AdamOptimizer(learning_rate=learn_rate_dec)
     else:
       raise ValueError('Invalid optimizer type: ' + optimizer_type)
-    tf.scalar_summary('learn_rate', learn_rate_dec, name='lr_summ')
+    tf.summary.scalar('learn_rate', learn_rate_dec, name='lr_summ')
 
     self.train_op = opt.minimize(
         self.loss, global_step=self.global_step, name='train')
diff --git a/street/python/vgslspecs.py b/street/python/vgslspecs.py
index 1e08552f7e..2c96d77b2d 100644
--- a/street/python/vgslspecs.py
+++ b/street/python/vgslspecs.py
@@ -149,7 +149,7 @@ def GetLengths(self, dim=2, factor=1):
     else:
       lengths = tf.ones_like(lengths)
     if factor != 1:
-      lengths = tf.mul(lengths, tf.cast(factor, tf.float32))
+      lengths = tf.multiply(lengths, tf.cast(factor, tf.float32))
     return tf.cast(lengths, tf.int32)
 
   def BuildFromString(self, prev_layer, index):
@@ -235,7 +235,7 @@ def AddParallel(self, prev_layer, index):
         final_factors = self.reduction_factors
     if index == len(self.model_str):
       raise ValueError('Missing ) at end of parallel!' + self.model_str)
-    return tf.concat(num_dims - 1, layers), index + 1
+    return tf.concat(axis=num_dims - 1, values=layers), index + 1
 
   def AddConvLayer(self, prev_layer, index):
     """Add a single standard convolutional layer.
@@ -342,7 +342,7 @@ def AddReShape(self, prev_layer, index):
         factor1 = tf.cast(self.reduction_factors[i], tf.float32)
         factor2 = tf.cast(prev_shape[i], tf.float32)
         divisor = tf.cast(result_shape[i], tf.float32)
-        self.reduction_factors[i] = tf.div(tf.mul(factor1, factor2), divisor)
+        self.reduction_factors[i] = tf.div(tf.multiply(factor1, factor2), divisor)
     return layer, m.end()
 
   def AddFCLayer(self, prev_layer, index):
@@ -401,7 +401,7 @@ def AddLSTMLayer(self, prev_layer, index):
                             name + '_forward')
       back = self._LSTMLayer(prev_layer, 'backward', dim, True, depth,
                              name + '_reverse')
-      return tf.concat(3, [fwd, back], name=name + '_concat'), m.end()
+      return tf.concat(axis=3, values=[fwd, back], name=name + '_concat'), m.end()
     if direction == 'f':
       direction = 'forward'
     elif direction == 'r':
diff --git a/swivel/glove_to_shards.py b/swivel/glove_to_shards.py
old mode 100755
new mode 100644
diff --git a/swivel/nearest.py b/swivel/nearest.py
old mode 100755
new mode 100644
diff --git a/swivel/prep.py b/swivel/prep.py
old mode 100755
new mode 100644
diff --git a/swivel/swivel.py b/swivel/swivel.py
old mode 100755
new mode 100644
index 45311834c0..f964cba42c
--- a/swivel/swivel.py
+++ b/swivel/swivel.py
@@ -135,8 +135,8 @@ def count_matrix_input(filenames, submatrix_rows, submatrix_cols):
   sparse_local_col = features['sparse_local_col'].values
   sparse_count = features['sparse_value'].values
 
-  sparse_indices = tf.concat([tf.expand_dims(sparse_local_row, 1),
-                              tf.expand_dims(sparse_local_col, 1)], 1)
+  sparse_indices = tf.concat(axis=[tf.expand_dims(sparse_local_row, 1),
+                              tf.expand_dims(sparse_local_col, 1)], values=1)
   count = tf.sparse_to_dense(sparse_indices, [submatrix_rows, submatrix_cols],
                              sparse_count)
 
diff --git a/swivel/text2bin.py b/swivel/text2bin.py
old mode 100755
new mode 100644
diff --git a/swivel/wordsim.py b/swivel/wordsim.py
old mode 100755
new mode 100644
diff --git a/syntaxnet/syntaxnet/graph_builder.py b/syntaxnet/syntaxnet/graph_builder.py
index 54bb50858d..f3126207a3 100644
--- a/syntaxnet/syntaxnet/graph_builder.py
+++ b/syntaxnet/syntaxnet/graph_builder.py
@@ -69,7 +69,7 @@ def EmbeddingLookupFeatures(params, sparse_features, allow_weights):
 
   if allow_weights:
     # Multiply by weights, reshaping to allow broadcast.
-    broadcast_weights_shape = tf.concat([tf.shape(weights), [1]], 0)
+    broadcast_weights_shape = tf.concat(axis=[tf.shape(weights), [1]], values=0)
     embeddings *= tf.reshape(weights, broadcast_weights_shape)
 
   # Sum embeddings by index.
@@ -330,7 +330,7 @@ def _BuildNetwork(self, feature_endpoints, return_average=False):
                                            i,
                                            return_average=return_average))
 
-    last_layer = tf.concat(embeddings, 1)
+    last_layer = tf.concat(axis=embeddings, values=1)
     last_layer_size = self.embedding_size
 
     # Create ReLU layers.
diff --git a/textsum/seq2seq_attention.py b/textsum/seq2seq_attention.py
index f50eac10cb..33d1b4fed0 100644
--- a/textsum/seq2seq_attention.py
+++ b/textsum/seq2seq_attention.py
@@ -86,7 +86,7 @@ def _Train(model, data_batcher):
     saver = tf.train.Saver()
     # Train dir is different from log_root to avoid summary directory
     # conflict with Supervisor.
-    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)
+    summary_writer = tf.summary.FileWriter(FLAGS.train_dir)
     sv = tf.train.Supervisor(logdir=FLAGS.log_root,
                              is_chief=True,
                              saver=saver,
@@ -119,7 +119,7 @@ def _Eval(model, data_batcher, vocab=None):
   """Runs model eval."""
   model.build_graph()
   saver = tf.train.Saver()
-  summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir)
+  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
   sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
   running_avg_loss = 0
   step = 0
diff --git a/textsum/seq2seq_attention_model.py b/textsum/seq2seq_attention_model.py
index 10f3c6635a..ddcfb69bec 100644
--- a/textsum/seq2seq_attention_model.py
+++ b/textsum/seq2seq_attention_model.py
@@ -139,10 +139,10 @@ def _add_seq2seq(self):
     vsize = self._vocab.NumIds()
 
     with tf.variable_scope('seq2seq'):
-      encoder_inputs = tf.unpack(tf.transpose(self._articles))
-      decoder_inputs = tf.unpack(tf.transpose(self._abstracts))
-      targets = tf.unpack(tf.transpose(self._targets))
-      loss_weights = tf.unpack(tf.transpose(self._loss_weights))
+      encoder_inputs = tf.unstack(tf.transpose(self._articles))
+      decoder_inputs = tf.unstack(tf.transpose(self._abstracts))
+      targets = tf.unstack(tf.transpose(self._targets))
+      loss_weights = tf.unstack(tf.transpose(self._loss_weights))
       article_lens = self._article_lens
 
       # Embedding shared by the input and outputs.
@@ -195,7 +195,7 @@ def _add_seq2seq(self):
 
         encoder_outputs = [tf.reshape(x, [hps.batch_size, 1, 2*hps.num_hidden])
                            for x in encoder_outputs]
-        self._enc_top_states = tf.concat(1, encoder_outputs)
+        self._enc_top_states = tf.concat(axis=1, values=encoder_outputs)
         self._dec_in_state = fw_state
         # During decoding, follow up _dec_in_state are fed from beam_search.
         # dec_out_state are stored by beam_search for next step feeding.
@@ -218,7 +218,7 @@ def _add_seq2seq(self):
           best_outputs = [tf.argmax(x, 1) for x in model_outputs]
           tf.logging.info('best_outputs%s', best_outputs[0].get_shape())
           self._outputs = tf.concat(
-              1, [tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs])
+              axis=1, values=[tf.reshape(x, [hps.batch_size, 1]) for x in best_outputs])
 
           self._topk_log_probs, self._topk_ids = tf.nn.top_k(
               tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size*2)
@@ -236,7 +236,7 @@ def sampled_loss_func(inputs, labels):
         else:
           self._loss = tf.nn.seq2seq.sequence_loss(
               model_outputs, targets, loss_weights)
-        tf.scalar_summary('loss', tf.minimum(12.0, self._loss))
+        tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
 
   def _add_train_op(self):
     """Sets self._train_op, op to run for training."""
@@ -250,9 +250,9 @@ def _add_train_op(self):
     with tf.device(self._get_gpu(self._num_gpus-1)):
       grads, global_norm = tf.clip_by_global_norm(
           tf.gradients(self._loss, tvars), hps.max_grad_norm)
-    tf.scalar_summary('global_norm', global_norm)
+    tf.summary.scalar('global_norm', global_norm)
     optimizer = tf.train.GradientDescentOptimizer(self._lr_rate)
-    tf.scalar_summary('learning rate', self._lr_rate)
+    tf.summary.scalar('learning rate', self._lr_rate)
     self._train_op = optimizer.apply_gradients(
         zip(grads, tvars), global_step=self.global_step, name='train_step')
 
@@ -296,4 +296,4 @@ def build_graph(self):
     self.global_step = tf.Variable(0, name='global_step', trainable=False)
     if self._hps.mode == 'train':
       self._add_train_op()
-    self._summaries = tf.merge_all_summaries()
+    self._summaries = tf.summary.merge_all()
diff --git a/textsum/seq2seq_lib.py b/textsum/seq2seq_lib.py
index 6b7dfc9657..3f242febd4 100644
--- a/textsum/seq2seq_lib.py
+++ b/textsum/seq2seq_lib.py
@@ -127,7 +127,7 @@ def linear(args, output_size, bias, bias_start=0.0, scope=None):
     if len(args) == 1:
       res = tf.matmul(args[0], matrix)
     else:
-      res = tf.matmul(tf.concat(1, args), matrix)
+      res = tf.matmul(tf.concat(axis=1, values=args), matrix)
     if not bias:
       return res
     bias_term = tf.get_variable(
diff --git a/transformer/spatial_transformer.py b/transformer/spatial_transformer.py
index a049bb4342..51e14efd4c 100644
--- a/transformer/spatial_transformer.py
+++ b/transformer/spatial_transformer.py
@@ -53,7 +53,7 @@ def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
     def _repeat(x, n_repeats):
         with tf.variable_scope('_repeat'):
             rep = tf.transpose(
-                tf.expand_dims(tf.ones(shape=tf.pack([n_repeats, ])), 1), [1, 0])
+                tf.expand_dims(tf.ones(shape=tf.stack([n_repeats, ])), 1), [1, 0])
             rep = tf.cast(rep, 'int32')
             x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
             return tf.reshape(x, [-1])
@@ -102,7 +102,7 @@ def _interpolate(im, x, y, out_size):
 
             # use indices to lookup pixels in the flat image and restore
             # channels dim
-            im_flat = tf.reshape(im, tf.pack([-1, channels]))
+            im_flat = tf.reshape(im, tf.stack([-1, channels]))
             im_flat = tf.cast(im_flat, 'float32')
             Ia = tf.gather(im_flat, idx_a)
             Ib = tf.gather(im_flat, idx_b)
@@ -128,16 +128,16 @@ def _meshgrid(height, width):
             #                         np.linspace(-1, 1, height))
             #  ones = np.ones(np.prod(x_t.shape))
             #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
-            x_t = tf.matmul(tf.ones(shape=tf.pack([height, 1])),
+            x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
                             tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
             y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
-                            tf.ones(shape=tf.pack([1, width])))
+                            tf.ones(shape=tf.stack([1, width])))
 
             x_t_flat = tf.reshape(x_t, (1, -1))
             y_t_flat = tf.reshape(y_t, (1, -1))
 
             ones = tf.ones_like(x_t_flat)
-            grid = tf.concat(0, [x_t_flat, y_t_flat, ones])
+            grid = tf.concat(axis=0, values=[x_t_flat, y_t_flat, ones])
             return grid
 
     def _transform(theta, input_dim, out_size):
@@ -157,11 +157,11 @@ def _transform(theta, input_dim, out_size):
             grid = _meshgrid(out_height, out_width)
             grid = tf.expand_dims(grid, 0)
             grid = tf.reshape(grid, [-1])
-            grid = tf.tile(grid, tf.pack([num_batch]))
-            grid = tf.reshape(grid, tf.pack([num_batch, 3, -1]))
+            grid = tf.tile(grid, tf.stack([num_batch]))
+            grid = tf.reshape(grid, tf.stack([num_batch, 3, -1]))
 
             # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
-            T_g = tf.batch_matmul(theta, grid)
+            T_g = tf.matmul(theta, grid)
             x_s = tf.slice(T_g, [0, 0, 0], [-1, 1, -1])
             y_s = tf.slice(T_g, [0, 1, 0], [-1, 1, -1])
             x_s_flat = tf.reshape(x_s, [-1])
@@ -172,7 +172,7 @@ def _transform(theta, input_dim, out_size):
                 out_size)
 
             output = tf.reshape(
-                input_transformed, tf.pack([num_batch, out_height, out_width, num_channels]))
+                input_transformed, tf.stack([num_batch, out_height, out_width, num_channels]))
             return output
 
     with tf.variable_scope(name):
diff --git a/tutorials/embedding/word2vec.py b/tutorials/embedding/word2vec.py
index db656b4a0b..055a4c97e6 100644
--- a/tutorials/embedding/word2vec.py
+++ b/tutorials/embedding/word2vec.py
@@ -246,7 +246,7 @@ def forward(self, examples, labels):
     sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)
 
     # True logits: [batch_size, 1]
-    true_logits = tf.reduce_sum(tf.mul(example_emb, true_w), 1) + true_b
+    true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b
 
     # Sampled logits: [batch_size, num_sampled]
     # We replicate sampled noise labels for all examples in the batch
diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index e51b551a61..8fa67a00eb 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -124,7 +124,7 @@ def average_gradients(tower_grads):
       grads.append(expanded_g)
 
     # Average over the 'tower' dimension.
-    grad = tf.concat(grads, 0)
+    grad = tf.concat(axis=grads, values=0)
     grad = tf.reduce_mean(grad, 0)
 
     # Keep in mind that the Variables are redundant because they are shared
diff --git a/tutorials/rnn/ptb/ptb_word_lm.py b/tutorials/rnn/ptb/ptb_word_lm.py
index 0989df7c6b..6341791491 100644
--- a/tutorials/rnn/ptb/ptb_word_lm.py
+++ b/tutorials/rnn/ptb/ptb_word_lm.py
@@ -146,7 +146,7 @@ def attn_cell():
         (cell_output, state) = cell(inputs[:, time_step, :], state)
         outputs.append(cell_output)
 
-    output = tf.reshape(tf.concat(outputs, 1), [-1, size])
+    output = tf.reshape(tf.concat(axis=outputs, values=1), [-1, size])
     softmax_w = tf.get_variable(
         "softmax_w", [size, vocab_size], dtype=data_type())
     softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
diff --git a/video_prediction/lstm_ops.py b/video_prediction/lstm_ops.py
index da7b4d2463..d8afe56bef 100644
--- a/video_prediction/lstm_ops.py
+++ b/video_prediction/lstm_ops.py
@@ -23,7 +23,7 @@
 
 def init_state(inputs,
                state_shape,
-               state_initializer=tf.zeros_initializer,
+               state_initializer=tf.zeros_initializer(),
                dtype=tf.float32):
   """Helper function to create an initial state given inputs.
 
@@ -45,7 +45,7 @@ def init_state(inputs,
     batch_size = 0
 
   initial_state = state_initializer(
-      tf.pack([batch_size] + state_shape),
+      tf.stack([batch_size] + state_shape),
       dtype=dtype)
   initial_state.set_shape([inferred_batch_size] + state_shape)
 
@@ -89,8 +89,8 @@ def basic_conv_lstm_cell(inputs,
                          reuse=reuse):
     inputs.get_shape().assert_has_rank(4)
     state.get_shape().assert_has_rank(4)
-    c, h = tf.split(3, 2, state)
-    inputs_h = tf.concat(3, [inputs, h])
+    c, h = tf.split(axis=3, num_or_size_splits=2, value=state)
+    inputs_h = tf.concat(axis=3, values=[inputs, h])
     # Parameters of gates are concatenated into one conv for efficiency.
     i_j_f_o = layers.conv2d(inputs_h,
                             4 * num_channels, [filter_size, filter_size],
@@ -99,12 +99,12 @@ def basic_conv_lstm_cell(inputs,
                             scope='Gates')
 
     # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-    i, j, f, o = tf.split(3, 4, i_j_f_o)
+    i, j, f, o = tf.split(axis=3, num_or_size_splits=4, value=i_j_f_o)
 
     new_c = c * tf.sigmoid(f + forget_bias) + tf.sigmoid(i) * tf.tanh(j)
     new_h = tf.tanh(new_c) * tf.sigmoid(o)
 
-    return new_h, tf.concat(3, [new_c, new_h])
+    return new_h, tf.concat(axis=3, values=[new_c, new_h])
 
 
 
diff --git a/video_prediction/prediction_input.py b/video_prediction/prediction_input.py
index b2c6eb9e31..e35b9daed3 100644
--- a/video_prediction/prediction_input.py
+++ b/video_prediction/prediction_input.py
@@ -97,11 +97,11 @@ def build_tfrecord_input(training=True):
       action = tf.reshape(features[action_name], shape=[1, STATE_DIM])
       action_seq.append(action)
 
-  image_seq = tf.concat(0, image_seq)
+  image_seq = tf.concat(axis=0, values=image_seq)
 
   if FLAGS.use_state:
-    state_seq = tf.concat(0, state_seq)
-    action_seq = tf.concat(0, action_seq)
+    state_seq = tf.concat(axis=0, values=state_seq)
+    action_seq = tf.concat(axis=0, values=action_seq)
     [image_batch, action_batch, state_batch] = tf.train.batch(
         [image_seq, action_seq, state_seq],
         FLAGS.batch_size,
diff --git a/video_prediction/prediction_model.py b/video_prediction/prediction_model.py
index bad8800388..4ebbcbd2e5 100644
--- a/video_prediction/prediction_model.py
+++ b/video_prediction/prediction_model.py
@@ -109,7 +109,7 @@ def construct_model(images,
         prev_image = image
 
       # Predicted state is always fed back in
-      state_action = tf.concat(1, [action, current_state])
+      state_action = tf.concat(axis=1, values=[action, current_state])
 
       enc0 = slim.layers.conv2d(
           prev_image,
@@ -144,7 +144,7 @@ def construct_model(images,
       smear = tf.tile(
           smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])
       if use_state:
-        enc2 = tf.concat(3, [enc2, smear])
+        enc2 = tf.concat(axis=3, values=[enc2, smear])
       enc3 = slim.layers.conv2d(
           enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4')
 
@@ -158,7 +158,7 @@ def construct_model(images,
           enc4, lstm_state6, lstm_size[5], scope='state6')  # 16x16
       hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
       # Skip connection.
-      hidden6 = tf.concat(3, [hidden6, enc1])  # both 16x16
+      hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16
 
       enc5 = slim.layers.conv2d_transpose(
           hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
@@ -167,7 +167,7 @@ def construct_model(images,
       hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')
 
       # Skip connection.
-      hidden7 = tf.concat(3, [hidden7, enc0])  # both 32x32
+      hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32
 
       enc6 = slim.layers.conv2d_transpose(
           hidden7,
@@ -207,7 +207,7 @@ def construct_model(images,
       masks = tf.reshape(
           tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
           [int(batch_size), int(img_height), int(img_width), num_masks + 1])
-      mask_list = tf.split(3, num_masks + 1, masks)
+      mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks)
       output = mask_list[0] * prev_image
       for layer, mask in zip(transformed, mask_list[1:]):
         output += layer * mask
@@ -277,8 +277,8 @@ def cdna_transformation(prev_image, cdna_input, num_masks, color_channels):
   cdna_kerns /= norm_factor
 
   cdna_kerns = tf.tile(cdna_kerns, [1, 1, 1, color_channels, 1])
-  cdna_kerns = tf.split(0, batch_size, cdna_kerns)
-  prev_images = tf.split(0, batch_size, prev_image)
+  cdna_kerns = tf.split(axis=0, num_or_size_splits=batch_size, value=cdna_kerns)
+  prev_images = tf.split(axis=0, num_or_size_splits=batch_size, value=prev_image)
 
   # Transform image.
   transformed = []
@@ -288,8 +288,8 @@ def cdna_transformation(prev_image, cdna_input, num_masks, color_channels):
       kernel = tf.expand_dims(kernel, -1)
     transformed.append(
         tf.nn.depthwise_conv2d(preimg, kernel, [1, 1, 1, 1], 'SAME'))
-  transformed = tf.concat(0, transformed)
-  transformed = tf.split(3, num_masks, transformed)
+  transformed = tf.concat(axis=0, values=transformed)
+  transformed = tf.split(axis=3, num_or_size_splits=num_masks, value=transformed)
   return transformed
 
 
@@ -314,7 +314,7 @@ def dna_transformation(prev_image, dna_input):
           tf.expand_dims(
               tf.slice(prev_image_pad, [0, xkern, ykern, 0],
                        [-1, image_height, image_width, -1]), [3]))
-  inputs = tf.concat(3, inputs)
+  inputs = tf.concat(axis=3, values=inputs)
 
   # Normalize channels to 1.
   kernel = tf.nn.relu(dna_input - RELU_SHIFT) + RELU_SHIFT
diff --git a/video_prediction/prediction_train.py b/video_prediction/prediction_train.py
index 4c5ccd57cb..872849ec62 100644
--- a/video_prediction/prediction_train.py
+++ b/video_prediction/prediction_train.py
@@ -113,11 +113,11 @@ def __init__(self,
     summaries = []
 
     # Split into timesteps.
-    actions = tf.split(1, actions.get_shape()[1], actions)
+    actions = tf.split(axis=1, num_or_size_splits=actions.get_shape()[1], value=actions)
     actions = [tf.squeeze(act) for act in actions]
-    states = tf.split(1, states.get_shape()[1], states)
+    states = tf.split(axis=1, num_or_size_splits=states.get_shape()[1], value=states)
     states = [tf.squeeze(st) for st in states]
-    images = tf.split(1, images.get_shape()[1], images)
+    images = tf.split(axis=1, num_or_size_splits=images.get_shape()[1], value=images)
     images = [tf.squeeze(img) for img in images]
 
     if reuse_scope is None:
@@ -157,8 +157,8 @@ def __init__(self,
       psnr_i = peak_signal_to_noise_ratio(x, gx)
       psnr_all += psnr_i
       summaries.append(
-          tf.scalar_summary(prefix + '_recon_cost' + str(i), recon_cost))
-      summaries.append(tf.scalar_summary(prefix + '_psnr' + str(i), psnr_i))
+          tf.summary.scalar(prefix + '_recon_cost' + str(i), recon_cost))
+      summaries.append(tf.summary.scalar(prefix + '_psnr' + str(i), psnr_i))
       loss += recon_cost
 
     for i, state, gen_state in zip(
@@ -166,19 +166,19 @@ def __init__(self,
         gen_states[FLAGS.context_frames - 1:]):
       state_cost = mean_squared_error(state, gen_state) * 1e-4
       summaries.append(
-          tf.scalar_summary(prefix + '_state_cost' + str(i), state_cost))
+          tf.summary.scalar(prefix + '_state_cost' + str(i), state_cost))
       loss += state_cost
-    summaries.append(tf.scalar_summary(prefix + '_psnr_all', psnr_all))
+    summaries.append(tf.summary.scalar(prefix + '_psnr_all', psnr_all))
     self.psnr_all = psnr_all
 
     self.loss = loss = loss / np.float32(len(images) - FLAGS.context_frames)
 
-    summaries.append(tf.scalar_summary(prefix + '_loss', loss))
+    summaries.append(tf.summary.scalar(prefix + '_loss', loss))
 
     self.lr = tf.placeholder_with_default(FLAGS.learning_rate, ())
 
     self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
-    self.summ_op = tf.merge_summary(summaries)
+    self.summ_op = tf.summary.merge(summaries)
 
 
 def main(unused_argv):
@@ -200,7 +200,7 @@ def main(unused_argv):
 
   # Make training session.
   sess = tf.InteractiveSession()
-  summary_writer = tf.train.SummaryWriter(
+  summary_writer = tf.summary.FileWriter(
       FLAGS.event_log_dir, graph=sess.graph, flush_secs=10)
 
   if FLAGS.pretrained_model: