From 082e65c9342683e6dcb6df4e9922c99f0f966e90 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Thu, 8 Jun 2017 10:08:13 -0700
Subject: [PATCH 1/5] iput pipeline on CPU. 1700 images/sec to 8000 on GTX 1080

---
 tutorials/image/cifar10/cifar10_multi_gpu_train.py | 13 +++++++++----
 tutorials/image/cifar10/cifar10_train.py           |  5 ++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index 16033eeffd8..e97e1f8fae1 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -62,7 +62,7 @@
                             """Whether to log device placement.""")
 
 
-def tower_loss(scope):
+def tower_loss(scope, images, labels):
   """Calculate the total loss on a single tower running the CIFAR model.
 
   Args:
@@ -71,8 +71,7 @@ def tower_loss(scope):
   Returns:
      Tensor of shape [] containing the total loss for a batch of data
   """
-  # Get images and labels for CIFAR-10.
-  images, labels = cifar10.distorted_inputs()
+
 
   # Build inference Graph.
   logits = cifar10.inference(images)
@@ -160,6 +159,12 @@ def train():
     # Create an optimizer that performs gradient descent.
     opt = tf.train.GradientDescentOptimizer(lr)
 
+    # Get images and labels for CIFAR-10.
+    # Force input pipeline to CPU:0 to avoid opertaios sometimes ending up on GPU
+    # and resulting in a slow down.
+    with tf.device('/CPU:0'):
+      images, labels = cifar10.distorted_inputs()
+
     # Calculate the gradients for each model tower.
     tower_grads = []
     with tf.variable_scope(tf.get_variable_scope()):
@@ -169,7 +174,7 @@ def train():
             # Calculate the loss for one tower of the CIFAR model. This function
             # constructs the entire CIFAR model but shares the variables across
             # all towers.
-            loss = tower_loss(scope)
+            loss = tower_loss(scope, images, labels)
 
             # Reuse variables for the next tower.
             tf.get_variable_scope().reuse_variables()
diff --git a/tutorials/image/cifar10/cifar10_train.py b/tutorials/image/cifar10/cifar10_train.py
index fec64ec2272..da01d500149 100644
--- a/tutorials/image/cifar10/cifar10_train.py
+++ b/tutorials/image/cifar10/cifar10_train.py
@@ -62,7 +62,10 @@ def train():
     global_step = tf.contrib.framework.get_or_create_global_step()
 
     # Get images and labels for CIFAR-10.
-    images, labels = cifar10.distorted_inputs()
+    # Force input pipeline to CPU:0 to avoid opertaios sometimes ending up
+    # on GPU and resulting in a slow down.
+    with tf.device('/CPU:0'):
+      images, labels = cifar10.distorted_inputs()
 
     # Build a Graph that computes the logits predictions from the
     # inference model.

From 3909e4bdff25c952713a08b4ecc31fff1fdf2cb4 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Thu, 8 Jun 2017 10:15:06 -0700
Subject: [PATCH 2/5] pydoc update to match method signature

---
 tutorials/image/cifar10/cifar10_multi_gpu_train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index e97e1f8fae1..05d92cc27ec 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -67,6 +67,8 @@ def tower_loss(scope, images, labels):
 
   Args:
     scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
+    images: Images. 4D tensor of [batch_size, height, width, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
 
   Returns:
      Tensor of shape [] containing the total loss for a batch of data

From c3e2ae5ec1b0164ddd3895680c249d0adb1f11a8 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Thu, 8 Jun 2017 13:49:27 -0700
Subject: [PATCH 3/5] Fixed typos and redudant with CPU:0

---
 tutorials/image/cifar10/cifar10_multi_gpu_train.py | 5 +----
 tutorials/image/cifar10/cifar10_train.py           | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index 05d92cc27ec..9f269cc04ab 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -162,10 +162,7 @@ def train():
     opt = tf.train.GradientDescentOptimizer(lr)
 
     # Get images and labels for CIFAR-10.
-    # Force input pipeline to CPU:0 to avoid opertaios sometimes ending up on GPU
-    # and resulting in a slow down.
-    with tf.device('/CPU:0'):
-      images, labels = cifar10.distorted_inputs()
+    images, labels = cifar10.distorted_inputs()
 
     # Calculate the gradients for each model tower.
     tower_grads = []
diff --git a/tutorials/image/cifar10/cifar10_train.py b/tutorials/image/cifar10/cifar10_train.py
index da01d500149..e3243527926 100644
--- a/tutorials/image/cifar10/cifar10_train.py
+++ b/tutorials/image/cifar10/cifar10_train.py
@@ -62,8 +62,8 @@ def train():
     global_step = tf.contrib.framework.get_or_create_global_step()
 
     # Get images and labels for CIFAR-10.
-    # Force input pipeline to CPU:0 to avoid opertaios sometimes ending up
-    # on GPU and resulting in a slow down.
+    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
+    # GPU and resulting in a slow down.
     with tf.device('/CPU:0'):
       images, labels = cifar10.distorted_inputs()
 

From 9e8fd6d90c84df1f7444b055dcc3b653f6b7e14c Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Thu, 8 Jun 2017 15:05:06 -0700
Subject: [PATCH 4/5] Fixed typo and multi-gpu processing same batch on each
 gpu

---
 tutorials/image/cifar10/cifar10_multi_gpu_train.py | 6 +++++-
 tutorials/image/cifar10/cifar10_train.py           | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index 9f269cc04ab..bc90711d7c9 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -138,6 +138,7 @@ def average_gradients(tower_grads):
 
 
 def train():
+  print(FLAGS.batch_size)
   """Train CIFAR-10 for a number of steps."""
   with tf.Graph().as_default(), tf.device('/cpu:0'):
     # Create a variable to count the number of train() calls. This equals the
@@ -163,13 +164,16 @@ def train():
 
     # Get images and labels for CIFAR-10.
     images, labels = cifar10.distorted_inputs()
-
+    batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
+          [images, labels], capacity=2 * FLAGS.num_gpus)
     # Calculate the gradients for each model tower.
     tower_grads = []
     with tf.variable_scope(tf.get_variable_scope()):
       for i in xrange(FLAGS.num_gpus):
         with tf.device('/gpu:%d' % i):
           with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
+            # Dequeues one batch for the GPU
+            images, labels = batch_queue.dequeue()
             # Calculate the loss for one tower of the CIFAR model. This function
             # constructs the entire CIFAR model but shares the variables across
             # all towers.
diff --git a/tutorials/image/cifar10/cifar10_train.py b/tutorials/image/cifar10/cifar10_train.py
index e3243527926..cc1dc0d1489 100644
--- a/tutorials/image/cifar10/cifar10_train.py
+++ b/tutorials/image/cifar10/cifar10_train.py
@@ -64,7 +64,7 @@ def train():
     # Get images and labels for CIFAR-10.
     # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
     # GPU and resulting in a slow down.
-    with tf.device('/CPU:0'):
+    with tf.device('/cpu:0'):
       images, labels = cifar10.distorted_inputs()
 
     # Build a Graph that computes the logits predictions from the

From b5acc005968d37495f0e7d83d2dd2ef3d3674211 Mon Sep 17 00:00:00 2001
From: Neal Wu <neal@nealwu.com>
Date: Thu, 8 Jun 2017 16:44:02 -0700
Subject: [PATCH 5/5] Code cleanup

---
 tutorials/image/cifar10/cifar10_multi_gpu_train.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
index bc90711d7c9..fb15faca260 100644
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
@@ -67,14 +67,13 @@ def tower_loss(scope, images, labels):
 
   Args:
     scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
-    images: Images. 4D tensor of [batch_size, height, width, 3] size.
-    labels: Labels. 1D tensor of [batch_size] size.
+    images: Images. 4D tensor of shape [batch_size, height, width, 3].
+    labels: Labels. 1D tensor of shape [batch_size].
 
   Returns:
      Tensor of shape [] containing the total loss for a batch of data
   """
 
-
   # Build inference Graph.
   logits = cifar10.inference(images)
 
@@ -138,7 +137,6 @@ def average_gradients(tower_grads):
 
 
 def train():
-  print(FLAGS.batch_size)
   """Train CIFAR-10 for a number of steps."""
   with tf.Graph().as_default(), tf.device('/cpu:0'):
     # Create a variable to count the number of train() calls. This equals the