diff --git a/tutorials/rnn/ptb/ptb_word_lm.py b/tutorials/rnn/ptb/ptb_word_lm.py
index 7430f2e43d..fccbd41255 100644
--- a/tutorials/rnn/ptb/ptb_word_lm.py
+++ b/tutorials/rnn/ptb/ptb_word_lm.py
@@ -162,11 +162,21 @@ def attn_cell():
         "softmax_w", [size, vocab_size], dtype=data_type())
     softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
     logits = tf.matmul(output, softmax_w) + softmax_b
-    loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
-        [logits],
-        [tf.reshape(input_.targets, [-1])],
-        [tf.ones([batch_size * num_steps], dtype=data_type())])
-    self._cost = cost = tf.reduce_sum(loss) / batch_size
+
+    # Reshape logits to be 3-D tensor for sequence loss
+    logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])
+
+    # use the contrib sequence loss and average over the batches
+    loss = tf.contrib.seq2seq.sequence_loss(
+        logits,
+        input_.targets,
+        tf.ones([batch_size, num_steps], dtype=data_type()),
+        average_across_timesteps=False,
+        average_across_batch=True
+    )
+
+    # update the cost variables
+    self._cost = cost = tf.reduce_sum(loss)
     self._final_state = state
 
     if not is_training: