research/global_objectives/util.py

# Copyright 2018 The TensorFlow Global Objectives Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains utility functions for the global objectives library."""

# Dependency imports
import tensorflow as tf


def weighted_sigmoid_cross_entropy_with_logits(labels,
                                               logits,
                                               positive_weights=1.0,
                                               negative_weights=1.0,
                                               name=None):
  """Computes a weighting of sigmoid cross entropy given `logits`.

  Measures the weighted probability error in discrete classification tasks in
  which classes are independent and not mutually exclusive.  For instance, one
  could perform multilabel classification where a picture can contain both an
  elephant and a dog at the same time. The class weight multiplies the
  different types of errors.
  For brevity, let `x = logits`, `z = labels`, `c = positive_weights`,
  `d = negative_weights`  The
  weighed logistic loss is

  ```
  c * z * -log(sigmoid(x)) + d * (1 - z) * -log(1 - sigmoid(x))
  = c * z * -log(1 / (1 + exp(-x))) - d * (1 - z) * log(exp(-x) / (1 + exp(-x)))
  = c * z * log(1 + exp(-x)) + d * (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
  = c * z * log(1 + exp(-x)) + d * (1 - z) * (x + log(1 + exp(-x)))
  = (1 - z) * x * d + (1 - z + c * z ) * log(1 + exp(-x))
  =  - d * x * z + d * x + (d - d * z + c * z ) * log(1 + exp(-x))
  ```

  To ensure stability and avoid overflow, the implementation uses the identity
      log(1 + exp(-x)) = max(0,-x) + log(1 + exp(-abs(x)))
  and the result is computed as

    ```
    = -d * x * z + d * x
      + (d - d * z + c * z ) * (max(0,-x) + log(1 + exp(-abs(x))))
    ```

  Note that the loss is NOT an upper bound on the 0-1 loss, unless it is divided
  by log(2).

  Args:
    labels: A `Tensor` of type `float32` or `float64`. `labels` can be a 2D
      tensor with shape [batch_size, num_labels] or a 3D tensor with shape
      [batch_size, num_labels, K].
    logits: A `Tensor` of the same type and shape as `labels`. If `logits` has
      shape [batch_size, num_labels, K], the loss is computed separately on each
      slice [:, :, k] of `logits`.
    positive_weights: A `Tensor` that holds positive weights and has the
      following semantics according to its shape:
        scalar - A global positive weight.
        1D tensor - must be of size K, a weight for each 'attempt'
        2D tensor - of size [num_labels, K'] where K' is either K or 1.
      The `positive_weights` will be expanded to the left to match the
      dimensions of logits and labels.
    negative_weights: A `Tensor` that holds positive weight and has the
      semantics identical to positive_weights.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
      weighted logistic losses.
  """
  with tf.name_scope(
      name,
      'weighted_logistic_loss',
      [logits, labels, positive_weights, negative_weights]) as name:
    labels, logits, positive_weights, negative_weights = prepare_loss_args(
        labels, logits, positive_weights, negative_weights)

    softplus_term = tf.add(tf.maximum(-logits, 0.0),
                           tf.log(1.0 + tf.exp(-tf.abs(logits))))
    weight_dependent_factor = (
        negative_weights + (positive_weights - negative_weights) * labels)
    return (negative_weights * (logits - labels * logits) +
            weight_dependent_factor * softplus_term)


def weighted_hinge_loss(labels,
                        logits,
                        positive_weights=1.0,
                        negative_weights=1.0,
                        name=None):
  """Computes weighted hinge loss given logits `logits`.

  The loss applies to multi-label classification tasks where labels are
  independent and not mutually exclusive. See also
  `weighted_sigmoid_cross_entropy_with_logits`.

  Args:
    labels: A `Tensor` of type `float32` or `float64`. Each entry must be
      either 0 or 1. `labels` can be a 2D tensor with shape
      [batch_size, num_labels] or a 3D tensor with shape
      [batch_size, num_labels, K].
    logits: A `Tensor` of the same type and shape as `labels`. If `logits` has
      shape [batch_size, num_labels, K], the loss is computed separately on each
      slice [:, :, k] of `logits`.
    positive_weights: A `Tensor` that holds positive weights and has the
      following semantics according to its shape:
        scalar - A global positive weight.
        1D tensor - must be of size K, a weight for each 'attempt'
        2D tensor - of size [num_labels, K'] where K' is either K or 1.
      The `positive_weights` will be expanded to the left to match the
      dimensions of logits and labels.
    negative_weights: A `Tensor` that holds positive weight and has the
      semantics identical to positive_weights.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
      weighted hinge loss.
  """
  with tf.name_scope(
      name, 'weighted_hinge_loss',
      [logits, labels, positive_weights, negative_weights]) as name:
    labels, logits, positive_weights, negative_weights = prepare_loss_args(
        labels, logits, positive_weights, negative_weights)

    positives_term = positive_weights * labels * tf.maximum(1.0 - logits, 0)
    negatives_term = (negative_weights * (1.0 - labels)
                      * tf.maximum(1.0 + logits, 0))
    return positives_term + negatives_term


def weighted_surrogate_loss(labels,
                            logits,
                            surrogate_type='xent',
                            positive_weights=1.0,
                            negative_weights=1.0,
                            name=None):
  """Returns either weighted cross-entropy or hinge loss.

  For example `surrogate_type` is 'xent' returns the weighted cross
  entropy loss.

  Args:
   labels: A `Tensor` of type `float32` or `float64`. Each entry must be
      between 0 and 1. `labels` can be a 2D tensor with shape
      [batch_size, num_labels] or a 3D tensor with shape
      [batch_size, num_labels, K].
    logits: A `Tensor` of the same type and shape as `labels`. If `logits` has
      shape [batch_size, num_labels, K], each slice [:, :, k] represents an
      'attempt' to predict `labels` and the loss is computed per slice.
    surrogate_type: A string that determines which loss to return, supports
    'xent' for cross-entropy and 'hinge' for hinge loss.
    positive_weights: A `Tensor` that holds positive weights and has the
      following semantics according to its shape:
        scalar - A global positive weight.
        1D tensor - must be of size K, a weight for each 'attempt'
        2D tensor - of size [num_labels, K'] where K' is either K or 1.
      The `positive_weights` will be expanded to the left to match the
      dimensions of logits and labels.
    negative_weights: A `Tensor` that holds positive weight and has the
      semantics identical to positive_weights.
    name: A name for the operation (optional).

  Returns:
    The weigthed loss.

  Raises:
    ValueError: If value of `surrogate_type` is not supported.
  """
  with tf.name_scope(
      name, 'weighted_loss',
      [logits, labels, surrogate_type, positive_weights,
       negative_weights]) as name:
    if surrogate_type == 'xent':
      return weighted_sigmoid_cross_entropy_with_logits(
          logits=logits,
          labels=labels,
          positive_weights=positive_weights,
          negative_weights=negative_weights,
          name=name)
    elif surrogate_type == 'hinge':
      return weighted_hinge_loss(
          logits=logits,
          labels=labels,
          positive_weights=positive_weights,
          negative_weights=negative_weights,
          name=name)
    raise ValueError('surrogate_type %s not supported.' % surrogate_type)


def expand_outer(tensor, rank):
  """Expands the given `Tensor` outwards to a target rank.

  For example if rank = 3 and tensor.shape is [3, 4], this function will expand
  to such that the resulting shape will be  [1, 3, 4].

  Args:
    tensor: The tensor to expand.
    rank: The target dimension.

  Returns:
    The expanded tensor.

  Raises:
    ValueError: If rank of `tensor` is unknown, or if `rank` is smaller than
      the rank of `tensor`.
  """
  if tensor.get_shape().ndims is None:
    raise ValueError('tensor dimension must be known.')
  if len(tensor.get_shape()) > rank:
    raise ValueError(
        '`rank` must be at least the current tensor dimension: (%s vs %s).' %
        (rank, len(tensor.get_shape())))
  while len(tensor.get_shape()) < rank:
    tensor = tf.expand_dims(tensor, 0)
  return tensor


def build_label_priors(labels,
                       weights=None,
                       positive_pseudocount=1.0,
                       negative_pseudocount=1.0,
                       variables_collections=None):
  """Creates an op to maintain and update label prior probabilities.

  For each label, the label priors are estimated as
      (P + sum_i w_i y_i) / (P + N + sum_i w_i),
  where y_i is the ith label, w_i is the ith weight, P is a pseudo-count of
  positive labels, and N is a pseudo-count of negative labels. The index i
  ranges over all labels observed during all evaluations of the returned op.

  Args:
    labels: A `Tensor` with shape [batch_size, num_labels]. Entries should be
      in [0, 1].
    weights: Coefficients representing the weight of each label. Must be either
      a Tensor of shape [batch_size, num_labels] or `None`, in which case each
      weight is treated as 1.0.
    positive_pseudocount: Number of positive labels used to initialize the label
      priors.
    negative_pseudocount: Number of negative labels used to initialize the label
      priors.
    variables_collections: Optional list of collections for created variables.

  Returns:
    label_priors: An op to update the weighted label_priors. Gives the
      current value of the label priors when evaluated.
  """
  dtype = labels.dtype.base_dtype
  num_labels = get_num_labels(labels)

  if weights is None:
    weights = tf.ones_like(labels)

  # We disable partitioning while constructing dual variables because they will
  # be updated with assign, which is not available for partitioned variables.
  partitioner = tf.get_variable_scope().partitioner
  try:
    tf.get_variable_scope().set_partitioner(None)
    # Create variable and update op for weighted label counts.
    weighted_label_counts = tf.contrib.framework.model_variable(
        name='weighted_label_counts',
        shape=[num_labels],
        dtype=dtype,
        initializer=tf.constant_initializer(
            [positive_pseudocount] * num_labels, dtype=dtype),
        collections=variables_collections,
        trainable=False)
    weighted_label_counts_update = weighted_label_counts.assign_add(
        tf.reduce_sum(weights * labels, 0))

    # Create variable and update op for the sum of the weights.
    weight_sum = tf.contrib.framework.model_variable(
        name='weight_sum',
        shape=[num_labels],
        dtype=dtype,
        initializer=tf.constant_initializer(
            [positive_pseudocount + negative_pseudocount] * num_labels,
            dtype=dtype),
        collections=variables_collections,
        trainable=False)
    weight_sum_update = weight_sum.assign_add(tf.reduce_sum(weights, 0))

  finally:
    tf.get_variable_scope().set_partitioner(partitioner)

  label_priors = tf.div(
      weighted_label_counts_update,
      weight_sum_update)
  return label_priors


def convert_and_cast(value, name, dtype):
  """Convert input to tensor and cast to dtype.

  Args:
    value: An object whose type has a registered Tensor conversion function,
        e.g. python numerical type or numpy array.
    name: Name to use for the new Tensor, if one is created.
    dtype: Optional element type for the returned tensor.

  Returns:
    A tensor.
  """
  return tf.cast(tf.convert_to_tensor(value, name=name), dtype=dtype)


def prepare_loss_args(labels, logits, positive_weights, negative_weights):
  """Prepare arguments for weighted loss functions.

  If needed, will convert given arguments to appropriate type and shape.

  Args:
    labels: labels or labels of the loss function.
    logits: Logits of the loss function.
    positive_weights: Weight on the positive examples.
    negative_weights: Weight on the negative examples.

  Returns:
    Converted labels, logits, positive_weights, negative_weights.
  """
  logits = tf.convert_to_tensor(logits, name='logits')
  labels = convert_and_cast(labels, 'labels', logits.dtype)
  if len(labels.get_shape()) == 2 and len(logits.get_shape()) == 3:
    labels = tf.expand_dims(labels, [2])

  positive_weights = convert_and_cast(positive_weights, 'positive_weights',
                                      logits.dtype)
  positive_weights = expand_outer(positive_weights, logits.get_shape().ndims)
  negative_weights = convert_and_cast(negative_weights, 'negative_weights',
                                      logits.dtype)
  negative_weights = expand_outer(negative_weights, logits.get_shape().ndims)
  return labels, logits, positive_weights, negative_weights


def get_num_labels(labels_or_logits):
  """Returns the number of labels inferred from labels_or_logits."""
  if labels_or_logits.get_shape().ndims <= 1:
    return 1
  return labels_or_logits.get_shape()[1].value