official/vision/image_classification/dataset_factory.py

# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function

import os
from typing import Any, List, Optional, Tuple, Mapping, Union
from absl import logging
from dataclasses import dataclass
import tensorflow as tf
import tensorflow_datasets as tfds

from official.modeling.hyperparams import base_config
from official.vision.image_classification import augment
from official.vision.image_classification import preprocessing


AUGMENTERS = {
    'autoaugment': augment.AutoAugment,
    'randaugment': augment.RandAugment,
}


@dataclass
class AugmentConfig(base_config.Config):
  """Configuration for image augmenters.

  Attributes:
    name: The name of the image augmentation to use. Possible options are
      None (default), 'autoaugment', or 'randaugment'.
    params: Any paramaters used to initialize the augmenter.
  """
  name: Optional[str] = None
  params: Optional[Mapping[str, Any]] = None

  def build(self) -> augment.ImageAugment:
    """Build the augmenter using this config."""
    params = self.params or {}
    augmenter = AUGMENTERS.get(self.name, None)
    return augmenter(**params) if augmenter is not None else None


@dataclass
class DatasetConfig(base_config.Config):
  """The base configuration for building datasets.

  Attributes:
    name: The name of the Dataset. Usually should correspond to a TFDS dataset.
    data_dir: The path where the dataset files are stored, if available.
    filenames: Optional list of strings representing the TFRecord names.
    builder: The builder type used to load the dataset. Value should be one of
      'tfds' (load using TFDS), 'records' (load from TFRecords), or 'synthetic'
      (generate dummy synthetic data without reading from files).
    split: The split of the dataset. Usually 'train', 'validation', or 'test'.
    image_size: The size of the image in the dataset. This assumes that
      `width` == `height`. Set to 'infer' to infer the image size from TFDS
      info. This requires `name` to be a registered dataset in TFDS.
    num_classes: The number of classes given by the dataset. Set to 'infer'
      to infer the image size from TFDS info. This requires `name` to be a
      registered dataset in TFDS.
    num_channels: The number of channels given by the dataset. Set to 'infer'
      to infer the image size from TFDS info. This requires `name` to be a
      registered dataset in TFDS.
    num_examples: The number of examples given by the dataset. Set to 'infer'
      to infer the image size from TFDS info. This requires `name` to be a
      registered dataset in TFDS.
    batch_size: The base batch size for the dataset.
    use_per_replica_batch_size: Whether to scale the batch size based on
      available resources. If set to `True`, the dataset builder will return
      batch_size multiplied by `num_devices`, the number of device replicas
      (e.g., the number of GPUs or TPU cores). This setting should be `True` if
      the strategy argument is passed to `build()` and `num_devices > 1`.
    num_devices: The number of replica devices to use. This should be set by
      `strategy.num_replicas_in_sync` when using a distribution strategy.
    dtype: The desired dtype of the dataset. This will be set during
      preprocessing.
    one_hot: Whether to apply one hot encoding. Set to `True` to be able to use
      label smoothing.
    augmenter: The augmenter config to use. No augmentation is used by default.
    download: Whether to download data using TFDS.
    shuffle_buffer_size: The buffer size used for shuffling training data.
    file_shuffle_buffer_size: The buffer size used for shuffling raw training
      files.
    skip_decoding: Whether to skip image decoding when loading from TFDS.
    cache: whether to cache to dataset examples. Can be used to avoid re-reading
      from disk on the second epoch. Requires significant memory overhead.
    tf_data_service: The URI of a tf.data service to offload preprocessing onto
      during training. The URI should be in the format "protocol://address",
      e.g. "grpc://tf-data-service:5050".
    mean_subtract: whether or not to apply mean subtraction to the dataset.
    standardize: whether or not to apply standardization to the dataset.
  """
  name: Optional[str] = None
  data_dir: Optional[str] = None
  filenames: Optional[List[str]] = None
  builder: str = 'tfds'
  split: str = 'train'
  image_size: Union[int, str] = 'infer'
  num_classes: Union[int, str] = 'infer'
  num_channels: Union[int, str] = 'infer'
  num_examples: Union[int, str] = 'infer'
  batch_size: int = 128
  use_per_replica_batch_size: bool = True
  num_devices: int = 1
  dtype: str = 'float32'
  one_hot: bool = True
  augmenter: AugmentConfig = AugmentConfig()
  download: bool = False
  shuffle_buffer_size: int = 10000
  file_shuffle_buffer_size: int = 1024
  skip_decoding: bool = True
  cache: bool = False
  tf_data_service: Optional[str] = None
  mean_subtract: bool = False
  standardize: bool = False

  @property
  def has_data(self):
    """Whether this dataset is has any data associated with it."""
    return self.name or self.data_dir or self.filenames


@dataclass
class ImageNetConfig(DatasetConfig):
  """The base ImageNet dataset config."""
  name: str = 'imagenet2012'
  # Note: for large datasets like ImageNet, using records is faster than tfds
  builder: str = 'records'
  image_size: int = 224
  batch_size: int = 128


@dataclass
class Cifar10Config(DatasetConfig):
  """The base CIFAR-10 dataset config."""
  name: str = 'cifar10'
  image_size: int = 224
  batch_size: int = 128
  download: bool = True
  cache: bool = True


class DatasetBuilder:
  """An object for building datasets.

  Allows building various pipelines fetching examples, preprocessing, etc.
  Maintains additional state information calculated from the dataset, i.e.,
  training set split, batch size, and number of steps (batches).
  """

  def __init__(self, config: DatasetConfig, **overrides: Any):
    """Initialize the builder from the config."""
    self.config = config.replace(**overrides)
    self.builder_info = None

    if self.config.augmenter is not None:
      logging.info('Using augmentation: %s', self.config.augmenter.name)
      self.augmenter = self.config.augmenter.build()
    else:
      self.augmenter = None

  @property
  def is_training(self) -> bool:
    """Whether this is the training set."""
    return self.config.split == 'train'

  @property
  def batch_size(self) -> int:
    """The batch size, multiplied by the number of replicas (if configured)."""
    if self.config.use_per_replica_batch_size:
      return self.config.batch_size * self.config.num_devices
    else:
      return self.config.batch_size

  @property
  def global_batch_size(self):
    """The global batch size across all replicas."""
    return self.batch_size

  @property
  def local_batch_size(self):
    """The base unscaled batch size."""
    if self.config.use_per_replica_batch_size:
      return self.config.batch_size
    else:
      return self.config.batch_size // self.config.num_devices

  @property
  def num_steps(self) -> int:
    """The number of steps (batches) to exhaust this dataset."""
    # Always divide by the global batch size to get the correct # of steps
    return self.num_examples // self.global_batch_size

  @property
  def dtype(self) -> tf.dtypes.DType:
    """Converts the config's dtype string to a tf dtype.

    Returns:
      A mapping from string representation of a dtype to the `tf.dtypes.DType`.

    Raises:
      ValueError if the config's dtype is not supported.

    """
    dtype_map = {
        'float32': tf.float32,
        'bfloat16': tf.bfloat16,
        'float16': tf.float16,
        'fp32': tf.float32,
        'bf16': tf.bfloat16,
    }
    try:
      return dtype_map[self.config.dtype]
    except:
      raise ValueError('Invalid DType provided. Supported types: {}'.format(
          dtype_map.keys()))

  @property
  def image_size(self) -> int:
    """The size of each image (can be inferred from the dataset)."""

    if self.config.image_size == 'infer':
      return self.info.features['image'].shape[0]
    else:
      return int(self.config.image_size)

  @property
  def num_channels(self) -> int:
    """The number of image channels (can be inferred from the dataset)."""
    if self.config.num_channels == 'infer':
      return self.info.features['image'].shape[-1]
    else:
      return int(self.config.num_channels)

  @property
  def num_examples(self) -> int:
    """The number of examples (can be inferred from the dataset)."""
    if self.config.num_examples == 'infer':
      return self.info.splits[self.config.split].num_examples
    else:
      return int(self.config.num_examples)

  @property
  def num_classes(self) -> int:
    """The number of classes (can be inferred from the dataset)."""
    if self.config.num_classes == 'infer':
      return self.info.features['label'].num_classes
    else:
      return int(self.config.num_classes)

  @property
  def info(self) -> tfds.core.DatasetInfo:
    """The TFDS dataset info, if available."""
    if self.builder_info is None:
      self.builder_info = tfds.builder(self.config.name).info
    return self.builder_info

  def build(self, strategy: tf.distribute.Strategy = None) -> tf.data.Dataset:
    """Construct a dataset end-to-end and return it using an optional strategy.

    Args:
      strategy: a strategy that, if passed, will distribute the dataset
        according to that strategy. If passed and `num_devices > 1`,
        `use_per_replica_batch_size` must be set to `True`.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if strategy:
      if strategy.num_replicas_in_sync != self.config.num_devices:
        logging.warn('Passed a strategy with %d devices, but expected'
                     '%d devices.',
                     strategy.num_replicas_in_sync,
                     self.config.num_devices)
      dataset = strategy.experimental_distribute_datasets_from_function(
          self._build)
    else:
      dataset = self._build()

    return dataset

  def _build(self, input_context: tf.distribute.InputContext = None
             ) -> tf.data.Dataset:
    """Construct a dataset end-to-end and return it.

    Args:
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    builders = {
        'tfds': self.load_tfds,
        'records': self.load_records,
        'synthetic': self.load_synthetic,
    }

    builder = builders.get(self.config.builder, None)

    if builder is None:
      raise ValueError('Unknown builder type {}'.format(self.config.builder))

    self.input_context = input_context
    dataset = builder()
    dataset = self.pipeline(dataset)

    return dataset

  def load_tfds(self) -> tf.data.Dataset:
    """Return a dataset loading files from TFDS."""

    logging.info('Using TFDS to load data.')

    builder = tfds.builder(self.config.name,
                           data_dir=self.config.data_dir)

    if self.config.download:
      builder.download_and_prepare()

    decoders = {}

    if self.config.skip_decoding:
      decoders['image'] = tfds.decode.SkipDecoding()

    read_config = tfds.ReadConfig(
        interleave_cycle_length=10,
        interleave_block_length=1,
        input_context=self.input_context)

    dataset = builder.as_dataset(
        split=self.config.split,
        as_supervised=True,
        shuffle_files=True,
        decoders=decoders,
        read_config=read_config)

    return dataset

  def load_records(self) -> tf.data.Dataset:
    """Return a dataset loading files with TFRecords."""
    logging.info('Using TFRecords to load data.')
    if self.config.filenames is None:
      if self.config.data_dir is None:
        raise ValueError('Dataset must specify a path for the data files.')

      file_pattern = os.path.join(self.config.data_dir,
                                  '{}*'.format(self.config.split))
      dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
    else:
      dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)

    return dataset

  def load_synthetic(self) -> tf.data.Dataset:
    """Return a dataset generating dummy synthetic data."""
    logging.info('Generating a synthetic dataset.')

    def generate_data(_):
      image = tf.zeros([self.image_size, self.image_size, self.num_channels],
                       dtype=self.dtype)
      label = tf.zeros([1], dtype=tf.int32)
      return image, label

    dataset = tf.data.Dataset.range(1)
    dataset = dataset.repeat()
    dataset = dataset.map(generate_data,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if (self.config.builder != 'tfds' and self.input_context
        and self.input_context.num_input_pipelines > 1):
      dataset = dataset.shard(self.input_context.num_input_pipelines,
                              self.input_context.input_pipeline_id)
      logging.info('Sharding the dataset: input_pipeline_id=%d '
                   'num_input_pipelines=%d',
                   self.input_context.num_input_pipelines,
                   self.input_context.input_pipeline_id)

    if self.is_training and self.config.builder == 'records':
      # Shuffle the input files.
      dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)

    if self.is_training and not self.config.cache:
      dataset = dataset.repeat()

    if self.config.builder == 'records':
      # Read the data from disk in parallel
      dataset = dataset.interleave(
          tf.data.TFRecordDataset,
          cycle_length=10,
          block_length=1,
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self.config.cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self.config.shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    if self.config.builder == 'records':
      preprocess = self.parse_record
    else:
      preprocess = self.preprocess
    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self.input_context and self.config.num_devices > 1:
      if not self.config.use_per_replica_batch_size:
        raise ValueError(
            'The builder does not support a global batch size with more than '
            'one replica. Got {} replicas. Please set a '
            '`per_replica_batch_size` and enable '
            '`use_per_replica_batch_size=True`.'.format(
                self.config.num_devices))

      # The batch size of the dataset will be multiplied by the number of
      # replicas automatically when strategy.distribute_datasets_from_function
      # is called, so we use local batch size here.
      dataset = dataset.batch(self.local_batch_size,
                              drop_remainder=self.is_training)
    else:
      dataset = dataset.batch(self.global_batch_size,
                              drop_remainder=self.is_training)

    # Prefetch overlaps in-feed with training
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    if self.config.tf_data_service:
      if not hasattr(tf.data.experimental, 'service'):
        raise ValueError('The tf_data_service flag requires Tensorflow version '
                         '>= 2.3.0, but the version is {}'.format(
                             tf.__version__))
      dataset = dataset.apply(
          tf.data.experimental.service.distribute(
              processing_mode='parallel_epochs',
              service=self.config.tf_data_service,
              job_name='resnet_train'))
      dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset

  def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    """Parse an ImageNet record from a serialized string Tensor."""
    keys_to_features = {
        'image/encoded':
            tf.io.FixedLenFeature((), tf.string, ''),
        'image/format':
            tf.io.FixedLenFeature((), tf.string, 'jpeg'),
        'image/class/label':
            tf.io.FixedLenFeature([], tf.int64, -1),
        'image/class/text':
            tf.io.FixedLenFeature([], tf.string, ''),
        'image/object/bbox/xmin':
            tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin':
            tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax':
            tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax':
            tf.io.VarLenFeature(dtype=tf.float32),
        'image/object/class/label':
            tf.io.VarLenFeature(dtype=tf.int64),
    }

    parsed = tf.io.parse_single_example(record, keys_to_features)

    label = tf.reshape(parsed['image/class/label'], shape=[1])

    # Subtract one so that labels are in [0, 1000)
    label -= 1

    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
    image, label = self.preprocess(image_bytes, label)

    return image, label

  def preprocess(self, image: tf.Tensor, label: tf.Tensor
                ) -> Tuple[tf.Tensor, tf.Tensor]:
    """Apply image preprocessing and augmentation to the image and label."""
    if self.is_training:
      image = preprocessing.preprocess_for_train(
          image,
          image_size=self.image_size,
          mean_subtract=self.config.mean_subtract,
          standardize=self.config.standardize,
          dtype=self.dtype,
          augmenter=self.augmenter)
    else:
      image = preprocessing.preprocess_for_eval(
          image,
          image_size=self.image_size,
          num_channels=self.num_channels,
          mean_subtract=self.config.mean_subtract,
          standardize=self.config.standardize,
          dtype=self.dtype)

    label = tf.cast(label, tf.int32)
    if self.config.one_hot:
      label = tf.one_hot(label, self.num_classes)
      label = tf.reshape(label, [self.num_classes])

    return image, label

  @classmethod
  def from_params(cls, *args, **kwargs):
    """Construct a dataset builder from a default config and any overrides."""
    config = DatasetConfig.from_args(*args, **kwargs)
    return cls(config)