diff --git a/README.md b/README.md index efe14740..80ed678f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ - **August 2023 (image compression)** - [Released PyTorch implementation of MS-ILLM](https://github.com/facebookresearch/NeuralCompression/tree/main/projects/illm) - **April 2023 (video compression)** - [Released PyTorch implementation of VCT](https://github.com/facebookresearch/NeuralCompression/tree/main/projects/torch_vct) - **November 2022 (image compression)** - [Released Bits-Back coding with diffusion models](https://github.com/facebookresearch/NeuralCompression/tree/main/projects/bits_back_diffusion)! -- **July 2021 (image compression)** - [Released implemenation of Scale Hyperprior](https://github.com/facebookresearch/NeuralCompression/tree/main/projects/scale_hyperprior_lightning) - **July 2021 (video compression)** - [Released implementation of DVC](https://github.com/facebookresearch/NeuralCompression/tree/main/projects/deep_video_compression) ## About @@ -80,7 +79,6 @@ The 2-tier structure enables rapid iteration and reproduction via code in - `deep_video_compression` [DVC (Lu et al., 2019)](https://openaccess.thecvf.com/content_CVPR_2019/html/Lu_DVC_An_End-To-End_Deep_Video_Compression_Framework_CVPR_2019_paper.html), might soon be deprecated - `illm` [MS-ILLM (Muckley et al., 2023)](https://proceedings.mlr.press/v202/muckley23a.html) - `jax_entropy_coders` - implementations of arithmetic coding and ANS in JAX - - `scale_hyperprior_lightning` [Scale Hyperprior (Balle et al., 2018)](https://arxiv.org/abs/1802.01436), might soon be deprecated - `torch_vct` [VCT (Mentzer, et al.,)](https://proceedings.neurips.cc/paper_files/paper/2022/hash/54dcf25318f9de5a7a01f0a4125c541e-Abstract-Conference.html) ## Tutorial Notebooks diff --git a/projects/scale_hyperprior_lightning/README.md b/projects/scale_hyperprior_lightning/README.md deleted file mode 100644 index 18adb624..00000000 --- a/projects/scale_hyperprior_lightning/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Scale Hyperprior Training in PyTorch Lightning - -This project trains the -[scale hyperprior model](https://arxiv.org/pdf/1802.01436.pdf) -on the [Vimeo-90k septuplet](http://toflow.csail.mit.edu/) dataset. - -The project uses -[PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/) -as a training framework and [Hydra](https://hydra.cc/) for configuration. - -## Installation - -After installing the `neuralcompression` package following the -[top-level README](https://github.com/facebookresearch/NeuralCompression/README.md) -instructions, install this project's additional dependencies with: - -```bash -pip install -r requirements.txt -``` - -## Training the Model - -The config options for this model are documented in `config/base.yaml`. -The only parameter that must be specified by the user is the path to -vimeo dataset - this can be done by modifying the config file itself or by -passing arguments on the command line (see the -[Hydra documentation](https://hydra.cc/docs/intro#basic-example) for details). -Training a model locally using default hyperparameters can be run with: - -```bash -python train.py data.data_dir=/path/to/vimeo -``` - -### Cluster Training - -Using Hydra's -[submitit plugin](https://hydra.cc/docs/next/plugins/submitit_launcher/), -you can also launch training jobs on SLURM clusters. -This can be configured by passing `+mode=submitit_single_node` or -`+mode=submitit_multi_node` as command line arguments. -The `--multirun/-m` flag must also be passed to load the plugin. -For example, to train a model on 2 nodes, each with 3 gpus, run: - -```bash -python train.py -m data.data_dir=/path/to/vimeo +mode=submitit_multi_node ngpu=3 trainer.num_nodes=2 -``` - -## Testing - -After following the installation instructions above, this project's tests can -be run with: - -```bash -pytest tests/ -```` diff --git a/projects/scale_hyperprior_lightning/config/base.yaml b/projects/scale_hyperprior_lightning/config/base.yaml deleted file mode 100644 index 7959333d..00000000 --- a/projects/scale_hyperprior_lightning/config/base.yaml +++ /dev/null @@ -1,53 +0,0 @@ -save_dir: "." -resume_training: True # If True, resumes from the last checkpoint if one exists. - -# If overwrite and resume_training are both False, the script will throw an error -# if checkpoints already exist in the save_dir. -overwrite: False -ngpu: 1 - - -model: # See ScaleHyperprior for parameter details. - network_channels: 128 - compression_channels: 192 - - -training_loop: - distortion_lambda: 1e-2 - learning_rate: 1e-4 - aux_learning_rate: 1e-3 - - -data: - data_dir: ??? - num_workers: 4 - patch_size: [256, 256] - train_batch_size: 8 - val_batch_size: 8 - - -save_model: # Passed to PyTorch Lightning's ModelCheckpoint callback. - dirpath: ${save_dir} - save_top_k: 1 - monitor: "val_loss" - save_last: True - - -hydra: # So hydra will put your config info in the same dir as your checkpoints - run: - dir: ${save_dir} - sweep: - dir: ${save_dir} - - -loggers: - - _target_: pytorch_lightning.loggers.WandbLogger - save_dir: ${save_dir} - - -# These flags are passed to the PyTorch Lightning Trainer - add -# any extra customization here! -trainer: - max_steps: 1000000 # 1M - gpus: ${ngpu} - accelerator: ddp diff --git a/projects/scale_hyperprior_lightning/config/mode/submitit_multi_node.yaml b/projects/scale_hyperprior_lightning/config/mode/submitit_multi_node.yaml deleted file mode 100644 index e0ec4f2d..00000000 --- a/projects/scale_hyperprior_lightning/config/mode/submitit_multi_node.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# @package _global_ -defaults: - - override /hydra/launcher: submitit_slurm - -trainer: - num_nodes: 1 - -hydra: - launcher: - gpus_per_node: ${ngpu} - tasks_per_node: ${ngpu} - nodes: ${trainer.num_nodes} diff --git a/projects/scale_hyperprior_lightning/config/mode/submitit_single_node.yaml b/projects/scale_hyperprior_lightning/config/mode/submitit_single_node.yaml deleted file mode 100644 index e75ac157..00000000 --- a/projects/scale_hyperprior_lightning/config/mode/submitit_single_node.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# @package _global_ -defaults: - - override /hydra/launcher: submitit_slurm - -hydra: - launcher: - gpus_per_node: ${ngpu} - tasks_per_node: ${ngpu} diff --git a/projects/scale_hyperprior_lightning/requirements.txt b/projects/scale_hyperprior_lightning/requirements.txt deleted file mode 100644 index 3244bfe6..00000000 --- a/projects/scale_hyperprior_lightning/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytorch-lightning==1.3.3 -hydra-core==1.1.0rc1 -hydra-submitit-launcher==1.1.5.dev2 diff --git a/projects/scale_hyperprior_lightning/scale_hyperprior.py b/projects/scale_hyperprior_lightning/scale_hyperprior.py deleted file mode 100644 index 4def626a..00000000 --- a/projects/scale_hyperprior_lightning/scale_hyperprior.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math -from typing import List, Sequence, Tuple - -import torch.nn.functional as F -import torch.optim as optim -from pytorch_lightning import LightningModule -from torch import Tensor - -from neuralcompression.models import ScaleHyperprior - - -class ScaleHyperpriorLightning(LightningModule): - """ - Model and training loop for the scale hyperprior model. - - Combines a pre-defined scale hyperprior model with its training loop - for use with PyTorch Lightning. - - Args: - model: the ScaleHyperprior model to train. - distortion_lambda: A scaling factor for the distortion term - of the loss. - learning_rate: passed to the main network optimizer (i.e. the one that - adjusts the analysis and synthesis parameters). - aux_learning_rate: passed to the optimizer that learns the quantiles - used to build the CDF table for the entropy codder. - """ - - def __init__( - self, - model: ScaleHyperprior, - distortion_lambda: float = 1e-2, - learning_rate: float = 1e-3, - aux_learning_rate: float = 1e-3, - ): - super().__init__() - - self.model = model - self.learning_rate = learning_rate - self.aux_learning_rate = aux_learning_rate - self.distortion_lambda = distortion_lambda - - def forward(self, images): - return self.model(images) - - def rate_distortion_loss( - self, - reconstruction: Tensor, - latent_likelihoods: Tensor, - hyper_latent_likelihoods: Tensor, - original: Tensor, - ): - num_images, _, height, width = original.shape - num_pixels = num_images * height * width - - bits = ( - latent_likelihoods.log().sum() + hyper_latent_likelihoods.log().sum() - ) / -math.log(2) - bpp_loss = bits / num_pixels - - distortion_loss = F.mse_loss(reconstruction, original) - combined_loss = self.distortion_lambda * 255**2 * distortion_loss + bpp_loss - - return bpp_loss, distortion_loss, combined_loss - - def update(self, force=True): - return self.model.update(force=force) - - def compress( - self, images: Tensor - ) -> Tuple[List[str], List[str], Sequence[int], Sequence[int], Sequence[int]]: - return self.model.compress(images) - - def decompress( - self, - y_strings: List[str], - z_strings: List[str], - image_shape: Sequence[int], - y_shape: Sequence[int], - z_shape: Sequence[int], - ): - return self.model.decompress( - y_strings, z_strings, image_shape, y_shape, z_shape - ) - - def training_step(self, batch, batch_idx, optimizer_idx): - if optimizer_idx not in [0, 1]: - raise ValueError( - f"Received unexpected optimizer index {optimizer_idx}" - " - should be 0 or 1" - ) - - if optimizer_idx == 0: - x_hat, y_likelihoods, z_likelihoods = self(batch) - bpp_loss, distortion_loss, combined_loss = self.rate_distortion_loss( - x_hat, y_likelihoods, z_likelihoods, batch - ) - self.log_dict( - { - "bpp_loss": bpp_loss.item(), - "distortion_loss": distortion_loss.item(), - "loss": combined_loss.item(), - }, - sync_dist=True, - ) - return combined_loss - - else: - # This is the loss for learning the quantiles of the - # distribution for the hyperprior. - quantile_loss = self.model.quantile_loss() - self.log("quantile_loss", quantile_loss.item(), sync_dist=True) - return quantile_loss - - def validation_step(self, batch, batch_idx): - x_hat, y_likelihoods, z_likelihoods = self(batch) - bpp_loss, distortion_loss, combined_loss = self.rate_distortion_loss( - x_hat, y_likelihoods, z_likelihoods, batch - ) - - self.log_dict( - { - "val_loss": combined_loss.item(), - "val_distortion_loss": distortion_loss.item(), - "val_bpp_loss": bpp_loss.item(), - }, - sync_dist=True, - ) - - def configure_optimizers(self): - model_param_dict, quantile_param_dict = self.model.collect_parameters() - - optimizer = optim.Adam( - model_param_dict.values(), - lr=self.learning_rate, - ) - aux_optimizer = optim.Adam( - quantile_param_dict.values(), - lr=self.aux_learning_rate, - ) - - lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min") - - return ( - { - "optimizer": optimizer, - "lr_scheduler": {"scheduler": lr_scheduler, "monitor": "val_loss"}, - }, - {"optimizer": aux_optimizer}, - ) diff --git a/projects/scale_hyperprior_lightning/tests/test_scale_hyperprior_lightning.py b/projects/scale_hyperprior_lightning/tests/test_scale_hyperprior_lightning.py deleted file mode 100644 index 8edfe116..00000000 --- a/projects/scale_hyperprior_lightning/tests/test_scale_hyperprior_lightning.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import pytest -import torch -from pytorch_lightning import Trainer -from torch.utils.data import DataLoader - -from neuralcompression.models import ScaleHyperprior -from projects.scale_hyperprior_lightning.scale_hyperprior import ( - ScaleHyperpriorLightning, -) - - -@pytest.mark.parametrize( - "network_channels,compression_channels,img_size,batch_size", - [(32, 64, 128, 1), (32, 64, 128, 4)], -) -def test_hyperprior_training( - network_channels, compression_channels, img_size, batch_size -): - # Tests the training and validation loop of the PTL lightning module. - # - # Tests that the scale hyperprior's PyTorch LightningModule (which - # is responsible for the training and validation loop logic, logging, - # etc.) can complete several training and validation steps without - # crashing. - - train_ds = [torch.randn(3, img_size, img_size) for _ in range(10)] - val_ds = [torch.randn(3, img_size, img_size) for _ in range(10)] - - train_dl = DataLoader(train_ds, batch_size=batch_size) - val_dl = DataLoader(val_ds, batch_size=batch_size) - - module = ScaleHyperprior( - network_channels=network_channels, compression_channels=compression_channels - ) - - lightning_module = ScaleHyperpriorLightning(module) - - trainer = Trainer(fast_dev_run=3) - trainer.fit(lightning_module, train_dataloader=train_dl, val_dataloaders=val_dl) diff --git a/projects/scale_hyperprior_lightning/train.py b/projects/scale_hyperprior_lightning/train.py deleted file mode 100644 index 36eb0332..00000000 --- a/projects/scale_hyperprior_lightning/train.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from pathlib import Path - -import hydra -from omegaconf import DictConfig -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint -from scale_hyperprior import ScaleHyperpriorLightning -from vimeo import Vimeo90kSeptupletLightning - -from neuralcompression.models import ScaleHyperprior - - -@hydra.main(config_path="config", config_name="base") -def main(cfg: DictConfig): - save_dir: Path = Path(hydra.utils.get_original_cwd()) / cfg.save_dir - - if ( - not cfg.overwrite - and not cfg.resume_training - and len(list(save_dir.glob("*.ckpt"))) > 0 - ): - raise RuntimeError( - "Checkpoints detected in save directory: set resume_training=True" - " to restore trainer state from these checkpoints, or set overwrite=True" - " to ignore them." - ) - - save_dir.mkdir(exist_ok=True, parents=True) - last_checkpoint = save_dir / "last.ckpt" - - model = ScaleHyperprior(**cfg.model) - lightning_model = ScaleHyperpriorLightning(model, **cfg.training_loop) - - data = Vimeo90kSeptupletLightning(**cfg.data, pin_memory=cfg.ngpu != 0) - - loggers = [hydra.utils.instantiate(logger_cfg) for logger_cfg in cfg.loggers] - trainer = Trainer( - **cfg.trainer, - logger=loggers, - callbacks=[ - LearningRateMonitor(), - ModelCheckpoint(**cfg.save_model), - ], - resume_from_checkpoint=last_checkpoint - if last_checkpoint.exists() and cfg.resume_training - else None, - ) - - trainer.fit(lightning_model, datamodule=data) - - -if __name__ == "__main__": - main() diff --git a/projects/scale_hyperprior_lightning/vimeo.py b/projects/scale_hyperprior_lightning/vimeo.py deleted file mode 100644 index 4a25fefd..00000000 --- a/projects/scale_hyperprior_lightning/vimeo.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import List, Optional, Sequence, Union - -from pytorch_lightning import LightningDataModule -from torch.utils.data import DataLoader -from torchvision import transforms - -from neuralcompression.data import Vimeo90kSeptuplet - - -class Vimeo90kSeptupletLightning(LightningDataModule): - """ - PyTorch Lightning data module version of ``Vimeo90kSeptuplet``. - - Args: - data_dir: root directory of Vimeo dataset. - train_batch_size: the batch size to use during training. - val_batch_size: the batch size to use during validation. - patch_size: the size of the crop to take from the original images. - num_workers: the number of parallel workers to create to load data - items (see PyTorch's Dataloader documentation for more details). - pin_memory: whether prepared items should be loaded into pinned memory - or not. This improves performance on GPUs. - """ - - def __init__( - self, - data_dir: str, - train_batch_size: int = 8, - val_batch_size: int = 8, - patch_size: Union[int, Sequence[int]] = (256, 256), - num_workers: int = 0, - pin_memory: bool = False, - ): - super().__init__() - - self.data_dir = data_dir - self.train_batch_size = train_batch_size - self.val_batch_size = val_batch_size - self.patch_size = patch_size - self.num_workers = num_workers - self.pin_memory = pin_memory - - def setup(self, stage: Optional[str] = None) -> None: - train_transforms = transforms.Compose( - [transforms.RandomCrop(self.patch_size), transforms.ToTensor()] - ) - - val_transforms = transforms.Compose( - [transforms.CenterCrop(self.patch_size), transforms.ToTensor()] - ) - - self.train_dataset = Vimeo90kSeptuplet( - self.data_dir, - pil_transform=train_transforms, - split="train", - ) - - self.val_dataset = Vimeo90kSeptuplet( - self.data_dir, - pil_transform=val_transforms, - split="test", - ) - - def train_dataloader(self) -> DataLoader: - return DataLoader( - self.train_dataset, - batch_size=self.train_batch_size, - num_workers=self.num_workers, - shuffle=True, - pin_memory=self.pin_memory, - ) - - def val_dataloader(self) -> Union[DataLoader, List[DataLoader]]: - return DataLoader( - self.val_dataset, - batch_size=self.val_batch_size, - num_workers=self.num_workers, - shuffle=False, - pin_memory=self.pin_memory, - )