lightly-ai · IgorSusmelj · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/benchmarks/imagenet/resnet50/main.py b/benchmarks/imagenet/resnet50/main.py
@@ -17,6 +17,7 @@
 import tico
 import torch
 import vicreg
+import wmse
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.callbacks import (
     DeviceStatsMonitor,
@@ -64,6 +65,7 @@
     "swav": {"model": swav.SwAV, "transform": swav.transform},
     "tico": {"model": tico.TiCo, "transform": tico.transform},
     "vicreg": {"model": vicreg.VICReg, "transform": vicreg.transform},
+    "wmse": {"model": wmse.WMSE, "transform": wmse.transform},
 }
 
 

diff --git a/benchmarks/imagenet/resnet50/wmse.py b/benchmarks/imagenet/resnet50/wmse.py
@@ -0,0 +1,109 @@
+import math
+from typing import List, Tuple
+
+import torch
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import Identity
+from torchvision.models import resnet50
+
+from lightly.loss.wmse_loss import WMSELoss
+from lightly.models.modules import WMSEProjectionHead
+from lightly.models.utils import get_weight_decay_parameters
+from lightly.transforms import WMSETransform
+from lightly.utils.benchmarking import OnlineLinearClassifier
+from lightly.utils.lars import LARS
+from lightly.utils.scheduler import CosineWarmupScheduler
+
+
+class WMSE(LightningModule):
+    def __init__(self, batch_size_per_device: int, num_classes: int) -> None:
+        super().__init__()
+        self.save_hyperparameters()
+        self.batch_size_per_device = batch_size_per_device
+
+        resnet = resnet50()
+        resnet.fc = Identity()  # Ignore classification head
+        self.backbone = resnet
+
+        # we use a projection head with output dimension 64
+        # and w_size of 128 to support a batch size of 256
+        self.projection_head = WMSEProjectionHead(output_dim=64)
+
+        self.criterion_WMSE4loss = WMSELoss(
+            w_size=128, embedding_dim=64, num_samples=4, gather_distributed=True
+        )
+
+        self.online_classifier = OnlineLinearClassifier(num_classes=num_classes)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.backbone(x)
+
+    def training_step(
+        self, batch: Tuple[List[Tensor], Tensor, List[str]], batch_idx: int
+    ) -> Tensor:
+        views, targets = batch[0], batch[1]
+        features = self.forward(torch.cat(views)).flatten(start_dim=1)
+        z = self.projection_head(features)
+        loss = self.criterion_WMSE4loss(z)
+        self.log(
+            "train_loss", loss, prog_bar=True, sync_dist=True, batch_size=len(targets)
+        )
+
+        cls_loss, cls_log = self.online_classifier.training_step(
+            (features.detach(), targets.repeat(len(views))), batch_idx
+        )
+        self.log_dict(cls_log, sync_dist=True, batch_size=len(targets))
+        return loss + cls_loss
+
+    def validation_step(
+        self, batch: Tuple[Tensor, Tensor, List[str]], batch_idx: int
+    ) -> Tensor:
+        images, targets = batch[0], batch[1]
+        features = self.forward(images).flatten(start_dim=1)
+        cls_loss, cls_log = self.online_classifier.validation_step(
+            (features.detach(), targets), batch_idx
+        )
+        self.log_dict(cls_log, prog_bar=True, sync_dist=True, batch_size=len(targets))
+        return cls_loss
+
+    def configure_optimizers(self):
+        # Don't use weight decay for batch norm, bias parameters, and classification
+        # head to improve performance.
+        params, params_no_weight_decay = get_weight_decay_parameters(
+            [self.backbone, self.projection_head]
+        )
+        optimizer = LARS(
+            [
+                {"name": "wmse", "params": params},
+                {
+                    "name": "wmse_no_weight_decay",
+                    "params": params_no_weight_decay,
+                    "weight_decay": 0.0,
+                },
+                {
+                    "name": "online_classifier",
+                    "params": self.online_classifier.parameters(),
+                    "weight_decay": 0.0,
+                },
+            ],
+            lr=0.1 * math.sqrt(self.batch_size_per_device * self.trainer.world_size),
+            momentum=0.9,
+            weight_decay=1e-6,
+        )
+        scheduler = {
+            "scheduler": CosineWarmupScheduler(
+                optimizer=optimizer,
+                warmup_epochs=int(
+                    self.trainer.estimated_stepping_batches
+                    / self.trainer.max_epochs
+                    * 10
+                ),
+                max_epochs=int(self.trainer.estimated_stepping_batches),
+            ),
+            "interval": "step",
+        }
+        return [optimizer], [scheduler]
+
+
+transform = WMSETransform()
diff --git a/lightly/loss/wmse_loss.py b/lightly/loss/wmse_loss.py
@@ -3,8 +3,10 @@
 from typing import Callable
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+from lightly.utils.dist import gather
 
 
 def norm_mse_loss(x0: torch.Tensor, x1: torch.Tensor) -> torch.Tensor:
@@ -59,10 +61,18 @@
 
         f_cov_shrinked = (1 - self.eps) * f_cov + self.eps * eye
 
+        # get type of f_cov_shrinked and temporary convert to full precision
+        # to support chelosky decomposition
+        f_cov_shrinked_type = f_cov_shrinked.dtype
+        f_cov_shrinked = f_cov_shrinked.to(torch.float32)
+
         inv_sqrt = torch.linalg.solve_triangular(
             torch.linalg.cholesky(f_cov_shrinked), eye, upper=False
         )
 
+        # convert back to original type
+        inv_sqrt = inv_sqrt.to(f_cov_shrinked_type)
+
         inv_sqrt = inv_sqrt.contiguous().view(
             self.num_features, self.num_features, 1, 1
         )
@@ -117,6 +127,7 @@
         w_size: int = 256,
         loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = norm_mse_loss,
         num_samples: int = 2,
+        gather_distributed: bool = False,
     ):
         """Parameters as described in [0]
 
@@ -137,6 +148,9 @@
                 Loss function to use for the whitening.
             num_samples:
                 Number of samples generated by the transforms for each image.
+            gather_distributed:
+                If True then the cross-correlation matrices from all gpus are
+                gathered and summed before the loss calculation.
 
 
         """
@@ -147,15 +161,22 @@
             eps=eps,
             track_running_stats=track_running_stats,
         )
+        if gather_distributed and not dist.is_available():
+            raise ValueError(
+                "gather_distributed is True but torch.distributed is not available. "
+                "Please set gather_distributed=False or install a torch version with "
+                "distributed support."
+            )
         if embedding_dim * 2 > w_size:
             raise ValueError(
-                "w_size should be at least twice the size of embedding_dim to avoid instabiliy"
+                f"w_size is {w_size} but it should be at least twice the size of embedding_dim which is {embedding_dim} to avoid instabiliy"
             )
         self.w_iter = w_iter
         self.w_size = w_size
         self.loss_f = loss_fn
         self.num_samples = num_samples
         self.num_pairs = num_samples * (num_samples - 1) // 2
+        self.gather_distributed = gather_distributed
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         """Calculates the W-MSE loss.
@@ -173,13 +194,23 @@
             ValueError:
                 If the batch size is smaller than w_size.
         """
+        # gather all batches
+        if self.gather_distributed and dist.is_initialized():
+            world_size = dist.get_world_size()
+            if world_size > 1:
+                input = torch.cat(gather(input), dim=0)
+
         if input.shape[0] % self.num_samples != 0:
-            raise RuntimeError("input batch size must be divisible by num_samples")
+            raise RuntimeError(
+                f"input batch size is {input.shape[0]} but must be divisible by num_samples which is {self.num_samples}"
+            )
 
         bs = input.shape[0] // self.num_samples
 
         if bs < self.w_size:
-            raise ValueError("batch size must be greater than or equal to w_size")
+            raise ValueError(
+                f"batch size is {bs} but must be greater than or equal to w_size which is {self.w_size}"
+            )
         loss = torch.tensor(0.0, device=input.device, requires_grad=True)
 
         for _ in range(self.w_iter):

diff --git a/lightly/models/modules/__init__.py b/lightly/models/modules/__init__.py
@@ -25,6 +25,7 @@
     SMoGPrototypes,
     SwaVProjectionHead,
     SwaVPrototypes,
+    WMSEProjectionHead,
 )
 from lightly.models.modules.nn_memory_bank import NNMemoryBankModule
 

diff --git a/lightly/models/modules/heads.py b/lightly/models/modules/heads.py
@@ -699,6 +699,27 @@
         )
 
 
+class WMSEProjectionHead(SimCLRProjectionHead):
+    """Projection head used for W-MSE.
+
+    Uses the same projection head as SimCLR.[0]
+
+    [0]: 2021, W-MSE, https://arxiv.org/pdf/2007.06346.pdf
+    """
+
+    def __init__(
+        self,
+        input_dim: int = 2048,
+        hidden_dim: int = 2048,
+        output_dim: int = 128,
+        num_layers: int = 2,
+        batch_norm: bool = True,
+    ):
+        super(WMSEProjectionHead, self).__init__(
-        super(WMSEProjectionHead, self).__init__(
+        super().__init__(
-        super(WMSEProjectionHead, self).__init__(
+        super().__init__(
+            input_dim, hidden_dim, output_dim, num_layers, batch_norm
+        )
+
+
 class VICRegProjectionHead(ProjectionHead):
     """Projection head used for VICReg.