mllam · khintz · May 28, 2024 · May 28, 2024 · May 31, 2024 · May 31, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   [\#66](https://github.com/mllam/neural-lam/pull/66)
  @leifdenby @sadamov
 
+- Add support for MLFlow logging and metrics tracking. [\#77](https://github.com/mllam/neural-lam/pull/77)
+  @khintz
+
 ### Fixed
 
 - Fix wandb environment variable disabling wandb during tests. Now correctly uses WANDB_MODE=disabled. [\#94](https://github.com/mllam/neural-lam/pull/94) @joeloskarsson

diff --git a/neural_lam/config.py b/neural_lam/config.py
@@ -86,6 +86,9 @@ class TrainingConfig:
         ManualStateFeatureWeighting, UniformFeatureWeighting
     ] = dataclasses.field(default_factory=UniformFeatureWeighting)
 
+    logger: str = "wandb"
+    logger_url: str = ""
+
 
 @dataclasses.dataclass
 class NeuralLAMConfig(dataclass_wizard.JSONWizard, dataclass_wizard.YAMLWizard):

diff --git a/neural_lam/datastore/plot_example.py b/neural_lam/datastore/plot_example.py
@@ -186,4 +186,5 @@ def _parse_dict(arg_str):
         selection=selection,
         index_selection=index_selection,
     )
-    plt.show()
+    # plt.show()
+    plt.savefig("plot_example.png")
diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py
@@ -7,7 +7,6 @@
 import numpy as np
 import pytorch_lightning as pl
 import torch
-import wandb
 import xarray as xr
 
 # Local
@@ -96,6 +95,7 @@ def __init__(
             # Store constant per-variable std.-dev. weighting
             # NOTE that this is the inverse of the multiplicative weighting
             # in wMSE/wMAE
+            # TODO: Do we need param_weights for this?
             self.register_buffer(
                 "per_var_std",
                 self.diff_std / torch.sqrt(self.feature_weights),
@@ -262,7 +262,7 @@ def unroll_prediction(self, init_states, forcing_features, true_states):
                 pred_std_list, dim=1
             )  # (B, pred_steps, num_grid_nodes, d_f)
         else:
-            pred_std = self.per_var_std  # (d_f,)
+            pred_std = self.diff_std  # (d_f,)
 
         return prediction, pred_std
 
@@ -539,14 +539,18 @@ def plot_examples(self, batch, n_examples, split, prediction=None):
 
                 example_i = self.plotted_examples
 
-                wandb.log(
-                    {
-                        f"{var_name}_example_{example_i}": wandb.Image(fig)
-                        for var_name, fig in zip(
-                            self._datastore.get_vars_names("state"), var_figs
-                        )
-                    }
-                )
+                for var_name, fig in zip(
+                    self._datastore.get_vars_names("state"), var_figs
+                ):
+
+                    if isinstance(self.logger, pl.loggers.WandbLogger):
+                        key = f"{var_name}_example_{example_i}"
+                    else:
+                        key = f"{var_name}_example"
+
+                    if hasattr(self.logger, "log_image"):
+                        self.logger.log_image(key=key, images=[fig], step=t_i)
+
                 plt.close(
                     "all"
                 )  # Close all figs for this time step, saves memory
@@ -555,13 +559,15 @@ def plot_examples(self, batch, n_examples, split, prediction=None):
             torch.save(
                 pred_slice.cpu(),
                 os.path.join(
-                    wandb.run.dir, f"example_pred_{self.plotted_examples}.pt"
+                    self.logger.save_dir,
+                    f"example_pred_{self.plotted_examples}.pt",
                 ),
             )
             torch.save(
                 target_slice.cpu(),
                 os.path.join(
-                    wandb.run.dir, f"example_target_{self.plotted_examples}.pt"
+                    self.logger.save_dir,
+                    f"example_target_{self.plotted_examples}.pt",
                 ),
             )
 
@@ -582,16 +588,16 @@ def create_metric_log_dict(self, metric_tensor, prefix, metric_name):
             datastore=self._datastore,
         )
         full_log_name = f"{prefix}_{metric_name}"
-        log_dict[full_log_name] = wandb.Image(metric_fig)
+        log_dict[full_log_name] = metric_fig
 
         if prefix == "test":
             # Save pdf
             metric_fig.savefig(
-                os.path.join(wandb.run.dir, f"{full_log_name}.pdf")
+                os.path.join(self.logger.save_dir, f"{full_log_name}.pdf")
             )
             # Save errors also as csv
             np.savetxt(
-                os.path.join(wandb.run.dir, f"{full_log_name}.csv"),
+                os.path.join(self.logger.save_dir, f"{full_log_name}.csv"),
                 metric_tensor.cpu().numpy(),
                 delimiter=",",
             )
@@ -639,8 +645,25 @@ def aggregate_and_plot_metrics(self, metrics_dict, prefix):
                     )
                 )
 
+        # Ensure that log_dict has structure for
+        # logging as dict(str, plt.Figure)
+        assert all(
+            isinstance(key, str) and isinstance(value, plt.Figure)
+            for key, value in log_dict.items()
+        )
+
         if self.trainer.is_global_zero and not self.trainer.sanity_checking:
-            wandb.log(log_dict)  # Log all
+
+            current_epoch = self.trainer.current_epoch
+
+            for key, figure in log_dict.items():
+                # For other loggers than wandb, add epoch to key
+                if not isinstance(self.logger, pl.loggers.WandbLogger):
+                    key = f"{key}-{current_epoch}"
+
+                if hasattr(self.logger, "log_image"):
+                    self.logger.log_image(key=key, images=[figure])
+
             plt.close("all")  # Close all figs
 
     def on_test_epoch_end(self):
@@ -672,9 +695,13 @@ def on_test_epoch_end(self):
                 )
             ]
 
-            # log all to same wandb key, sequentially
-            for fig in loss_map_figs:
-                wandb.log({"test_loss": wandb.Image(fig)})
+            # log all to same key, sequentially
+            for i, fig in enumerate(loss_map_figs):
+                key = "test_loss"
+                if not isinstance(self.logger, pl.loggers.WandbLogger):
+                    key = f"{key}_{i}"
+                if hasattr(self.logger, "log_image"):
+                    self.logger.log_image(key=key, images=[fig])
 
             # also make without title and save as pdf
             pdf_loss_map_figs = [
@@ -683,14 +710,16 @@ def on_test_epoch_end(self):
                 )
                 for loss_map in mean_spatial_loss
             ]
-            pdf_loss_maps_dir = os.path.join(wandb.run.dir, "spatial_loss_maps")
+            pdf_loss_maps_dir = os.path.join(
+                self.logger.save_dir, "spatial_loss_maps"
+            )
             os.makedirs(pdf_loss_maps_dir, exist_ok=True)
             for t_i, fig in zip(self.args.val_steps_to_log, pdf_loss_map_figs):
                 fig.savefig(os.path.join(pdf_loss_maps_dir, f"loss_t{t_i}.pdf"))
             # save mean spatial loss as .pt file also
             torch.save(
                 mean_spatial_loss.cpu(),
-                os.path.join(wandb.run.dir, "mean_spatial_loss.pt"),
+                os.path.join(self.logger.save_dir, "mean_spatial_loss.pt"),
             )
 
         self.spatial_loss_maps.clear()

diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py
@@ -5,10 +5,15 @@
 from argparse import ArgumentParser
 
 # Third-party
+import mlflow
+
+# for logging the model:
+import mlflow.pytorch
 import pytorch_lightning as pl
 import torch
 from lightning_fabric.utilities import seed
 from loguru import logger
+from mlflow.models import infer_signature
 
 # Local
 from . import utils
@@ -23,6 +28,110 @@
 }
 
 
+class CustomMLFlowLogger(pl.loggers.MLFlowLogger):
+    """
+    Custom MLFlow logger that adds functionality not present in the default
+    """
+
+    def __init__(self, experiment_name, tracking_uri):
+        super().__init__(
+            experiment_name=experiment_name, tracking_uri=tracking_uri
+        )
+        mlflow.start_run(run_id=self.run_id, log_system_metrics=True)
+        mlflow.log_param("run_id", self.run_id)
+
+    @property
+    def save_dir(self):
+        """
+        Returns the directory where the MLFlow artifacts are saved
+        """
+        return "mlruns"
+
+    def log_image(self, key, images, step=None):
+        """
+        Log a matplotlib figure as an image to MLFlow
+
+        key: str
+            Key to log the image under
+        images: list
+            List of matplotlib figures to log
+        step: Union[int, None]
+            Step to log the image under. If None, logs under the key directly
+        """
+        # Third-party
+        from PIL import Image
+
+        if step is not None:
+            key = f"{key}_{step}"
+
+        # Need to save the image to a temporary file, then log that file
+        # mlflow.log_image, should do this automatically, but is buggy
+        temporary_image = f"{key}.png"
+        images[0].savefig(temporary_image)
+
+        img = Image.open(temporary_image)
+        mlflow.log_image(img, f"{key}.png")
+
+    def log_model(self, data_module, model):
+        input_example = self.create_input_example(data_module)
+
+        with torch.no_grad():
+            model_output = model.common_step(input_example)[
+                0
+            ]  # common_step returns tuple (prediction, target, pred_std, _)
+
+        log_model_input_example = {
+            name: tensor.cpu().numpy()
+            for name, tensor in zip(
+                ["init_states", "target_states", "forcing", "target_times"],
+                input_example,
+            )
+        }
+
+        signature = infer_signature(
+            log_model_input_example, model_output.cpu().numpy()
+        )
+
+        mlflow.pytorch.log_model(
+            model,
+            "model",
+            signature=signature,
+        )
+
+    def create_input_example(self, data_module):
+
+        if data_module.val_dataset is None:
+            data_module.setup(stage="fit")
+
+        data_loader = data_module.train_dataloader()
+        batch_sample = next(iter(data_loader))
+        return batch_sample
+
+
+def _setup_training_logger(config, datastore, args, run_name):
+    if config.training.logger == "wandb":
+        logger = pl.loggers.WandbLogger(
+            project=args.wandb_project,
+            name=run_name,
+            config=dict(training=vars(args), datastore=datastore._config),
+        )
+    elif config.training.logger == "mlflow":
+        url = config.training.logger_url
+        if url is None:
+            raise ValueError(
+                "MLFlow logger requires a URL to the MLFlow server"
+            )
+        logger = CustomMLFlowLogger(
+            experiment_name=args.wandb_project,
+            tracking_uri=url,
+        )
+        logger.log_hyperparams(
+            dict(training=vars(args), datastore=datastore._config)
+        )
+
+    return logger
+
+
 @logger.catch
 def main(input_args=None):
     """Main function for training and evaluating models."""
@@ -163,6 +272,12 @@ def main(input_args=None):
         help="Number of example predictions to plot during evaluation "
         "(default: 1)",
     )
+    parser.add_argument(
+        "--save_predictions",
+        action="store_true",
+        help="If predictions should be saved to disk as a zarr dataset "
+        "(default: false)",
+    )
 
     # Logger Settings
     parser.add_argument(
@@ -261,24 +376,30 @@ def main(input_args=None):
         f"{prefix}{args.model}-{args.processor_layers}x{args.hidden_dim}-"
         f"{time.strftime('%m_%d_%H')}-{random_run_id:04d}"
     )
+
+    training_logger = _setup_training_logger(
+        config=config, datastore=datastore, args=args, run_name=run_name
+    )
+
     checkpoint_callback = pl.callbacks.ModelCheckpoint(
         dirpath=f"saved_models/{run_name}",
         filename="min_val_loss",
         monitor="val_mean_loss",
         mode="min",
         save_last=True,
     )
-    logger = pl.loggers.WandbLogger(
-        project=args.wandb_project,
-        name=run_name,
-        config=dict(training=vars(args), datastore=datastore._config),
-    )
     trainer = pl.Trainer(
         max_epochs=args.epochs,
         deterministic=True,
         strategy="ddp",
+        devices=4,
+        # devices=[1,2],
+        # devices=[0, 1, 2],
+        # strategy="auto",
+        # devices=1,  # For eval mode
+        # num_nodes=1,  # For eval mode
         accelerator=device_name,
-        logger=logger,
+        logger=training_logger,
         log_every_n_steps=1,
         callbacks=[checkpoint_callback],
         check_val_every_n_epoch=args.val_interval,
@@ -287,11 +408,15 @@ def main(input_args=None):
 
     # Only init once, on rank 0 only
     if trainer.global_rank == 0:
-        utils.init_wandb_metrics(
-            logger, val_steps=args.val_steps_to_log
-        )  # Do after wandb.init
+        utils.init_training_logger_metrics(
+            training_logger, val_steps=args.val_steps_to_log
+        )  # Do after initializing logger
     if args.eval:
-        trainer.test(model=model, datamodule=data_module, ckpt_path=args.load)
+        trainer.test(
+            model=model,
+            datamodule=data_module,
+            ckpt_path=args.load,
+        )
     else:
         trainer.fit(model=model, datamodule=data_module, ckpt_path=args.load)