From f83c7724fbb9aff1c8d16365ca9a468e3425e7c5 Mon Sep 17 00:00:00 2001
From: Muhammed Shuaibi <45150244+mshuaibii@users.noreply.github.com>
Date: Tue, 9 Jan 2024 09:52:51 -0800
Subject: [PATCH] minor fixes (#615)

---
 configs/is2re/example.yml          | 131 -------------------
 configs/s2ef/example.yml           | 197 -----------------------------
 ocpmodels/trainers/base_trainer.py |   1 -
 ocpmodels/trainers/ocp_trainer.py  |   4 +
 4 files changed, 4 insertions(+), 329 deletions(-)
 delete mode 100644 configs/is2re/example.yml
 delete mode 100644 configs/s2ef/example.yml

diff --git a/configs/is2re/example.yml b/configs/is2re/example.yml
deleted file mode 100644
index 549bbe8c6..000000000
--- a/configs/is2re/example.yml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Example config for training models for IS2RE.
-
-trainer: energy                                                                 # 'energy' or 'forces'
-
-task:
-  # The code currently supports 'lmdb' and 'oc22_lmdb' for both IS2RE and S2EF.
-  #
-  # To train models on adsorption energy (as in OC20), use `lmdb`.
-  # To train models on total DFT energy, use `oc22_lmdb`.
-  #
-  # Can use 'single_point_lmdb' or 'trajectory_lmdb' for backward compatibility.
-  # 'single_point_lmdb' was for training IS2RE models, and 'trajectory_lmdb' was
-  # for training S2EF models.
-  # To train an oc20 model on total energy use 'oc22_lmdb'
-  dataset: lmdb                                                                 # 'lmdb' or 'oc22_lmdb'
-  # This is an optional parameter specifying the val metric to watch for
-  # improvement to decide when to save checkpoints.
-  # By default, this is:
-  #   'energy_force_within_threshold' for S2EF,
-  #   'energy_mae' for IS2RE,
-  #   'average_distance_within_threshold' for IS2RS.
-  primary_metric: energy_mae
-  # This is an argument used for checkpoint loading. By default it is True and loads
-  # checkpoint as it is. If False, it could partially load the checkpoint without giving
-  # any errors
-  strict_load: True                                                             # True or False
-
-dataset:
-  train:
-    # Path to training set LMDB
-    src: data/is2re/all/train/data.lmdb
-    # If we want to normalize each target value, i.e. subtract the mean and
-    # divide by standard deviation, then those 'target_mean' and 'target_std'
-    # statistics need to be specified here for the train split.
-    normalize_labels: True                                                      # True or False
-    # These stats are for OC20 IS2RE.
-    target_mean: -1.525913953781128
-    target_std: 2.279365062713623
-    # If we want to train OC20 on total energy, a path to OC20 reference
-    # energies `oc20_ref` must be specified to unreference existing OC20 data.
-    # download at https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/oc20_ref.pkl
-    # Also, train_on_oc20_total_energies must be set to True
-    # OC22 defaults to total energy, so these flags are not necessary.
-    train_on_oc20_total_energies: False                                         # True or False
-    oc20_ref: None                                                              # path to oc20_ref
-    # If we want to train on total energies and use a linear reference
-    # normalization scheme, we must specify the path to the per-element
-    # coefficients in a `.npz` format.
-    lin_ref: False
-  val:
-    # Path to val set LMDB
-    src: data/is2re/all/val_id/data.lmdb
-    # If we want to run validation with OC20 total energy val set, `oc20_ref` must be specified and
-    # train_on_oc20_total_energies set to True
-    # OC22 defaults to total energy, so these flags are not necessary.
-    train_on_oc20_total_energies: False                                         # True or False
-    oc20_ref: None                                                              # path to oc20_ref
-  test:
-    # Path to test set LMDB
-    src: data/is2re/all/test_id/data.lmdb
-
-logger: tensorboard                                                             # 'wandb' or 'tensorboard'
-
-model:
-  name: gemnet_t
-  # Model attributes go here, e.g. no. of layers, no. of hidden channels,
-  # embedding functions, cutoff radius, no. of neighbors, etc.
-  # This list of params will look different depending on the model.
-  #
-  # 'otf_graph' specifies whether graph edges should be computed on the fly
-  # or they already exist in the preprocessed LMDBs. If unsure, set it to True.
-  otf_graph: True                                                               # True or False
-  # All models in OCP can be used to predict just energies, or both energies and
-  # forces. For IS2RE, we don't need forces, so 'regress_forces' is False.
-  regress_forces: False                                                         # True or False
-
-optim:
-  # Batch size per GPU for training.
-  # Note that effective batch size will be 'batch_size' x no. of GPUs.
-  batch_size: 8
-  # Batch size per GPU for evaluation.
-  # Note that effective batch size will be 'eval_batch_size' x no. of GPUs.
-  eval_batch_size: 8
-  # No. of subprocesses to use for dataloading, pass as an arg to
-  # https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader.
-  num_workers: 2
-  # After how many updates to run evaluation on val during training.
-  # If unspecified, defaults to 1 epoch.
-  eval_every: 5000
-  # Loss function to use for energies. Defaults to 'mae'.
-  loss_energy: mae                                                              # 'mae' or 'mse'
-  # Optimizer to use from torch.optim.
-  # Default is https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html.
-  optimizer: AdamW
-  # Learning rate. Passed as an `lr` argument when initializing the optimizer.
-  lr_initial: 1.e-4
-  # Additional args needed to initialize the optimizer.
-  optimizer_params:
-    amsgrad: True
-    # Weight decay to use. Passed as an argument when initializing the optimizer.
-    weight_decay: 0
-  # Learning rate scheduler. Should work for any scheduler specified in
-  # in torch.optim.lr_scheduler: https://pytorch.org/docs/stable/optim.html
-  # as long as the relevant args are specified here.
-  #
-  # For example, for ReduceLROnPlateau, we specify `mode`, `factor`, `patience`.
-  # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html
-  #
-  # Note that if task.primary_metric specified earlier in the config is a metric
-  # where higher is better (e.g. 'energy_force_within_threshold' or
-  # 'average_distance_within_threshold'), `mode` should be 'max' since we'd want
-  # to step LR when the metric has stopped increasing. Vice versa for energy_mae
-  # or forces_mae or loss.
-  #
-  # If you don't want to use a scheduler, set it to 'Null' (yes type that out).
-  # This is for legacy reasons. If scheduler is unspecified, it defaults to
-  # 'LambdaLR': warming up the learning rate to 'lr_initial' and then stepping
-  # it at pre-defined set of steps. See the DimeNet++ config for how to do this.
-  scheduler: ReduceLROnPlateau
-  mode: min
-  factor: 0.8
-  patience: 3
-  # No. of epochs to train for.
-  max_epochs: 100
-  # Exponential moving average of parameters. 'ema_decay' is the decay factor.
-  ema_decay: 0.999
-  # Max norm of gradients for clipping. Uses torch.nn.utils.clip_grad_norm_.
-  clip_grad_norm: 10
-
-slurm:
-  constraint: "rtx_6000"
diff --git a/configs/s2ef/example.yml b/configs/s2ef/example.yml
deleted file mode 100644
index b792f2dfc..000000000
--- a/configs/s2ef/example.yml
+++ /dev/null
@@ -1,197 +0,0 @@
-# Example config for training models for S2EF.
-
-trainer: forces                                                                 # 'energy' or 'forces'
-
-task:
-  # The code currently supports 'lmdb' and 'oc22_lmdb' for both IS2RE and S2EF.
-  #
-  # To train models on adsorption energy (as in OC20), use `lmdb`.
-  # To train models on total DFT energy, use `oc22_lmdb`.
-  #
-  # Can use 'single_point_lmdb' or 'trajectory_lmdb' for backward compatibility.
-  # 'single_point_lmdb' was for training IS2RE models, and 'trajectory_lmdb' was
-  # for training S2EF models.
-  # To train an oc20 model on total energy use 'oc22_lmdb'
-  dataset: lmdb                                                                 # 'lmdb' or 'oc22_lmdb'
-  # This is an optional parameter specifying the val metric to watch for
-  # improvement to decide when to save checkpoints.
-  # By default, this is:
-  #   'energy_force_within_threshold' for S2EF,
-  #   'energy_mae' for IS2RE,
-  #   'average_distance_within_threshold' for IS2RS.
-  primary_metric: forces_mae
-  # OC20 systems had slab atoms fixed when running DFT calculations. Surface and
-  # adsorbate atoms were free to move. This info is available for each structure
-  # in the released LMDBs.
-  # These args specify whether to train/eval forces on only free atoms or all.
-  train_on_free_atoms: True                                                     # True or False
-  eval_on_free_atoms: True                                                      # True or False
-  # By default OC20 s2ef predictions are written in float16 to reduce file size
-  # By default OC22 s2ef predictions are written in float32
-  # If training on total energy use float32
-  prediction_dtype: float16                                                     # 'float16' or 'float32'
-  # This is an argument used for checkpoint loading. By default it is True and loads
-  # checkpoint as it is. If False, it could partially load the checkpoint without giving
-  # any errors
-  strict_load: True                                                             # True or False
-  # The following args in the 'task' tree are for running relaxations with an
-  # S2EF model during training (as additional validation) or testing.
-  # Totally optional if you're only looking to train an S2EF model.
-  #
-  # Whether to evaluate val relaxations when training S2EF models on the
-  # energy_mae and average_distance_within_threshold metrics.
-  eval_relaxations: False                                                       # True or False
-  # No. of batches to run relaxations on. Defaults to the full 'relax_dataset'.
-  num_relaxation_batches: 5
-  # Max no. of steps to run relaxations for.
-  relaxation_steps: 300
-  # Whether to save out the positions.
-  write_pos: True                                                               # True or False
-  # Path to initial structures to run relaxations on. Same as the IS2RE set.
-  relax_dataset:
-    src: data/is2re/all/test_id/data.lmdb
-    # To shard a dataset into smaller subsets, define the total_shards desired
-    # and the shard a particular process to see.
-    total_shards: 1                                                             # int (optional)
-    shard: 0                                                                    # int (optional)
-  relax_opt:
-    name: lbfgs
-    maxstep: 0.04
-    memory: 50
-    damping: 1.0
-    alpha: 70.0
-    # Directory to save out trajectories (.traj files) in.
-    traj_dir: path/to/traj/directory
-  # Whether to save out the full trajectory or just the initial+final frames
-  save_full_traj: True                                                          # True or False
-  # When set to true, uses "deterministic" CUDA scatter ops if available,
-  # i.e. given the same input, leads to the same results. Default is false
-  # since this can be significantly slower.
-  set_deterministic_scatter: False                                              # True or False
-
-dataset:
-  train:
-    # Directory containing training set LMDBs
-    src: data/s2ef/all/train/
-    # If we want to normalize each target value, i.e. subtract the mean and
-    # divide by standard deviation, then those 'target_mean' and 'target_std'
-    # statistics for energies and 'grad_target_mean' and 'grad_target_std'
-    # statistics for forces need to be specified here for the train split.
-    normalize_labels: True
-    # These stats are for OC20 S2EF.
-    target_mean: -0.7554450631141663
-    target_std: 2.887317180633545
-    grad_target_mean: 0.0
-    grad_target_std: 2.887317180633545
-
-    # If we want to train OC20 on total energy, a path to OC20 reference
-    # energies `oc20_ref` must be specified to unreference existing OC20 data.
-    # download at https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/oc20_ref.pkl
-    # Also, train_on_oc20_total_energies must be set to True
-    # OC22 defaults to total energy, so these flags are not necessary.
-    train_on_oc20_total_energies: False                                         # True or False
-    oc20_ref: None                                                              # path to oc20_ref
-    # If we want to train on total energies and use a linear reference
-    # normalization scheme, we must specify the path to the per-element
-    # coefficients in a `.npz` format.
-    lin_ref: False                                                              # True or False
-  val:
-    # Directory containing val set LMDBs
-    src: data/s2ef/all/val_id/
-    # If we want to run validation with OC20 total energy val set, `oc20_ref` must be specified and
-    # train_on_oc20_total_energies set to True
-    # OC22 defaults to total energy, so these flags are not necessary.
-    train_on_oc20_total_energies: False                                         # True or False
-    oc20_ref: None                                                              # path to oc20_ref
-  test:
-    # Directory containing test set LMDBs
-    src: data/s2ef/all/test_id/
-
-logger: tensorboard                                                             # 'wandb' or 'tensorboard'
-
-model:
-  name: gemnet_t
-  # Model attributes go here, e.g. no. of layers, no. of hidden channels,
-  # embedding functions, cutoff radius, no. of neighbors, etc.
-  # This list of params will look different depending on the model.
-  #
-  # 'otf_graph' specifies whether graph edges should be computed on the fly
-  # or they already exist in the preprocessed LMDBs. If unsure, set it to True.
-  otf_graph: True                                                               # True or False
-  # All models in OCP can be used to predict just energies, or both energies and
-  # forces. For S2EF, we need both, so 'regress_forces' is True.
-  regress_forces: True                                                          # True or False
-  # Whether forces are predicted directly via an independent network (when set
-  # to True), or as negative gradients of energy wrt positions (when False)
-  direct_forces: True
-
-optim:
-  # Batch size per GPU for training.
-  # Note that effective batch size will be 'batch_size' x no. of GPUs.
-  batch_size: 8
-  # Batch size per GPU for evaluation.
-  # Note that effective batch size will be 'eval_batch_size' x no. of GPUs.
-  eval_batch_size: 8
-  # Whether to load balance across GPUs based on no. of 'atoms' or 'neighbors'.
-  load_balancing: atoms                                                         # 'atoms' or 'neighbors'
-  # No. of subprocesses to use for dataloading, pass as an arg to
-  # https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader.
-  num_workers: 2
-  # After how many updates to run evaluation on val during training.
-  # If unspecified, defaults to 1 epoch.
-  eval_every: 5000
-  # Loss function to use for energies. Defaults to 'mae'.
-  loss_energy: mae                                                              # 'mae' or 'mse'
-  # Loss function to use for forces. Defaults to 'mae'.
-  #
-  # 'l2mae' has been working well for us with a force to energy coefficient
-  # ratio of 100:1.
-  #
-  # When training on raw DFT energies, 'atomwisel2' might be a better default
-  # with a force to energy coefficient ratio of 1:1. 'atomwisel2' scales L2 loss
-  # for forces by the no. of atoms in the structure.
-  loss_force: l2mae                                                             # 'mae' or 'mse' or 'l2mae' or 'atomwisel2'
-  # Coefficient to use for the energy loss.
-  energy_coefficient: 1
-  # Coefficient to use for the force loss.
-  force_coefficient: 100
-  # Optimizer to use from torch.optim.
-  # Default is https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html.
-  optimizer: AdamW
-  # Learning rate. Passed as an `lr` argument when initializing the optimizer.
-  lr_initial: 1.e-4
-  # Additional args needed to initialize the optimizer.
-  optimizer_params:
-    amsgrad: True
-    # Weight decay to use. Passed as an argument when initializing the optimizer.
-    weight_decay: 0
-  # Learning rate scheduler. Should work for any scheduler specified in
-  # in torch.optim.lr_scheduler: https://pytorch.org/docs/stable/optim.html
-  # as long as the relevant args are specified here.
-  #
-  # For example, for ReduceLROnPlateau, we specify `mode`, `factor`, `patience`.
-  # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html
-  #
-  # Note that if task.primary_metric specified earlier in the config is a metric
-  # where higher is better (e.g. 'energy_force_within_threshold' or
-  # 'average_distance_within_threshold'), `mode` should be 'max' since we'd want
-  # to step LR when the metric has stopped increasing. Vice versa for energy_mae
-  # or forces_mae or loss.
-  #
-  # If you don't want to use a scheduler, set it to 'Null' (yes type that out).
-  # This is for legacy reasons. If scheduler is unspecified, it defaults to
-  # 'LambdaLR': warming up the learning rate to 'lr_initial' and then stepping
-  # it at pre-defined set of steps. See the DimeNet++ config for how to do this.
-  scheduler: ReduceLROnPlateau
-  mode: min
-  factor: 0.8
-  patience: 3
-  # No. of epochs to train for.
-  max_epochs: 100
-  # Exponential moving average of parameters. 'ema_decay' is the decay factor.
-  ema_decay: 0.999
-  # Max norm of gradients for clipping. Uses torch.nn.utils.clip_grad_norm_.
-  clip_grad_norm: 10
-
-slurm:
-  constraint: "rtx_6000"
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 6f5d6c972..c2a72ada0 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -437,7 +437,6 @@ def load_model(self) -> None:
         if self.logger is not None:
             self.logger.watch(self.model)
 
-        self.model.to(self.device)
         if distutils.initialized() and not self.config["noddp"]:
             self.model = DistributedDataParallel(
                 self.model, device_ids=[self.device]
diff --git a/ocpmodels/trainers/ocp_trainer.py b/ocpmodels/trainers/ocp_trainer.py
index 812f68fa9..26c92bf0a 100644
--- a/ocpmodels/trainers/ocp_trainer.py
+++ b/ocpmodels/trainers/ocp_trainer.py
@@ -41,8 +41,11 @@ class OCPTrainer(BaseTrainer):
     Args:
         task (dict): Task configuration.
         model (dict): Model configuration.
+        outputs (dict): Output property configuration.
         dataset (dict): Dataset configuration. The dataset needs to be a SinglePointLMDB dataset.
         optimizer (dict): Optimizer configuration.
+        loss_fns (dict): Loss function configuration.
+        eval_metrics (dict): Evaluation metrics configuration.
         identifier (str): Experiment identifier that is appended to log directory.
         run_dir (str, optional): Path to the run directory where logs are to be saved.
             (default: :obj:`None`)
@@ -60,6 +63,7 @@ class OCPTrainer(BaseTrainer):
             (default: :obj:`False`)
         slurm (dict): Slurm configuration. Currently just for keeping track.
             (default: :obj:`{}`)
+        noddp (bool, optional): Run model without DDP.
     """
 
     def __init__(