diff --git a/configs/is2re/example.yml b/configs/is2re/example.yml deleted file mode 100644 index 549bbe8c6..000000000 --- a/configs/is2re/example.yml +++ /dev/null @@ -1,131 +0,0 @@ -# Example config for training models for IS2RE. - -trainer: energy # 'energy' or 'forces' - -task: - # The code currently supports 'lmdb' and 'oc22_lmdb' for both IS2RE and S2EF. - # - # To train models on adsorption energy (as in OC20), use `lmdb`. - # To train models on total DFT energy, use `oc22_lmdb`. - # - # Can use 'single_point_lmdb' or 'trajectory_lmdb' for backward compatibility. - # 'single_point_lmdb' was for training IS2RE models, and 'trajectory_lmdb' was - # for training S2EF models. - # To train an oc20 model on total energy use 'oc22_lmdb' - dataset: lmdb # 'lmdb' or 'oc22_lmdb' - # This is an optional parameter specifying the val metric to watch for - # improvement to decide when to save checkpoints. - # By default, this is: - # 'energy_force_within_threshold' for S2EF, - # 'energy_mae' for IS2RE, - # 'average_distance_within_threshold' for IS2RS. - primary_metric: energy_mae - # This is an argument used for checkpoint loading. By default it is True and loads - # checkpoint as it is. If False, it could partially load the checkpoint without giving - # any errors - strict_load: True # True or False - -dataset: - train: - # Path to training set LMDB - src: data/is2re/all/train/data.lmdb - # If we want to normalize each target value, i.e. subtract the mean and - # divide by standard deviation, then those 'target_mean' and 'target_std' - # statistics need to be specified here for the train split. - normalize_labels: True # True or False - # These stats are for OC20 IS2RE. - target_mean: -1.525913953781128 - target_std: 2.279365062713623 - # If we want to train OC20 on total energy, a path to OC20 reference - # energies `oc20_ref` must be specified to unreference existing OC20 data. - # download at https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/oc20_ref.pkl - # Also, train_on_oc20_total_energies must be set to True - # OC22 defaults to total energy, so these flags are not necessary. - train_on_oc20_total_energies: False # True or False - oc20_ref: None # path to oc20_ref - # If we want to train on total energies and use a linear reference - # normalization scheme, we must specify the path to the per-element - # coefficients in a `.npz` format. - lin_ref: False - val: - # Path to val set LMDB - src: data/is2re/all/val_id/data.lmdb - # If we want to run validation with OC20 total energy val set, `oc20_ref` must be specified and - # train_on_oc20_total_energies set to True - # OC22 defaults to total energy, so these flags are not necessary. - train_on_oc20_total_energies: False # True or False - oc20_ref: None # path to oc20_ref - test: - # Path to test set LMDB - src: data/is2re/all/test_id/data.lmdb - -logger: tensorboard # 'wandb' or 'tensorboard' - -model: - name: gemnet_t - # Model attributes go here, e.g. no. of layers, no. of hidden channels, - # embedding functions, cutoff radius, no. of neighbors, etc. - # This list of params will look different depending on the model. - # - # 'otf_graph' specifies whether graph edges should be computed on the fly - # or they already exist in the preprocessed LMDBs. If unsure, set it to True. - otf_graph: True # True or False - # All models in OCP can be used to predict just energies, or both energies and - # forces. For IS2RE, we don't need forces, so 'regress_forces' is False. - regress_forces: False # True or False - -optim: - # Batch size per GPU for training. - # Note that effective batch size will be 'batch_size' x no. of GPUs. - batch_size: 8 - # Batch size per GPU for evaluation. - # Note that effective batch size will be 'eval_batch_size' x no. of GPUs. - eval_batch_size: 8 - # No. of subprocesses to use for dataloading, pass as an arg to - # https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader. - num_workers: 2 - # After how many updates to run evaluation on val during training. - # If unspecified, defaults to 1 epoch. - eval_every: 5000 - # Loss function to use for energies. Defaults to 'mae'. - loss_energy: mae # 'mae' or 'mse' - # Optimizer to use from torch.optim. - # Default is https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html. - optimizer: AdamW - # Learning rate. Passed as an `lr` argument when initializing the optimizer. - lr_initial: 1.e-4 - # Additional args needed to initialize the optimizer. - optimizer_params: - amsgrad: True - # Weight decay to use. Passed as an argument when initializing the optimizer. - weight_decay: 0 - # Learning rate scheduler. Should work for any scheduler specified in - # in torch.optim.lr_scheduler: https://pytorch.org/docs/stable/optim.html - # as long as the relevant args are specified here. - # - # For example, for ReduceLROnPlateau, we specify `mode`, `factor`, `patience`. - # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html - # - # Note that if task.primary_metric specified earlier in the config is a metric - # where higher is better (e.g. 'energy_force_within_threshold' or - # 'average_distance_within_threshold'), `mode` should be 'max' since we'd want - # to step LR when the metric has stopped increasing. Vice versa for energy_mae - # or forces_mae or loss. - # - # If you don't want to use a scheduler, set it to 'Null' (yes type that out). - # This is for legacy reasons. If scheduler is unspecified, it defaults to - # 'LambdaLR': warming up the learning rate to 'lr_initial' and then stepping - # it at pre-defined set of steps. See the DimeNet++ config for how to do this. - scheduler: ReduceLROnPlateau - mode: min - factor: 0.8 - patience: 3 - # No. of epochs to train for. - max_epochs: 100 - # Exponential moving average of parameters. 'ema_decay' is the decay factor. - ema_decay: 0.999 - # Max norm of gradients for clipping. Uses torch.nn.utils.clip_grad_norm_. - clip_grad_norm: 10 - -slurm: - constraint: "rtx_6000" diff --git a/configs/s2ef/example.yml b/configs/s2ef/example.yml deleted file mode 100644 index b792f2dfc..000000000 --- a/configs/s2ef/example.yml +++ /dev/null @@ -1,197 +0,0 @@ -# Example config for training models for S2EF. - -trainer: forces # 'energy' or 'forces' - -task: - # The code currently supports 'lmdb' and 'oc22_lmdb' for both IS2RE and S2EF. - # - # To train models on adsorption energy (as in OC20), use `lmdb`. - # To train models on total DFT energy, use `oc22_lmdb`. - # - # Can use 'single_point_lmdb' or 'trajectory_lmdb' for backward compatibility. - # 'single_point_lmdb' was for training IS2RE models, and 'trajectory_lmdb' was - # for training S2EF models. - # To train an oc20 model on total energy use 'oc22_lmdb' - dataset: lmdb # 'lmdb' or 'oc22_lmdb' - # This is an optional parameter specifying the val metric to watch for - # improvement to decide when to save checkpoints. - # By default, this is: - # 'energy_force_within_threshold' for S2EF, - # 'energy_mae' for IS2RE, - # 'average_distance_within_threshold' for IS2RS. - primary_metric: forces_mae - # OC20 systems had slab atoms fixed when running DFT calculations. Surface and - # adsorbate atoms were free to move. This info is available for each structure - # in the released LMDBs. - # These args specify whether to train/eval forces on only free atoms or all. - train_on_free_atoms: True # True or False - eval_on_free_atoms: True # True or False - # By default OC20 s2ef predictions are written in float16 to reduce file size - # By default OC22 s2ef predictions are written in float32 - # If training on total energy use float32 - prediction_dtype: float16 # 'float16' or 'float32' - # This is an argument used for checkpoint loading. By default it is True and loads - # checkpoint as it is. If False, it could partially load the checkpoint without giving - # any errors - strict_load: True # True or False - # The following args in the 'task' tree are for running relaxations with an - # S2EF model during training (as additional validation) or testing. - # Totally optional if you're only looking to train an S2EF model. - # - # Whether to evaluate val relaxations when training S2EF models on the - # energy_mae and average_distance_within_threshold metrics. - eval_relaxations: False # True or False - # No. of batches to run relaxations on. Defaults to the full 'relax_dataset'. - num_relaxation_batches: 5 - # Max no. of steps to run relaxations for. - relaxation_steps: 300 - # Whether to save out the positions. - write_pos: True # True or False - # Path to initial structures to run relaxations on. Same as the IS2RE set. - relax_dataset: - src: data/is2re/all/test_id/data.lmdb - # To shard a dataset into smaller subsets, define the total_shards desired - # and the shard a particular process to see. - total_shards: 1 # int (optional) - shard: 0 # int (optional) - relax_opt: - name: lbfgs - maxstep: 0.04 - memory: 50 - damping: 1.0 - alpha: 70.0 - # Directory to save out trajectories (.traj files) in. - traj_dir: path/to/traj/directory - # Whether to save out the full trajectory or just the initial+final frames - save_full_traj: True # True or False - # When set to true, uses "deterministic" CUDA scatter ops if available, - # i.e. given the same input, leads to the same results. Default is false - # since this can be significantly slower. - set_deterministic_scatter: False # True or False - -dataset: - train: - # Directory containing training set LMDBs - src: data/s2ef/all/train/ - # If we want to normalize each target value, i.e. subtract the mean and - # divide by standard deviation, then those 'target_mean' and 'target_std' - # statistics for energies and 'grad_target_mean' and 'grad_target_std' - # statistics for forces need to be specified here for the train split. - normalize_labels: True - # These stats are for OC20 S2EF. - target_mean: -0.7554450631141663 - target_std: 2.887317180633545 - grad_target_mean: 0.0 - grad_target_std: 2.887317180633545 - - # If we want to train OC20 on total energy, a path to OC20 reference - # energies `oc20_ref` must be specified to unreference existing OC20 data. - # download at https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/oc20_ref.pkl - # Also, train_on_oc20_total_energies must be set to True - # OC22 defaults to total energy, so these flags are not necessary. - train_on_oc20_total_energies: False # True or False - oc20_ref: None # path to oc20_ref - # If we want to train on total energies and use a linear reference - # normalization scheme, we must specify the path to the per-element - # coefficients in a `.npz` format. - lin_ref: False # True or False - val: - # Directory containing val set LMDBs - src: data/s2ef/all/val_id/ - # If we want to run validation with OC20 total energy val set, `oc20_ref` must be specified and - # train_on_oc20_total_energies set to True - # OC22 defaults to total energy, so these flags are not necessary. - train_on_oc20_total_energies: False # True or False - oc20_ref: None # path to oc20_ref - test: - # Directory containing test set LMDBs - src: data/s2ef/all/test_id/ - -logger: tensorboard # 'wandb' or 'tensorboard' - -model: - name: gemnet_t - # Model attributes go here, e.g. no. of layers, no. of hidden channels, - # embedding functions, cutoff radius, no. of neighbors, etc. - # This list of params will look different depending on the model. - # - # 'otf_graph' specifies whether graph edges should be computed on the fly - # or they already exist in the preprocessed LMDBs. If unsure, set it to True. - otf_graph: True # True or False - # All models in OCP can be used to predict just energies, or both energies and - # forces. For S2EF, we need both, so 'regress_forces' is True. - regress_forces: True # True or False - # Whether forces are predicted directly via an independent network (when set - # to True), or as negative gradients of energy wrt positions (when False) - direct_forces: True - -optim: - # Batch size per GPU for training. - # Note that effective batch size will be 'batch_size' x no. of GPUs. - batch_size: 8 - # Batch size per GPU for evaluation. - # Note that effective batch size will be 'eval_batch_size' x no. of GPUs. - eval_batch_size: 8 - # Whether to load balance across GPUs based on no. of 'atoms' or 'neighbors'. - load_balancing: atoms # 'atoms' or 'neighbors' - # No. of subprocesses to use for dataloading, pass as an arg to - # https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader. - num_workers: 2 - # After how many updates to run evaluation on val during training. - # If unspecified, defaults to 1 epoch. - eval_every: 5000 - # Loss function to use for energies. Defaults to 'mae'. - loss_energy: mae # 'mae' or 'mse' - # Loss function to use for forces. Defaults to 'mae'. - # - # 'l2mae' has been working well for us with a force to energy coefficient - # ratio of 100:1. - # - # When training on raw DFT energies, 'atomwisel2' might be a better default - # with a force to energy coefficient ratio of 1:1. 'atomwisel2' scales L2 loss - # for forces by the no. of atoms in the structure. - loss_force: l2mae # 'mae' or 'mse' or 'l2mae' or 'atomwisel2' - # Coefficient to use for the energy loss. - energy_coefficient: 1 - # Coefficient to use for the force loss. - force_coefficient: 100 - # Optimizer to use from torch.optim. - # Default is https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html. - optimizer: AdamW - # Learning rate. Passed as an `lr` argument when initializing the optimizer. - lr_initial: 1.e-4 - # Additional args needed to initialize the optimizer. - optimizer_params: - amsgrad: True - # Weight decay to use. Passed as an argument when initializing the optimizer. - weight_decay: 0 - # Learning rate scheduler. Should work for any scheduler specified in - # in torch.optim.lr_scheduler: https://pytorch.org/docs/stable/optim.html - # as long as the relevant args are specified here. - # - # For example, for ReduceLROnPlateau, we specify `mode`, `factor`, `patience`. - # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html - # - # Note that if task.primary_metric specified earlier in the config is a metric - # where higher is better (e.g. 'energy_force_within_threshold' or - # 'average_distance_within_threshold'), `mode` should be 'max' since we'd want - # to step LR when the metric has stopped increasing. Vice versa for energy_mae - # or forces_mae or loss. - # - # If you don't want to use a scheduler, set it to 'Null' (yes type that out). - # This is for legacy reasons. If scheduler is unspecified, it defaults to - # 'LambdaLR': warming up the learning rate to 'lr_initial' and then stepping - # it at pre-defined set of steps. See the DimeNet++ config for how to do this. - scheduler: ReduceLROnPlateau - mode: min - factor: 0.8 - patience: 3 - # No. of epochs to train for. - max_epochs: 100 - # Exponential moving average of parameters. 'ema_decay' is the decay factor. - ema_decay: 0.999 - # Max norm of gradients for clipping. Uses torch.nn.utils.clip_grad_norm_. - clip_grad_norm: 10 - -slurm: - constraint: "rtx_6000" diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 6f5d6c972..c2a72ada0 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -437,7 +437,6 @@ def load_model(self) -> None: if self.logger is not None: self.logger.watch(self.model) - self.model.to(self.device) if distutils.initialized() and not self.config["noddp"]: self.model = DistributedDataParallel( self.model, device_ids=[self.device] diff --git a/ocpmodels/trainers/ocp_trainer.py b/ocpmodels/trainers/ocp_trainer.py index 812f68fa9..26c92bf0a 100644 --- a/ocpmodels/trainers/ocp_trainer.py +++ b/ocpmodels/trainers/ocp_trainer.py @@ -41,8 +41,11 @@ class OCPTrainer(BaseTrainer): Args: task (dict): Task configuration. model (dict): Model configuration. + outputs (dict): Output property configuration. dataset (dict): Dataset configuration. The dataset needs to be a SinglePointLMDB dataset. optimizer (dict): Optimizer configuration. + loss_fns (dict): Loss function configuration. + eval_metrics (dict): Evaluation metrics configuration. identifier (str): Experiment identifier that is appended to log directory. run_dir (str, optional): Path to the run directory where logs are to be saved. (default: :obj:`None`) @@ -60,6 +63,7 @@ class OCPTrainer(BaseTrainer): (default: :obj:`False`) slurm (dict): Slurm configuration. Currently just for keeping track. (default: :obj:`{}`) + noddp (bool, optional): Run model without DDP. """ def __init__(