Commit

FAIR-Chem · Apr 27, 2024 · 24d28bb · 24d28bb
1 parent daf72a5
commit 24d28bb
Show file tree

Hide file tree

Showing 10 changed files with 268 additions and 12 deletions.
diff --git a/configs/odac/s2ef/eqv2_153M.yml b/configs/odac/s2ef/eqv2_153M.yml
@@ -45,8 +45,8 @@ model:
 
   weight_init:              'uniform'    # ['uniform', 'normal']
 
-  norm_scale_nodes: 192.561
-  norm_scale_degree: 21.024127419363214
+  avg_num_nodes: 192.561
+  avg_degree: 21.024127419363214
 
 optim:
   batch_size:                   1

diff --git a/configs/odac/s2ef/eqv2_153M_raw.yml b/configs/odac/s2ef/eqv2_153M_raw.yml
@@ -0,0 +1,114 @@
+dataset:
+  train:
+    src: data_odac/s2ef/train
+    normalize_labels: True
+    target_mean: 0.5514033085536203
+    target_std: 15.554063738073438
+    grad_target_mean: 0.0
+    grad_target_std: 0.04262716323137283
+    lin_ref: configs/odac/s2ef_lin_ref.npz
+    target: raw_y
+
+  val:
+    src: data_odac/s2ef/val_0.1
+    lin_ref: configs/odac/s2ef_lin_ref.npz
+    target: raw_y
+
+logger:
+  name: wandb
+
+task:
+  dataset: lmdb
+  train_on_free_atoms: True
+  eval_on_free_atoms: True
+  primary_metric: forces_mae
+
+  relaxation_steps: 125
+  relaxation_fmax: 0.05
+  write_pos: True
+  # relax_dataset:
+  #   src: data_odac/is2r/val
+  # relax_opt:
+  #   name: lbfgs
+  #   maxstep: 0.04
+  #   memory: 50
+  #   damping: 1.0
+  #   alpha: 70.0
+
+trainer: equiformerv2_forces
+
+model:
+  name: equiformer_v2
+
+  use_pbc:                  True
+  regress_forces:           True
+  otf_graph:                True
+  max_neighbors:            20
+  max_radius:               8.0
+  max_num_elements:         100
+
+  num_layers:               20
+  sphere_channels:          128
+  attn_hidden_channels:     64              # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96.
+  num_heads:                8
+  attn_alpha_channels:      64              # Not used when `use_s2_act_attn` is True.
+  attn_value_channels:      16
+  ffn_hidden_channels:      128
+  norm_type:                'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh']
+
+  lmax_list:                [6]
+  mmax_list:                [3]
+  grid_resolution:          18              # [18, 16, 14, None] For `None`, simply comment this line.
+
+  num_sphere_samples:       128
+
+  edge_channels:            128
+  use_atom_edge_embedding:  True
+  distance_function:        'gaussian'
+  num_distance_basis:       512         # not used
+
+  attn_activation:          'silu'
+  use_s2_act_attn:          False       # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention.
+  ffn_activation:           'silu'      # ['silu', 'swiglu']
+  use_gate_act:             False       # [False, True] Switch between gate activation and S2 activation
+  use_grid_mlp:             True        # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs.
+
+  alpha_drop:               0.1         # [0.0, 0.1]
+  drop_path_rate:           0.1         # [0.0, 0.05]
+  proj_drop:                0.0
+
+  weight_init:              'uniform'    # ['uniform', 'normal']
+
+  avg_num_nodes: 192.561
+  avg_degree: 21.024127419363214
+
+  use_energy_lin_ref: True
+  load_energy_lin_ref: True
+
+optim:
+  batch_size:                   1
+  eval_batch_size:              1
+  grad_accumulation_steps:      1
+  load_balancing: atoms
+  num_workers: 8
+  lr_initial:                   0.0004
+
+  optimizer: AdamW
+  optimizer_params:
+    weight_decay: 0.3
+  scheduler: LambdaLR
+  scheduler_params:
+    lambda_type: cosine
+    warmup_factor: 0.2
+    warmup_epochs: 0.01
+    lr_min_factor: 0.01
+
+  max_epochs: 1
+  force_coefficient: 100
+  energy_coefficient: 4
+  clip_grad_norm: 100
+  ema_decay: 0.999
+  loss_energy: mae
+  loss_force: l2mae
+
+  eval_every: 5000
diff --git a/configs/odac/s2ef/eqv2_31M_raw.yml b/configs/odac/s2ef/eqv2_31M_raw.yml
@@ -0,0 +1,116 @@
+dataset:
+  train:
+    src: data_odac/s2ef/train
+    normalize_labels: True
+    target_mean: 0.5514033085536203
+    target_std: 15.554063738073438
+    grad_target_mean: 0.0
+    grad_target_std: 0.04262716323137283
+    # lin_ref: configs/odac/s2ef_lin_ref.npz
+    target: raw_y
+
+  val:
+    src: data_odac/s2ef/val_0.1
+    # lin_ref: configs/odac/s2ef_lin_ref.npz
+    target: raw_y
+
+logger:
+  name: wandb
+
+task:
+  dataset: lmdb
+  train_on_free_atoms: True
+  eval_on_free_atoms: True
+  primary_metric: forces_mae
+
+  relaxation_steps: 125
+  relaxation_fmax: 0.05
+  write_pos: True
+  relax_dataset:
+    src: data_odac/is2r/val
+  relax_opt:
+    name: lbfgs
+    maxstep: 0.04
+    memory: 50
+    damping: 1.0
+    alpha: 70.0
+
+trainer: equiformerv2_forces
+
+model:
+  name: equiformer_v2
+
+  use_pbc:                  True
+  regress_forces:           True
+  otf_graph:                True
+  max_neighbors:            20
+  max_radius:               8.0
+  max_num_elements:         100
+
+  num_layers:               8
+  sphere_channels:          128
+  attn_hidden_channels:     64              # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96.
+  num_heads:                8
+  attn_alpha_channels:      64              # Not used when `use_s2_act_attn` is True.
+  attn_value_channels:      16
+  ffn_hidden_channels:      128
+  norm_type:                'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh']
+
+  lmax_list:                [4]
+  mmax_list:                [2]
+  grid_resolution:          18              # [18, 16, 14, None] For `None`, simply comment this line.
+
+  num_sphere_samples:       128
+
+  edge_channels:            128
+  use_atom_edge_embedding:  True
+  distance_function:        'gaussian'
+  num_distance_basis:       512         # not used
+
+  attn_activation:          'silu'
+  use_s2_act_attn:          False       # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention.
+  ffn_activation:           'silu'      # ['silu', 'swiglu']
+  use_gate_act:             False       # [False, True] Switch between gate activation and S2 activation
+  use_grid_mlp:             True        # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs.
+
+  alpha_drop:               0.1         # [0.0, 0.1]
+  drop_path_rate:           0.1         # [0.0, 0.05]
+  proj_drop:                0.0
+
+  weight_init:              'uniform'    # ['uniform', 'normal']
+
+  avg_num_nodes:            192.561
+  avg_degree:               21.024127419363214
+
+  use_energy_lin_ref: True
+  load_energy_lin_ref: True
+
+  # energy_lin_ref_path: configs/odac/s2ef_lin_ref_offset.npz
+
+optim:
+  batch_size:                   3
+  eval_batch_size:              3
+  grad_accumulation_steps:      1         # gradient accumulation: effective batch size = `grad_accumulation_steps` * `batch_size` * (num of GPUs)
+  load_balancing: atoms
+  num_workers: 8
+  lr_initial:                   0.0004    # [0.0002, 0.0004], eSCN uses 0.0008 for batch size 96
+
+  optimizer: AdamW
+  optimizer_params:
+    weight_decay: 0.3
+  scheduler: LambdaLR
+  scheduler_params:
+    lambda_type: cosine
+    warmup_factor: 0.2
+    warmup_epochs: 0.01
+    lr_min_factor: 0.01
+
+  max_epochs: 3
+  force_coefficient: 200
+  energy_coefficient: 1
+  clip_grad_norm: 100
+  ema_decay: 0.999
+  loss_energy: mae
+  loss_force: l2mae
+
+  eval_every: 5000
diff --git a/configs/odac/s2ef_lin_ref.npz b/configs/odac/s2ef_lin_ref.npz
diff --git a/configs/odac/s2ef_lin_ref_offset.npz b/configs/odac/s2ef_lin_ref_offset.npz
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
@@ -90,7 +90,7 @@ def add_core_args(self) -> None:
         )
         self.parser.add_argument(
             "--slurm-partition",
-            default="ocp",
+            default="ocp_high",
             type=str,
             help="Name of partition",
         )

diff --git a/ocpmodels/datasets/lmdb_dataset.py b/ocpmodels/datasets/lmdb_dataset.py
@@ -153,6 +153,9 @@ def __getitem__(self, idx: int) -> T_co:
             data_object = rename_data_object_keys(
                 data_object, self.key_mapping
             )
+
+        if "target" in self.config:
+            data_object.y = data_object[self.config["target"]]
 
         data_object = self.transforms(data_object)
 
@@ -253,8 +256,9 @@ def data_list_collater(
                 n_neighbors.append(n_index.shape[0])
             batch.neighbors = torch.tensor(n_neighbors)
         except (NotImplementedError, TypeError):
-            logging.warning(
-                "LMDB does not contain edge index information, set otf_graph=True"
-            )
+            # logging.warning(
+            #     "LMDB does not contain edge index information, set otf_graph=True"
+            # )
+            pass
 
     return batch
diff --git a/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py b/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py
@@ -152,6 +152,7 @@ def __init__(
         avg_degree: Optional[float] = None,
         use_energy_lin_ref: Optional[bool] = False,
         load_energy_lin_ref: Optional[bool] = False,
+        energy_lin_ref_path: Optional[str] = None,
     ):
         super().__init__()
 
@@ -215,6 +216,7 @@ def __init__(
 
         self.use_energy_lin_ref = use_energy_lin_ref
         self.load_energy_lin_ref = load_energy_lin_ref
+        self.energy_lin_ref_path = energy_lin_ref_path
         assert not (
             self.use_energy_lin_ref and not self.load_energy_lin_ref
         ), "You can't have use_energy_lin_ref = True and load_energy_lin_ref = False, since the model will not have the parameters for the linear references. All other combinations are fine."
@@ -389,10 +391,20 @@ def __init__(
             )
 
         if self.load_energy_lin_ref:
-            self.energy_lin_ref = nn.Parameter(
-                torch.zeros(self.max_num_elements),
-                requires_grad=False,
-            )
+            if not self.energy_lin_ref_path:
+                self.energy_lin_ref = nn.Parameter(
+                    torch.zeros(self.max_num_elements),
+                    requires_grad=False,
+                )
+            if self.energy_lin_ref_path:
+                self.energy_lin_ref = nn.Parameter(
+                    torch.zeros(self.max_num_elements + 1),
+                    requires_grad=False,
+                )
+                import numpy as np
+                coeffs = np.load(self.energy_lin_ref_path)["coeff"]
+                self.energy_lin_ref.add_(self.energy_lin_ref.new_tensor(coeffs))
+                print("Energy Reference:", self.energy_lin_ref)
 
         self.apply(self._init_weights)
         self.apply(self._uniform_init_rad_func_linear_weights)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
@@ -745,7 +745,12 @@ def validate(self, split: str = "val", disable_tqdm: bool = False):
             # Forward.
             with torch.cuda.amp.autocast(enabled=self.scaler is not None):
                 batch.to(self.device)
-                out = self._forward(batch)
+                try:
+                    out = self._forward(batch)
+                except torch.cuda.OutOfMemoryError:
+                    logging.error(f"OOM error at batch: {i}")
+                    torch.cuda.empty_cache()
+                    continue
             loss = self._compute_loss(out, batch)
 
             # Compute metrics.

diff --git a/ocpmodels/trainers/ocp_trainer.py b/ocpmodels/trainers/ocp_trainer.py
@@ -455,7 +455,12 @@ def predict(
         ):
 
             with torch.cuda.amp.autocast(enabled=self.scaler is not None):
-                out = self._forward(batch)
+                try:
+                    out = self._forward(batch)
+                except torch.cuda.OutOfMemoryError:
+                    logging.error(f"OOM error at batch: {i}")
+                    torch.cuda.empty_cache()
+                    continue
 
             for target_key in self.config["outputs"]:
                 pred = out[target_key]