Skip to content

Commit

Permalink
Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
anuroopsriram committed Apr 27, 2024
1 parent daf72a5 commit 24d28bb
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 12 deletions.
4 changes: 2 additions & 2 deletions configs/odac/s2ef/eqv2_153M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ model:

weight_init: 'uniform' # ['uniform', 'normal']

norm_scale_nodes: 192.561
norm_scale_degree: 21.024127419363214
avg_num_nodes: 192.561
avg_degree: 21.024127419363214

optim:
batch_size: 1
Expand Down
114 changes: 114 additions & 0 deletions configs/odac/s2ef/eqv2_153M_raw.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
dataset:
train:
src: data_odac/s2ef/train
normalize_labels: True
target_mean: 0.5514033085536203
target_std: 15.554063738073438
grad_target_mean: 0.0
grad_target_std: 0.04262716323137283
lin_ref: configs/odac/s2ef_lin_ref.npz
target: raw_y

val:
src: data_odac/s2ef/val_0.1
lin_ref: configs/odac/s2ef_lin_ref.npz
target: raw_y

logger:
name: wandb

task:
dataset: lmdb
train_on_free_atoms: True
eval_on_free_atoms: True
primary_metric: forces_mae

relaxation_steps: 125
relaxation_fmax: 0.05
write_pos: True
# relax_dataset:
# src: data_odac/is2r/val
# relax_opt:
# name: lbfgs
# maxstep: 0.04
# memory: 50
# damping: 1.0
# alpha: 70.0

trainer: equiformerv2_forces

model:
name: equiformer_v2

use_pbc: True
regress_forces: True
otf_graph: True
max_neighbors: 20
max_radius: 8.0
max_num_elements: 100

num_layers: 20
sphere_channels: 128
attn_hidden_channels: 64 # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96.
num_heads: 8
attn_alpha_channels: 64 # Not used when `use_s2_act_attn` is True.
attn_value_channels: 16
ffn_hidden_channels: 128
norm_type: 'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh']

lmax_list: [6]
mmax_list: [3]
grid_resolution: 18 # [18, 16, 14, None] For `None`, simply comment this line.

num_sphere_samples: 128

edge_channels: 128
use_atom_edge_embedding: True
distance_function: 'gaussian'
num_distance_basis: 512 # not used

attn_activation: 'silu'
use_s2_act_attn: False # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention.
ffn_activation: 'silu' # ['silu', 'swiglu']
use_gate_act: False # [False, True] Switch between gate activation and S2 activation
use_grid_mlp: True # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs.

alpha_drop: 0.1 # [0.0, 0.1]
drop_path_rate: 0.1 # [0.0, 0.05]
proj_drop: 0.0

weight_init: 'uniform' # ['uniform', 'normal']

avg_num_nodes: 192.561
avg_degree: 21.024127419363214

use_energy_lin_ref: True
load_energy_lin_ref: True

optim:
batch_size: 1
eval_batch_size: 1
grad_accumulation_steps: 1
load_balancing: atoms
num_workers: 8
lr_initial: 0.0004

optimizer: AdamW
optimizer_params:
weight_decay: 0.3
scheduler: LambdaLR
scheduler_params:
lambda_type: cosine
warmup_factor: 0.2
warmup_epochs: 0.01
lr_min_factor: 0.01

max_epochs: 1
force_coefficient: 100
energy_coefficient: 4
clip_grad_norm: 100
ema_decay: 0.999
loss_energy: mae
loss_force: l2mae

eval_every: 5000
116 changes: 116 additions & 0 deletions configs/odac/s2ef/eqv2_31M_raw.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
dataset:
train:
src: data_odac/s2ef/train
normalize_labels: True
target_mean: 0.5514033085536203
target_std: 15.554063738073438
grad_target_mean: 0.0
grad_target_std: 0.04262716323137283
# lin_ref: configs/odac/s2ef_lin_ref.npz
target: raw_y

val:
src: data_odac/s2ef/val_0.1
# lin_ref: configs/odac/s2ef_lin_ref.npz
target: raw_y

logger:
name: wandb

task:
dataset: lmdb
train_on_free_atoms: True
eval_on_free_atoms: True
primary_metric: forces_mae

relaxation_steps: 125
relaxation_fmax: 0.05
write_pos: True
relax_dataset:
src: data_odac/is2r/val
relax_opt:
name: lbfgs
maxstep: 0.04
memory: 50
damping: 1.0
alpha: 70.0

trainer: equiformerv2_forces

model:
name: equiformer_v2

use_pbc: True
regress_forces: True
otf_graph: True
max_neighbors: 20
max_radius: 8.0
max_num_elements: 100

num_layers: 8
sphere_channels: 128
attn_hidden_channels: 64 # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96.
num_heads: 8
attn_alpha_channels: 64 # Not used when `use_s2_act_attn` is True.
attn_value_channels: 16
ffn_hidden_channels: 128
norm_type: 'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh']

lmax_list: [4]
mmax_list: [2]
grid_resolution: 18 # [18, 16, 14, None] For `None`, simply comment this line.

num_sphere_samples: 128

edge_channels: 128
use_atom_edge_embedding: True
distance_function: 'gaussian'
num_distance_basis: 512 # not used

attn_activation: 'silu'
use_s2_act_attn: False # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention.
ffn_activation: 'silu' # ['silu', 'swiglu']
use_gate_act: False # [False, True] Switch between gate activation and S2 activation
use_grid_mlp: True # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs.

alpha_drop: 0.1 # [0.0, 0.1]
drop_path_rate: 0.1 # [0.0, 0.05]
proj_drop: 0.0

weight_init: 'uniform' # ['uniform', 'normal']

avg_num_nodes: 192.561
avg_degree: 21.024127419363214

use_energy_lin_ref: True
load_energy_lin_ref: True

# energy_lin_ref_path: configs/odac/s2ef_lin_ref_offset.npz

optim:
batch_size: 3
eval_batch_size: 3
grad_accumulation_steps: 1 # gradient accumulation: effective batch size = `grad_accumulation_steps` * `batch_size` * (num of GPUs)
load_balancing: atoms
num_workers: 8
lr_initial: 0.0004 # [0.0002, 0.0004], eSCN uses 0.0008 for batch size 96

optimizer: AdamW
optimizer_params:
weight_decay: 0.3
scheduler: LambdaLR
scheduler_params:
lambda_type: cosine
warmup_factor: 0.2
warmup_epochs: 0.01
lr_min_factor: 0.01

max_epochs: 3
force_coefficient: 200
energy_coefficient: 1
clip_grad_norm: 100
ema_decay: 0.999
loss_energy: mae
loss_force: l2mae

eval_every: 5000
Binary file added configs/odac/s2ef_lin_ref.npz
Binary file not shown.
Binary file added configs/odac/s2ef_lin_ref_offset.npz
Binary file not shown.
2 changes: 1 addition & 1 deletion ocpmodels/common/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def add_core_args(self) -> None:
)
self.parser.add_argument(
"--slurm-partition",
default="ocp",
default="ocp_high",
type=str,
help="Name of partition",
)
Expand Down
10 changes: 7 additions & 3 deletions ocpmodels/datasets/lmdb_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def __getitem__(self, idx: int) -> T_co:
data_object = rename_data_object_keys(
data_object, self.key_mapping
)

if "target" in self.config:
data_object.y = data_object[self.config["target"]]

data_object = self.transforms(data_object)

Expand Down Expand Up @@ -253,8 +256,9 @@ def data_list_collater(
n_neighbors.append(n_index.shape[0])
batch.neighbors = torch.tensor(n_neighbors)
except (NotImplementedError, TypeError):
logging.warning(
"LMDB does not contain edge index information, set otf_graph=True"
)
# logging.warning(
# "LMDB does not contain edge index information, set otf_graph=True"
# )
pass

return batch
20 changes: 16 additions & 4 deletions ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def __init__(
avg_degree: Optional[float] = None,
use_energy_lin_ref: Optional[bool] = False,
load_energy_lin_ref: Optional[bool] = False,
energy_lin_ref_path: Optional[str] = None,
):
super().__init__()

Expand Down Expand Up @@ -215,6 +216,7 @@ def __init__(

self.use_energy_lin_ref = use_energy_lin_ref
self.load_energy_lin_ref = load_energy_lin_ref
self.energy_lin_ref_path = energy_lin_ref_path
assert not (
self.use_energy_lin_ref and not self.load_energy_lin_ref
), "You can't have use_energy_lin_ref = True and load_energy_lin_ref = False, since the model will not have the parameters for the linear references. All other combinations are fine."
Expand Down Expand Up @@ -389,10 +391,20 @@ def __init__(
)

if self.load_energy_lin_ref:
self.energy_lin_ref = nn.Parameter(
torch.zeros(self.max_num_elements),
requires_grad=False,
)
if not self.energy_lin_ref_path:
self.energy_lin_ref = nn.Parameter(
torch.zeros(self.max_num_elements),
requires_grad=False,
)
if self.energy_lin_ref_path:
self.energy_lin_ref = nn.Parameter(
torch.zeros(self.max_num_elements + 1),
requires_grad=False,
)
import numpy as np
coeffs = np.load(self.energy_lin_ref_path)["coeff"]
self.energy_lin_ref.add_(self.energy_lin_ref.new_tensor(coeffs))
print("Energy Reference:", self.energy_lin_ref)

self.apply(self._init_weights)
self.apply(self._uniform_init_rad_func_linear_weights)
Expand Down
7 changes: 6 additions & 1 deletion ocpmodels/trainers/base_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,12 @@ def validate(self, split: str = "val", disable_tqdm: bool = False):
# Forward.
with torch.cuda.amp.autocast(enabled=self.scaler is not None):
batch.to(self.device)
out = self._forward(batch)
try:
out = self._forward(batch)
except torch.cuda.OutOfMemoryError:
logging.error(f"OOM error at batch: {i}")
torch.cuda.empty_cache()
continue
loss = self._compute_loss(out, batch)

# Compute metrics.
Expand Down
7 changes: 6 additions & 1 deletion ocpmodels/trainers/ocp_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,12 @@ def predict(
):

with torch.cuda.amp.autocast(enabled=self.scaler is not None):
out = self._forward(batch)
try:
out = self._forward(batch)
except torch.cuda.OutOfMemoryError:
logging.error(f"OOM error at batch: {i}")
torch.cuda.empty_cache()
continue

for target_key in self.config["outputs"]:
pred = out[target_key]
Expand Down

0 comments on commit 24d28bb

Please sign in to comment.