From 24d28bbc0d32452d00047d96ec5a56cd8a561a43 Mon Sep 17 00:00:00 2001 From: anuroopsriram Date: Sat, 27 Apr 2024 19:25:09 +0000 Subject: [PATCH] Commit --- configs/odac/s2ef/eqv2_153M.yml | 4 +- configs/odac/s2ef/eqv2_153M_raw.yml | 114 +++++++++++++++++ configs/odac/s2ef/eqv2_31M_raw.yml | 116 ++++++++++++++++++ configs/odac/s2ef_lin_ref.npz | Bin 0 -> 1064 bytes configs/odac/s2ef_lin_ref_offset.npz | Bin 0 -> 1072 bytes ocpmodels/common/flags.py | 2 +- ocpmodels/datasets/lmdb_dataset.py | 10 +- .../equiformer_v2/equiformer_v2_oc20.py | 20 ++- ocpmodels/trainers/base_trainer.py | 7 +- ocpmodels/trainers/ocp_trainer.py | 7 +- 10 files changed, 268 insertions(+), 12 deletions(-) create mode 100644 configs/odac/s2ef/eqv2_153M_raw.yml create mode 100644 configs/odac/s2ef/eqv2_31M_raw.yml create mode 100644 configs/odac/s2ef_lin_ref.npz create mode 100644 configs/odac/s2ef_lin_ref_offset.npz diff --git a/configs/odac/s2ef/eqv2_153M.yml b/configs/odac/s2ef/eqv2_153M.yml index cbb21fe2d..f71d923e7 100755 --- a/configs/odac/s2ef/eqv2_153M.yml +++ b/configs/odac/s2ef/eqv2_153M.yml @@ -45,8 +45,8 @@ model: weight_init: 'uniform' # ['uniform', 'normal'] - norm_scale_nodes: 192.561 - norm_scale_degree: 21.024127419363214 + avg_num_nodes: 192.561 + avg_degree: 21.024127419363214 optim: batch_size: 1 diff --git a/configs/odac/s2ef/eqv2_153M_raw.yml b/configs/odac/s2ef/eqv2_153M_raw.yml new file mode 100644 index 000000000..9a4ace679 --- /dev/null +++ b/configs/odac/s2ef/eqv2_153M_raw.yml @@ -0,0 +1,114 @@ +dataset: + train: + src: data_odac/s2ef/train + normalize_labels: True + target_mean: 0.5514033085536203 + target_std: 15.554063738073438 + grad_target_mean: 0.0 + grad_target_std: 0.04262716323137283 + lin_ref: configs/odac/s2ef_lin_ref.npz + target: raw_y + + val: + src: data_odac/s2ef/val_0.1 + lin_ref: configs/odac/s2ef_lin_ref.npz + target: raw_y + +logger: + name: wandb + +task: + dataset: lmdb + train_on_free_atoms: True + eval_on_free_atoms: True + primary_metric: forces_mae + + relaxation_steps: 125 + relaxation_fmax: 0.05 + write_pos: True + # relax_dataset: + # src: data_odac/is2r/val + # relax_opt: + # name: lbfgs + # maxstep: 0.04 + # memory: 50 + # damping: 1.0 + # alpha: 70.0 + +trainer: equiformerv2_forces + +model: + name: equiformer_v2 + + use_pbc: True + regress_forces: True + otf_graph: True + max_neighbors: 20 + max_radius: 8.0 + max_num_elements: 100 + + num_layers: 20 + sphere_channels: 128 + attn_hidden_channels: 64 # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96. + num_heads: 8 + attn_alpha_channels: 64 # Not used when `use_s2_act_attn` is True. + attn_value_channels: 16 + ffn_hidden_channels: 128 + norm_type: 'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh'] + + lmax_list: [6] + mmax_list: [3] + grid_resolution: 18 # [18, 16, 14, None] For `None`, simply comment this line. + + num_sphere_samples: 128 + + edge_channels: 128 + use_atom_edge_embedding: True + distance_function: 'gaussian' + num_distance_basis: 512 # not used + + attn_activation: 'silu' + use_s2_act_attn: False # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention. + ffn_activation: 'silu' # ['silu', 'swiglu'] + use_gate_act: False # [False, True] Switch between gate activation and S2 activation + use_grid_mlp: True # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs. + + alpha_drop: 0.1 # [0.0, 0.1] + drop_path_rate: 0.1 # [0.0, 0.05] + proj_drop: 0.0 + + weight_init: 'uniform' # ['uniform', 'normal'] + + avg_num_nodes: 192.561 + avg_degree: 21.024127419363214 + + use_energy_lin_ref: True + load_energy_lin_ref: True + +optim: + batch_size: 1 + eval_batch_size: 1 + grad_accumulation_steps: 1 + load_balancing: atoms + num_workers: 8 + lr_initial: 0.0004 + + optimizer: AdamW + optimizer_params: + weight_decay: 0.3 + scheduler: LambdaLR + scheduler_params: + lambda_type: cosine + warmup_factor: 0.2 + warmup_epochs: 0.01 + lr_min_factor: 0.01 + + max_epochs: 1 + force_coefficient: 100 + energy_coefficient: 4 + clip_grad_norm: 100 + ema_decay: 0.999 + loss_energy: mae + loss_force: l2mae + + eval_every: 5000 diff --git a/configs/odac/s2ef/eqv2_31M_raw.yml b/configs/odac/s2ef/eqv2_31M_raw.yml new file mode 100644 index 000000000..cce370d36 --- /dev/null +++ b/configs/odac/s2ef/eqv2_31M_raw.yml @@ -0,0 +1,116 @@ +dataset: + train: + src: data_odac/s2ef/train + normalize_labels: True + target_mean: 0.5514033085536203 + target_std: 15.554063738073438 + grad_target_mean: 0.0 + grad_target_std: 0.04262716323137283 + # lin_ref: configs/odac/s2ef_lin_ref.npz + target: raw_y + + val: + src: data_odac/s2ef/val_0.1 + # lin_ref: configs/odac/s2ef_lin_ref.npz + target: raw_y + +logger: + name: wandb + +task: + dataset: lmdb + train_on_free_atoms: True + eval_on_free_atoms: True + primary_metric: forces_mae + + relaxation_steps: 125 + relaxation_fmax: 0.05 + write_pos: True + relax_dataset: + src: data_odac/is2r/val + relax_opt: + name: lbfgs + maxstep: 0.04 + memory: 50 + damping: 1.0 + alpha: 70.0 + +trainer: equiformerv2_forces + +model: + name: equiformer_v2 + + use_pbc: True + regress_forces: True + otf_graph: True + max_neighbors: 20 + max_radius: 8.0 + max_num_elements: 100 + + num_layers: 8 + sphere_channels: 128 + attn_hidden_channels: 64 # [64, 96] This determines the hidden size of message passing. Do not necessarily use 96. + num_heads: 8 + attn_alpha_channels: 64 # Not used when `use_s2_act_attn` is True. + attn_value_channels: 16 + ffn_hidden_channels: 128 + norm_type: 'layer_norm_sh' # ['rms_norm_sh', 'layer_norm', 'layer_norm_sh'] + + lmax_list: [4] + mmax_list: [2] + grid_resolution: 18 # [18, 16, 14, None] For `None`, simply comment this line. + + num_sphere_samples: 128 + + edge_channels: 128 + use_atom_edge_embedding: True + distance_function: 'gaussian' + num_distance_basis: 512 # not used + + attn_activation: 'silu' + use_s2_act_attn: False # [False, True] Switch between attention after S2 activation or the original EquiformerV1 attention. + ffn_activation: 'silu' # ['silu', 'swiglu'] + use_gate_act: False # [False, True] Switch between gate activation and S2 activation + use_grid_mlp: True # [False, True] If `True`, use projecting to grids and performing MLPs for FFNs. + + alpha_drop: 0.1 # [0.0, 0.1] + drop_path_rate: 0.1 # [0.0, 0.05] + proj_drop: 0.0 + + weight_init: 'uniform' # ['uniform', 'normal'] + + avg_num_nodes: 192.561 + avg_degree: 21.024127419363214 + + use_energy_lin_ref: True + load_energy_lin_ref: True + + # energy_lin_ref_path: configs/odac/s2ef_lin_ref_offset.npz + +optim: + batch_size: 3 + eval_batch_size: 3 + grad_accumulation_steps: 1 # gradient accumulation: effective batch size = `grad_accumulation_steps` * `batch_size` * (num of GPUs) + load_balancing: atoms + num_workers: 8 + lr_initial: 0.0004 # [0.0002, 0.0004], eSCN uses 0.0008 for batch size 96 + + optimizer: AdamW + optimizer_params: + weight_decay: 0.3 + scheduler: LambdaLR + scheduler_params: + lambda_type: cosine + warmup_factor: 0.2 + warmup_epochs: 0.01 + lr_min_factor: 0.01 + + max_epochs: 3 + force_coefficient: 200 + energy_coefficient: 1 + clip_grad_norm: 100 + ema_decay: 0.999 + loss_energy: mae + loss_force: l2mae + + eval_every: 5000 diff --git a/configs/odac/s2ef_lin_ref.npz b/configs/odac/s2ef_lin_ref.npz new file mode 100644 index 0000000000000000000000000000000000000000..0972f3e81d3fefd27046ca3450884d170bdfceae GIT binary patch literal 1064 zcmWIWW@Zs#fB;2?*Iqd@7cet0fG{V62t#syYFe6JUO^=zg8*0%q!1(t0+anheFGvH z8Oj){)l*W7lZ(`?6x40fEYx)r)YI~dN{SNm;`57AQbFQwi8;loK=I;?#DY{HU&GMA zKu1$Yp;m!xz|~fJ&#zBk-~iY$8~XQ7JACQ#R0-h&nVbK_RB6i|=nytuyCzZkz_m)_ zdi6Dm2UPPc1r=Y)AK2#Z6tH%Q^nopp_!jGB3PSWZ7}*{;aHU--pZ~x+E3qkjPs9#5 zc39UK{!%{BaFxT!Iaubv8m8Khm5MS4z67YXTskgzz`&pV+7>2eh<*kRTd+GABy3#- zpGY$J$sO45cp_HgkotkNqJ-;4oH_^Y1RAzD>{dU(+>`O`jkfB6hP9cR?8d4G9=!VL z-1A!Dz_XXPc3)SLIneYv<>mtykpq`=7SwL-<2taxBS5QsGxLFICri@rd5RqnP}^;# zv{~@LCxhBw{}p);T-ow0^hGM)ffMV0Evu0JxF6z<7keP|FB^tO^|e?2%N($&ojLh& zz0Lu1cR|V7S9K3e;^ejR3Dh`{HgiL2lcYM-9|!i_Zql`^rfkd%hEdV z!P4{a^y@lMeGm4mYI-*9Ij7NqH=Dm7Tee8=K!~x&l};wp1KK;zXEY`oA87d?d~%hf z;Q`SHr(N_o^bf=>Qh6H{u5+MsVy9u|3N5HP3=G$78`iQYsUOJd_|Q}S!vHR>Gi$*) zlNmY(qE_DtuZYuzi!(q$HBa__mxW>nRJHVjuDJ6ZSRiq?wxa0<)c-J53=E66mUzwy z62Z`qP=TG5UeG*E&KBVwF#(rLkKXGYcvGvw(I#(1vUxCD0=yZSM3`}9ZkQ?th6Y9u W8=lnzyjj^m0*pXt4y12`vpWE5c1Q#O literal 0 HcmV?d00001 diff --git a/configs/odac/s2ef_lin_ref_offset.npz b/configs/odac/s2ef_lin_ref_offset.npz new file mode 100644 index 0000000000000000000000000000000000000000..7498f8968c77c72aa9cb109b4ffc7a1c3cc565b4 GIT binary patch literal 1072 zcmWIWW@Zs#fB;2??FYC1Uct=30K%LMA`Hp-scC6?c?Fe>3<6+5kV23o2u$`1^$mz* zWGG{(R!>PSPA*cnQc$-^vryMjP*2M*Dk)0Li_b4gNd<|!CFT^T0>z6n5(`p+d<{bb zLmf>Wg<1u&0T;v-3=D0x_x$?w1rC7u8~XQ7JACQ#R0-h&nVbK_RB6i|=nytuyCzZk zz_m)_di6Dm2UPPc1r=Y)AK2#Z6tH%Q^nopp_!jGB3PSWZ7}*{;aHU--pZ~x+E3qkj zPs9#5c39UK{!%{BaFxT!Iaubv8m8Khm5MS4z67YXTskgzz`&pV+7>2eh<*kRTd+GA zBy3#-pGY$J$sO45cp_HgkotkNqJ-;4oH_^Y1RAzD>{dU(+>`O`jkfB6hP9cR?8d4G z9=!VL-1A!Dz_XXPc3)SLIneYv<>mtykpq`=7SwL-<2taxBS5QsGxLFICri@rd5Rqn zP}^;#v{~@LCxhBw{}p);T-ow0^hGM)ffMV0Evu0JxF6z<7keP|FB^tO^|e?2%N($& zojLh&z0Lu1cR|V7S9K3e;^ejR3Dh`{HgiL2lcYM-9|!i_Zql`^rfkd z%hEdV!P4{a^y@lMeGm4mYI-*9Ij7NqH=Dm7Tee8=K!~x&l};wp1KK;zXEY`oA87d? zd~%hf;Q`SHr(N_o^bf=>Qh6H{u5+MsVy9u|3N5HP3=G$78`iQYsUOJd_|Q}S!vHR> zGi$*)lNmY(qE_DtuZYuzi!(q$HBa__mxW>nRJHVjuDJ6ZSRiq?wxa0<)c-J53=E66 zmUzwy62Z`qP=TG5UeG*E&KBVwF#(rLkKXGYcvGvw(I#(1vUxCD0=yZSM3`}9a+oRx ah6Y9u8=mC@yjj^m0*pXt4x}H0vpoPOVoDhR literal 0 HcmV?d00001 diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 7b84f58aa..68baad226 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -90,7 +90,7 @@ def add_core_args(self) -> None: ) self.parser.add_argument( "--slurm-partition", - default="ocp", + default="ocp_high", type=str, help="Name of partition", ) diff --git a/ocpmodels/datasets/lmdb_dataset.py b/ocpmodels/datasets/lmdb_dataset.py index 1c7e313ac..73b7fce9a 100644 --- a/ocpmodels/datasets/lmdb_dataset.py +++ b/ocpmodels/datasets/lmdb_dataset.py @@ -153,6 +153,9 @@ def __getitem__(self, idx: int) -> T_co: data_object = rename_data_object_keys( data_object, self.key_mapping ) + + if "target" in self.config: + data_object.y = data_object[self.config["target"]] data_object = self.transforms(data_object) @@ -253,8 +256,9 @@ def data_list_collater( n_neighbors.append(n_index.shape[0]) batch.neighbors = torch.tensor(n_neighbors) except (NotImplementedError, TypeError): - logging.warning( - "LMDB does not contain edge index information, set otf_graph=True" - ) + # logging.warning( + # "LMDB does not contain edge index information, set otf_graph=True" + # ) + pass return batch diff --git a/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py b/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py index 93598b6d7..684104fe7 100644 --- a/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py +++ b/ocpmodels/models/equiformer_v2/equiformer_v2_oc20.py @@ -152,6 +152,7 @@ def __init__( avg_degree: Optional[float] = None, use_energy_lin_ref: Optional[bool] = False, load_energy_lin_ref: Optional[bool] = False, + energy_lin_ref_path: Optional[str] = None, ): super().__init__() @@ -215,6 +216,7 @@ def __init__( self.use_energy_lin_ref = use_energy_lin_ref self.load_energy_lin_ref = load_energy_lin_ref + self.energy_lin_ref_path = energy_lin_ref_path assert not ( self.use_energy_lin_ref and not self.load_energy_lin_ref ), "You can't have use_energy_lin_ref = True and load_energy_lin_ref = False, since the model will not have the parameters for the linear references. All other combinations are fine." @@ -389,10 +391,20 @@ def __init__( ) if self.load_energy_lin_ref: - self.energy_lin_ref = nn.Parameter( - torch.zeros(self.max_num_elements), - requires_grad=False, - ) + if not self.energy_lin_ref_path: + self.energy_lin_ref = nn.Parameter( + torch.zeros(self.max_num_elements), + requires_grad=False, + ) + if self.energy_lin_ref_path: + self.energy_lin_ref = nn.Parameter( + torch.zeros(self.max_num_elements + 1), + requires_grad=False, + ) + import numpy as np + coeffs = np.load(self.energy_lin_ref_path)["coeff"] + self.energy_lin_ref.add_(self.energy_lin_ref.new_tensor(coeffs)) + print("Energy Reference:", self.energy_lin_ref) self.apply(self._init_weights) self.apply(self._uniform_init_rad_func_linear_weights) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 30e80946b..438ff7165 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -745,7 +745,12 @@ def validate(self, split: str = "val", disable_tqdm: bool = False): # Forward. with torch.cuda.amp.autocast(enabled=self.scaler is not None): batch.to(self.device) - out = self._forward(batch) + try: + out = self._forward(batch) + except torch.cuda.OutOfMemoryError: + logging.error(f"OOM error at batch: {i}") + torch.cuda.empty_cache() + continue loss = self._compute_loss(out, batch) # Compute metrics. diff --git a/ocpmodels/trainers/ocp_trainer.py b/ocpmodels/trainers/ocp_trainer.py index 0e540805b..56e2ed2b9 100644 --- a/ocpmodels/trainers/ocp_trainer.py +++ b/ocpmodels/trainers/ocp_trainer.py @@ -455,7 +455,12 @@ def predict( ): with torch.cuda.amp.autocast(enabled=self.scaler is not None): - out = self._forward(batch) + try: + out = self._forward(batch) + except torch.cuda.OutOfMemoryError: + logging.error(f"OOM error at batch: {i}") + torch.cuda.empty_cache() + continue for target_key in self.config["outputs"]: pred = out[target_key]