Merge branch 'main' into aseatoms-pbc

FAIR-Chem · Aug 6, 2024 · 2eeaa31 · 2eeaa31
2 parents 2f34898 + b2eebb6
commit 2eeaa31
Show file tree

Hide file tree

Showing 82 changed files with 5,276 additions and 1,251 deletions.
diff --git a/configs/ocp_example.yml b/configs/ocp_example.yml
@@ -12,7 +12,7 @@ dataset:
     # Can use 'single_point_lmdb' or 'trajectory_lmdb' for backward compatibility.
     # 'single_point_lmdb' was for training IS2RE models, and 'trajectory_lmdb' was
     # for training S2EF models.
-    format: lmdb                                                                 # 'lmdb' or 'oc22_lmdb'
+    format: lmdb                                                   # 'lmdb', 'oc22_lmdb', or 'ase_d'
     # Directory containing training set LMDBs
     src: data/s2ef/all/train/
     # If we want to rename a target value stored in the data object, specify the mapping here.
@@ -34,9 +34,11 @@ dataset:
             irrep_dim: 0
           anisotropic_stress:
             irrep_dim: 2
-      # If we want to normalize targets, i.e. subtract the mean and
-      # divide by standard deviation, then specify the 'mean' and 'stdev' here.
+      # If we want to normalize targets, there are a couple of ways to specify normalization values.
+      # normalization values are applied as: (target - mean) / rmsd
+      # Note root mean squared difference (rmsd) is equal to stdev if mean != 0, and equal to rms if mean == 0.
       # Statistics will by default be applied to the validation and test set.
+      # 1) specify the 'mean' and 'stdev' explicitly here.
       normalizer:
         energy:
           mean: -0.7554450631141663
@@ -49,17 +51,60 @@ dataset:
           stdev: 674.1657344451734
         anisotropic_stress:
           stdev: 143.72764771869745
+      # 2) Estimate the values on-the-fly (OTF) from training data
+      normalizer:
+        fit:
+          targets:
+            forces: { mean: 0.0 }   # values can be explicitly set, ie if you need RMS forces instead of stdev force
+            stress_isotropic: { }   # to estimate both mean and rmsd set to {} or None
+            stress_anisotropic: { }
+          batch_size: 64
+          num_batches: 5000   # if num_batches is not given, the whole dataset will be used
+      # 3) Specify a single .pt file with dict of target names and Normalizer modules
+      # (this is the format that OTF vales are saved in)
+      # see Normalizer module in fairchem.core.modules.normalization.normalizer
+      normalizer:
+        file: normalizers.pt
+      # 4) specify an individual file either .pt or .npz with keys 'mean' and 'rmsd' or 'stdev'
+      normalizer:
+        energy:
+          file: energy_norm.pt
+        forces:
+          file: forces_norm.npz
+        isotropic_stress:
+          file: isostress_norm.npz
+        anisotropic_stress:
+          file: anisostress_norm.npz
+      # If we want to train on total energies and use a per-element linear reference
+      # normalization scheme, we can estimate those from the data or specify the path to the per-element
+      # 1) Fit element references from data
+      element_references:
+        fit:
+          targets:
+            - energy
+          batch_size: 64
+          num_batches: 5000   # if num_batches is not given, the whole dataset will be used
+      # 2) Specify a file with with key energy and LinearReference object. This is the format OTF references are saved in.
+      # see fairchem.core.modules.normalization.element_references for references.
+      element_references:
+        file: element_references.pt
+      # 3) Legacy files in npz format can be specified as well. They must have the elemenet references
+      #  under the key coeff
+      element_references:
+        energy:
+          file: element_ref.npz
+    # 4) backwards compatibility only, linear references can be set as follows. Setting the references
+    # file as follows is a legacy setting and only works with oc22_lmdb and ase_lmdb datasets
+    lin_ref: element_ref.npz
+
     # If we want to train OC20 on total energy, a path to OC20 reference
     # energies `oc20_ref` must be specified to unreference existing OC20 data.
     # download at https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/oc20_ref.pkl
     # Also, train_on_oc20_total_energies must be set to True
     # OC22 defaults to total energy, so these flags are not necessary.
     train_on_oc20_total_energies: False                                         # True or False
     oc20_ref: None                                                              # path to oc20_ref
-    # If we want to train on total energies and use a linear reference
-    # normalization scheme, we must specify the path to the per-element
-    # coefficients in a `.npz` format.
-    lin_ref: False                                                              # True or False
+
   val:
     # Directory containing val set LMDBs
     src: data/s2ef/all/val_id/

diff --git a/docs/core/datasets/oc20dense.md b/docs/core/datasets/oc20dense.md
@@ -11,7 +11,7 @@ The OC20Dense dataset is a validation dataset which was used to assess model per
 |ASE Trajectories    |29G    |112G   | [ee937e5290f8f720c914dc9a56e0281f](https://dl.fbaipublicfiles.com/opencatalystproject/data/adsorbml/oc20_dense_trajectories.tar.gz)   |
 
 The following files are also provided to be used for evaluation and general information:
-* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information -
+* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information. If this file is not present, run the command `python src/fairchem/core/scripts/download_large_files.py adsorbml` from the root of the fairchem repo to download it. -
   * `system_id`: Unique system identifier for an adsorbate, bulk, surface combination.
   * `config_id`: Unique configuration identifier, where `rand` and `heur` correspond to random and heuristic initial configurations, respectively.
   * `mpid`: Materials Project bulk identifier.

diff --git a/docs/core/fine-tuning/fine-tuning-oxides.md b/docs/core/fine-tuning/fine-tuning-oxides.md
@@ -205,6 +205,7 @@ from fairchem.core.common.tutorial_utils import generate_yml_config
 yml = generate_yml_config(checkpoint_path, 'config.yml',
                    delete=['slurm', 'cmd', 'logger', 'task', 'model_attributes',
                            'optim.loss_force', # the checkpoint setting causes an error
+			   'optim.load_balancing',
                            'dataset', 'test_dataset', 'val_dataset'],
                    update={'gpus': 1,
                            'optim.eval_every': 10,

diff --git a/docs/core/install.md b/docs/core/install.md
@@ -44,28 +44,28 @@ You can also install `pytorch` and `torch_geometric` dependencies from PyPI to s
    similarly by selecting the appropriate versions in the official
    [PyG docs](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html)
 
-## Install fairchem-core
+## Standard installation of fairchem-core
 Install `fairchem-core` from PyPi
 ```bash
 pip install fairchem-core
 ```
 
-## Additional packages
-
+### Additional packages
 `fairchem` is a namespace package, meaning all packages are installed seperately. If you need
 to install other packages you can do so by:
 ```bash
 pip install fairchem-{package-to-install}
 ```
+Available `fairchem` packages are `fairchem-core`,`fairchem-data-oc`,`fairchem-demo-ocpapi`,`fairchem-applications-cattsunami`
 
-## Development install
-
+## Development installation
 If you plan to make contributions you will need to fork and clone (for windows user please see next section) the repo,
 set up the environment, and install fairchem-core from source in editable mode with dev dependencies,
 ```bash
 git clone https://github.com/FAIR-Chem/fairchem.git
 cd fairchem
 pip install -e packages/fairchem-core[dev]
+pytest tests/core
 ```
 
 And similarly for any other namespace package:

diff --git a/docs/legacy_tutorials/OCP_Tutorial.md b/docs/legacy_tutorials/OCP_Tutorial.md
@@ -1807,7 +1807,7 @@ Similarly, to predict forces, we pass edge features through a fully-connected la
 
 @registry.register_model("simple")
 class SimpleAtomEdgeModel(torch.nn.Module):
-    def __init__(self, num_atoms, bond_feat_dim, num_targets, emb_size=64, num_radial=64, cutoff=6.0, env_exponent=5):
+    def __init__(self, emb_size=64, num_radial=64, cutoff=6.0, env_exponent=5):
         super().__init__()
 
         self.radial_basis = RadialBasis(

diff --git a/docs/tutorials/NRR/NRR_example.md b/docs/tutorials/NRR/NRR_example.md
@@ -62,7 +62,7 @@ To do this, we will enumerate adsorbate-slab configurations and run ML relaxatio
 
 +++
 
-Be sure to set the path in `fairchem/data/oc/configs/paths.py` to point to the correct place or pass the paths as an argument. The database pickles can be found in `fairchem/data/oc/databases/pkls`. We will show one explicitly here as an example and then run all of them in an automated fashion for brevity.
+Be sure to set the path in `fairchem/data/oc/configs/paths.py` to point to the correct place or pass the paths as an argument. The database pickles can be found in `fairchem/data/oc/databases/pkls` (some pkl files are only downloaded by running the command `python src/fairchem/core/scripts/download_large_files.py oc` from the root of the fairchem repo). We will show one explicitly here as an example and then run all of them in an automated fashion for brevity.
 
 ```{code-cell} ipython3
 import fairchem.data.oc

diff --git a/docs/tutorials/advanced/fine-tuning-in-python.md b/docs/tutorials/advanced/fine-tuning-in-python.md
@@ -75,7 +75,7 @@ We start by making the config.yml. We build this from the calculator checkpoint.
 from fairchem.core.common.tutorial_utils import generate_yml_config
 
 yml = generate_yml_config(checkpoint_path, 'config.yml',
-                   delete=['slurm', 'cmd', 'logger', 'task', 'model_attributes',
+                   delete=['slurm', 'cmd', 'logger', 'task', 'model_attributes','optim.load_balancing',
                            'optim.loss_force', # the checkpoint setting causes an error
                            'dataset', 'test_dataset', 'val_dataset'],
                    update={'gpus': 1,

diff --git a/src/fairchem/applications/AdsorbML/README.md b/src/fairchem/applications/AdsorbML/README.md
@@ -21,7 +21,7 @@ NOTE - ASE trajectories exclude systems that were not converged or had invalid c
 |ASE Trajectories    |29G    |112G   | [ee937e5290f8f720c914dc9a56e0281f](https://dl.fbaipublicfiles.com/opencatalystproject/data/adsorbml/oc20_dense_trajectories.tar.gz)   |
 
 The following files are also provided to be used for evaluation and general information:
-* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information -
+* `oc20dense_mapping.pkl` : Mapping of the LMDB `sid` to general metadata information. If this file is not present, run the command `python src/fairchem/core/scripts/download_large_files.py adsorbml` from the root of the fairchem repo to download it. -
   * `system_id`: Unique system identifier for an adsorbate, bulk, surface combination.
   * `config_id`: Unique configuration identifier, where `rand` and `heur` correspond to random and heuristic initial configurations, respectively.
   * `mpid`: Materials Project bulk identifier.

diff --git a/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py b/src/fairchem/applications/AdsorbML/adsorbml/2023_neurips_challenge/challenge_eval.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 
+from fairchem.core.scripts import download_large_files
+
 
 def is_successful(best_pred_energy, best_dft_energy, SUCCESS_THRESHOLD=0.1):
     """
@@ -161,6 +163,11 @@ def main():
 
     # targets and metadata are expected to be in
     # the same directory as this script
+    if (
+        not Path(__file__).with_name("oc20dense_val_targets.pkl").exists()
+        or not Path(__file__).with_name("ml_relaxed_dft_targets.pkl").exists()
+    ):
+        download_large_files.download_file_group("adsorbml")
     targets = pickle.load(
         open(Path(__file__).with_name("oc20dense_val_targets.pkl"), "rb")
     )