Add computing Hessians using MBTR forcefield (#8)

* Add a ASE calculator for MBTR * Move back to Py3.9 for dscribe I get an error "from collections import Iterable" stemming from the sparse module used by dscribe * Add force computation * Add ability to compute Hessian from MBTR * Remove unused import * Add an example notebook with MBTR * Adjust the scale to be a unit normal Should reduce the likelihood of numerical issues * Update proof-of-concept with more data, better KRR
globus-labs · Sep 13, 2023 · b7d6b6a · b7d6b6a
1 parent 93f77b3
commit b7d6b6a
Show file tree

Hide file tree

Showing 8 changed files with 1,001 additions and 52 deletions.
diff --git a/envs/environment-cpu.yml b/envs/environment-cpu.yml
@@ -7,7 +7,7 @@ channels:
   - pytorch
   - conda-forge/label/libint_dev
 dependencies:
-  - python==3.10.*
+  - python==3.9.*
 
   # Standard data analysis tools
   - pandas==1.*
@@ -20,6 +20,9 @@ dependencies:
   # Quantum chemistry
   - psi4==1.8.*
 
+  # Interatomic forcefields
+  - dscribe==2.1.*
+
   # Use Conda PyTorch to avoid OpenMP disagreement with other libraries
   - pytorch==2.0.*
   - cpuonly

diff --git a/jitterbug/model/base.py b/jitterbug/model/base.py
@@ -16,11 +16,13 @@ def train(self, data: list[Atoms]) -> object:
         """
         raise NotImplementedError()
 
-    def mean_hessian(self, model: object) -> list[np.ndarray]:
+    def mean_hessian(self, model: object) -> np.ndarray:
         """Produce the most-likely Hessian given the model
 
         Args:
-            model: Model trained by this
+            model: Model trained by this class
+        Returns:
+            The most-likely Hessian given the model
         """
 
     def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:

diff --git a/jitterbug/model/mbtr.py b/jitterbug/model/mbtr.py
@@ -0,0 +1,96 @@
+"""Learn a potential energy surface with the MBTR representation
+
+MBTR is an easy route for learning a forcefield because it represents
+a molecule as a single vector, which means we can case the learning
+problem as a simple "molecule->energy" learning problem. Other methods,
+such as SOAP, provided atomic-level features that must require an
+extra step "molecule->atoms->energy/atom->energy".
+"""
+from shutil import rmtree
+
+from ase.calculators.calculator import Calculator, all_changes
+from ase.vibrations import Vibrations
+from ase import Atoms
+from sklearn.linear_model import LinearRegression
+from dscribe.descriptors import MBTR
+import numpy as np
+
+from jitterbug.model.base import EnergyModel
+
+
+class MBTRCalculator(Calculator):
+    """A learnable forcefield based on GPR and fingerprints computed using DScribe"""
+
+    implemented_properties = ['energy', 'forces']
+    default_parameters = {
+        'descriptor': MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        'model': LinearRegression(),
+        'intercept': 0.,  # Normalizing parameters
+        'scale': 0.
+    }
+
+    def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
+        # Compute the energy using the learned model
+        desc = self.parameters['descriptor'].create_single(atoms)
+        energy_no_int = self.parameters['model'].predict(desc[None, :])
+        self.results['energy'] = energy_no_int[0] * self.parameters['scale'] + self.parameters['intercept']
+
+        # If desired, compute forces numerically
+        if 'forces' in properties:
+            # calculate_numerical_forces use that the calculation of the input atoms,
+            #  even though it is a method of a calculator and not of the input atoms :shrug:
+            temp_atoms: Atoms = atoms.copy()
+            temp_atoms.calc = self
+            self.results['forces'] = self.calculate_numerical_forces(temp_atoms)
+
+    def train(self, train_set: list[Atoms]):
+        """Train the embedded forcefield object
+
+        Args:
+            train_set: List of Atoms objects containing at least the energy
+        """
+
+        # Determine the mean energy and subtract it off
+        energies = np.array([atoms.get_potential_energy() for atoms in train_set])
+        self.parameters['intercept'] = energies.mean()
+        energies -= self.parameters['intercept']
+        self.parameters['scale'] = energies.std()
+        energies /= self.parameters['scale']
+
+        # Compute the descriptors and use them to fit the model
+        desc = self.parameters['descriptor'].create(train_set)
+        self.parameters['model'].fit(desc, energies)
+
+
+class MBTREnergyModel(EnergyModel):
+    """Use the MBTR representation to model the potential energy surface
+
+    Args:
+        calc: Calculator used to fit the potential energy surface
+        reference: Reference structure at which we compute the Hessian
+    """
+
+    def __init__(self, calc: MBTRCalculator, reference: Atoms):
+        super().__init__()
+        self.calc = calc
+        self.reference = reference
+
+    def train(self, data: list[Atoms]) -> MBTRCalculator:
+        self.calc.train(data)
+        return self.calc
+
+    def mean_hessian(self, model: MBTRCalculator) -> np.ndarray:
+        self.reference.calc = model
+        try:
+            vib = Vibrations(self.reference, name='mbtr-temp')
+            vib.run()
+            return vib.get_vibrations().get_hessian_2d()
+        finally:
+            rmtree('mbtr-temp', ignore_errors=True)
diff --git a/notebooks/proof-of-concept/1_compute-random-offsets.ipynb b/notebooks/proof-of-concept/1_compute-random-offsets.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "c6a28419-6831-4197-8973-00c5591e19cb",
    "metadata": {
     "tags": []
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "c6be56c5-a460-4acd-9b89-8c3d9c812f5f",
    "metadata": {
     "tags": [
@@ -51,7 +51,7 @@
     "method = 'hf'\n",
     "basis = 'def2-svpd'\n",
     "threads = min(os.cpu_count(), 12)\n",
-    "step_size: float = 0.005 # Perturbation amount, used as maximum L2 norm"
+    "step_size: float = 0.01 # Perturbation amount, used as maximum L2 norm"
    ]
   },
   {
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "0b6794cd-477f-45a1-b96f-2332804ddb20",
    "metadata": {},
    "outputs": [],
@@ -83,12 +83,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "ad9fd725-b1ba-4fec-ae41-959be0e540b3",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Atoms(symbols='O2N4C8H10', pbc=False, forces=..., calculator=SinglePointCalculator(...))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "atoms = read(Path('data') / 'exact' / f'{run_name}.xyz')\n",
     "atoms"
@@ -113,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "23502eea-0974-4248-8f19-e85447069c61",
    "metadata": {
     "tags": []
@@ -126,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "bf1366fc-d9a7-4a98-b9c9-cb3a0209b406",
    "metadata": {
     "tags": []
@@ -146,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "d4f21e81-5ec3-4877-a4d1-402077be2ee8",
    "metadata": {
     "tags": []
@@ -168,12 +179,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "0915595d-133a-43df-84fc-4ff6a3b538ea",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  Memory set to   3.815 GiB by Python driver.\n",
+      "  Threads set to 12 by Python driver.\n"
+     ]
+    }
+   ],
    "source": [
     "calc = Psi4(method=method, basis=basis, num_threads=threads, memory='4096MB')"
    ]
@@ -188,12 +209,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "e2a28593-2634-4bb7-ae5b-8f557937bda1",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Need to run 2701 calculations for full accuracy.\n"
+     ]
+    }
+   ],
    "source": [
     "n_atoms = len(atoms)\n",
     "to_compute = 3 * n_atoms + 3 * n_atoms * (3 * n_atoms + 1) // 2 + 1\n",
@@ -202,12 +231,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "8bf40523-dcaa-4046-a9c6-74e35178e87f",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Already done 427. 2274 left to do.\n"
+     ]
+    }
+   ],
    "source": [
     "with connect(db_path) as db:\n",
     "    done = len(db)\n",
@@ -219,7 +256,31 @@
    "execution_count": null,
    "id": "a6fa1b33-defc-4b35-895d-052eb64453fb",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                                                            | 0/2701 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  Memory set to   3.815 GiB by Python driver.\n",
+      "  Threads set to 12 by Python driver.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                | 1600/2701 [2:06:01<2:33:24,  8.36s/it]"
+     ]
+    }
+   ],
    "source": [
     "pbar = tqdm(total=to_compute)\n",
     "pbar.update(done)\n",
@@ -271,7 +332,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.17"
   }
  },
  "nbformat": 4,