Refactor the global descriptor class

globus-labs · Oct 16, 2023 · a6318ab · a6318ab
1 parent 0a2a687
commit a6318ab
Show file tree

Hide file tree

Showing 4 changed files with 223 additions and 109 deletions.
diff --git a/jitterbug/model/base.py b/jitterbug/model/base.py
@@ -1,6 +1,15 @@
 """Base class defining the key methods"""
+from tempfile import TemporaryDirectory
+from typing import List, Optional
+from random import choices
+from pathlib import Path
+import os
+
 import numpy as np
+from scipy import stats
 from ase import Atoms
+from ase.vibrations import Vibrations
+from ase.calculators.calculator import Calculator
 
 
 class EnergyModel:
@@ -35,3 +44,89 @@ def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:
             A list of 2D hessians
         """
         raise NotImplementedError()
+
+
+class ASEEnergyModel(EnergyModel):
+    """Energy models which produce a series of ASE :class:`~ase.calculators.calculator.Calculator` objects"""
+
+    def __init__(self, reference: Atoms, num_calculators: int, scratch_dir: Optional[Path] = None):
+        self.reference = reference
+        self.scratch_dir = scratch_dir
+        self.num_calculators = num_calculators
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        """Train one of the constituent calculators
+
+        Args:
+            data: Training data
+        Returns:
+            Retrained calculator
+        """
+        raise NotImplementedError()
+
+    def train(self, data: list[Atoms]) -> List[Calculator]:
+        # Train each on a different bootstrap sample
+        output = []
+        for _ in range(self.num_calculators):
+            sampled_data = choices(data, k=len(data))
+            output.append(self.train_calculator(sampled_data))
+        return output
+
+    def compute_hessian(self, mol: Atoms, calc: Calculator) -> np.ndarray:
+        """Compute the Hessian using one of the calculators
+
+        Args:
+            mol: Molecule to be evaluated
+            calc: Calculator to use
+        Returns:
+            2D Hessian matrix
+        """
+        # Clone the molecule to avoid any cross-talk
+        mol = mol.copy()
+
+        with TemporaryDirectory(dir=self.scratch_dir) as td:
+            start_dir = Path.cwd()
+            try:
+                # Run the vibrations calculation in a temporary directory
+                os.chdir(td)
+                mol.calc = calc
+                vib = Vibrations(mol, name='mbtr-temp')
+                vib.run()
+
+                # Return only the 2D Hessian
+                return vib.get_vibrations().get_hessian_2d()
+            finally:
+                os.chdir(start_dir)
+
+    def mean_hessian(self, models: list[Calculator]) -> np.ndarray:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Return the mean
+        return np.mean(hessians, axis=0)
+
+    def sample_hessians(self, models: list[Calculator], num_samples: int) -> list[np.ndarray]:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Compute the mean and covariance for each parameter
+        triu_ind = np.triu_indices(hessians[0].shape[0])
+        hessians_flat = [h[triu_ind] for h in hessians]
+        hessian_mean = np.mean(hessians_flat, axis=0)
+        hessian_covar = np.cov(hessians_flat, rowvar=False)
+
+        # Generate samples
+        hessian_mvn = stats.multivariate_normal(hessian_mean, hessian_covar, allow_singular=True)
+        diag_indices = np.diag_indices(hessians[0].shape[0])
+        output = []
+        for sample in hessian_mvn.rvs(size=num_samples):
+            # Fill in a 2D version
+            sample_hessian = np.zeros_like(hessians[0])
+            sample_hessian[triu_ind] = sample
+
+            # Make it symmetric
+            sample_hessian += sample_hessian.T
+            sample_hessian[diag_indices] /= 2
+
+            output.append(sample_hessian)
+        return output
diff --git a/jitterbug/model/dscribe/globald.py b/jitterbug/model/dscribe/globald.py
@@ -0,0 +1,87 @@
+"""Models which use a single vector to describe a molecule.
+
+Such approaches, such as MBTR, present a simple "molecule->energy" learning problem.
+Other methods, such as SOAP, provided atomic-level features that must require an
+extra step "molecule->atoms->energy/atom->energy".
+
+We train the models using sklearn for this example.
+"""
+from typing import List
+
+from dscribe.descriptors.descriptorglobal import DescriptorGlobal
+from sklearn.linear_model import LinearRegression
+from sklearn.base import BaseEstimator, clone
+from dscribe.descriptors import MBTR
+from ase.calculators.calculator import Calculator, all_changes
+from ase import Atoms
+import numpy as np
+
+from jitterbug.model.base import ASEEnergyModel
+
+
+class DScribeGlobalCalculator(Calculator):
+    """A learnable forcefield based on global fingerprints computed using DScribe"""
+
+    implemented_properties = ['energy', 'forces']
+    default_parameters = {
+        'descriptor': MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        'model': LinearRegression(),
+        'offset': 0.,  # Normalizing parameters
+        'scale': 1.
+    }
+
+    def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
+        # Compute the energy using the learned model
+        desc = self.parameters['descriptor'].create_single(atoms)
+        energy_unscaled = self.parameters['model'].predict(desc[None, :])
+        self.results['energy'] = energy_unscaled[0] * self.parameters['scale'] + self.parameters['offset']
+
+        # If desired, compute forces numerically
+        if 'forces' in properties:
+            # calculate_numerical_forces use that the calculation of the input atoms,
+            #  even though it is a method of a calculator and not of the input atoms :shrug:
+            temp_atoms: Atoms = atoms.copy()
+            temp_atoms.calc = self
+            self.results['forces'] = self.calculate_numerical_forces(temp_atoms)
+
+
+class DScribeGlobalEnergyModel(ASEEnergyModel):
+    """Compute energy using a scikit-learn model that computes energies from global descriptors
+
+    Args:
+        reference: Reference structure at which we compute the Hessian
+        descriptors: Tool used to compute descriptors
+        model: Scikit-learn model to use to compute energies given descriptors
+        num_calculators: Number of models to use in ensemble
+    """
+
+    def __init__(self, reference: Atoms, descriptors: DescriptorGlobal, model: BaseEstimator, num_calculators: int):
+        super().__init__(reference, num_calculators)
+        self.desc = descriptors
+        self.model = model
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        # Make a copy of the model
+        model: BaseEstimator = clone(self.model)
+
+        # Compute the descriptors then train
+        train_x = self.desc.create(data)
+        train_y = np.array([a.get_potential_energy() for a in data])
+        scale_y, offset_y = np.std(train_y), np.mean(train_y)
+        train_y = (train_y - offset_y) / scale_y
+        model.fit(train_x, train_y)
+
+        # Assemble into a Calculator
+        return DScribeGlobalCalculator(
+            descriptor=self.desc,
+            model=model,
+            offset=offset_y,
+            scale=scale_y,
+        )
diff --git a/jitterbug/model/dscribe/single.py b/jitterbug/model/dscribe/single.py
diff --git a/tests/models/test_mbtr.py b/tests/models/test_mbtr.py
@@ -1,15 +1,35 @@
 """Test for a MBTR-based energy model"""
 import numpy as np
-
-from jitterbug.model.dscribe.single import DScribeGlobalCalculator, DScribeGlobalEnergyModel
-
-
-def test_model(train_set):
+from pytest import fixture
+from dscribe.descriptors import MBTR
+from sklearn.linear_model import LinearRegression
+
+from jitterbug.model.dscribe.globald import DScribeGlobalEnergyModel
+
+
+@fixture()
+def model(train_set):
+    return DScribeGlobalEnergyModel(
+        reference=train_set[0],
+        descriptors=MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        model=LinearRegression(),
+        num_calculators=8
+    )
+
+
+def test_calculator(train_set, model):
     # Create then fit the model
-    calc = DScribeGlobalCalculator()
-    calc.train(train_set)
+    calcs = model.train(train_set)
 
     # Predict the energy (we should be close!)
+    calc = calcs[0]
     test_atoms = train_set[0].copy()
     test_atoms.calc = calc
     energy = test_atoms.get_potential_energy()
@@ -18,14 +38,22 @@ def test_model(train_set):
     # See if force calculation works
     forces = test_atoms.get_forces()
     assert forces.shape == (3, 3)  # At least make sure we get the right shape, values are iffy
+    assert not np.isclose(forces, 0).all()
 
 
-def test_hessian(train_set):
+def test_hessian(train_set, model):
     """See if we can compute the Hessian"""
-    calc = DScribeGlobalCalculator()
-    model = DScribeGlobalEnergyModel(calc, train_set[0])
 
     # Run the fitting
-    hess_model = model.train(train_set)
-    hess = model.mean_hessian(hess_model)
-    assert hess.shape == (9, 9)
+    calcs = model.train(train_set)
+
+    # Test the mean hessian function
+    mean_hess = model.mean_hessian(calcs)
+    assert mean_hess.shape == (9, 9), 'Wrong shape'
+    assert np.isclose(mean_hess, mean_hess.T).all(), 'Not symmetric'
+
+    # Test the sampling
+    sampled_hess = model.sample_hessians(calcs, 128)
+    assert all(np.isclose(hess, hess.T).all() for hess in sampled_hess)
+    mean_sampled_hess = np.mean(sampled_hess, 0)
+    assert np.isclose(np.diag(mean_sampled_hess), np.diag(mean_hess), atol=5).mean() > 0.5  # Make sure most agree