globus-labs · WardLT · Oct 18, 2023 · Oct 4, 2023 · Oct 4, 2023 · Oct 5, 2023
diff --git a/envs/environment-cpu.yml b/envs/environment-cpu.yml
@@ -4,7 +4,6 @@ name: jitterbug
 channels:
   - defaults
   - conda-forge
-  - pytorch
   - conda-forge/label/libint_dev
 dependencies:
   - python==3.9.*
@@ -31,4 +30,5 @@ dependencies:
   - pip
   - pip:
     - git+https://gitlab.com/ase/ase.git  # Needed for MOPAC
+    - gpytorch==1.11.*
     - -e ..[test]
diff --git a/jitterbug/model/base.py b/jitterbug/model/base.py
@@ -1,6 +1,15 @@
 """Base class defining the key methods"""
+from tempfile import TemporaryDirectory
+from typing import List, Optional
+from random import choices
+from pathlib import Path
+import os
+
 import numpy as np
+from scipy import stats
 from ase import Atoms
+from ase.vibrations import Vibrations
+from ase.calculators.calculator import Calculator
 
 
 class EnergyModel:
@@ -35,3 +44,89 @@ def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:
             A list of 2D hessians
         """
         raise NotImplementedError()
+
+
+class ASEEnergyModel(EnergyModel):
+    """Energy models which produce a series of ASE :class:`~ase.calculators.calculator.Calculator` objects"""
+
+    def __init__(self, reference: Atoms, num_calculators: int, scratch_dir: Optional[Path] = None):
+        self.reference = reference
+        self.scratch_dir = scratch_dir
+        self.num_calculators = num_calculators
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        """Train one of the constituent calculators
+
+        Args:
+            data: Training data
+        Returns:
+            Retrained calculator
+        """
+        raise NotImplementedError()
+
+    def train(self, data: list[Atoms]) -> List[Calculator]:
+        # Train each on a different bootstrap sample
+        output = []
+        for _ in range(self.num_calculators):
+            sampled_data = choices(data, k=len(data))
+            output.append(self.train_calculator(sampled_data))
+        return output
+
+    def compute_hessian(self, mol: Atoms, calc: Calculator) -> np.ndarray:
+        """Compute the Hessian using one of the calculators
+
+        Args:
+            mol: Molecule to be evaluated
+            calc: Calculator to use
+        Returns:
+            2D Hessian matrix
+        """
+        # Clone the molecule to avoid any cross-talk
+        mol = mol.copy()
+
+        with TemporaryDirectory(dir=self.scratch_dir) as td:
+            start_dir = Path.cwd()
+            try:
+                # Run the vibrations calculation in a temporary directory
+                os.chdir(td)
+                mol.calc = calc
+                vib = Vibrations(mol, name='mbtr-temp')
+                vib.run()
+
+                # Return only the 2D Hessian
+                return vib.get_vibrations().get_hessian_2d()
+            finally:
+                os.chdir(start_dir)
+
+    def mean_hessian(self, models: list[Calculator]) -> np.ndarray:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Return the mean
+        return np.mean(hessians, axis=0)
+
+    def sample_hessians(self, models: list[Calculator], num_samples: int) -> list[np.ndarray]:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Compute the mean and covariance for each parameter
+        triu_ind = np.triu_indices(hessians[0].shape[0])
+        hessians_flat = [h[triu_ind] for h in hessians]
+        hessian_mean = np.mean(hessians_flat, axis=0)
+        hessian_covar = np.cov(hessians_flat, rowvar=False)
+
+        # Generate samples
+        hessian_mvn = stats.multivariate_normal(hessian_mean, hessian_covar, allow_singular=True)
+        diag_indices = np.diag_indices(hessians[0].shape[0])
+        output = []
+        for sample in hessian_mvn.rvs(size=num_samples):
+            # Fill in a 2D version
+            sample_hessian = np.zeros_like(hessians[0])
+            sample_hessian[triu_ind] = sample
+
+            # Make it symmetric
+            sample_hessian += sample_hessian.T
+            sample_hessian[diag_indices] /= 2
+
+            output.append(sample_hessian)
+        return output
diff --git a/jitterbug/model/dscribe/__init__.py b/jitterbug/model/dscribe/__init__.py
@@ -0,0 +1 @@
+"""Energy models using `DScribe <https://singroup.github.io/dscribe/latest/index.html>`_"""
diff --git a/jitterbug/model/dscribe/globald.py b/jitterbug/model/dscribe/globald.py
@@ -0,0 +1,87 @@
+"""Models which use a single vector to describe a molecule.
+
+Such approaches, such as MBTR, present a simple "molecule->energy" learning problem.
+Other methods, such as SOAP, provided atomic-level features that must require an
+extra step "molecule->atoms->energy/atom->energy".
+
+We train the models using sklearn for this example.
+"""
+from typing import List
+
+from dscribe.descriptors.descriptorglobal import DescriptorGlobal
+from sklearn.linear_model import LinearRegression
+from sklearn.base import BaseEstimator, clone
+from dscribe.descriptors import MBTR
+from ase.calculators.calculator import Calculator, all_changes
+from ase import Atoms
+import numpy as np
+
+from jitterbug.model.base import ASEEnergyModel
+
+
+class DScribeGlobalCalculator(Calculator):
+    """A learnable forcefield based on global fingerprints computed using DScribe"""
+
+    implemented_properties = ['energy', 'forces']
+    default_parameters = {
+        'descriptor': MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        'model': LinearRegression(),
+        'offset': 0.,  # Normalizing parameters
+        'scale': 1.
+    }
+
+    def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
+        # Compute the energy using the learned model
+        desc = self.parameters['descriptor'].create_single(atoms)
+        energy_unscaled = self.parameters['model'].predict(desc[None, :])
+        self.results['energy'] = energy_unscaled[0] * self.parameters['scale'] + self.parameters['offset']
+
+        # If desired, compute forces numerically
+        if 'forces' in properties:
+            # calculate_numerical_forces use that the calculation of the input atoms,
+            #  even though it is a method of a calculator and not of the input atoms :shrug:
+            temp_atoms: Atoms = atoms.copy()
+            temp_atoms.calc = self
+            self.results['forces'] = self.calculate_numerical_forces(temp_atoms)
+
+
+class DScribeGlobalEnergyModel(ASEEnergyModel):
+    """Compute energy using a scikit-learn model that computes energies from global descriptors
+
+    Args:
+        reference: Reference structure at which we compute the Hessian
+        descriptors: Tool used to compute descriptors
+        model: Scikit-learn model to use to compute energies given descriptors
+        num_calculators: Number of models to use in ensemble
+    """
+
+    def __init__(self, reference: Atoms, descriptors: DescriptorGlobal, model: BaseEstimator, num_calculators: int):
+        super().__init__(reference, num_calculators)
+        self.desc = descriptors
+        self.model = model
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        # Make a copy of the model
+        model: BaseEstimator = clone(self.model)
+
+        # Compute the descriptors then train
+        train_x = self.desc.create(data)
+        train_y = np.array([a.get_potential_energy() for a in data])
+        scale_y, offset_y = np.std(train_y), np.mean(train_y)
+        train_y = (train_y - offset_y) / scale_y
+        model.fit(train_x, train_y)
+
+        # Assemble into a Calculator
+        return DScribeGlobalCalculator(
+            descriptor=self.desc,
+            model=model,
+            offset=offset_y,
+            scale=scale_y,
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Energy models using `DScribe <https://singroup.github.io/dscribe/latest/index.html>`_"""