Merge branch 'main' into add_jacobian

globus-labs · Dec 4, 2023 · 9e82cc6 · 9e82cc6
2 parents 68a5589 + d5f72c1
commit 9e82cc6
Show file tree

Hide file tree

Showing 26 changed files with 3,051 additions and 701 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -21,7 +21,15 @@ jobs:
       uses: conda-incubator/setup-miniconda@v2
       with:
         environment-file: envs/environment-cpu.yml
-        mamba-version: "*"
+        activate-environment: test
+        auto-activate-base: true
+        auto-update-conda: false
+        remove-profiles: true
+        architecture: x64
+        clean-patched-environment-file: true
+        run-post: true
+        use-mamba: true
+        miniforge-version: latest
     - name: Install test dependencies
       run: |
         pip install -e .[test]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,5 @@
+repos:
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.6.1
+  hooks:
+    - id: nbstripout
diff --git a/envs/environment-cpu.yml b/envs/environment-cpu.yml
@@ -4,7 +4,6 @@ name: jitterbug
 channels:
   - defaults
   - conda-forge
-  - pytorch
   - conda-forge/label/libint_dev
 dependencies:
   - python==3.9.*
@@ -26,10 +25,10 @@ dependencies:
   - geometric
 
   # Use Conda PyTorch to avoid OpenMP disagreement with other libraries
-  - pytorch==2.0.*
-  - cpuonly
+  - pytorch==2.0.*=*cpu*
 
   - pip
   - pip:
     - git+https://gitlab.com/ase/ase.git  # Needed for MOPAC
+    - gpytorch==1.11.*
     - -e ..[test]
diff --git a/jitterbug/compare.py b/jitterbug/compare.py
@@ -0,0 +1,85 @@
+"""Tools for assessing the quality of a Hessian compared to a true one"""
+from dataclasses import dataclass
+
+import ase
+from ase import units
+import numpy as np
+from ase.vibrations import VibrationsData
+from pmutt.statmech import StatMech, presets
+
+
+@dataclass
+class HessianQuality:
+    """Measurements of the quality of a Hessian"""
+
+    # Thermodynamics
+    zpe: float
+    """Zero point energy (kcal/mol)"""
+    zpe_error: float
+    """Different between the ZPE and the target one"""
+    cp: list[float]
+    """Heat capacity as a function of temperature (units: kcal/mol/K)"""
+    cp_error: list[float]
+    """Difference between known and approximate heat capacity as a function of temperature (units: kcal/mol/K)"""
+    h: list[float]
+    """Enthalpy as a function of temperature (units: kcal/mol)"""
+    h_error: list[float]
+    """Error between known and approximate enthalpy as a function of temperature (units: kcal/mol)"""
+    temps: list[float]
+    """Temperatures at which Cp was evaluated (units: K)"""
+
+    # Vibrations
+    vib_freqs: list[float]
+    """Vibrational frequencies for our hessian (units: cm^-1)"""
+    vib_errors: list[float]
+    """Error between each frequency and the corresponding mode in the known hessian"""
+    vib_mae: float
+    """Mean absolute error for the vibrational modes"""
+
+
+def compare_hessians(atoms: ase.Atoms, known_hessian: np.ndarray, approx_hessian: np.ndarray) -> HessianQuality:
+    """Compare two different hessians for same atomic structure
+
+    Args:
+        atoms: Structure
+        known_hessian: 2D form of the target Hessian
+        approx_hessian: 2D form of an approximate Hessian
+    Returns:
+        Collection of the performance metrics
+    """
+
+    # Start by making a vibration data object
+    known_vibs: VibrationsData = VibrationsData.from_2d(atoms, known_hessian)
+    approx_vibs: VibrationsData = VibrationsData.from_2d(atoms, approx_hessian)
+
+    # Compare the vibrational frequencies on the non-zero modes
+    known_freqs = known_vibs.get_frequencies()
+    is_real = np.isreal(known_freqs)
+    approx_freqs = approx_vibs.get_frequencies()
+    freq_error = np.subtract(approx_freqs[is_real], known_freqs[is_real])
+    freq_mae = np.abs(freq_error).mean()
+
+    # Compare the enthalpy and heat capacity
+    #  TODO (wardlt): Might actually want to compute the symmetry number
+    known_harm = StatMech(vib_wavenumbers=np.real(known_freqs[is_real]), atoms=atoms, symmetrynumber=1, **presets['harmonic'])
+    approx_harm = StatMech(vib_wavenumbers=np.real(approx_freqs[is_real]), atoms=atoms, symmetrynumber=1, **presets['harmonic'])
+
+    temps = np.linspace(1., 373, 128)
+    known_cp = np.array([known_harm.get_Cp('kcal/mol/K', T=t) for t in temps])
+    approx_cp = np.array([approx_harm.get_Cp('kcal/mol/K', T=t) for t in temps])
+    known_h = np.array([known_harm.get_H('kcal/mol', T=t) for t in temps])
+    approx_h = np.array([approx_harm.get_H('kcal/mol', T=t) for t in temps])
+
+    # Assemble into a result object
+    return HessianQuality(
+        zpe=approx_vibs.get_zero_point_energy() * units.mol / units.kcal,
+        zpe_error=(approx_vibs.get_zero_point_energy() - known_vibs.get_zero_point_energy()) * units.mol / units.kcal,
+        vib_freqs=np.real(approx_freqs[is_real]).tolist(),
+        vib_errors=np.abs(freq_error),
+        vib_mae=freq_mae,
+        cp=approx_cp.tolist(),
+        cp_error=(known_cp - approx_cp).tolist(),
+        h=approx_h,
+        h_error=(known_h - approx_h).tolist(),
+        temps=temps.tolist()
+    )
diff --git a/jitterbug/model/base.py b/jitterbug/model/base.py
@@ -1,6 +1,15 @@
 """Base class defining the key methods"""
+from tempfile import TemporaryDirectory
+from typing import List, Optional
+from random import choices
+from pathlib import Path
+import os
+
 import numpy as np
+from scipy import stats
 from ase import Atoms
+from ase.vibrations import Vibrations
+from ase.calculators.calculator import Calculator
 
 
 class EnergyModel:
@@ -35,3 +44,89 @@ def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:
             A list of 2D hessians
         """
         raise NotImplementedError()
+
+
+class ASEEnergyModel(EnergyModel):
+    """Energy models which produce a series of ASE :class:`~ase.calculators.calculator.Calculator` objects"""
+
+    def __init__(self, reference: Atoms, num_calculators: int, scratch_dir: Optional[Path] = None):
+        self.reference = reference
+        self.scratch_dir = scratch_dir
+        self.num_calculators = num_calculators
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        """Train one of the constituent calculators
+
+        Args:
+            data: Training data
+        Returns:
+            Retrained calculator
+        """
+        raise NotImplementedError()
+
+    def train(self, data: list[Atoms]) -> List[Calculator]:
+        # Train each on a different bootstrap sample
+        output = []
+        for _ in range(self.num_calculators):
+            sampled_data = choices(data, k=len(data))
+            output.append(self.train_calculator(sampled_data))
+        return output
+
+    def compute_hessian(self, mol: Atoms, calc: Calculator) -> np.ndarray:
+        """Compute the Hessian using one of the calculators
+
+        Args:
+            mol: Molecule to be evaluated
+            calc: Calculator to use
+        Returns:
+            2D Hessian matrix
+        """
+        # Clone the molecule to avoid any cross-talk
+        mol = mol.copy()
+
+        with TemporaryDirectory(dir=self.scratch_dir) as td:
+            start_dir = Path.cwd()
+            try:
+                # Run the vibrations calculation in a temporary directory
+                os.chdir(td)
+                mol.calc = calc
+                vib = Vibrations(mol, name='mbtr-temp')
+                vib.run()
+
+                # Return only the 2D Hessian
+                return vib.get_vibrations().get_hessian_2d()
+            finally:
+                os.chdir(start_dir)
+
+    def mean_hessian(self, models: list[Calculator]) -> np.ndarray:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Return the mean
+        return np.mean(hessians, axis=0)
+
+    def sample_hessians(self, models: list[Calculator], num_samples: int) -> list[np.ndarray]:
+        # Run all calculators
+        hessians = [self.compute_hessian(self.reference, calc) for calc in models]
+
+        # Compute the mean and covariance for each parameter
+        triu_ind = np.triu_indices(hessians[0].shape[0])
+        hessians_flat = [h[triu_ind] for h in hessians]
+        hessian_mean = np.mean(hessians_flat, axis=0)
+        hessian_covar = np.cov(hessians_flat, rowvar=False)
+
+        # Generate samples
+        hessian_mvn = stats.multivariate_normal(hessian_mean, hessian_covar, allow_singular=True)
+        diag_indices = np.diag_indices(hessians[0].shape[0])
+        output = []
+        for sample in hessian_mvn.rvs(size=num_samples):
+            # Fill in a 2D version
+            sample_hessian = np.zeros_like(hessians[0])
+            sample_hessian[triu_ind] = sample
+
+            # Make it symmetric
+            sample_hessian += sample_hessian.T
+            sample_hessian[diag_indices] /= 2
+
+            output.append(sample_hessian)
+        return output
diff --git a/jitterbug/model/dscribe/__init__.py b/jitterbug/model/dscribe/__init__.py
@@ -0,0 +1 @@
+"""Energy models using `DScribe <https://singroup.github.io/dscribe/latest/index.html>`_"""
diff --git a/jitterbug/model/dscribe/globald.py b/jitterbug/model/dscribe/globald.py
@@ -0,0 +1,87 @@
+"""Models which use a single vector to describe a molecule.
+
+Such approaches, such as MBTR, present a simple "molecule->energy" learning problem.
+Other methods, such as SOAP, provided atomic-level features that must require an
+extra step "molecule->atoms->energy/atom->energy".
+
+We train the models using sklearn for this example.
+"""
+from typing import List
+
+from dscribe.descriptors.descriptorglobal import DescriptorGlobal
+from sklearn.linear_model import LinearRegression
+from sklearn.base import BaseEstimator, clone
+from dscribe.descriptors import MBTR
+from ase.calculators.calculator import Calculator, all_changes
+from ase import Atoms
+import numpy as np
+
+from jitterbug.model.base import ASEEnergyModel
+
+
+class DScribeGlobalCalculator(Calculator):
+    """A learnable forcefield based on global fingerprints computed using DScribe"""
+
+    implemented_properties = ['energy', 'forces']
+    default_parameters = {
+        'descriptor': MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        'model': LinearRegression(),
+        'offset': 0.,  # Normalizing parameters
+        'scale': 1.
+    }
+
+    def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
+        # Compute the energy using the learned model
+        desc = self.parameters['descriptor'].create_single(atoms)
+        energy_unscaled = self.parameters['model'].predict(desc[None, :])
+        self.results['energy'] = energy_unscaled[0] * self.parameters['scale'] + self.parameters['offset']
+
+        # If desired, compute forces numerically
+        if 'forces' in properties:
+            # calculate_numerical_forces use that the calculation of the input atoms,
+            #  even though it is a method of a calculator and not of the input atoms :shrug:
+            temp_atoms: Atoms = atoms.copy()
+            temp_atoms.calc = self
+            self.results['forces'] = self.calculate_numerical_forces(temp_atoms)
+
+
+class DScribeGlobalEnergyModel(ASEEnergyModel):
+    """Compute energy using a scikit-learn model that computes energies from global descriptors
+
+    Args:
+        reference: Reference structure at which we compute the Hessian
+        descriptors: Tool used to compute descriptors
+        model: Scikit-learn model to use to compute energies given descriptors
+        num_calculators: Number of models to use in ensemble
+    """
+
+    def __init__(self, reference: Atoms, descriptors: DescriptorGlobal, model: BaseEstimator, num_calculators: int):
+        super().__init__(reference, num_calculators)
+        self.desc = descriptors
+        self.model = model
+
+    def train_calculator(self, data: List[Atoms]) -> Calculator:
+        # Make a copy of the model
+        model: BaseEstimator = clone(self.model)
+
+        # Compute the descriptors then train
+        train_x = self.desc.create(data)
+        train_y = np.array([a.get_potential_energy() for a in data])
+        scale_y, offset_y = np.std(train_y), np.mean(train_y)
+        train_y = (train_y - offset_y) / scale_y
+        model.fit(train_x, train_y)
+
+        # Assemble into a Calculator
+        return DScribeGlobalCalculator(
+            descriptor=self.desc,
+            model=model,
+            offset=offset_y,
+            scale=scale_y,
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Energy models using `DScribe <https://singroup.github.io/dscribe/latest/index.html>`_"""