use internal coordinates in linear model (linear_internals.py) (#25)

* see freqs * Add computing Hessians using MBTR forcefield (#8) * Add a ASE calculator for MBTR * Move back to Py3.9 for dscribe I get an error "from collections import Iterable" stemming from the sparse module used by dscribe * Add force computation * Add ability to compute Hessian from MBTR * Remove unused import * Add an example notebook with MBTR * Adjust the scale to be a unit normal Should reduce the likelihood of numerical issues * Update proof-of-concept with more data, better KRR * Allow use of MOPAC in proof of concept (#13) * Separate notebooks for exact answer, steps in fitting (#14) * Add utility for making calculator Avoid copying code between notebooks * Separate exact, sampling and approximation * Accept "None" as an option * Sample as if computing numerical derivatives (#17) * Start notebook for displacing along axes * Add code for combining perturbations into single tasks * Add example of using this new data * Refactor and add linear terms Fixes #2 Fixes #4 * Make the second-order terms 1/2 the first order * Update with the newer model * Update the proof of concept notebooks * see freqs * internal coordinate model * flake8 cleanup * added pytest for internals * more flake8 cleanup * geometric package to convert to internal coordinates --------- Co-authored-by: snelliott <[email protected]> Co-authored-by: Logan Ward <[email protected]> Co-authored-by: Logan Ward <[email protected]>
globus-labs · Dec 4, 2023 · 0d8df9a · 0d8df9a
1 parent 536bf7c
commit 0d8df9a
Show file tree

Hide file tree

Showing 103 changed files with 36,927 additions and 228 deletions.
diff --git a/envs/environment-cpu.yml b/envs/environment-cpu.yml
@@ -7,7 +7,7 @@ channels:
   - pytorch
   - conda-forge/label/libint_dev
 dependencies:
-  - python==3.10.*
+  - python==3.9.*
 
   # Standard data analysis tools
   - pandas==1.*
@@ -18,12 +18,18 @@ dependencies:
   - scikit-learn
 
   # Quantum chemistry
-  - psi4==1.8.*
+  - psi4==1.8.*  # Remove for MacOS systems
+  - mopac
+
+  # Interatomic forcefields
+  - dscribe==2.1.*
+  - geometric
 
   # Use Conda PyTorch to avoid OpenMP disagreement with other libraries
   - pytorch==2.0.*
   - cpuonly
 
   - pip
   - pip:
+    - git+https://gitlab.com/ase/ase.git  # Needed for MOPAC
     - -e ..[test]
diff --git a/jitterbug/model/base.py b/jitterbug/model/base.py
@@ -16,11 +16,13 @@ def train(self, data: list[Atoms]) -> object:
         """
         raise NotImplementedError()
 
-    def mean_hessian(self, model: object) -> list[np.ndarray]:
+    def mean_hessian(self, model: object) -> np.ndarray:
         """Produce the most-likely Hessian given the model
 
         Args:
-            model: Model trained by this
+            model: Model trained by this class
+        Returns:
+            The most-likely Hessian given the model
         """
 
     def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:

diff --git a/jitterbug/model/linear.py b/jitterbug/model/linear.py
@@ -26,7 +26,6 @@ def get_model_inputs(atoms: Atoms, reference: Atoms) -> np.ndarray:
     # Compute the displacements and the products of displacement
     disp_matrix = (atoms.positions - reference.positions).flatten()
     disp_prod_matrix = disp_matrix[:, None] * disp_matrix[None, :]
-
     # Multiply the off-axis terms by two, as they appear twice in the energy model
     n_terms = len(atoms) * 3
     off_diag = np.triu_indices(n_terms, k=1)
@@ -71,7 +70,6 @@ def train(self, data: list[Atoms]) -> LinearModel:
         # Y: Subtract off the reference energy
         ref_energy = self.reference.get_potential_energy()
         y = [atoms.get_potential_energy() - ref_energy for atoms in data]
-
         # Fit the ARD model and ensure it captures the data well
         model = self.regressor(fit_intercept=False).fit(x, y)
         pred = model.predict(x)
@@ -129,4 +127,6 @@ def _params_to_hessian(self, param: np.ndarray) -> np.ndarray:
         hessian[triu_inds] = param[n_coords:]  # The first n_coords terms are the linear part
         hessian[off_diag_triu_inds] /= 2
         hessian.T[triu_inds] = hessian[triu_inds]
+        # v = np.sqrt(self.reference.get_masses()).repeat(3).reshape(-1, 1)
+        # hessian /= np.dot(v, v.T)
         return hessian
diff --git a/jitterbug/model/linear_internals.py b/jitterbug/model/linear_internals.py
@@ -0,0 +1,207 @@
+"""Models which treat each term in the Hessian as independent
+
+We achieve a approximate Hessian by conditioning a linear model
+on the assumption that the Hessian contains parameters which are zero
+and that smaller parameters are more likely than larger
+"""
+import numpy as np
+from ase import Atoms
+from ase import io as aseio
+from geometric.molecule import Molecule
+from geometric.internal import DelocalizedInternalCoordinates as DIC
+from sklearn.linear_model import ARDRegression
+from sklearn.linear_model._base import LinearModel
+from .base import EnergyModel
+import geometric
+
+
+def get_internal_diff(ref_int: DIC, ref_mol: Molecule, atoms: Atoms) -> np.ndarray:
+    """Finds the differences in the values of the internal coordinates for two
+    of geometries.
+    Args:
+        ref_int: delocalized internal coordinate object (geometric package)
+        ref_mol: molecule object of minimum (geometric package)
+        atoms: molecule object of perturbation structure (ASE package)
+    Returns:
+        Array of displacements, in internal coordinates
+    """
+    # convert ASE to geometric molecule object
+    # this i/o is major slowdown, there has to be a better way
+    filename = 'tmp.xyz'
+    aseio.write(filename, atoms, 'xyz')
+    pert_mol = Molecule(filename, 'xyz')
+
+    ref_coords = ref_mol.xyzs[0].flatten()
+    pert_coords = pert_mol.xyzs[0].flatten()
+    int_diff = ref_int.calcDiff(pert_coords, ref_coords)
+    return (int_diff)
+
+# def get_internal_coords(atoms: Atoms) -> np.ndarray:
+#     #filename = 'tmp.xyz'
+#     #aseio.write(filename, atoms, 'xyz')
+#     #molecule = geometric.molecule.Molecule(filename, 'xyz')
+#     #IC = geometric.internal.DelocalizedInternalCoordinates(molecule, build=True, remove_tr=True)
+#     #coords = molecule.xyzs[0].flatten()#*geometric.nifty.ang2bohr
+#     #print('without rts')
+#     #print(IC.Prims)
+#     #print(IC.calculate(coords))
+#     ##IC.remove_TR(coords)
+#     ##print('without rts')
+#     ##print(IC.Prims)
+#     ##print(IC.calculate(coords))
+#     #return IC.calculate(coords)
+
+
+def get_model_internal_inputs(atoms: Atoms, ref_mol: Molecule, ref_IC: DIC) -> np.ndarray:
+    """Sets up first and second-order displacement vectors for a perturbation geometry
+    Args:
+        atoms: molecule object of perturbation structure (ASE package)
+        ref_mol: molecule object of minimum (geometric package)
+        ref_IC: delocalized internal coordinate object (geometric package)
+    Returns:
+        Array of Jacobian and Hessian displacements, in internal coordinates
+    """
+    # Compute the displacements and the products of displacement
+    disp_matrix = get_internal_diff(ref_IC, ref_mol, atoms)
+    disp_prod_matrix = disp_matrix[:, None] * disp_matrix[None, :]
+    n_terms = len(atoms) * 3 - 6
+    off_diag = np.triu_indices(n_terms, k=1)
+    disp_prod_matrix[off_diag] *= 2.
+    # Append the displacements and products of displacements
+    return np.concatenate([
+        disp_matrix,
+        disp_prod_matrix[np.triu_indices(n_terms)] / 2
+    ], axis=0)
+
+
+def get_model_inputs(atoms: Atoms, reference: Atoms) -> np.ndarray:
+    """Get the inputs for the model, which are derived from the displacements
+    of the structure with respect to a reference.
+
+    Args:
+        atoms: Displaced structure
+        reference: Reference structure
+    Returns:
+        Vector of displacements in the same order as the
+    """
+
+    # Compute the displacements and the products of displacement
+    disp_matrix = (atoms.positions - reference.positions).flatten()
+    disp_prod_matrix = disp_matrix[:, None] * disp_matrix[None, :]
+    # Multiply the off-axis terms by two, as they appear twice in the energy model
+    n_terms = len(atoms) * 3
+    off_diag = np.triu_indices(n_terms, k=1)
+    disp_prod_matrix[off_diag] *= 2.
+    # Append the displacements and products of displacements
+    return np.concatenate([
+        disp_matrix,
+        disp_prod_matrix[np.triu_indices(n_terms)] / 2
+    ], axis=0)
+
+
+class HarmonicModel(EnergyModel):
+    """Expresses energy as a Harmonic model (i.e., 2nd degree Taylor series)
+
+    Contains a total of :math:`3N + 3N(3N+1)/2` terms in total, where :math:`N`
+    is the number of atoms in the molecule. The first :math:`3N` correspond to the
+    linear terms of the model, which are known as the Jacobian matrix, and the
+    latter are from the quadratic terms, which are half of the symmetric Hessian matrix.
+
+    Implicitly treats all terms of the model as unrelated, which is the worst case
+    for trying to fit the energy of a molecule. However, it is still possible to fit
+    the model with a reduced number of terms if we assume that most terms are near zero.
+
+    The energy model is:
+
+    :math:`E = E_0 + \\sum_i J_i \\delta_i + \\frac{1}{2}\\sum_{i,j} H_{i,j}\\delta_i\\delta_j`
+
+    Args:
+        reference: Fully-relaxed structure used as the reference
+        regressor: LinearRegression class used to learn model
+    """
+
+    def __init__(self, reference: Atoms, regressor: type[LinearModel] = ARDRegression):
+        self.reference = reference
+        self.regressor = regressor
+
+    def train(self, data: list[Atoms]) -> LinearModel:
+        # Convert ASE object to geometric molecule object
+        filename = 'tmp.xyz'
+        aseio.write(filename, self.reference, 'xyz')
+        molecule = Molecule(filename, 'xyz')
+        # Build internal coordinates object
+        IC = DIC(molecule, build=True, remove_tr=True)
+        # X: Displacement vectors for each
+        x = [get_model_internal_inputs(atoms, molecule, IC) for atoms in data]
+        # Y: Subtract off the reference energy
+        ref_energy = self.reference.get_potential_energy()
+        y = [atoms.get_potential_energy() - ref_energy for atoms in data]
+        # Fit the ARD model and ensure it captures the data well
+        model = self.regressor(fit_intercept=True).fit(x, y)
+        pred = model.predict(x)
+        max_error = np.abs(pred - y).max()
+        if max_error > 0.002:
+            raise ValueError(f'Model error exceeds 1 meV. Actual: {max_error:.2e}')
+
+        return model
+
+    def mean_hessian(self, model: LinearModel) -> np.ndarray:
+        return self._params_to_hessian(model.coef_)
+
+    def sample_hessians(self, model: LinearModel, num_samples: int) -> list[np.ndarray]:
+        # Get the covariance matrix
+        if not hasattr(model, 'sigma_'):  # pragma: no-coverage
+            raise ValueError(f'Sampling only possible with Bayesian regressors. You trained a {type(model)}')
+        if isinstance(model, ARDRegression):
+            # The sigma matrix may be zero for high-precision terms
+            n_terms = len(model.coef_)
+            nonzero_terms = model.lambda_ < model.threshold_lambda
+
+            # Replace those terms (Thanks: https://stackoverflow.com/a/73176327/2593278)
+            sigma = np.zeros((n_terms, n_terms))
+            sub_sigma = sigma[nonzero_terms, :]
+            sub_sigma[:, nonzero_terms] = model.sigma_
+            sigma[nonzero_terms, :] = sub_sigma
+        else:
+            sigma = model.sigma_
+
+        # Sample the model parameters
+        params = np.random.multivariate_normal(model.coef_, sigma, size=num_samples)
+
+        # Assemble them into Hessians
+        output = []
+        for param in params:
+            hessian = self._params_to_hessian(param)
+            output.append(hessian)
+        return output
+
+    def _params_to_hessian(self, param: np.ndarray) -> np.ndarray:
+        """Convert the parameters for the linear model into a Hessian
+
+        Args:
+            param: Coefficients of the linear model
+        Returns:
+            The harmonic terms expressed as a Hessian matrix
+        """
+        # Get the parameters
+        filename = 'tmp.xyz'
+        aseio.write(filename, self.reference, 'xyz')
+        n_coords = len(self.reference) * 3 - 6
+        triu_inds = np.triu_indices(n_coords)
+        off_diag_triu_inds = np.triu_indices(n_coords, k=1)
+
+        # Assemble the hessian
+        hessian = np.zeros((n_coords, n_coords))
+        gradq = np.zeros(n_coords)
+        gradq = param[:n_coords]
+        hessian[triu_inds] = param[n_coords:]  # The first n_coords terms are the linear part
+        hessian[off_diag_triu_inds] /= 2
+        hessian.T[triu_inds] = hessian[triu_inds]
+        molecule = Molecule(filename, 'xyz')
+        IC = DIC(molecule, build=True, remove_tr=True)
+        coords = molecule.xyzs[0].flatten()*geometric.nifty.ang2bohr
+        hessian_cart = IC.calcHessCart(coords,  gradq, hessian)
+        # print(IC.Internals)
+        # print(IC.Prims)
+        # print(geometric.normal_modes.frequency_analysis(coords, hessian_cart, elem = molecule.elem))
+        return hessian_cart
diff --git a/jitterbug/model/mbtr.py b/jitterbug/model/mbtr.py
@@ -0,0 +1,96 @@
+"""Learn a potential energy surface with the MBTR representation
+
+MBTR is an easy route for learning a forcefield because it represents
+a molecule as a single vector, which means we can case the learning
+problem as a simple "molecule->energy" learning problem. Other methods,
+such as SOAP, provided atomic-level features that must require an
+extra step "molecule->atoms->energy/atom->energy".
+"""
+from shutil import rmtree
+
+from ase.calculators.calculator import Calculator, all_changes
+from ase.vibrations import Vibrations
+from ase import Atoms
+from sklearn.linear_model import LinearRegression
+from dscribe.descriptors import MBTR
+import numpy as np
+
+from jitterbug.model.base import EnergyModel
+
+
+class MBTRCalculator(Calculator):
+    """A learnable forcefield based on GPR and fingerprints computed using DScribe"""
+
+    implemented_properties = ['energy', 'forces']
+    default_parameters = {
+        'descriptor': MBTR(
+            species=["H", "C", "N", "O"],
+            geometry={"function": "inverse_distance"},
+            grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
+            weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
+            periodic=False,
+            normalization="l2",
+        ),
+        'model': LinearRegression(),
+        'intercept': 0.,  # Normalizing parameters
+        'scale': 0.
+    }
+
+    def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
+        # Compute the energy using the learned model
+        desc = self.parameters['descriptor'].create_single(atoms)
+        energy_no_int = self.parameters['model'].predict(desc[None, :])
+        self.results['energy'] = energy_no_int[0] * self.parameters['scale'] + self.parameters['intercept']
+
+        # If desired, compute forces numerically
+        if 'forces' in properties:
+            # calculate_numerical_forces use that the calculation of the input atoms,
+            #  even though it is a method of a calculator and not of the input atoms :shrug:
+            temp_atoms: Atoms = atoms.copy()
+            temp_atoms.calc = self
+            self.results['forces'] = self.calculate_numerical_forces(temp_atoms)
+
+    def train(self, train_set: list[Atoms]):
+        """Train the embedded forcefield object
+
+        Args:
+            train_set: List of Atoms objects containing at least the energy
+        """
+
+        # Determine the mean energy and subtract it off
+        energies = np.array([atoms.get_potential_energy() for atoms in train_set])
+        self.parameters['intercept'] = energies.mean()
+        energies -= self.parameters['intercept']
+        self.parameters['scale'] = energies.std()
+        energies /= self.parameters['scale']
+
+        # Compute the descriptors and use them to fit the model
+        desc = self.parameters['descriptor'].create(train_set)
+        self.parameters['model'].fit(desc, energies)
+
+
+class MBTREnergyModel(EnergyModel):
+    """Use the MBTR representation to model the potential energy surface
+
+    Args:
+        calc: Calculator used to fit the potential energy surface
+        reference: Reference structure at which we compute the Hessian
+    """
+
+    def __init__(self, calc: MBTRCalculator, reference: Atoms):
+        super().__init__()
+        self.calc = calc
+        self.reference = reference
+
+    def train(self, data: list[Atoms]) -> MBTRCalculator:
+        self.calc.train(data)
+        return self.calc
+
+    def mean_hessian(self, model: MBTRCalculator) -> np.ndarray:
+        self.reference.calc = model
+        try:
+            vib = Vibrations(self.reference, name='mbtr-temp')
+            vib.run()
+            return vib.get_vibrations().get_hessian_2d()
+        finally:
+            rmtree('mbtr-temp', ignore_errors=True)