Skip to content

Commit

Permalink
Refactor the global descriptor class
Browse files Browse the repository at this point in the history
  • Loading branch information
WardLT committed Oct 16, 2023
1 parent 0a2a687 commit a6318ab
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 109 deletions.
95 changes: 95 additions & 0 deletions jitterbug/model/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
"""Base class defining the key methods"""
from tempfile import TemporaryDirectory
from typing import List, Optional
from random import choices
from pathlib import Path
import os

import numpy as np
from scipy import stats
from ase import Atoms
from ase.vibrations import Vibrations
from ase.calculators.calculator import Calculator


class EnergyModel:
Expand Down Expand Up @@ -35,3 +44,89 @@ def sample_hessians(self, model: object, num_samples: int) -> list[np.ndarray]:
A list of 2D hessians
"""
raise NotImplementedError()


class ASEEnergyModel(EnergyModel):
"""Energy models which produce a series of ASE :class:`~ase.calculators.calculator.Calculator` objects"""

def __init__(self, reference: Atoms, num_calculators: int, scratch_dir: Optional[Path] = None):
self.reference = reference
self.scratch_dir = scratch_dir
self.num_calculators = num_calculators

def train_calculator(self, data: List[Atoms]) -> Calculator:
"""Train one of the constituent calculators
Args:
data: Training data
Returns:
Retrained calculator
"""
raise NotImplementedError()

def train(self, data: list[Atoms]) -> List[Calculator]:
# Train each on a different bootstrap sample
output = []
for _ in range(self.num_calculators):
sampled_data = choices(data, k=len(data))
output.append(self.train_calculator(sampled_data))
return output

def compute_hessian(self, mol: Atoms, calc: Calculator) -> np.ndarray:
"""Compute the Hessian using one of the calculators
Args:
mol: Molecule to be evaluated
calc: Calculator to use
Returns:
2D Hessian matrix
"""
# Clone the molecule to avoid any cross-talk
mol = mol.copy()

with TemporaryDirectory(dir=self.scratch_dir) as td:
start_dir = Path.cwd()
try:
# Run the vibrations calculation in a temporary directory
os.chdir(td)
mol.calc = calc
vib = Vibrations(mol, name='mbtr-temp')
vib.run()

# Return only the 2D Hessian
return vib.get_vibrations().get_hessian_2d()
finally:
os.chdir(start_dir)

def mean_hessian(self, models: list[Calculator]) -> np.ndarray:
# Run all calculators
hessians = [self.compute_hessian(self.reference, calc) for calc in models]

# Return the mean
return np.mean(hessians, axis=0)

def sample_hessians(self, models: list[Calculator], num_samples: int) -> list[np.ndarray]:
# Run all calculators
hessians = [self.compute_hessian(self.reference, calc) for calc in models]

# Compute the mean and covariance for each parameter
triu_ind = np.triu_indices(hessians[0].shape[0])
hessians_flat = [h[triu_ind] for h in hessians]
hessian_mean = np.mean(hessians_flat, axis=0)
hessian_covar = np.cov(hessians_flat, rowvar=False)

# Generate samples
hessian_mvn = stats.multivariate_normal(hessian_mean, hessian_covar, allow_singular=True)
diag_indices = np.diag_indices(hessians[0].shape[0])
output = []
for sample in hessian_mvn.rvs(size=num_samples):
# Fill in a 2D version
sample_hessian = np.zeros_like(hessians[0])
sample_hessian[triu_ind] = sample

# Make it symmetric
sample_hessian += sample_hessian.T
sample_hessian[diag_indices] /= 2

output.append(sample_hessian)
return output
87 changes: 87 additions & 0 deletions jitterbug/model/dscribe/globald.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Models which use a single vector to describe a molecule.
Such approaches, such as MBTR, present a simple "molecule->energy" learning problem.
Other methods, such as SOAP, provided atomic-level features that must require an
extra step "molecule->atoms->energy/atom->energy".
We train the models using sklearn for this example.
"""
from typing import List

from dscribe.descriptors.descriptorglobal import DescriptorGlobal
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, clone
from dscribe.descriptors import MBTR
from ase.calculators.calculator import Calculator, all_changes
from ase import Atoms
import numpy as np

from jitterbug.model.base import ASEEnergyModel


class DScribeGlobalCalculator(Calculator):
"""A learnable forcefield based on global fingerprints computed using DScribe"""

implemented_properties = ['energy', 'forces']
default_parameters = {
'descriptor': MBTR(
species=["H", "C", "N", "O"],
geometry={"function": "inverse_distance"},
grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
periodic=False,
normalization="l2",
),
'model': LinearRegression(),
'offset': 0., # Normalizing parameters
'scale': 1.
}

def calculate(self, atoms=None, properties=('energy', 'forces'), system_changes=all_changes):
# Compute the energy using the learned model
desc = self.parameters['descriptor'].create_single(atoms)
energy_unscaled = self.parameters['model'].predict(desc[None, :])
self.results['energy'] = energy_unscaled[0] * self.parameters['scale'] + self.parameters['offset']

# If desired, compute forces numerically
if 'forces' in properties:
# calculate_numerical_forces use that the calculation of the input atoms,
# even though it is a method of a calculator and not of the input atoms :shrug:
temp_atoms: Atoms = atoms.copy()
temp_atoms.calc = self
self.results['forces'] = self.calculate_numerical_forces(temp_atoms)


class DScribeGlobalEnergyModel(ASEEnergyModel):
"""Compute energy using a scikit-learn model that computes energies from global descriptors
Args:
reference: Reference structure at which we compute the Hessian
descriptors: Tool used to compute descriptors
model: Scikit-learn model to use to compute energies given descriptors
num_calculators: Number of models to use in ensemble
"""

def __init__(self, reference: Atoms, descriptors: DescriptorGlobal, model: BaseEstimator, num_calculators: int):
super().__init__(reference, num_calculators)
self.desc = descriptors
self.model = model

def train_calculator(self, data: List[Atoms]) -> Calculator:
# Make a copy of the model
model: BaseEstimator = clone(self.model)

# Compute the descriptors then train
train_x = self.desc.create(data)
train_y = np.array([a.get_potential_energy() for a in data])
scale_y, offset_y = np.std(train_y), np.mean(train_y)
train_y = (train_y - offset_y) / scale_y
model.fit(train_x, train_y)

# Assemble into a Calculator
return DScribeGlobalCalculator(
descriptor=self.desc,
model=model,
offset=offset_y,
scale=scale_y,
)
96 changes: 0 additions & 96 deletions jitterbug/model/dscribe/single.py

This file was deleted.

54 changes: 41 additions & 13 deletions tests/models/test_mbtr.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,35 @@
"""Test for a MBTR-based energy model"""
import numpy as np

from jitterbug.model.dscribe.single import DScribeGlobalCalculator, DScribeGlobalEnergyModel


def test_model(train_set):
from pytest import fixture
from dscribe.descriptors import MBTR
from sklearn.linear_model import LinearRegression

from jitterbug.model.dscribe.globald import DScribeGlobalEnergyModel


@fixture()
def model(train_set):
return DScribeGlobalEnergyModel(
reference=train_set[0],
descriptors=MBTR(
species=["H", "C", "N", "O"],
geometry={"function": "inverse_distance"},
grid={"min": 0, "max": 1, "n": 100, "sigma": 0.1},
weighting={"function": "exp", "scale": 0.5, "threshold": 1e-3},
periodic=False,
normalization="l2",
),
model=LinearRegression(),
num_calculators=8
)


def test_calculator(train_set, model):
# Create then fit the model
calc = DScribeGlobalCalculator()
calc.train(train_set)
calcs = model.train(train_set)

# Predict the energy (we should be close!)
calc = calcs[0]
test_atoms = train_set[0].copy()
test_atoms.calc = calc
energy = test_atoms.get_potential_energy()
Expand All @@ -18,14 +38,22 @@ def test_model(train_set):
# See if force calculation works
forces = test_atoms.get_forces()
assert forces.shape == (3, 3) # At least make sure we get the right shape, values are iffy
assert not np.isclose(forces, 0).all()


def test_hessian(train_set):
def test_hessian(train_set, model):
"""See if we can compute the Hessian"""
calc = DScribeGlobalCalculator()
model = DScribeGlobalEnergyModel(calc, train_set[0])

# Run the fitting
hess_model = model.train(train_set)
hess = model.mean_hessian(hess_model)
assert hess.shape == (9, 9)
calcs = model.train(train_set)

# Test the mean hessian function
mean_hess = model.mean_hessian(calcs)
assert mean_hess.shape == (9, 9), 'Wrong shape'
assert np.isclose(mean_hess, mean_hess.T).all(), 'Not symmetric'

# Test the sampling
sampled_hess = model.sample_hessians(calcs, 128)
assert all(np.isclose(hess, hess.T).all() for hess in sampled_hess)
mean_sampled_hess = np.mean(sampled_hess, 0)
assert np.isclose(np.diag(mean_sampled_hess), np.diag(mean_hess), atol=5).mean() > 0.5 # Make sure most agree

0 comments on commit a6318ab

Please sign in to comment.