Skip to content

Commit

Permalink
Add CLI support for approximate Hessians (#35)
Browse files Browse the repository at this point in the history
* Add initial sampler

We'll probably use it in our bigger tests anyway

* Initial draft of the static thinker

* Finish CLI support

* Make a log statement less verbose
  • Loading branch information
WardLT authored Dec 11, 2023
1 parent 5e178af commit cc6db49
Show file tree
Hide file tree
Showing 11 changed files with 382 additions and 17 deletions.
41 changes: 33 additions & 8 deletions jitterbug/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@
from colmena.task_server import ParslTaskServer
from parsl import Config, HighThroughputExecutor

from jitterbug.model.dscribe import make_global_mbtr_model
from jitterbug.parsl import get_energy, load_configuration
from jitterbug.thinkers.exact import ExactHessianThinker
from jitterbug.thinkers.static import ApproximateHessianThinker
from jitterbug.utils import make_calculator
from jitterbug.sampler import methods, UniformSampler

logger = logging.getLogger(__name__)

approx_methods = list(methods.keys())


def main(args: Optional[list[str]] = None):
"""Run Jitterbug"""
Expand All @@ -28,7 +33,14 @@ def main(args: Optional[list[str]] = None):
'to store information about charged or radical molecules')
parser.add_argument('--method', nargs=2, required=True,
help='Method to use to compute energies. Format: [method] [basis]. Example: B3LYP 6-31g*')
parser.add_argument('--exact', help='Compute Hessian using numerical derivatives', action='store_true')
parser.add_argument('--approach', default='exact',
choices=['exact', 'static'],
help='Method used to compute the Hessian. Either "exact", or'
'"static" for a approximate method using a fixed set of structures.')
parser.add_argument('--amount-to-run', default=0.1, type=float,
help='Amount of structures to run for approximate Hessian. '
'Either number (if >1) or fraction of number required to compute exact Hessian,'
'`6N + 2 * (3N) * (3N - 1) + 1`.')
parser.add_argument('--parsl-config', help='Path to the Parsl configuration to use')
args = parser.parse_args(args)

Expand All @@ -39,11 +51,7 @@ def main(args: Optional[list[str]] = None):

# Make the run directory
method, basis = (x.lower() for x in args.method)
if args.exact:
compute_name = 'exact'
else:
raise NotImplementedError()
run_dir = Path('run') / xyz_name / f'{method}_{basis}_{compute_name}'
run_dir = Path('run') / xyz_name / f'{method}_{basis}_{args.approach}'
run_dir.mkdir(parents=True, exist_ok=True)

# Start logging
Expand Down Expand Up @@ -92,14 +100,31 @@ def main(args: Optional[list[str]] = None):

# Create a thinker
queues = PipeQueues(topics=['simulation'])
if args.exact:
functions = [] # Additional functions needed by thinker
if args.approach == 'exact':
thinker = ExactHessianThinker(
queues=queues,
atoms=atoms,
run_dir=run_dir,
num_workers=num_workers,
)
functions = [] # No other functions to run
elif args.approach == 'static':
# Determine the number to run
exact_needs = len(atoms) * 3 * 2 + (len(atoms) * 3) * (len(atoms) * 3 - 1) * 2 + 1
if args.amount_to_run < 1:
num_to_run = int(args.amount_to_run * exact_needs)
else:
num_to_run = int(args.amount_to_run)
logger.info(f'Running {num_to_run} energies out of {exact_needs} required for exact Hessian')
thinker = ApproximateHessianThinker(
queues=queues,
atoms=atoms,
run_dir=run_dir,
num_workers=num_workers,
num_to_run=num_to_run,
sampler=UniformSampler(), # TODO (wardlt): Make this configurable
model=make_global_mbtr_model(atoms) # TODO (wardlt): Make this configurable
)
else:
raise NotImplementedError()

Expand Down
40 changes: 40 additions & 0 deletions jitterbug/model/dscribe/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,41 @@
"""Energy models using `DScribe <https://singroup.github.io/dscribe/latest/index.html>`_"""
import ase
from dscribe.descriptors.mbtr import MBTR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
import numpy as np

from .globald import DScribeGlobalEnergyModel


def make_global_mbtr_model(ref_atoms: ase.Atoms, n_points: int = 8, cutoff: float = 6.) -> DScribeGlobalEnergyModel:
"""Make an MBTR model using scikit-learn
Args:
ref_atoms: Reference atoms to use for the model
n_points: Number of points to include in the MBTR grid
cutoff: Cutoff distance for the descriptors (units: Angstrom)
Returns:
Energy model, ready to be trained
"""
species = list(set(ref_atoms.get_chemical_symbols()))
desc = MBTR(
species=species,
geometry={"function": "angle"},
grid={"min": 0., "max": 180, "n": n_points, "sigma": 180. / n_points / 2.},
weighting={"function": "smooth_cutoff", "r_cut": cutoff, "threshold": 1e-3},
periodic=False,
)
model = Pipeline(
[('scale', StandardScaler()),
('krr', GridSearchCV(KernelRidge(kernel='rbf', alpha=1e-10),
{'gamma': np.logspace(-5, 5, 32)}))]
)
return DScribeGlobalEnergyModel(
reference=ref_atoms,
model=model,
descriptors=desc,
num_calculators=2
)
9 changes: 9 additions & 0 deletions jitterbug/sampler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Functions to sample atomic configurations"""
from typing import Type

from .base import StructureSampler
from .random import UniformSampler

methods: dict[str, Type[StructureSampler]] = {
'uniform': UniformSampler
}
26 changes: 26 additions & 0 deletions jitterbug/sampler/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import ase


class StructureSampler:
"""Base class for generating structures used to train Hessian model
Options for the sampler should be defined in the initializer.
"""

@property
def name(self) -> str:
"""Name for the sampling strategy"""
raise NotImplementedError()

def produce_structures(self, atoms: ase.Atoms, count: int, seed: int = 1) -> list[ase.Atoms]:
"""Generate a set of training structure
Args:
atoms: Unperturbed geometry
count: Number of structure to produce
seed: Random seed
Returns:
List of structures to be evaluated
"""

raise NotImplementedError()
38 changes: 38 additions & 0 deletions jitterbug/sampler/random.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Simple, friendly, random sampling"""
from dataclasses import dataclass

import numpy as np
import ase

from jitterbug.sampler.base import StructureSampler


@dataclass
class UniformSampler(StructureSampler):
"""Sample randomly-chosen directions
Perturbs each atom in each direction a random amount between -:attr:`step_size` and :attr:`step_size`.
"""

step_size: float = 0.005
"""Amount to displace the atoms (units: Angstrom)"""

@property
def name(self) -> str:
return f'uniform_{self.step_size:.3e}'

def produce_structures(self, atoms: ase.Atoms, count: int, seed: int = 1) -> list[ase.Atoms]:
# Make the RNG
n_atoms = len(atoms)
rng = np.random.RandomState(seed + n_atoms)

output = []
for _ in range(count):
# Sample a perturbation
disp = rng.normal(-self.step_size, self.step_size, size=(n_atoms, 3))

# Make the new atoms
new_atoms = atoms.copy()
new_atoms.positions += disp
output.append(new_atoms)
return output
39 changes: 39 additions & 0 deletions jitterbug/thinkers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from pathlib import Path

import ase
import numpy as np
from colmena.queue import ColmenaQueues
from colmena.thinker import BaseThinker, ResourceCounter


class HessianThinker(BaseThinker):
"""Base class for thinkers
Implementations must write their simulation data to the same spot"""

atoms: ase.Atoms
"""Unperturbed atomic structure"""

run_dir: Path
"""Path to the run directory"""
result_file: Path
"""Path to file in which to store result records"""

def __init__(self, queues: ColmenaQueues, rec: ResourceCounter, run_dir: Path, atoms: ase.Atoms):
super().__init__(queues, rec)
self.atoms = atoms

# Prepare for outputs
self.run_dir = run_dir
self.run_dir.mkdir(exist_ok=True)
self.result_file = run_dir / 'simulation-results.json'

def compute_hessian(self) -> np.ndarray:
"""Compute the Hessian using finite differences
Returns:
Hessian in the 2D form
Raises:
(ValueError) If there is missing data
"""
raise NotImplementedError()
11 changes: 4 additions & 7 deletions jitterbug/thinkers/exact.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,19 @@
import numpy as np
from colmena.models import Result
from colmena.queue import ColmenaQueues
from colmena.thinker import BaseThinker, ResourceCounter, agent, result_processor
from colmena.thinker import ResourceCounter, agent, result_processor

from jitterbug.thinkers.base import HessianThinker
from jitterbug.utils import read_from_string


class ExactHessianThinker(BaseThinker):
class ExactHessianThinker(HessianThinker):
"""Schedule the calculation of a complete set of numerical derivatives"""

def __init__(self, queues: ColmenaQueues, num_workers: int, atoms: ase.Atoms, run_dir: Path, step_size: float = 0.005):
super().__init__(queues, ResourceCounter(num_workers))
self.atoms = atoms
super().__init__(queues, ResourceCounter(num_workers), run_dir, atoms)

# Initialize storage for the energies
self.result_file = run_dir / 'simulation-results.json'
self.step_size = step_size
self.unperturbed_energy: Optional[float] = None
self.single_perturb = np.zeros((len(atoms), 3, 2)) * np.nan # Perturbation of a single direction. [atom_id, axis (xyz), dir_id (0 back, 1 forward)]
Expand All @@ -30,8 +29,6 @@ def __init__(self, queues: ColmenaQueues, num_workers: int, atoms: ase.Atoms, ru

# Load what has been run already
self.total_complete = 0
self.run_dir = run_dir
self.run_dir.mkdir(exist_ok=True)
self.energy_path = self.run_dir / 'unperturbed.energy'
self.single_path = self.run_dir / 'single_energies.csv'
self.double_path = self.run_dir / 'double_energies.csv'
Expand Down
108 changes: 108 additions & 0 deletions jitterbug/thinkers/static.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Approach which uses a static set of structures to compute Hessian"""
from pathlib import Path

import ase
import numpy as np
from ase.db import connect
from colmena.models import Result
from colmena.queue import ColmenaQueues
from colmena.thinker import ResourceCounter, agent, result_processor

from .base import HessianThinker
from jitterbug.sampler.base import StructureSampler
from ..model.base import EnergyModel
from ..utils import read_from_string


class ApproximateHessianThinker(HessianThinker):
"""Approach which approximates a Hessian by computing it from a forcefield fit to structures
Saves structures to an ASE db and labels them with the name of the sampler and the index
"""

def __init__(self,
queues: ColmenaQueues,
num_workers: int,
atoms: ase.Atoms,
run_dir: Path,
sampler: StructureSampler,
num_to_run: int,
model: EnergyModel,
step_size: float = 0.005):
super().__init__(queues, ResourceCounter(num_workers), run_dir, atoms)
self.step_size = step_size
self.sampler = sampler
self.num_to_run = num_to_run
self.model = model

# Generate the structures to be sampled
self.to_sample = self.sampler.produce_structures(atoms, num_to_run)
sampler_name = self.sampler.name
self.logger.info(f'Generated {len(self.to_sample)} structures with strategy: {sampler_name}')

# Find how many we've done already
self.db_path = self.run_dir / 'atoms.db'
self.completed: set[int] = set()
with connect(self.db_path) as db:
for row in db.select(f'index<{self.num_to_run}', sampler=sampler_name):
atoms = row.toatoms(True)
ind = atoms.info['key_value_pairs']['index']
assert np.isclose(atoms.positions, self.to_sample[ind].positions).all(), f'Structure {ind} in the DB and generated structure are inconsistent'
self.completed.add(ind)
num_remaining = self.num_to_run - len(self.completed)
self.logger.info(f'Completed {len(self.completed)} structures already. Need to run {num_remaining} more')

@agent()
def submit_tasks(self):
"""Submit all required tasks then start the shutdown process by exiting"""

for ind, atoms in enumerate(self.to_sample):
# Skip structures which we've done already
if ind in self.completed:
continue

# Submit it
self.rec.acquire(None, 1)
self.queues.send_inputs(
atoms,
method='get_energy',
task_info={'index': ind}
)

@result_processor
def store_energy(self, result: Result):
"""Store the energy in the appropriate files"""
self.rec.release()

# Store the result object to disk
with self.result_file.open('a') as fp:
print(result.json(exclude={'inputs'}), file=fp)

if not result.success:
self.logger.warning(f'Calculation failed due to {result.failure_info.exception}')
return

# Store the result into the ASE database
sampler_name = self.sampler.name
index = result.task_info['index']
atoms = read_from_string(result.value, 'extxyz')
assert np.isclose(result.args[0].positions, atoms.positions).all()
self.completed.add(index)
with connect(self.db_path) as db:
db.write(atoms, sampler=sampler_name, index=index)
self.logger.info(f'Saved completed structure. Progress: {len(self.completed)}/{self.num_to_run}'
f' ({len(self.completed) / self.num_to_run * 100:.2f}%)')

def compute_hessian(self) -> np.ndarray:
# Load the models
atoms = []
with connect(self.db_path) as db:
for row in db.select(f'index<{self.num_to_run}', sampler=self.sampler.name):
atoms.append(row.toatoms())
self.logger.info(f'Pulled {len(atoms)} atoms for a training set')

# Fit the model
model = self.model.train(atoms)
self.logger.info('Completed model fitting')

return self.model.mean_hessian(model)
Loading

0 comments on commit cc6db49

Please sign in to comment.