diff --git a/jitterbug/cli.py b/jitterbug/cli.py index 9f0f271..c98bc8a 100644 --- a/jitterbug/cli.py +++ b/jitterbug/cli.py @@ -12,12 +12,17 @@ from colmena.task_server import ParslTaskServer from parsl import Config, HighThroughputExecutor +from jitterbug.model.dscribe import make_global_mbtr_model from jitterbug.parsl import get_energy, load_configuration from jitterbug.thinkers.exact import ExactHessianThinker +from jitterbug.thinkers.static import ApproximateHessianThinker from jitterbug.utils import make_calculator +from jitterbug.sampler import methods, UniformSampler logger = logging.getLogger(__name__) +approx_methods = list(methods.keys()) + def main(args: Optional[list[str]] = None): """Run Jitterbug""" @@ -28,7 +33,14 @@ def main(args: Optional[list[str]] = None): 'to store information about charged or radical molecules') parser.add_argument('--method', nargs=2, required=True, help='Method to use to compute energies. Format: [method] [basis]. Example: B3LYP 6-31g*') - parser.add_argument('--exact', help='Compute Hessian using numerical derivatives', action='store_true') + parser.add_argument('--approach', default='exact', + choices=['exact', 'static'], + help='Method used to compute the Hessian. Either "exact", or' + '"static" for a approximate method using a fixed set of structures.') + parser.add_argument('--amount-to-run', default=0.1, type=float, + help='Amount of structures to run for approximate Hessian. ' + 'Either number (if >1) or fraction of number required to compute exact Hessian,' + '`6N + 2 * (3N) * (3N - 1) + 1`.') parser.add_argument('--parsl-config', help='Path to the Parsl configuration to use') args = parser.parse_args(args) @@ -39,11 +51,7 @@ def main(args: Optional[list[str]] = None): # Make the run directory method, basis = (x.lower() for x in args.method) - if args.exact: - compute_name = 'exact' - else: - raise NotImplementedError() - run_dir = Path('run') / xyz_name / f'{method}_{basis}_{compute_name}' + run_dir = Path('run') / xyz_name / f'{method}_{basis}_{args.approach}' run_dir.mkdir(parents=True, exist_ok=True) # Start logging @@ -92,14 +100,31 @@ def main(args: Optional[list[str]] = None): # Create a thinker queues = PipeQueues(topics=['simulation']) - if args.exact: + functions = [] # Additional functions needed by thinker + if args.approach == 'exact': thinker = ExactHessianThinker( queues=queues, atoms=atoms, run_dir=run_dir, num_workers=num_workers, ) - functions = [] # No other functions to run + elif args.approach == 'static': + # Determine the number to run + exact_needs = len(atoms) * 3 * 2 + (len(atoms) * 3) * (len(atoms) * 3 - 1) * 2 + 1 + if args.amount_to_run < 1: + num_to_run = int(args.amount_to_run * exact_needs) + else: + num_to_run = int(args.amount_to_run) + logger.info(f'Running {num_to_run} energies out of {exact_needs} required for exact Hessian') + thinker = ApproximateHessianThinker( + queues=queues, + atoms=atoms, + run_dir=run_dir, + num_workers=num_workers, + num_to_run=num_to_run, + sampler=UniformSampler(), # TODO (wardlt): Make this configurable + model=make_global_mbtr_model(atoms) # TODO (wardlt): Make this configurable + ) else: raise NotImplementedError() diff --git a/jitterbug/model/dscribe/__init__.py b/jitterbug/model/dscribe/__init__.py index bfdf045..6d9ab56 100644 --- a/jitterbug/model/dscribe/__init__.py +++ b/jitterbug/model/dscribe/__init__.py @@ -1 +1,41 @@ """Energy models using `DScribe `_""" +import ase +from dscribe.descriptors.mbtr import MBTR +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GridSearchCV +from sklearn.kernel_ridge import KernelRidge +import numpy as np + +from .globald import DScribeGlobalEnergyModel + + +def make_global_mbtr_model(ref_atoms: ase.Atoms, n_points: int = 8, cutoff: float = 6.) -> DScribeGlobalEnergyModel: + """Make an MBTR model using scikit-learn + + Args: + ref_atoms: Reference atoms to use for the model + n_points: Number of points to include in the MBTR grid + cutoff: Cutoff distance for the descriptors (units: Angstrom) + Returns: + Energy model, ready to be trained + """ + species = list(set(ref_atoms.get_chemical_symbols())) + desc = MBTR( + species=species, + geometry={"function": "angle"}, + grid={"min": 0., "max": 180, "n": n_points, "sigma": 180. / n_points / 2.}, + weighting={"function": "smooth_cutoff", "r_cut": cutoff, "threshold": 1e-3}, + periodic=False, + ) + model = Pipeline( + [('scale', StandardScaler()), + ('krr', GridSearchCV(KernelRidge(kernel='rbf', alpha=1e-10), + {'gamma': np.logspace(-5, 5, 32)}))] + ) + return DScribeGlobalEnergyModel( + reference=ref_atoms, + model=model, + descriptors=desc, + num_calculators=2 + ) diff --git a/jitterbug/sampler/__init__.py b/jitterbug/sampler/__init__.py new file mode 100644 index 0000000..2bcf863 --- /dev/null +++ b/jitterbug/sampler/__init__.py @@ -0,0 +1,9 @@ +"""Functions to sample atomic configurations""" +from typing import Type + +from .base import StructureSampler +from .random import UniformSampler + +methods: dict[str, Type[StructureSampler]] = { + 'uniform': UniformSampler +} diff --git a/jitterbug/sampler/base.py b/jitterbug/sampler/base.py new file mode 100644 index 0000000..5a0a648 --- /dev/null +++ b/jitterbug/sampler/base.py @@ -0,0 +1,26 @@ +import ase + + +class StructureSampler: + """Base class for generating structures used to train Hessian model + + Options for the sampler should be defined in the initializer. + """ + + @property + def name(self) -> str: + """Name for the sampling strategy""" + raise NotImplementedError() + + def produce_structures(self, atoms: ase.Atoms, count: int, seed: int = 1) -> list[ase.Atoms]: + """Generate a set of training structure + + Args: + atoms: Unperturbed geometry + count: Number of structure to produce + seed: Random seed + Returns: + List of structures to be evaluated + """ + + raise NotImplementedError() diff --git a/jitterbug/sampler/random.py b/jitterbug/sampler/random.py new file mode 100644 index 0000000..5471544 --- /dev/null +++ b/jitterbug/sampler/random.py @@ -0,0 +1,38 @@ +"""Simple, friendly, random sampling""" +from dataclasses import dataclass + +import numpy as np +import ase + +from jitterbug.sampler.base import StructureSampler + + +@dataclass +class UniformSampler(StructureSampler): + """Sample randomly-chosen directions + + Perturbs each atom in each direction a random amount between -:attr:`step_size` and :attr:`step_size`. + """ + + step_size: float = 0.005 + """Amount to displace the atoms (units: Angstrom)""" + + @property + def name(self) -> str: + return f'uniform_{self.step_size:.3e}' + + def produce_structures(self, atoms: ase.Atoms, count: int, seed: int = 1) -> list[ase.Atoms]: + # Make the RNG + n_atoms = len(atoms) + rng = np.random.RandomState(seed + n_atoms) + + output = [] + for _ in range(count): + # Sample a perturbation + disp = rng.normal(-self.step_size, self.step_size, size=(n_atoms, 3)) + + # Make the new atoms + new_atoms = atoms.copy() + new_atoms.positions += disp + output.append(new_atoms) + return output diff --git a/jitterbug/thinkers/base.py b/jitterbug/thinkers/base.py new file mode 100644 index 0000000..62bff03 --- /dev/null +++ b/jitterbug/thinkers/base.py @@ -0,0 +1,39 @@ +from pathlib import Path + +import ase +import numpy as np +from colmena.queue import ColmenaQueues +from colmena.thinker import BaseThinker, ResourceCounter + + +class HessianThinker(BaseThinker): + """Base class for thinkers + + Implementations must write their simulation data to the same spot""" + + atoms: ase.Atoms + """Unperturbed atomic structure""" + + run_dir: Path + """Path to the run directory""" + result_file: Path + """Path to file in which to store result records""" + + def __init__(self, queues: ColmenaQueues, rec: ResourceCounter, run_dir: Path, atoms: ase.Atoms): + super().__init__(queues, rec) + self.atoms = atoms + + # Prepare for outputs + self.run_dir = run_dir + self.run_dir.mkdir(exist_ok=True) + self.result_file = run_dir / 'simulation-results.json' + + def compute_hessian(self) -> np.ndarray: + """Compute the Hessian using finite differences + + Returns: + Hessian in the 2D form + Raises: + (ValueError) If there is missing data + """ + raise NotImplementedError() diff --git a/jitterbug/thinkers/exact.py b/jitterbug/thinkers/exact.py index 5ab9a5b..aae908a 100644 --- a/jitterbug/thinkers/exact.py +++ b/jitterbug/thinkers/exact.py @@ -7,20 +7,19 @@ import numpy as np from colmena.models import Result from colmena.queue import ColmenaQueues -from colmena.thinker import BaseThinker, ResourceCounter, agent, result_processor +from colmena.thinker import ResourceCounter, agent, result_processor +from jitterbug.thinkers.base import HessianThinker from jitterbug.utils import read_from_string -class ExactHessianThinker(BaseThinker): +class ExactHessianThinker(HessianThinker): """Schedule the calculation of a complete set of numerical derivatives""" def __init__(self, queues: ColmenaQueues, num_workers: int, atoms: ase.Atoms, run_dir: Path, step_size: float = 0.005): - super().__init__(queues, ResourceCounter(num_workers)) - self.atoms = atoms + super().__init__(queues, ResourceCounter(num_workers), run_dir, atoms) # Initialize storage for the energies - self.result_file = run_dir / 'simulation-results.json' self.step_size = step_size self.unperturbed_energy: Optional[float] = None self.single_perturb = np.zeros((len(atoms), 3, 2)) * np.nan # Perturbation of a single direction. [atom_id, axis (xyz), dir_id (0 back, 1 forward)] @@ -30,8 +29,6 @@ def __init__(self, queues: ColmenaQueues, num_workers: int, atoms: ase.Atoms, ru # Load what has been run already self.total_complete = 0 - self.run_dir = run_dir - self.run_dir.mkdir(exist_ok=True) self.energy_path = self.run_dir / 'unperturbed.energy' self.single_path = self.run_dir / 'single_energies.csv' self.double_path = self.run_dir / 'double_energies.csv' diff --git a/jitterbug/thinkers/static.py b/jitterbug/thinkers/static.py new file mode 100644 index 0000000..744c29d --- /dev/null +++ b/jitterbug/thinkers/static.py @@ -0,0 +1,108 @@ +"""Approach which uses a static set of structures to compute Hessian""" +from pathlib import Path + +import ase +import numpy as np +from ase.db import connect +from colmena.models import Result +from colmena.queue import ColmenaQueues +from colmena.thinker import ResourceCounter, agent, result_processor + +from .base import HessianThinker +from jitterbug.sampler.base import StructureSampler +from ..model.base import EnergyModel +from ..utils import read_from_string + + +class ApproximateHessianThinker(HessianThinker): + """Approach which approximates a Hessian by computing it from a forcefield fit to structures + + Saves structures to an ASE db and labels them with the name of the sampler and the index + """ + + def __init__(self, + queues: ColmenaQueues, + num_workers: int, + atoms: ase.Atoms, + run_dir: Path, + sampler: StructureSampler, + num_to_run: int, + model: EnergyModel, + step_size: float = 0.005): + super().__init__(queues, ResourceCounter(num_workers), run_dir, atoms) + self.step_size = step_size + self.sampler = sampler + self.num_to_run = num_to_run + self.model = model + + # Generate the structures to be sampled + self.to_sample = self.sampler.produce_structures(atoms, num_to_run) + sampler_name = self.sampler.name + self.logger.info(f'Generated {len(self.to_sample)} structures with strategy: {sampler_name}') + + # Find how many we've done already + self.db_path = self.run_dir / 'atoms.db' + self.completed: set[int] = set() + with connect(self.db_path) as db: + for row in db.select(f'index<{self.num_to_run}', sampler=sampler_name): + atoms = row.toatoms(True) + ind = atoms.info['key_value_pairs']['index'] + assert np.isclose(atoms.positions, self.to_sample[ind].positions).all(), f'Structure {ind} in the DB and generated structure are inconsistent' + self.completed.add(ind) + num_remaining = self.num_to_run - len(self.completed) + self.logger.info(f'Completed {len(self.completed)} structures already. Need to run {num_remaining} more') + + @agent() + def submit_tasks(self): + """Submit all required tasks then start the shutdown process by exiting""" + + for ind, atoms in enumerate(self.to_sample): + # Skip structures which we've done already + if ind in self.completed: + continue + + # Submit it + self.rec.acquire(None, 1) + self.queues.send_inputs( + atoms, + method='get_energy', + task_info={'index': ind} + ) + + @result_processor + def store_energy(self, result: Result): + """Store the energy in the appropriate files""" + self.rec.release() + + # Store the result object to disk + with self.result_file.open('a') as fp: + print(result.json(exclude={'inputs'}), file=fp) + + if not result.success: + self.logger.warning(f'Calculation failed due to {result.failure_info.exception}') + return + + # Store the result into the ASE database + sampler_name = self.sampler.name + index = result.task_info['index'] + atoms = read_from_string(result.value, 'extxyz') + assert np.isclose(result.args[0].positions, atoms.positions).all() + self.completed.add(index) + with connect(self.db_path) as db: + db.write(atoms, sampler=sampler_name, index=index) + self.logger.info(f'Saved completed structure. Progress: {len(self.completed)}/{self.num_to_run}' + f' ({len(self.completed) / self.num_to_run * 100:.2f}%)') + + def compute_hessian(self) -> np.ndarray: + # Load the models + atoms = [] + with connect(self.db_path) as db: + for row in db.select(f'index<{self.num_to_run}', sampler=self.sampler.name): + atoms.append(row.toatoms()) + self.logger.info(f'Pulled {len(atoms)} atoms for a training set') + + # Fit the model + model = self.model.train(atoms) + self.logger.info('Completed model fitting') + + return self.model.mean_hessian(model) diff --git a/tests/test_cli.py b/tests/test_cli.py index 09f6ae7..b1dfafb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -13,16 +13,26 @@ def test_exact_solver(file_dir, xyz): with open(devnull, 'w') as fo: with redirect_stdout(fo): main([ - str(file_dir / xyz), '--exact', '--method', 'hf', 'sto-3g' + str(file_dir / xyz), '--approach', 'exact', '--method', 'hf', 'sto-3g' ]) assert (Path('run') / xyz_name / 'hf_sto-3g_exact' / 'hessian.npy').exists() +@mark.parametrize('amount', [32., 0.5]) +def test_approx_solver(amount, xyz_path): + with open(devnull, 'w') as fo: + with redirect_stdout(fo): + main([ + str(xyz_path), '--approach', 'static', '--amount-to-run', str(amount), '--method', 'hf', 'sto-3g' + ]) + assert (Path('run') / xyz_path.name[:-4] / 'hf_sto-3g_static' / 'hessian.npy').exists() + + def test_parsl_path(xyz_path, file_dir): with open(devnull, 'w') as fo: with redirect_stdout(fo): main([ - str(xyz_path), '--exact', '--method', 'pm7', 'None', + str(xyz_path), '--approach', 'exact', '--method', 'pm7', 'None', '--parsl-config', str(file_dir / 'example_config.py') ]) assert (Path('run') / 'water' / 'pm7_none_exact' / 'hessian.npy').exists() diff --git a/tests/test_samplers.py b/tests/test_samplers.py new file mode 100644 index 0000000..82de96d --- /dev/null +++ b/tests/test_samplers.py @@ -0,0 +1,21 @@ +from pytest import mark +from ase.io import read +import numpy as np + +from jitterbug.sampler import methods + + +@mark.parametrize('method', ['uniform']) +def test_random(method, xyz_path): + """Make sure we get the same structures each time""" + + atoms = read(xyz_path) + sampler = methods[method]() + assert sampler.name.startswith(method) + + # Generate two batches + samples_1 = sampler.produce_structures(atoms, 4) + samples_2 = sampler.produce_structures(atoms, 8) + + for a1, a2 in zip(samples_1, samples_2): + assert np.isclose(a1.positions, a2.positions).all() diff --git a/tests/test_thinkers.py b/tests/test_thinkers.py index 10e097a..2957d90 100644 --- a/tests/test_thinkers.py +++ b/tests/test_thinkers.py @@ -10,8 +10,11 @@ from pytest import fixture, mark from jitterbug.compare import compare_hessians +from jitterbug.model.dscribe import make_global_mbtr_model from jitterbug.parsl import get_energy +from jitterbug.sampler import UniformSampler from jitterbug.thinkers.exact import ExactHessianThinker +from jitterbug.thinkers.static import ApproximateHessianThinker from jitterbug.utils import make_calculator @@ -33,6 +36,11 @@ def ase_hessian(atoms, tmp_path) -> np.ndarray: return vib.get_vibrations().get_hessian_2d() +@fixture() +def mbtr(atoms): + return make_global_mbtr_model(atoms) + + @fixture(autouse=True) def task_server(queues): # Make the function @@ -93,3 +101,47 @@ def test_exact(atoms, queues, tmpdir, ase_hessian): # Make sure it is close to ase's comparison = compare_hessians(atoms, hessian, ase_hessian) assert abs(comparison.zpe_error) < 0.2 + + +@mark.timeout(60) +def test_approx(atoms, queues, tmpdir, ase_hessian, mbtr): + # Make the thinker + run_path = Path(tmpdir) / 'run' + thinker = ApproximateHessianThinker( + queues=queues, + num_workers=2, + atoms=atoms, + run_dir=run_path, + num_to_run=128, + model=mbtr, + sampler=UniformSampler() + ) + assert run_path.exists() + assert thinker.completed == set() + + # Run it + thinker.run() + assert len(thinker.completed) == 128 + + # Make sure it picks up from a previous run + thinker = ApproximateHessianThinker( + queues=queues, + num_workers=2, + atoms=atoms, + run_dir=run_path, + num_to_run=128, + model=mbtr, + sampler=UniformSampler() + ) + assert len(thinker.completed) == 128 + + # Make sure it doesn't do any new calculations + start_size = (run_path / 'simulation-results.json').stat().st_size + thinker.run() + end_size = (run_path / 'simulation-results.json').stat().st_size + assert start_size == end_size + + # Compute the Hessian + hessian = thinker.compute_hessian() + assert np.isfinite(hessian).all() + assert hessian.shape == (len(atoms) * 3, len(atoms) * 3)