Skip to content

Commit

Permalink
General improvements and addition of Gaussian process regression
Browse files Browse the repository at this point in the history
- ml4chem/data/parser.py: improved FakeCalculator class and its
  docstrings.
- ml4chem/models/kernelridge.py:
    * New `class_name` parameter is saved to operate with universal
      loading of models.
    * get_kernel_matrix() function can handle more input cases.
    * Improvement in docstrings.
- ml4chem/potentials.py: universal loading of models.
- setup.py: now the `ml4chem` command line tool is installed with pip.
- ml4chem/models/gaussian_process.py: the new gaussian process
  regression model for ml4chem. It uses the KernelRidge class in the
  kernelridge module as a base class to then compute the extra step
  referring to the inference variance. This is related to #18.
- Updated examples/.
  • Loading branch information
muammar committed Aug 3, 2019
1 parent 52bd471 commit 2c8e259
Show file tree
Hide file tree
Showing 18 changed files with 1,480 additions and 882 deletions.
39 changes: 39 additions & 0 deletions examples/gp_potentials/cu_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
import sys

sys.path.append("../../")
from ase.io import Trajectory
from dask.distributed import Client, LocalCluster
from ml4chem import Potentials


def main():
"""docstring for main"""

# Load the images with ASE
images = Trajectory("cu_training.traj")

calc = Potentials.load(
model="cu_training.ml4c",
params="cu_training.params",
preprocessor="cu_training.scaler",
)

# Passage of fingerprint database with reference space
calc.reference_space = "fingerprints.db"

for atoms in images:
energy = calc.get_potential_energy(atoms)
print("ML4Chem predicted energy = {}".format(energy))
print(" DFT energy = {}".format(atoms.get_potential_energy()))


if __name__ == "__main__":
logging.basicConfig(
filename="cu_inference.log",
level=logging.INFO,
format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
)
cluster = LocalCluster(n_workers=8, threads_per_worker=2)
client = Client(cluster, asyncronous=True)
main()
36 changes: 36 additions & 0 deletions examples/gp_potentials/cu_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sys
from ase.io import Trajectory
from dask.distributed import Client, LocalCluster

sys.path.append("../../")
from ml4chem import Potentials
from ml4chem.fingerprints import Gaussian
from ml4chem.models.gaussian_process import GaussianProcess
from ml4chem.utils import logger


def train():
# Load the images with ASE
images = Trajectory("cu_training.traj")

# Arguments for fingerprinting the images
normalized = True
batch_size = 160

calc = Potentials(
fingerprints=Gaussian(
cutoff=6.5, normalized=normalized, save_preprocessor="cu_training.scaler"
),
#model=GaussianProcess(batch_size=batch_size),
model=GaussianProcess(),
label="cu_training",
)

calc.train(training_set=images)


if __name__ == "__main__":
logger()
cluster = LocalCluster()
client = Client(cluster, asyncronous=True)
train()
Binary file added examples/gp_potentials/cu_training.traj
Binary file not shown.
683 changes: 365 additions & 318 deletions examples/krr_potentials/cu_inference.log

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions examples/krr_potentials/cu_inference.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import sys

sys.path.append("../../")
from ase.io import Trajectory
from dask.distributed import Client, LocalCluster
from ml4chem import Potentials
from ml4chem.utils import logger


def main():
Expand All @@ -29,11 +29,7 @@ def main():


if __name__ == "__main__":
logging.basicConfig(
filename="cu_inference.log",
level=logging.INFO,
format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
)
logger(filename="cu_inference.log")
cluster = LocalCluster(n_workers=8, threads_per_worker=2)
client = Client(cluster, asyncronous=True)
main()
135 changes: 75 additions & 60 deletions examples/krr_potentials/cu_training.log
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
potentials.py:53 INFO:
-------------------------------------------------------------------------------

===============================================================================

███╗ ███╗██╗██╗ ██╗ ██████╗██╗ ██╗███████╗███╗ ███╗
████╗ ████║██║██║ ██║██╔════╝██║ ██║██╔════╝████╗ ████║
Expand All @@ -10,64 +10,79 @@ potentials.py:53 INFO:



ML4Chem is Machine Learning for Chemistry. This package is written in Python 3,
and intends to offer modern and rich features to perform machine learning
ML4Chem is Machine Learning for Chemistry. This package is written in Python
3, and intends to offer modern and rich features to perform machine learning
workflows for chemical physics.

This software is developed by Muammar El Khatib.
-------------------------------------------------------------------------------
This project is directed by Muammar El Khatib.


Contributors (in alphabetic order):
-----------------------------------
Elijah Gardella : Interatomic potentials for ionic systems.
Jacklyn Gee : Gaussian features class improvements, and cjson
reader.

===============================================================================

Data
====
Data structure is not compatible with ML4Chem.
Preparing images for training...
Images hashed and processed...

There are 40 atoms in your data set.

Fingerprinting
==============
Getting unique element symbols for training
Unique chemical elements: ['Cu']
Making default symmetry functions...
Number of features per chemical element:
- Cu: 8.

Symmetry function parameters for Cu atom:
-----------------------------------------
# Symbol Type Parameters
0 Cu G2 eta: 0.0500
1 Cu G2 eta: 0.2321
2 Cu G2 eta: 1.0772
3 Cu G2 eta: 5.0000
4 Cu, Cu G3 eta: 0.0050 gamma: 1.0000 zeta: 1.0000
5 Cu, Cu G3 eta: 0.0050 gamma: -1.0000 zeta: 1.0000
6 Cu, Cu G3 eta: 0.0050 gamma: 1.0000 zeta: 4.0000
7 Cu, Cu G3 eta: 0.0050 gamma: -1.0000 zeta: 4.0000

Data preprocessing
------------------
Preprocessor: MinMaxScaler.
Options:
- feature_range: (-1, 1).


Adding atomic feature calculations to scheduler...
... finished in 0 hours 0 minutes 1.87 seconds.

potentials.py:54 INFO:Available backends: ['torch', 'torchvision', 'numpy'].
handler.py:34 WARNING:Data structure is not compatible with ML4Chem
handler.py:56 INFO:Preparing images...
handler.py:87 INFO:Images hashed and processed...
gaussian.py:108 INFO:
gaussian.py:109 INFO:Fingerprinting
gaussian.py:110 INFO:==============
gaussian.py:117 INFO:Getting unique element symbols for training
gaussian.py:125 INFO:Unique chemical elements: ['Cu']
gaussian.py:544 WARNING:Making default symmetry functions
gaussian.py:612 INFO:Number of features per chemical element:
gaussian.py:614 INFO: - Cu: 8.
gaussian.py:616 INFO:
gaussian.py:617 INFO:Symmetry function parameters:
gaussian.py:618 INFO:-----------------------------
gaussian.py:620 INFO: # Symbol Type Parameters
gaussian.py:641 INFO: 0 Cu G2 eta: 0.0500
gaussian.py:641 INFO: 1 Cu G2 eta: 0.2321
gaussian.py:641 INFO: 2 Cu G2 eta: 1.0772
gaussian.py:641 INFO: 3 Cu G2 eta: 5.0000
gaussian.py:641 INFO: 4 Cu, Cu G3 eta: 0.0050 gamma: 1.0000 zeta: 1.0000
gaussian.py:641 INFO: 5 Cu, Cu G3 eta: 0.0050 gamma: -1.0000 zeta: 1.0000
gaussian.py:641 INFO: 6 Cu, Cu G3 eta: 0.0050 gamma: 1.0000 zeta: 4.0000
gaussian.py:641 INFO: 7 Cu, Cu G3 eta: 0.0050 gamma: -1.0000 zeta: 4.0000
preprocessing.py:58 INFO:
preprocessing.py:88 INFO:Data preprocessing
preprocessing.py:89 INFO:------------------
preprocessing.py:90 INFO:Preprocessor: MinMaxScaler.
preprocessing.py:91 INFO:Options:
preprocessing.py:93 INFO: - feature_range: (-1, 1).
preprocessing.py:95 INFO:
gaussian.py:138 INFO:
gaussian.py:139 INFO:Adding atomic fingerprint calculations to scheduler...
gaussian.py:167 INFO:... finished in 0 hours 0 minutes 0.57 seconds.
gaussian.py:170 INFO:
gaussian.py:171 INFO:Computing fingerprints...
gaussian.py:280 INFO:Fingerprinting finished in 0 hours 0 minutes 4.39 seconds.
kernelridge.py:162 INFO:Model Training
kernelridge.py:163 INFO:Model name: KernelRidge.
kernelridge.py:164 INFO:Kernel parameters:
kernelridge.py:165 INFO: - Kernel function: rbf.
kernelridge.py:166 INFO: - Sigma: 1.0.
kernelridge.py:167 INFO: - Lamda: 1e-05.
kernelridge.py:177 INFO:Computing Kernel Matrix...
kernelridge.py:180 WARNING: Adding calculations to scheduler...
kernelridge.py:188 INFO: 1600 kernel evaluations added in 0 hours 0 minutes 0.16 seconds.
kernelridge.py:193 INFO: The calculations were batched in groups of 160.
kernelridge.py:197 INFO: Evaluating atomic similarities...
kernelridge.py:213 INFO:Kernel matrix built in 0 hours 0 minutes 2.81 seconds.
kernelridge.py:219 INFO:Building LT matrix
kernelridge.py:230 INFO:LT matrix built in 0 hours 0 minutes 2.84 seconds.
kernelridge.py:274 INFO:Size of the Kernel matrix is (10, 10).
kernelridge.py:275 INFO:Starting Cholesky Factorization...
kernelridge.py:277 INFO:Cholesky Factorization finished...
Computing fingerprints...
Fingerprinting finished in 0 hours 0 minutes 12.67 seconds.
Fingerprints saved to fingerprints.db.

Model Training
==============
Model name: KernelRidge.
Kernel parameters:
- Kernel function: rbf.
- Sigma: 1.0.
- Lamda: 1e-05.

Computing Kernel Matrix...
Adding calculations to scheduler...
1600 kernel evaluations added in 0 hours 0 minutes 0.59 seconds.
The calculations were batched in groups of 160.
Evaluating atomic similarities...
Kernel matrix built in 0 hours 0 minutes 6.37 seconds.
Building LT matrix
LT matrix built in 0 hours 0 minutes 6.47 seconds.
Size of the Kernel matrix is (10, 10).
Starting Cholesky Factorization...
Cholesky Factorization finished...
76 changes: 73 additions & 3 deletions examples/krr_potentials/cu_training.params
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"model": {
"name": "KernelRidge",
"type": "svm",
"class_name": "KernelRidge",
"sigma": 1.0,
"kernel": "rbf",
"scheduler": "distributed",
Expand All @@ -10,7 +11,8 @@
"weights_independent": true,
"forcetraining": false,
"sum_rule": true,
"batch_size": 160
"batch_size": 160,
"kwargs": {}
},
"fingerprints": {
"name": "Gaussian",
Expand All @@ -20,8 +22,76 @@
"MinMaxScaler",
null
],
"defaults": true,
"save_preprocessor": "cu_training.scaler",
"filename": "fingerprints.db"
"filename": "fingerprints.db",
"angular_type": "G3",
"weighted": false,
"custom": {
"user_input": null,
"GP": {
"Cu": [
{
"type": "G2",
"symbol": "Cu",
"eta": 0.049999999999999996
},
{
"type": "G2",
"symbol": "Cu",
"eta": 0.2320794416806389
},
{
"type": "G2",
"symbol": "Cu",
"eta": 1.0772173450159415
},
{
"type": "G2",
"symbol": "Cu",
"eta": 5.000000000000001
},
{
"type": "G3",
"symbols": [
"Cu",
"Cu"
],
"eta": 0.005,
"gamma": 1.0,
"zeta": 1.0
},
{
"type": "G3",
"symbols": [
"Cu",
"Cu"
],
"eta": 0.005,
"gamma": -1.0,
"zeta": 1.0
},
{
"type": "G3",
"symbols": [
"Cu",
"Cu"
],
"eta": 0.005,
"gamma": 1.0,
"zeta": 4.0
},
{
"type": "G3",
"symbols": [
"Cu",
"Cu"
],
"eta": 0.005,
"gamma": -1.0,
"zeta": 4.0
}
]
}
}
}
}
Binary file modified examples/krr_potentials/cu_training.scaler
Binary file not shown.
Loading

0 comments on commit 2c8e259

Please sign in to comment.