General improvements and addition of Gaussian process regression

- ml4chem/data/parser.py: improved FakeCalculator class and its docstrings. - ml4chem/models/kernelridge.py: * New `class_name` parameter is saved to operate with universal loading of models. * get_kernel_matrix() function can handle more input cases. * Improvement in docstrings. - ml4chem/potentials.py: universal loading of models. - setup.py: now the `ml4chem` command line tool is installed with pip. - ml4chem/models/gaussian_process.py: the new gaussian process regression model for ml4chem. It uses the KernelRidge class in the kernelridge module as a base class to then compute the extra step referring to the inference variance. This is related to #18. - Updated examples/.
muammar · Aug 3, 2019 · 2c8e259 · 2c8e259
1 parent 52bd471
commit 2c8e259
Show file tree

Hide file tree

Showing 18 changed files with 1,480 additions and 882 deletions.
diff --git a/examples/gp_potentials/cu_inference.py b/examples/gp_potentials/cu_inference.py
@@ -0,0 +1,39 @@
+import logging
+import sys
+
+sys.path.append("../../")
+from ase.io import Trajectory
+from dask.distributed import Client, LocalCluster
+from ml4chem import Potentials
+
+
+def main():
+    """docstring for main"""
+
+    # Load the images with ASE
+    images = Trajectory("cu_training.traj")
+
+    calc = Potentials.load(
+        model="cu_training.ml4c",
+        params="cu_training.params",
+        preprocessor="cu_training.scaler",
+    )
+
+    # Passage of fingerprint database with reference space
+    calc.reference_space = "fingerprints.db"
+
+    for atoms in images:
+        energy = calc.get_potential_energy(atoms)
+        print("ML4Chem predicted energy = {}".format(energy))
+        print("              DFT energy = {}".format(atoms.get_potential_energy()))
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        filename="cu_inference.log",
+        level=logging.INFO,
+        format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
+    )
+    cluster = LocalCluster(n_workers=8, threads_per_worker=2)
+    client = Client(cluster, asyncronous=True)
+    main()
diff --git a/examples/gp_potentials/cu_training.py b/examples/gp_potentials/cu_training.py
@@ -0,0 +1,36 @@
+import sys
+from ase.io import Trajectory
+from dask.distributed import Client, LocalCluster
+
+sys.path.append("../../")
+from ml4chem import Potentials
+from ml4chem.fingerprints import Gaussian
+from ml4chem.models.gaussian_process import GaussianProcess
+from ml4chem.utils import logger
+
+
+def train():
+    # Load the images with ASE
+    images = Trajectory("cu_training.traj")
+
+    # Arguments for fingerprinting the images
+    normalized = True
+    batch_size = 160
+
+    calc = Potentials(
+        fingerprints=Gaussian(
+            cutoff=6.5, normalized=normalized, save_preprocessor="cu_training.scaler"
+        ),
+        #model=GaussianProcess(batch_size=batch_size),
+        model=GaussianProcess(),
+        label="cu_training",
+    )
+
+    calc.train(training_set=images)
+
+
+if __name__ == "__main__":
+    logger()
+    cluster = LocalCluster()
+    client = Client(cluster, asyncronous=True)
+    train()
diff --git a/examples/gp_potentials/cu_training.traj b/examples/gp_potentials/cu_training.traj
diff --git a/examples/krr_potentials/cu_inference.log b/examples/krr_potentials/cu_inference.log
diff --git a/examples/krr_potentials/cu_inference.py b/examples/krr_potentials/cu_inference.py
@@ -1,10 +1,10 @@
-import logging
 import sys
 
 sys.path.append("../../")
 from ase.io import Trajectory
 from dask.distributed import Client, LocalCluster
 from ml4chem import Potentials
+from ml4chem.utils import logger
 
 
 def main():
@@ -29,11 +29,7 @@ def main():
 
 
 if __name__ == "__main__":
-    logging.basicConfig(
-        filename="cu_inference.log",
-        level=logging.INFO,
-        format="%(filename)s:%(lineno)s %(levelname)s:%(message)s",
-    )
+    logger(filename="cu_inference.log")
     cluster = LocalCluster(n_workers=8, threads_per_worker=2)
     client = Client(cluster, asyncronous=True)
     main()
diff --git a/examples/krr_potentials/cu_training.log b/examples/krr_potentials/cu_training.log
@@ -1,5 +1,5 @@
-potentials.py:53 INFO:
--------------------------------------------------------------------------------
+
+===============================================================================
 
           ███╗   ███╗██╗██╗  ██╗ ██████╗██╗  ██╗███████╗███╗   ███╗
           ████╗ ████║██║██║  ██║██╔════╝██║  ██║██╔════╝████╗ ████║
@@ -10,64 +10,79 @@ potentials.py:53 INFO:
 
 
 
-ML4Chem is Machine Learning for Chemistry. This package is written in Python 3,
-and intends to offer modern and rich features to perform machine learning
+ML4Chem is Machine Learning for Chemistry. This package is written in Python
+3, and intends to offer modern and rich features to perform machine learning
 workflows for chemical physics.
 
-This software is developed by Muammar El Khatib.
--------------------------------------------------------------------------------
+This project is directed by Muammar El Khatib.
+
+
+Contributors (in alphabetic order):
+-----------------------------------
+    Elijah Gardella     : Interatomic potentials for ionic systems.
+    Jacklyn Gee         : Gaussian features class improvements, and cjson
+                          reader.
+
+===============================================================================
+
+Data
+====
+Data structure is not compatible with ML4Chem.
+Preparing images for training...
+Images hashed and processed...
+
+There are 40 atoms in your data set.
+
+Fingerprinting
+==============
+Getting unique element symbols for training
+Unique chemical elements: ['Cu']
+Making default symmetry functions...
+Number of features per chemical element:
+    - Cu: 8.
+
+Symmetry function parameters for Cu atom:
+-----------------------------------------
+  #      Symbol    Type Parameters
+  0   Cu            G2  eta: 0.0500
+  1   Cu            G2  eta: 0.2321
+  2   Cu            G2  eta: 1.0772
+  3   Cu            G2  eta: 5.0000
+  4   Cu, Cu        G3  eta: 0.0050 gamma:  1.0000 zeta: 1.0000
+  5   Cu, Cu        G3  eta: 0.0050 gamma: -1.0000 zeta: 1.0000
+  6   Cu, Cu        G3  eta: 0.0050 gamma:  1.0000 zeta: 4.0000
+  7   Cu, Cu        G3  eta: 0.0050 gamma: -1.0000 zeta: 4.0000
+
+Data preprocessing
+------------------
+Preprocessor: MinMaxScaler.
+Options:
+    - feature_range: (-1, 1).
+
+
+Adding atomic feature calculations to scheduler...
+... finished in 0 hours 0 minutes 1.87 seconds.
 
-potentials.py:54 INFO:Available backends: ['torch', 'torchvision', 'numpy'].
-handler.py:34 WARNING:Data structure is not compatible with ML4Chem
-handler.py:56 INFO:Preparing images...
-handler.py:87 INFO:Images hashed and processed...
-gaussian.py:108 INFO: 
-gaussian.py:109 INFO:Fingerprinting
-gaussian.py:110 INFO:==============
-gaussian.py:117 INFO:Getting unique element symbols for training
-gaussian.py:125 INFO:Unique chemical elements: ['Cu']
-gaussian.py:544 WARNING:Making default symmetry functions
-gaussian.py:612 INFO:Number of features per chemical element:
-gaussian.py:614 INFO:    - Cu: 8.
-gaussian.py:616 INFO: 
-gaussian.py:617 INFO:Symmetry function parameters:
-gaussian.py:618 INFO:-----------------------------
-gaussian.py:620 INFO:  #      Symbol    Type Parameters
-gaussian.py:641 INFO:  0   Cu            G2  eta: 0.0500
-gaussian.py:641 INFO:  1   Cu            G2  eta: 0.2321
-gaussian.py:641 INFO:  2   Cu            G2  eta: 1.0772
-gaussian.py:641 INFO:  3   Cu            G2  eta: 5.0000
-gaussian.py:641 INFO:  4   Cu, Cu        G3  eta: 0.0050 gamma:  1.0000 zeta: 1.0000
-gaussian.py:641 INFO:  5   Cu, Cu        G3  eta: 0.0050 gamma: -1.0000 zeta: 1.0000
-gaussian.py:641 INFO:  6   Cu, Cu        G3  eta: 0.0050 gamma:  1.0000 zeta: 4.0000
-gaussian.py:641 INFO:  7   Cu, Cu        G3  eta: 0.0050 gamma: -1.0000 zeta: 4.0000
-preprocessing.py:58 INFO: 
-preprocessing.py:88 INFO:Data preprocessing
-preprocessing.py:89 INFO:------------------
-preprocessing.py:90 INFO:Preprocessor: MinMaxScaler.
-preprocessing.py:91 INFO:Options:
-preprocessing.py:93 INFO:    - feature_range: (-1, 1).
-preprocessing.py:95 INFO: 
-gaussian.py:138 INFO:
-gaussian.py:139 INFO:Adding atomic fingerprint calculations to scheduler...
-gaussian.py:167 INFO:... finished in 0 hours 0 minutes 0.57 seconds.
-gaussian.py:170 INFO:
-gaussian.py:171 INFO:Computing fingerprints...
-gaussian.py:280 INFO:Fingerprinting finished in 0 hours 0 minutes 4.39 seconds.
-kernelridge.py:162 INFO:Model Training
-kernelridge.py:163 INFO:Model name: KernelRidge.
-kernelridge.py:164 INFO:Kernel parameters:
-kernelridge.py:165 INFO:    - Kernel function: rbf.
-kernelridge.py:166 INFO:    - Sigma: 1.0.
-kernelridge.py:167 INFO:    - Lamda: 1e-05.
-kernelridge.py:177 INFO:Computing Kernel Matrix...
-kernelridge.py:180 WARNING:    Adding calculations to scheduler...
-kernelridge.py:188 INFO:    1600 kernel evaluations added in 0 hours 0 minutes 0.16 seconds.
-kernelridge.py:193 INFO:    The calculations were batched in groups of 160.
-kernelridge.py:197 INFO:    Evaluating atomic similarities...
-kernelridge.py:213 INFO:Kernel matrix built in 0 hours 0 minutes 2.81 seconds.
-kernelridge.py:219 INFO:Building LT matrix
-kernelridge.py:230 INFO:LT matrix built in 0 hours 0 minutes 2.84 seconds.
-kernelridge.py:274 INFO:Size of the Kernel matrix is (10, 10).
-kernelridge.py:275 INFO:Starting Cholesky Factorization...
-kernelridge.py:277 INFO:Cholesky Factorization finished...
+Computing fingerprints...
+Fingerprinting finished in 0 hours 0 minutes 12.67 seconds.
+Fingerprints saved to fingerprints.db.
+
+Model Training
+==============
+Model name: KernelRidge.
+Kernel parameters:
+    - Kernel function: rbf.
+    - Sigma: 1.0.
+    - Lamda: 1e-05.
+
+Computing Kernel Matrix...
+    Adding calculations to scheduler...
+    1600 kernel evaluations added in 0 hours 0 minutes 0.59 seconds.
+    The calculations were batched in groups of 160.
+    Evaluating atomic similarities...
+Kernel matrix built in 0 hours 0 minutes 6.37 seconds.
+Building LT matrix
+LT matrix built in 0 hours 0 minutes 6.47 seconds.
+Size of the Kernel matrix is (10, 10).
+Starting Cholesky Factorization...
+Cholesky Factorization finished...
diff --git a/examples/krr_potentials/cu_training.params b/examples/krr_potentials/cu_training.params
@@ -2,6 +2,7 @@
     "model": {
         "name": "KernelRidge",
         "type": "svm",
+        "class_name": "KernelRidge",
         "sigma": 1.0,
         "kernel": "rbf",
         "scheduler": "distributed",
@@ -10,7 +11,8 @@
         "weights_independent": true,
         "forcetraining": false,
         "sum_rule": true,
-        "batch_size": 160
+        "batch_size": 160,
+        "kwargs": {}
     },
     "fingerprints": {
         "name": "Gaussian",
@@ -20,8 +22,76 @@
             "MinMaxScaler",
             null
         ],
-        "defaults": true,
         "save_preprocessor": "cu_training.scaler",
-        "filename": "fingerprints.db"
+        "filename": "fingerprints.db",
+        "angular_type": "G3",
+        "weighted": false,
+        "custom": {
+            "user_input": null,
+            "GP": {
+                "Cu": [
+                    {
+                        "type": "G2",
+                        "symbol": "Cu",
+                        "eta": 0.049999999999999996
+                    },
+                    {
+                        "type": "G2",
+                        "symbol": "Cu",
+                        "eta": 0.2320794416806389
+                    },
+                    {
+                        "type": "G2",
+                        "symbol": "Cu",
+                        "eta": 1.0772173450159415
+                    },
+                    {
+                        "type": "G2",
+                        "symbol": "Cu",
+                        "eta": 5.000000000000001
+                    },
+                    {
+                        "type": "G3",
+                        "symbols": [
+                            "Cu",
+                            "Cu"
+                        ],
+                        "eta": 0.005,
+                        "gamma": 1.0,
+                        "zeta": 1.0
+                    },
+                    {
+                        "type": "G3",
+                        "symbols": [
+                            "Cu",
+                            "Cu"
+                        ],
+                        "eta": 0.005,
+                        "gamma": -1.0,
+                        "zeta": 1.0
+                    },
+                    {
+                        "type": "G3",
+                        "symbols": [
+                            "Cu",
+                            "Cu"
+                        ],
+                        "eta": 0.005,
+                        "gamma": 1.0,
+                        "zeta": 4.0
+                    },
+                    {
+                        "type": "G3",
+                        "symbols": [
+                            "Cu",
+                            "Cu"
+                        ],
+                        "eta": 0.005,
+                        "gamma": -1.0,
+                        "zeta": 4.0
+                    }
+                ]
+            }
+        }
     }
 }
diff --git a/examples/krr_potentials/cu_training.scaler b/examples/krr_potentials/cu_training.scaler