lab-cosmo · agoscinski · Feb 9, 2023
diff --git a/examples/composed-model.py b/examples/composed-model.py
@@ -0,0 +1,207 @@
+"""
+Computing a Linear Model
+========================
+
+.. start-body
+
+In this tutorial we calculate a linear model using Ridge regression.
+If you are never worked with equistore objects before please take a look at
+the documentation.
+
+For constructing a linear Model we need the atomic descriptor as training data
+``X`` as well as the energies and forces as target data ``y``.
+
+We first import all necessary packages.
+"""
+
+import ase.io
+import numpy as np
+from equistore import Labels
+from equistore.operations import ones_like, slice, sum_over_samples
+from rascaline import SoapPowerSpectrum
+
+from equisolve.numpy.models.linear_model import Ridge
+from equisolve.numpy.preprocessing import StandardScaler
+from equisolve.numpy.compose import TransformedTargetRegressor, Pipeline
+
+from equisolve.utils.convert import ase_to_tensormap
+
+
+# %%
+#
+# Dataset
+# -------
+#
+# As data set we use the SHIFTML set. You can obtain the dataset used in this
+# example from our :download:`website<../../static/dataset.xyz>`.
+# We read the first 20 structures of the data set using
+# `ASE <https://wiki.fysik.dtu.dk/ase/>`.
+
+
+frames = ase.io.read("dataset.xyz", ":20")
+
+# %%
+#
+# The data set contains everything we need for the model:
+# The atomic positions we use for the descriptor and with this as
+# training data. The data set also stores the energies and forces which will be our
+# target data we regress against.
+#
+# Training data
+# -------------
+#
+# We construct the descriptor training data with a SOAP powerspectrum using
+# rascaline. We first define the hyper parameters for the calculation
+
+
+HYPER_PARAMETERS = {
+    "cutoff": 5.0,
+    "max_radial": 6,
+    "max_angular": 4,
+    "atomic_gaussian_width": 0.3,
+    "center_atom_weight": 1.0,
+    "radial_basis": {
+        "Gto": {},
+    },
+    "cutoff_function": {
+        "ShiftedCosine": {"width": 0.5},
+    },
+}
+
+calculator = SoapPowerSpectrum(**HYPER_PARAMETERS)
+
+# %%
+#
+# And then run the actual calculation, including gradients with respect to positions.
+
+descriptor = calculator.compute(frames, gradients=["positions"])
+
+# %%
+#
+# For more details on how the descriptor works see the documentation of
+# rascaline.
+#
+# We now move all keys into properties to access them for our model.
+
+descriptor = descriptor.keys_to_samples("species_center")
+descriptor = descriptor.keys_to_properties(["species_neighbor_1", "species_neighbor_2"])
+
+# %%
+#
+# The descriptor contains a represenantion with respect to each central atoms per
+# structure. However, our energies as target data is per structure only.
+# Therefore, we sum the properties of each center atom per structure.
+
+X = sum_over_samples(descriptor, ["center", "species_center"])
+
+# %%
+#
+# The newly defined :class:`equistore.TensorMap` contains a single block
+
+print(f"X contains {len(X.blocks())} block.")
+
+# %%
+#
+# As well as 1800 properties and 20 sample.
+#
+# We acces the data using the :meth:``equistore.TensorMap.block`` method
+
+
+print(f"X contains {len(X.block().properties)} properties.")
+print(f"X contains {len(X.block().samples)} samples.")
+
+# Target data
+# -----------
+#
+# We construct the target data by converting energies and forces into a
+# :class:`equisolve.TensorMap`.
+
+
+y = ase_to_tensormap(frames, energy="energy", forces="forces")
+
+# %%
+#
+# The target data y contains a single block
+
+print(y.block())
+
+# %%
+#
+# Construct the model
+# -------------------
+#
+# Before we fit the model we have to define our regression values.
+#
+# For this we create a TensorMap containing with the desired regulerizer
+
+alpha = ones_like(X)
+alpha.block().values[:] *= 1e-5
+
+# %%
+#
+# So far ``alpha`` contains the same number of samples as ``X``. However,
+# the regulerizer only has to be one sample, because all samples will be
+# regulerized in the same way in a linear model.
+#
+# We remove all sample except the 0th one by using the
+# :func:`equistore.operations.slice`.
+
+samples = Labels(
+    names=["structure"],
+    values=np.array([(0,)]),
+)
+
+alpha = slice(alpha, samples=samples)
+
+# %%
+#
+# In our regulerizer we use the same values for all properties. However,
+# :class:`equisolve.numpy.models.linear_model.Ridge` can also handle different
+# regularization for each property. You can apply a property wise regularization by
+# setting ``"values"`` of ``alpha_dict`` with an 1d array of the same length as the
+# number of properties in the training data X (here 7200)
+#
+# With a valid regulerizer object we now initilize the Ridge object.
+# ``parameter_keys`` determines with respect to which parameters the regression is
+# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and
+# ``"positions"`` (forces).
+
+parameter_keys = ["values", "positions"]
+ridge = Ridge(parameter_keys=parameter_keys, alpha=alpha)
+standardizer = StandardScaler(parameter_keys=parameter_keys)
+ttr = TransformedTargetRegressor(regressor=ridge, transformer=standardizer
+clf = Pipeline([('scaler', StandardScaler(parameter_keys=["values", "positions"])),
+  ('ridge', ttr)])
+
+# Old classifier
+#clf = Ridge(parameter_keys=parameter_keys, alpha=alpha)
+
+
+# %%
+#
+# Next we create a sample weighting :class:`equistiore.TensorMap` that weights energies
+# five times more then the forces.
+
+sw = ones_like(y)
+sw.block().values[:] *= 5
+
+# %%
+#
+# The function `equisolve.utils.dictionary_to_tensormap` create a
+# :class:`equistore.TensorMap` with the same shape as our target data ``y`` but with
+# values a defined by ``sw_dict``.
+
+print(sw)
+
+# Finally we can fit the model using the sample weights defined above.
+
+clf.fit(X, y)
+#clf.fit(X, y, sample_weight=sw) TODO, passing args in pipes ant ttr is not implemented yet
+
+
+# Finally we can predict values and calculate the root mean squre error
+# of our model.
+
+clf.predict(X)
+print(f"RMSE energies = {clf.score(X, y, parameter_key='values')[0]:.3f} eV")
+print(f"RMSE forces = {clf.score(X, y, parameter_key='positions')[0]:.3f} eV/Å")
diff --git a/src/equisolve/numpy/__init__.py b/src/equisolve/numpy/__init__.py
@@ -9,4 +9,4 @@
 from . import models, utils
 
 
-__all__ = ["models", "utils", "preprocessing"]
+__all__ = ["models", "utils", "preprocessing", "compose"]
diff --git a/src/equisolve/numpy/compose/__init__.py b/src/equisolve/numpy/compose/__init__.py
@@ -0,0 +1,12 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
+#
+# Copyright (c) 2023 Authors and contributors
+# (see the AUTHORS.rst file for the full list of names)
+#
+# Released under the BSD 3-Clause "New" or "Revised" License
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._base import Pipeline, TransformedTargetRegressor
+
+
+__all__ = ["Pipeline", "TransformedTargetRegressor"]
diff --git a/src/equisolve/numpy/compose/_base.py b/src/equisolve/numpy/compose/_base.py
@@ -0,0 +1,106 @@
+# -*- Mode: python3; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
+#
+# Copyright (c) 2023 Authors and contributors
+# (see the AUTHORS.rst file for the full list of names)
+#
+# Released under the BSD 3-Clause "New" or "Revised" License
+# SPDX-License-Identifier: BSD-3-Clause
+
+from copy import deepcopy
+
+from typing import List, Optional, Union
+from equistore import TensorMap
+
+from ...utils.metrics import rmse
+
+class TransformedTargetRegressor:
+    r"""Regressor that transforms the target values with a transformer before fitting
+        and inverse transforms the predicted target before outputting the score.
+        This is useful for doing a regression on a numerical stable scale while still outputting the scores in the original scale.
+
+        :param regressor: The regressor used for doing the prediction.
+
+        :param transformer: The transformer applied on the target y values before regression.
+    """
+
+    def __init__(self, regressor=None, transformer=None):
+        self.regressor = regressor
+        self.transformer = transformer
+
+        if set(regressor.parameter_keys) != set(transformer.parameter_keys):
+            raise ValueError("parameter_keys between regressor and transformer do not match")
+
+        if regressor is not None:
+            if hasattr(regressor, "fit") and not(callable(regressor.fit)):
+                raise ValueError("regressor does not have fit function")
+            # TODO more checks for functionalities 
+
+        # COMMENT I am not sure if we should check the regressor and transformer this way,
+        #         a base class with defined functionalities seems more clean
+
+
+        # COMMENT scikit-learn offers also
+        #self.func
+        #self.inverse_func
+        #self.check_inverse
+        # but we do not need them at the moment, make low-prio issue to not forget?
+
+    def fit(self, X: TensorMap, y: TensorMap) -> None:
+        # COMMENT default args StandardScaler + Ridge? But what parameter keys?
+        #if self.regressor is None:
+        #    self.regressor_ = Ridge()
+        #if self.transformer is None:
+        #    self.transformer_ = StandardScaler()
+
+        self.transformer_ = deepcopy(self.transformer).fit(y)
+        self.regressor_ = deepcopy(self.regressor).fit(X, self.transformer_.transform(y))
+        return self
+
+    def predict(self, X: TensorMap) -> TensorMap:
+        return self.transformer_.inverse_transform(self.regressor_.predict(X))
+
+    def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> float:
+        y_pred = self.transformer_.inverse_transform(self.regressor_.predict(X))
+        return rmse(y, y_pred, parameter_key)
+
+
+class Pipeline:
+    r"""sklearn style Pipeline
+
+        :param steps: list of (name, transformer/estimator) tuples that are applied in the fit and prediction steps consecutiely in the given order
+                      last tuple must contain estimator, all other tuples must contain transformers
+                      #COMMENT sklearn Pipeline supports more, but I think this limitation is okay for now
+    """
+    def __init__(self, steps=None):
+        self.steps = steps
+
+    def fit(self, X : TensorMap, y : TensorMap) -> None:
+        # TODO check if they have fit function, for this we require base clases for estimator and transformers
+        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
+            fitted_transformer = deepcopy(transformer).fit(X, y)
+            X = fitted_transformer.transform(X)
+            self.steps[step_idx] = (name, fitted_transformer)
+
+        final_estimator = self.steps[-1][1]
+        final_estimator.fit(X,y)
+        return self
+
+    def predict(self, X: TensorMap) -> TensorMap:
+        # TODO check they are fitted, for this we require base clases for estimator and transformers
+        for _, transformer in self.steps[:-1]:
+            X = transformer.transform(X)
+        return self.steps[-1][1].predict(X)
+
+    def score(self, X: TensorMap, y: TensorMap, parameter_key: Optional[str]) -> float:
+        # COMMENT we use scorer from last estimator, maybe not what we want
+        for _, transformer in self.steps[:-1]:
+            X = transformer.transform(X)
+        final_estimator = self.steps[-1][1]
+        # COMMENT: what do you think about using default parameter_keys?
+        if parameter_key is None:
+            scores = np.zeros(len(final_estimator.parameter_keys))
+            for i, parameter_key in enumerate(final_estimator.parameter_keys):
+                scores[i] = final_estimator.score(X, y, parameter_key)[0]
+            return np.mean(scores)
+        else:
+            return final_estimator.score(X, y, parameter_key)
diff --git a/src/equisolve/numpy/models/linear_model.py b/src/equisolve/numpy/models/linear_model.py
@@ -61,6 +61,8 @@ def __init__(
 
         self.coef_ = None
 
+        self.alpha = tensor_map_to_dict(self.alpha)
+
     def _validate_data(self, X: TensorMap, y: Optional[TensorMap] = None) -> None:
         """Validates :class:`equistore.TensorBlock`'s for the usage in models.
 
@@ -191,9 +193,10 @@ def fit(
             )
             coef_blocks.append(coef_block)
 
-        # Convert alpha to a dictionary to be used in external models.
-        self.alpha = tensor_map_to_dict(self.alpha)
         self.coef_ = TensorMap(X.keys, coef_blocks)
+        # Convert alpha and coefs to a dictionary to be used in external models.
+        self.alpha = tensor_map_to_dict(self.alpha)
+        self.coef_ = tensor_map_to_dict(self.coef_)
 
         return self
 
@@ -207,9 +210,12 @@ def predict(self, X: TensorMap) -> TensorMap:
         if self.coef_ is None:
             raise ValueError("No weights. Call fit method first.")
 
-        return dot(X, self.coef_)
+        self.coef_ = dict_to_tensor_map(self.coef_)
+        y_pred = dot(X, self.coef_)
+        self.coef_ = tensor_map_to_dict(self.coef_)
+        return y_pred
 
-    def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]:
+    def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]: # COMMENT why does it return list of floats if we just allow one paramater_key?
         """Return the coefficient of determination of the prediction.
 
         :param X: Test samples
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,4 @@
		from . import models, utils


		__all__ = ["models", "utils", "preprocessing"]
		__all__ = ["models", "utils", "preprocessing", "compose"]