Skip to content
This repository has been archived by the owner on Apr 24, 2024. It is now read-only.

Basic classes to combine transformers and estimators #9

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions examples/composed-model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
"""
Computing a Linear Model
========================

.. start-body

In this tutorial we calculate a linear model using Ridge regression.
If you are never worked with equistore objects before please take a look at
the documentation.

For constructing a linear Model we need the atomic descriptor as training data
``X`` as well as the energies and forces as target data ``y``.

We first import all necessary packages.
"""

import ase.io
import numpy as np
from equistore import Labels
from equistore.operations import ones_like, slice, sum_over_samples
from rascaline import SoapPowerSpectrum

from equisolve.numpy.models.linear_model import Ridge
from equisolve.numpy.preprocessing import StandardScaler
from equisolve.numpy.compose import TransformedTargetRegressor, Pipeline

from equisolve.utils.convert import ase_to_tensormap


# %%
#
# Dataset
# -------
#
# As data set we use the SHIFTML set. You can obtain the dataset used in this
# example from our :download:`website<../../static/dataset.xyz>`.
# We read the first 20 structures of the data set using
# `ASE <https://wiki.fysik.dtu.dk/ase/>`.


frames = ase.io.read("dataset.xyz", ":20")

# %%
#
# The data set contains everything we need for the model:
# The atomic positions we use for the descriptor and with this as
# training data. The data set also stores the energies and forces which will be our
# target data we regress against.
#
# Training data
# -------------
#
# We construct the descriptor training data with a SOAP powerspectrum using
# rascaline. We first define the hyper parameters for the calculation


HYPER_PARAMETERS = {
"cutoff": 5.0,
"max_radial": 6,
"max_angular": 4,
"atomic_gaussian_width": 0.3,
"center_atom_weight": 1.0,
"radial_basis": {
"Gto": {},
},
"cutoff_function": {
"ShiftedCosine": {"width": 0.5},
},
}

calculator = SoapPowerSpectrum(**HYPER_PARAMETERS)

# %%
#
# And then run the actual calculation, including gradients with respect to positions.

descriptor = calculator.compute(frames, gradients=["positions"])

# %%
#
# For more details on how the descriptor works see the documentation of
# rascaline.
#
# We now move all keys into properties to access them for our model.

descriptor = descriptor.keys_to_samples("species_center")
descriptor = descriptor.keys_to_properties(["species_neighbor_1", "species_neighbor_2"])

# %%
#
# The descriptor contains a represenantion with respect to each central atoms per
# structure. However, our energies as target data is per structure only.
# Therefore, we sum the properties of each center atom per structure.

X = sum_over_samples(descriptor, ["center", "species_center"])

# %%
#
# The newly defined :class:`equistore.TensorMap` contains a single block

print(f"X contains {len(X.blocks())} block.")

# %%
#
# As well as 1800 properties and 20 sample.
#
# We acces the data using the :meth:``equistore.TensorMap.block`` method


print(f"X contains {len(X.block().properties)} properties.")
print(f"X contains {len(X.block().samples)} samples.")

# Target data
# -----------
#
# We construct the target data by converting energies and forces into a
# :class:`equisolve.TensorMap`.


y = ase_to_tensormap(frames, energy="energy", forces="forces")

# %%
#
# The target data y contains a single block

print(y.block())

# %%
#
# Construct the model
# -------------------
#
# Before we fit the model we have to define our regression values.
#
# For this we create a TensorMap containing with the desired regulerizer

alpha = ones_like(X)
alpha.block().values[:] *= 1e-5

# %%
#
# So far ``alpha`` contains the same number of samples as ``X``. However,
# the regulerizer only has to be one sample, because all samples will be
# regulerized in the same way in a linear model.
#
# We remove all sample except the 0th one by using the
# :func:`equistore.operations.slice`.

samples = Labels(
names=["structure"],
values=np.array([(0,)]),
)

alpha = slice(alpha, samples=samples)

# %%
#
# In our regulerizer we use the same values for all properties. However,
# :class:`equisolve.numpy.models.linear_model.Ridge` can also handle different
# regularization for each property. You can apply a property wise regularization by
# setting ``"values"`` of ``alpha_dict`` with an 1d array of the same length as the
# number of properties in the training data X (here 7200)
#
# With a valid regulerizer object we now initilize the Ridge object.
# ``parameter_keys`` determines with respect to which parameters the regression is
# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and
# ``"positions"`` (forces).

parameter_keys = ["values", "positions"]
ridge = Ridge(parameter_keys=parameter_keys, alpha=alpha)
standardizer = StandardScaler(parameter_keys=parameter_keys)
ttr = TransformedTargetRegressor(regressor=ridge, transformer=standardizer
clf = Pipeline([('scaler', StandardScaler(parameter_keys=["values", "positions"])),
('ridge', ttr)])

# Old classifier
#clf = Ridge(parameter_keys=parameter_keys, alpha=alpha)


# %%
#
# Next we create a sample weighting :class:`equistiore.TensorMap` that weights energies
# five times more then the forces.

sw = ones_like(y)
sw.block().values[:] *= 5

# %%
#
# The function `equisolve.utils.dictionary_to_tensormap` create a
# :class:`equistore.TensorMap` with the same shape as our target data ``y`` but with
# values a defined by ``sw_dict``.

print(sw)

# Finally we can fit the model using the sample weights defined above.

clf.fit(X, y)
#clf.fit(X, y, sample_weight=sw) TODO, passing args in pipes ant ttr is not implemented yet


# Finally we can predict values and calculate the root mean squre error
# of our model.

clf.predict(X)
print(f"RMSE energies = {clf.score(X, y, parameter_key='values')[0]:.3f} eV")
print(f"RMSE forces = {clf.score(X, y, parameter_key='positions')[0]:.3f} eV/Å")
2 changes: 1 addition & 1 deletion src/equisolve/numpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
from . import models, utils


__all__ = ["models", "utils", "preprocessing"]
__all__ = ["models", "utils", "preprocessing", "compose"]
12 changes: 12 additions & 0 deletions src/equisolve/numpy/compose/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
#
# Copyright (c) 2023 Authors and contributors
# (see the AUTHORS.rst file for the full list of names)
#
# Released under the BSD 3-Clause "New" or "Revised" License
# SPDX-License-Identifier: BSD-3-Clause

from ._base import Pipeline, TransformedTargetRegressor


__all__ = ["Pipeline", "TransformedTargetRegressor"]
106 changes: 106 additions & 0 deletions src/equisolve/numpy/compose/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- Mode: python3; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
#
# Copyright (c) 2023 Authors and contributors
# (see the AUTHORS.rst file for the full list of names)
#
# Released under the BSD 3-Clause "New" or "Revised" License
# SPDX-License-Identifier: BSD-3-Clause

from copy import deepcopy

from typing import List, Optional, Union
from equistore import TensorMap

from ...utils.metrics import rmse

class TransformedTargetRegressor:
r"""Regressor that transforms the target values with a transformer before fitting
and inverse transforms the predicted target before outputting the score.
This is useful for doing a regression on a numerical stable scale while still outputting the scores in the original scale.

:param regressor: The regressor used for doing the prediction.

:param transformer: The transformer applied on the target y values before regression.
"""

def __init__(self, regressor=None, transformer=None):
self.regressor = regressor
self.transformer = transformer

if set(regressor.parameter_keys) != set(transformer.parameter_keys):
raise ValueError("parameter_keys between regressor and transformer do not match")

if regressor is not None:
if hasattr(regressor, "fit") and not(callable(regressor.fit)):
raise ValueError("regressor does not have fit function")
# TODO more checks for functionalities

# COMMENT I am not sure if we should check the regressor and transformer this way,
# a base class with defined functionalities seems more clean


# COMMENT scikit-learn offers also
#self.func
#self.inverse_func
#self.check_inverse
# but we do not need them at the moment, make low-prio issue to not forget?

def fit(self, X: TensorMap, y: TensorMap) -> None:
# COMMENT default args StandardScaler + Ridge? But what parameter keys?
#if self.regressor is None:
# self.regressor_ = Ridge()
#if self.transformer is None:
# self.transformer_ = StandardScaler()

self.transformer_ = deepcopy(self.transformer).fit(y)
self.regressor_ = deepcopy(self.regressor).fit(X, self.transformer_.transform(y))
return self

def predict(self, X: TensorMap) -> TensorMap:
return self.transformer_.inverse_transform(self.regressor_.predict(X))

def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> float:
y_pred = self.transformer_.inverse_transform(self.regressor_.predict(X))
return rmse(y, y_pred, parameter_key)


class Pipeline:
r"""sklearn style Pipeline

:param steps: list of (name, transformer/estimator) tuples that are applied in the fit and prediction steps consecutiely in the given order
last tuple must contain estimator, all other tuples must contain transformers
#COMMENT sklearn Pipeline supports more, but I think this limitation is okay for now
"""
def __init__(self, steps=None):
self.steps = steps

def fit(self, X : TensorMap, y : TensorMap) -> None:
# TODO check if they have fit function, for this we require base clases for estimator and transformers
for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
fitted_transformer = deepcopy(transformer).fit(X, y)
X = fitted_transformer.transform(X)
self.steps[step_idx] = (name, fitted_transformer)

final_estimator = self.steps[-1][1]
final_estimator.fit(X,y)
return self

def predict(self, X: TensorMap) -> TensorMap:
# TODO check they are fitted, for this we require base clases for estimator and transformers
for _, transformer in self.steps[:-1]:
X = transformer.transform(X)
return self.steps[-1][1].predict(X)

def score(self, X: TensorMap, y: TensorMap, parameter_key: Optional[str]) -> float:
# COMMENT we use scorer from last estimator, maybe not what we want
for _, transformer in self.steps[:-1]:
X = transformer.transform(X)
final_estimator = self.steps[-1][1]
# COMMENT: what do you think about using default parameter_keys?
if parameter_key is None:
scores = np.zeros(len(final_estimator.parameter_keys))
for i, parameter_key in enumerate(final_estimator.parameter_keys):
scores[i] = final_estimator.score(X, y, parameter_key)[0]
return np.mean(scores)
else:
return final_estimator.score(X, y, parameter_key)
14 changes: 10 additions & 4 deletions src/equisolve/numpy/models/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(

self.coef_ = None

self.alpha = tensor_map_to_dict(self.alpha)

def _validate_data(self, X: TensorMap, y: Optional[TensorMap] = None) -> None:
"""Validates :class:`equistore.TensorBlock`'s for the usage in models.

Expand Down Expand Up @@ -191,9 +193,10 @@ def fit(
)
coef_blocks.append(coef_block)

# Convert alpha to a dictionary to be used in external models.
self.alpha = tensor_map_to_dict(self.alpha)
self.coef_ = TensorMap(X.keys, coef_blocks)
# Convert alpha and coefs to a dictionary to be used in external models.
self.alpha = tensor_map_to_dict(self.alpha)
self.coef_ = tensor_map_to_dict(self.coef_)

return self

Expand All @@ -207,9 +210,12 @@ def predict(self, X: TensorMap) -> TensorMap:
if self.coef_ is None:
raise ValueError("No weights. Call fit method first.")

return dot(X, self.coef_)
self.coef_ = dict_to_tensor_map(self.coef_)
y_pred = dot(X, self.coef_)
self.coef_ = tensor_map_to_dict(self.coef_)
return y_pred

def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]:
def score(self, X: TensorMap, y: TensorMap, parameter_key: str) -> List[float]: # COMMENT why does it return list of floats if we just allow one paramater_key?
"""Return the coefficient of determination of the prediction.

:param X: Test samples
Expand Down
Loading