From 8a707fae9a44dcf359b105300420e788a0049bfa Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Wed, 20 Nov 2024 21:17:34 +0100 Subject: [PATCH 01/22] implement datatree and dataset in linear regression --- mesmer/stats/_linear_regression.py | 119 ++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 34 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index efef8382..c7ed482f 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -1,9 +1,18 @@ -from collections.abc import Mapping - import numpy as np import xarray as xr +from datatree import DataTree, map_over_subtree + +from mesmer.core.datatree import ( + _extract_single_dataarray_from_dt, + collapse_datatree_into_dataset, +) +from mesmer.core.utils import ( + _check_dataarray_form, + _check_dataset_form, + _to_set, +) -from mesmer.core.utils import _check_dataarray_form, _check_dataset_form, _to_set +# TODO: deprecate predictor dicts? class LinearRegression: @@ -14,7 +23,7 @@ def __init__(self): def fit( self, - predictors: Mapping[str, xr.DataArray], + predictors: dict[str, xr.DataArray] | DataTree | xr.Dataset, target: xr.DataArray, dim: str, weights: xr.DataArray | None = None, @@ -25,9 +34,10 @@ def fit( Parameters ---------- - predictors : dict of xr.DataArray - A dict of DataArray objects used as predictors. Must be 1D and contain - `dim`. + predictors : dict of xr.DataArray | DataTree | xr.Dataset + A dict of DataArray objects used as predictors or a DataTree, holding each + predictor in a leaf. Each predictor must be 1D and contain `dim`. If predictors + is a xr.Dataset, it must have each predictor as a DataArray. target : xr.DataArray Target DataArray. Must be 2D and contain `dim`. dim : str @@ -52,16 +62,18 @@ def fit( def predict( self, - predictors: Mapping[str, xr.DataArray], + predictors: dict[str, xr.DataArray] | DataTree | xr.Dataset, exclude=None, - ): + ) -> xr.DataArray: """ Predict using the linear model. Parameters ---------- - predictors : dict of xr.DataArray - A dict of DataArray objects used as predictors. Must be 1D and contain `dim`. + predictors : dict of xr.DataArray | DataTree | xr.Dataset + A dict of ``DataArray`` objects used as predictors or a ``DataTree``, holding each + predictor in a leaf. Each predictor must be 1D and contain ``dim``. If predictors + is a ``xr.Dataset``, it must have each predictor as a ``DataArray``. exclude : str or set of str, default: None Set of variables to exclude in the prediction. May include ``"intercept"`` to initialize the prediction with 0. @@ -78,7 +90,7 @@ def predict( non_predictor_vars = {"intercept", "weights", "fit_intercept"} required_predictors = set(params.data_vars) - non_predictor_vars - exclude - available_predictors = set(predictors.keys()) + available_predictors = set(predictors.keys()) - exclude if required_predictors - available_predictors: missing = sorted(required_predictors - available_predictors) @@ -88,30 +100,47 @@ def predict( if available_predictors - required_predictors: superfluous = sorted(available_predictors - required_predictors) superfluous = "', '".join(superfluous) - raise ValueError(f"Superfluous predictors: '{superfluous}'") + raise ValueError( + f"Superfluous predictors: '{superfluous}', either params", + "for this predictor are missing or you forgot to add it to 'exclude'.", + ) if "intercept" in exclude: prediction = xr.zeros_like(params.intercept) else: prediction = params.intercept + # if predictors is a DataTree, rename all data variables to "pred" to avoid conflicts + if isinstance(predictors, DataTree) and not predictors.equals(DataTree()): + predictors = map_over_subtree( + lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) + )(predictors) + for key in required_predictors: - prediction = prediction + predictors[key] * params[key] + prediction = (predictors[key] * params[key]).transpose() + prediction + + prediction = ( + _extract_single_dataarray_from_dt(prediction) + if isinstance(prediction, DataTree) + else prediction + ) - return prediction + return prediction.rename("prediction") def residuals( self, - predictors: Mapping[str, xr.DataArray], + predictors: dict[str, xr.DataArray] | DataTree | xr.Dataset, target: xr.DataArray, - ): + ) -> xr.DataArray: """ Calculate the residuals of the fitted linear model Parameters ---------- - predictors : dict of xr.DataArray - A dict of DataArray objects used as predictors. Must be 1D and contain `dim`. + predictors : dict of xr.DataArray | DataTree | xr.Dataset + A dict of DataArray objects used as predictors or a DataTree, holding each + predictor in a leaf. Each predictor must be 1D and contain `dim`. If predictors + is a xr.Dataset, it must have each predictor as a DataArray. target : xr.DataArray Target DataArray. Must be 2D and contain `dim`. @@ -126,7 +155,7 @@ def residuals( residuals = target - prediction - return residuals + return residuals.rename("residuals") @property def params(self): @@ -182,12 +211,12 @@ def to_netcdf(self, filename, **kwargs): Additional keyword arguments passed to ``xr.Dataset.to_netcf`` """ - params = self.params() + params = self.params params.to_netcdf(filename, **kwargs) def _fit_linear_regression_xr( - predictors: Mapping[str, xr.DataArray], + predictors: dict[str, xr.DataArray] | DataTree | xr.Dataset, target: xr.DataArray, dim: str, weights: xr.DataArray | None = None, @@ -198,8 +227,10 @@ def _fit_linear_regression_xr( Parameters ---------- - predictors : dict of xr.DataArray - A dict of DataArray objects used as predictors. Must be 1D and contain `dim`. + predictors : dict of xr.DataArray | DataTree | xr.Dataset + A dict of DataArray objects used as predictors or a DataTree, holding each + predictor in a leaf. Each predictor must be 1D and contain `dim`. If predictors + is a xr.Dataset, it must have each predictor as a DataArray. target : xr.DataArray Target DataArray. Must be 2D and contain `dim`. dim : str @@ -217,8 +248,10 @@ def _fit_linear_regression_xr( individual DataArray. """ - if not isinstance(predictors, Mapping): - raise TypeError(f"predictors should be a dict, got {type(predictors)}.") + if not isinstance(predictors, dict | DataTree | xr.Dataset): + raise TypeError( + f"predictors should be a dict, DataTree or xr.Dataset, got {type(predictors)}." + ) if ("weights" in predictors) or ("intercept" in predictors): raise ValueError( @@ -229,14 +262,32 @@ def _fit_linear_regression_xr( raise ValueError("dim cannot currently be 'predictor'.") for key, pred in predictors.items(): + pred = ( + _extract_single_dataarray_from_dt(pred) + if isinstance(pred, DataTree) + else pred + ) _check_dataarray_form(pred, ndim=1, required_dims=dim, name=f"predictor: {key}") - predictors_concat = xr.concat( - tuple(predictors.values()), - dim="predictor", - join="exact", - coords="minimal", - ) + if isinstance(predictors, dict | xr.Dataset): + predictors_concat = xr.concat( + tuple(predictors.values()), + dim="predictor", + join="exact", + coords="minimal", + ) + predictors_concat = predictors_concat.assign_coords( + {"predictor": list(predictors.keys())} + ) + elif isinstance(predictors, DataTree): + # rename all data variables to "data" to avoid conflicts when concatenating + predictors = map_over_subtree( + lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) + )(predictors) + predictors_concat = collapse_datatree_into_dataset(predictors, dim="predictor") + predictors_concat = ( + predictors_concat.to_array().isel(variable=0).drop_vars("variable") + ) _check_dataarray_form(target, required_dims=dim, name="target") @@ -267,7 +318,7 @@ def _fit_linear_regression_xr( target = target.drop_vars(target[dim].coords) # split `out` into individual DataArrays - keys = ["intercept"] + list(predictors) + keys = ["intercept"] + list(predictors_concat.coords["predictor"].values) data_vars = {key: (target_dim, out[:, i]) for i, key in enumerate(keys)} out = xr.Dataset(data_vars, coords=target.coords) @@ -318,4 +369,4 @@ def _fit_linear_regression_np(predictors, target, weights=None, fit_intercept=Tr if not fit_intercept: intercepts = np.zeros_like(coefficients[:, :1]) - return np.hstack([intercepts, coefficients]) + return np.hstack([intercepts, coefficients]) \ No newline at end of file From 88b6b75d9027f2e6625b9b4a04bf3f3917ffdbda Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Wed, 20 Nov 2024 21:18:34 +0100 Subject: [PATCH 02/22] tests --- tests/unit/test_linear_regression.py | 218 ++++++++++++++++++++++----- 1 file changed, 182 insertions(+), 36 deletions(-) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 46bc43e0..56682287 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -4,11 +4,24 @@ import numpy.testing as npt import pytest import xarray as xr +from datatree import DataTree import mesmer from mesmer.testing import trend_data_1D, trend_data_2D +def to_dict(data_dict): + return data_dict + + +def to_datatree(data_dict): + return DataTree.from_dict(data_dict) + + +def to_xr_dataset(data_dict): + return xr.Dataset(data_dict) + + def trend_data_1D_or_2D(as_2D, slope, scale, intercept): if as_2D: return trend_data_2D(slope=slope, scale=scale, intercept=intercept) @@ -79,7 +92,8 @@ def test_lr_params(): @pytest.mark.parametrize("as_2D", [True, False]) -def test_lr_predict(as_2D): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict(as_2D, data_structure): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -87,16 +101,65 @@ def test_lr_predict(as_2D): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time") + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + pred = data_structure({"tas": tas}) - result = lr.predict({"tas": tas}) - expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) + result = lr.predict(pred) + expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")).rename("tas") + expected = expected if as_2D else expected.squeeze() + xr.testing.assert_equal(result, expected) + + +@pytest.mark.parametrize("as_2D", [True, False]) +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict_two_predictors(as_2D, data_structure): + lr = mesmer.stats.LinearRegression() + + params = xr.Dataset( + data_vars={ + "intercept": ("x", [5]), + "fit_intercept": True, + "tas": ("x", [3]), + "tas2": ("x", [1]), + } + ) + lr.params = params if as_2D else params.squeeze() + + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + pred = data_structure({"tas": tas, "tas2": tas.rename("tas2")}) + + result = lr.predict(pred) + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") expected = expected if as_2D else expected.squeeze() + xr.testing.assert_equal(result, expected) + + +@pytest.mark.parametrize("as_2D", [True, False]) +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict_two_predictors_diffnames(as_2D, data_structure): + lr = mesmer.stats.LinearRegression() + + params = xr.Dataset( + data_vars={ + "intercept": ("x", [5]), + "fit_intercept": True, + "tas": ("x", [3]), + "tas2": ("x", [1]), + } + ) + lr.params = params if as_2D else params.squeeze() + + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + pred = data_structure({"tas": tas, "tas2": tas}) + result = lr.predict(pred) + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") + expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) -def test_lr_predict_missing_superfluous(): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict_missing_superfluous(data_structure): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -108,22 +171,30 @@ def test_lr_predict_missing_superfluous(): } ) lr.params = params + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") with pytest.raises(ValueError, match="Missing predictors: 'tas', 'tas2'"): - lr.predict({}) + lr.predict(data_structure({})) with pytest.raises(ValueError, match="Missing predictors: 'tas'"): - lr.predict({"tas2": None}) + lr.predict(data_structure({"tas2": None})) + + with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): + lr.predict(data_structure({"tas": tas, "tas2": tas, "something else": None})) with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): - lr.predict({"tas": None, "tas2": None, "something else": None}) + lr.predict( + data_structure({"tas": tas, "tas2": tas, "something else": None}), + exclude="tas2", + ) with pytest.raises(ValueError, match="Superfluous predictors: 'bar', 'foo'"): - lr.predict({"tas": None, "tas2": None, "foo": None, "bar": None}) + lr.predict(data_structure({"tas": tas, "tas2": tas, "foo": None, "bar": None})) @pytest.mark.parametrize("as_2D", [True, False]) -def test_lr_predict_exclude(as_2D): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict_exclude(as_2D, data_structure): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -136,21 +207,21 @@ def test_lr_predict_exclude(as_2D): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time") + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - result = lr.predict({"tas": tas}, exclude="tas2") + result = lr.predict(data_structure({"tas": tas}), exclude="tas2") expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict({"tas": tas}, exclude={"tas2"}) + result = lr.predict(data_structure({"tas": tas}), exclude={"tas2"}) expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict({}, exclude={"tas", "tas2"}) + result = lr.predict(data_structure({}), exclude={"tas", "tas2"}) expected = xr.DataArray([5], dims="x") expected = expected if as_2D else expected.squeeze() @@ -158,7 +229,8 @@ def test_lr_predict_exclude(as_2D): @pytest.mark.parametrize("as_2D", [True, False]) -def test_lr_predict_exclude_intercept(as_2D): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_lr_predict_exclude_intercept(as_2D, data_structure): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -170,15 +242,15 @@ def test_lr_predict_exclude_intercept(as_2D): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time") + tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - result = lr.predict({"tas": tas}, exclude="intercept") + result = lr.predict(data_structure({"tas": tas}), exclude="intercept") expected = xr.DataArray([[0, 3, 6]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict({}, exclude={"tas", "intercept"}) + result = lr.predict(data_structure({}), exclude={"tas", "intercept"}) expected = xr.DataArray([0], dims="x") expected = expected if as_2D else expected.squeeze() @@ -288,20 +360,36 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): with pytest.raises(ValueError, match="dim cannot currently be 'predictor'."): lr_method_or_function({"pred0": pred0}, tgt, dim="predictor") + # test predictors have to be dict or DataTree + with pytest.raises( + TypeError, + match="predictors should be a dict, DataTree or xr.Dataset, got .", + ): + lr_method_or_function([pred0, pred1], tgt, dim="time") + + # test DataTree depth + with pytest.raises(ValueError, match="DataTree must only contain one node."): + lr_method_or_function( + DataTree.from_dict({"scen0": DataTree.from_dict({"pred1": pred1})}), + tgt, + dim="time", + ) + @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) def test_linear_regression_one_predictor( - lr_method_or_function, intercept, slope, as_2D + lr_method_or_function, intercept, slope, as_2D, data_structure ): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function({"pred0": pred0}, tgt, "time") + result = lr_method_or_function(data_structure({"pred0": pred0}), tgt, "time") template = tgt.isel(time=0, drop=True) @@ -320,12 +408,16 @@ def test_linear_regression_one_predictor( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -def test_linear_regression_predictor_named_like_dim(lr_method_or_function, as_2D): - +@pytest.mark.parametrize("data_structure", [to_dict]) +def test_linear_regression_predictor_named_like_dim( + lr_method_or_function, as_2D, data_structure +): + # cannot be DataTree, becaue data_var cannot have the same name as coord + # cannot be Dataset because we need pred as a data_variable not coord slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function({"time": tgt.time}, tgt, "time") + result = lr_method_or_function(data_structure({"time": tgt.time}), tgt, "time") template = tgt.isel(time=0, drop=True) expected_intercept = xr.full_like(template, intercept) @@ -343,13 +435,17 @@ def test_linear_regression_predictor_named_like_dim(lr_method_or_function, as_2D @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -def test_linear_regression_predictor_has_non_dim_coors(lr_method_or_function, as_2D): - +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_linear_regression_predictor_has_non_dim_coors( + lr_method_or_function, as_2D, data_structure +): slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) tgt = tgt.assign_coords(year=("time", tgt.time.values + 1850)) - result = lr_method_or_function({"pred0": tgt.time}, tgt, "time") + result = lr_method_or_function( + data_structure({"pred0": tgt.time.rename("pred0")}), tgt, "time" + ) template = tgt.isel(time=0, drop=True) expected_intercept = xr.full_like(template, intercept) @@ -367,12 +463,15 @@ def test_linear_regression_predictor_has_non_dim_coors(lr_method_or_function, as @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -def test_linear_regression_fit_intercept(lr_method_or_function, as_2D): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_structure): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=1, scale=0, intercept=1) - result = lr_method_or_function({"pred0": pred0}, tgt, "time", fit_intercept=False) + result = lr_method_or_function( + data_structure({"pred0": pred0}), tgt, "time", fit_intercept=False + ) template = tgt.isel(time=0, drop=True) @@ -391,7 +490,9 @@ def test_linear_regression_fit_intercept(lr_method_or_function, as_2D): @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -def test_linear_regression_no_coords(lr_method_or_function, as_2D): +@pytest.mark.parametrize("data_structure", [to_dict, to_xr_dataset]) +def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_structure): + # does not work with DataTree due to missing coords in collapse_datatree_into_dataset slope, intercept = 3.14, 3.14 @@ -402,7 +503,7 @@ def test_linear_regression_no_coords(lr_method_or_function, as_2D): pred0 = pred0.drop_vars(pred0.coords.keys()) tgt = tgt.drop_vars(tgt.coords.keys()) - result = lr_method_or_function({"pred0": pred0}, tgt, "time") + result = lr_method_or_function(data_structure({"pred0": pred0}), tgt, "time") template = tgt.isel(time=0, drop=True) @@ -423,15 +524,18 @@ def test_linear_regression_no_coords(lr_method_or_function, as_2D): @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) def test_linear_regression_two_predictors( - lr_method_or_function, intercept, slope, as_2D + lr_method_or_function, intercept, slope, as_2D, data_structure ): pred0 = trend_data_1D(slope=1, scale=0) pred1 = trend_data_1D(slope=1, scale=0) tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function({"pred0": pred0, "pred1": pred1}, tgt, "time") + result = lr_method_or_function( + data_structure({"pred0": pred0, "pred1": pred1}), tgt, "time" + ) template = tgt.isel(time=0, drop=True) @@ -452,7 +556,46 @@ def test_linear_regression_two_predictors( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) -def test_linear_regression_two_predictors_extra_dim(lr_method_or_function): +@pytest.mark.parametrize("intercept", (0, 3.14)) +@pytest.mark.parametrize("slope", (0, 3.14)) +@pytest.mark.parametrize("as_2D", [True, False]) +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_linear_regression_two_predictors_diffnames( + lr_method_or_function, intercept, slope, as_2D, data_structure +): + + pred0 = trend_data_1D(slope=1, scale=0).rename("bar") + pred1 = trend_data_1D(slope=1, scale=0).rename("foo") + tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) + + result = lr_method_or_function( + data_structure({"pred0": pred0, "pred1": pred1}), tgt, "time" + ) + + template = tgt.isel(time=0, drop=True) + + expected_intercept = xr.full_like(template, intercept) + expected_pred0 = xr.full_like(template, slope / 2) + expected_pred1 = xr.full_like(template, slope / 2) + + expected = xr.Dataset( + { + "intercept": expected_intercept, + "pred0": expected_pred0, + "pred1": expected_pred1, + "fit_intercept": True, + } + ) + + xr.testing.assert_allclose(result, expected) + + +@pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) +@pytest.mark.parametrize("data_structure", [to_dict, to_xr_dataset]) +def test_linear_regression_two_predictors_extra_dim( + lr_method_or_function, data_structure +): + # does not work with datatree because we have not implemented minimal coords # add a 0D dimension/ coordinate and ensure it still works # NOTE: requires 3 predictors to trigger the error (might be an xarray issue) @@ -467,7 +610,7 @@ def test_linear_regression_two_predictors_extra_dim(lr_method_or_function): tgt = trend_data_2D(slope=slope, scale=0, intercept=intercept) result = lr_method_or_function( - {"pred0": pred0, "pred1": pred1, "pred2": pred0}, tgt, "time" + data_structure({"pred0": pred0, "pred1": pred1, "pred2": pred0}), tgt, "time" ) template = tgt.isel(time=0, drop=True) @@ -492,7 +635,8 @@ def test_linear_regression_two_predictors_extra_dim(lr_method_or_function): @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("intercept", (0, 3.14)) -def test_linear_regression_weights(lr_method_or_function, intercept): +@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +def test_linear_regression_weights(lr_method_or_function, intercept, data_structure): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_2D(slope=1, scale=0, intercept=intercept) @@ -500,7 +644,9 @@ def test_linear_regression_weights(lr_method_or_function, intercept): weights = trend_data_1D(intercept=0, slope=0, scale=0) weights[0] = 1 - result = lr_method_or_function({"pred0": pred0}, tgt, "time", weights=weights) + result = lr_method_or_function( + data_structure({"pred0": pred0}), tgt, "time", weights=weights + ) template = tgt.isel(time=0, drop=True) From d8740fe22b0ac541d4caddee196686bbf8623766 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 21 Nov 2024 09:59:51 +0100 Subject: [PATCH 03/22] nits --- mesmer/stats/_linear_regression.py | 2 -- mesmer/testing.py | 6 +++--- tests/unit/test_linear_regression.py | 1 + 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index c7ed482f..9d8e2d90 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -12,8 +12,6 @@ _to_set, ) -# TODO: deprecate predictor dicts? - class LinearRegression: """Ordinary least squares Linear Regression for xr.DataArray objects.""" diff --git a/mesmer/testing.py b/mesmer/testing.py index 7dffdd3d..7b09c5a6 100644 --- a/mesmer/testing.py +++ b/mesmer/testing.py @@ -77,7 +77,7 @@ def assert_dict_allclose(first, second, first_name="left", second_name="right"): assert first_val == second_val, key -def trend_data_1D(n_timesteps=30, intercept=0, slope=1, scale=1): +def trend_data_1D(n_timesteps=30, intercept=0., slope=1., scale=1.): time = np.arange(n_timesteps) @@ -89,7 +89,7 @@ def trend_data_1D(n_timesteps=30, intercept=0, slope=1, scale=1): return xr.DataArray(data, dims=("time"), coords={"time": time}, name="data") -def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1): +def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0., slope=1., scale=1.): n_cells = n_lat * n_lon time = np.arange(n_timesteps) @@ -109,7 +109,7 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale= return xr.DataArray(data, dims=("cells", "time"), coords=coords, name="data") -def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1): +def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0., slope=1., scale=1.): data = trend_data_2D( n_timesteps=n_timesteps, diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 56682287..c5bd8548 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -7,6 +7,7 @@ from datatree import DataTree import mesmer +import mesmer.stats._linear_regression from mesmer.testing import trend_data_1D, trend_data_2D From 546236d9384bae3e8938414baa1524583b977d6a Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 21 Nov 2024 10:00:15 +0100 Subject: [PATCH 04/22] linting --- mesmer/stats/_linear_regression.py | 2 +- mesmer/testing.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 9d8e2d90..9b1e91b6 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -367,4 +367,4 @@ def _fit_linear_regression_np(predictors, target, weights=None, fit_intercept=Tr if not fit_intercept: intercepts = np.zeros_like(coefficients[:, :1]) - return np.hstack([intercepts, coefficients]) \ No newline at end of file + return np.hstack([intercepts, coefficients]) diff --git a/mesmer/testing.py b/mesmer/testing.py index 7b09c5a6..6fbb4856 100644 --- a/mesmer/testing.py +++ b/mesmer/testing.py @@ -77,7 +77,7 @@ def assert_dict_allclose(first, second, first_name="left", second_name="right"): assert first_val == second_val, key -def trend_data_1D(n_timesteps=30, intercept=0., slope=1., scale=1.): +def trend_data_1D(n_timesteps=30, intercept=0.0, slope=1.0, scale=1.0): time = np.arange(n_timesteps) @@ -89,7 +89,9 @@ def trend_data_1D(n_timesteps=30, intercept=0., slope=1., scale=1.): return xr.DataArray(data, dims=("time"), coords={"time": time}, name="data") -def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0., slope=1., scale=1.): +def trend_data_2D( + n_timesteps=30, n_lat=3, n_lon=2, intercept=0.0, slope=1.0, scale=1.0 +): n_cells = n_lat * n_lon time = np.arange(n_timesteps) @@ -109,7 +111,9 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0., slope=1., scal return xr.DataArray(data, dims=("cells", "time"), coords=coords, name="data") -def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0., slope=1., scale=1.): +def trend_data_3D( + n_timesteps=30, n_lat=3, n_lon=2, intercept=0.0, slope=1.0, scale=1.0 +): data = trend_data_2D( n_timesteps=n_timesteps, From 521067eaffd7bedff38443200040ee8ee2565631 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 21 Nov 2024 12:01:17 +0100 Subject: [PATCH 05/22] nits --- mesmer/stats/_linear_regression.py | 2 +- tests/unit/test_linear_regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 9b1e91b6..5a6b78bb 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -278,7 +278,7 @@ def _fit_linear_regression_xr( {"predictor": list(predictors.keys())} ) elif isinstance(predictors, DataTree): - # rename all data variables to "data" to avoid conflicts when concatenating + # rename all data variables to "pred" to avoid conflicts when concatenating predictors = map_over_subtree( lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) )(predictors) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index c5bd8548..1be7ab73 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -369,7 +369,7 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): lr_method_or_function([pred0, pred1], tgt, dim="time") # test DataTree depth - with pytest.raises(ValueError, match="DataTree must only contain one node."): + with pytest.raises(ValueError, match="DataTree must contain data."): lr_method_or_function( DataTree.from_dict({"scen0": DataTree.from_dict({"pred1": pred1})}), tgt, From 1e101e56ba43c241aa91c83d2191e093a7041a4a Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 21 Nov 2024 12:14:01 +0100 Subject: [PATCH 06/22] fixes --- mesmer/create_emulations/create_emus_lv.py | 2 +- mesmer/create_emulations/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/create_emulations/create_emus_lv.py b/mesmer/create_emulations/create_emus_lv.py index c9d9074b..36f1212d 100644 --- a/mesmer/create_emulations/create_emus_lv.py +++ b/mesmer/create_emulations/create_emus_lv.py @@ -271,6 +271,6 @@ def create_emus_lv_OLS(params_lv, preds_lv): lr.params = params prediction = lr.predict(predictors=preds) - emus_lv[scen][targ] = prediction.values + emus_lv[scen][targ] = prediction.values.transpose() return emus_lv diff --git a/mesmer/create_emulations/utils.py b/mesmer/create_emulations/utils.py index a7293eff..b2e3c3ee 100644 --- a/mesmer/create_emulations/utils.py +++ b/mesmer/create_emulations/utils.py @@ -114,7 +114,7 @@ def _gather_lr_params(params_dict, targ, dims): intercept = xr.DataArray(params_dict["intercept"][targ], dims=dims) fit_intercept = True else: - intercept = 0 + intercept = xr.zeros_like(params[pred]) fit_intercept = False params["intercept"] = intercept From 05eb6cc6818f50f35738d126d13d68eda8caf22b Mon Sep 17 00:00:00 2001 From: Victoria <112418493+veni-vidi-vici-dormivi@users.noreply.github.com> Date: Fri, 22 Nov 2024 18:03:47 +0100 Subject: [PATCH 07/22] Update mesmer/stats/_linear_regression.py Co-authored-by: Mathias Hauser --- mesmer/stats/_linear_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 5a6b78bb..7f30c9b6 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -71,7 +71,7 @@ def predict( predictors : dict of xr.DataArray | DataTree | xr.Dataset A dict of ``DataArray`` objects used as predictors or a ``DataTree``, holding each predictor in a leaf. Each predictor must be 1D and contain ``dim``. If predictors - is a ``xr.Dataset``, it must have each predictor as a ``DataArray``. + is a ``xr.Dataset``, it must have each predictor as a single ``DataArray``. exclude : str or set of str, default: None Set of variables to exclude in the prediction. May include ``"intercept"`` to initialize the prediction with 0. From e23db8c8ddc57346863f63752bdb5864dadfb8e3 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 16:22:23 +0100 Subject: [PATCH 08/22] convert_to --- tests/unit/test_linear_regression.py | 123 +++++++++++++-------------- 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 1be7ab73..b39a9dac 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -11,16 +11,15 @@ from mesmer.testing import trend_data_1D, trend_data_2D -def to_dict(data_dict): - return data_dict - - -def to_datatree(data_dict): - return DataTree.from_dict(data_dict) - - -def to_xr_dataset(data_dict): - return xr.Dataset(data_dict) +def convert_to(dct: dict, data_type: str) -> dict | DataTree | xr.Dataset: + if data_type == "dict": + return dct + elif data_type == "DataTree": + return DataTree.from_dict(dct) + elif data_type == "xr_dataset": + return xr.Dataset(dct) + else: + raise ValueError(f"Unknown data_type: {data_type}") def trend_data_1D_or_2D(as_2D, slope, scale, intercept): @@ -93,8 +92,8 @@ def test_lr_params(): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict(as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict(as_2D, data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -103,7 +102,7 @@ def test_lr_predict(as_2D, data_structure): lr.params = params if as_2D else params.squeeze() tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - pred = data_structure({"tas": tas}) + pred = convert_to({"tas": tas}, data_type) result = lr.predict(pred) expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")).rename("tas") @@ -112,8 +111,8 @@ def test_lr_predict(as_2D, data_structure): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict_two_predictors(as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict_two_predictors(as_2D, data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -127,7 +126,7 @@ def test_lr_predict_two_predictors(as_2D, data_structure): lr.params = params if as_2D else params.squeeze() tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - pred = data_structure({"tas": tas, "tas2": tas.rename("tas2")}) + pred = convert_to({"tas": tas, "tas2": tas.rename("tas2")}, data_type) result = lr.predict(pred) expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") @@ -136,8 +135,8 @@ def test_lr_predict_two_predictors(as_2D, data_structure): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict_two_predictors_diffnames(as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict_two_predictors_diffnames(as_2D, data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -151,7 +150,7 @@ def test_lr_predict_two_predictors_diffnames(as_2D, data_structure): lr.params = params if as_2D else params.squeeze() tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - pred = data_structure({"tas": tas, "tas2": tas}) + pred = convert_to({"tas": tas, "tas2": tas}, data_type) result = lr.predict(pred) expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") @@ -159,8 +158,8 @@ def test_lr_predict_two_predictors_diffnames(as_2D, data_structure): xr.testing.assert_equal(result, expected) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict_missing_superfluous(data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict_missing_superfluous(data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -175,27 +174,27 @@ def test_lr_predict_missing_superfluous(data_structure): tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") with pytest.raises(ValueError, match="Missing predictors: 'tas', 'tas2'"): - lr.predict(data_structure({})) + lr.predict(convert_to({}, data_type)) with pytest.raises(ValueError, match="Missing predictors: 'tas'"): - lr.predict(data_structure({"tas2": None})) + lr.predict(convert_to({"tas2": None}, data_type)) with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): - lr.predict(data_structure({"tas": tas, "tas2": tas, "something else": None})) + lr.predict(convert_to({"tas": tas, "tas2": tas, "something else": None}, data_type)) with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): lr.predict( - data_structure({"tas": tas, "tas2": tas, "something else": None}), + convert_to({"tas": tas, "tas2": tas, "something else": None}, data_type), exclude="tas2", ) with pytest.raises(ValueError, match="Superfluous predictors: 'bar', 'foo'"): - lr.predict(data_structure({"tas": tas, "tas2": tas, "foo": None, "bar": None})) + lr.predict(convert_to({"tas": tas, "tas2": tas, "foo": None, "bar": None}, data_type)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict_exclude(as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict_exclude(as_2D, data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -210,19 +209,19 @@ def test_lr_predict_exclude(as_2D, data_structure): tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - result = lr.predict(data_structure({"tas": tas}), exclude="tas2") + result = lr.predict(convert_to({"tas": tas}, data_type), exclude="tas2") expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict(data_structure({"tas": tas}), exclude={"tas2"}) + result = lr.predict(convert_to({"tas": tas}, data_type), exclude={"tas2"}) expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict(data_structure({}), exclude={"tas", "tas2"}) + result = lr.predict(convert_to({}, data_type), exclude={"tas", "tas2"}) expected = xr.DataArray([5], dims="x") expected = expected if as_2D else expected.squeeze() @@ -230,8 +229,8 @@ def test_lr_predict_exclude(as_2D, data_structure): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_lr_predict_exclude_intercept(as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_lr_predict_exclude_intercept(as_2D, data_type): lr = mesmer.stats.LinearRegression() params = xr.Dataset( @@ -245,13 +244,13 @@ def test_lr_predict_exclude_intercept(as_2D, data_structure): tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") - result = lr.predict(data_structure({"tas": tas}), exclude="intercept") + result = lr.predict(convert_to({"tas": tas}, data_type), exclude="intercept") expected = xr.DataArray([[0, 3, 6]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) - result = lr.predict(data_structure({}), exclude={"tas", "intercept"}) + result = lr.predict(convert_to({}, data_type), exclude={"tas", "intercept"}) expected = xr.DataArray([0], dims="x") expected = expected if as_2D else expected.squeeze() @@ -381,16 +380,16 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) def test_linear_regression_one_predictor( - lr_method_or_function, intercept, slope, as_2D, data_structure + lr_method_or_function, intercept, slope, as_2D, data_type ): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function(data_structure({"pred0": pred0}), tgt, "time") + result = lr_method_or_function(convert_to({"pred0": pred0}, data_type), tgt, "time") template = tgt.isel(time=0, drop=True) @@ -409,16 +408,16 @@ def test_linear_regression_one_predictor( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict]) +@pytest.mark.parametrize("data_type", ['dict']) def test_linear_regression_predictor_named_like_dim( - lr_method_or_function, as_2D, data_structure + lr_method_or_function, as_2D, data_type ): # cannot be DataTree, becaue data_var cannot have the same name as coord # cannot be Dataset because we need pred as a data_variable not coord slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function(data_structure({"time": tgt.time}), tgt, "time") + result = lr_method_or_function(convert_to({"time": tgt.time}, data_type), tgt, "time") template = tgt.isel(time=0, drop=True) expected_intercept = xr.full_like(template, intercept) @@ -436,16 +435,16 @@ def test_linear_regression_predictor_named_like_dim( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) def test_linear_regression_predictor_has_non_dim_coors( - lr_method_or_function, as_2D, data_structure + lr_method_or_function, as_2D, data_type ): slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) tgt = tgt.assign_coords(year=("time", tgt.time.values + 1850)) result = lr_method_or_function( - data_structure({"pred0": tgt.time.rename("pred0")}), tgt, "time" + convert_to({"pred0": tgt.time.rename("pred0")}, data_type), tgt, "time" ) template = tgt.isel(time=0, drop=True) @@ -464,14 +463,14 @@ def test_linear_regression_predictor_has_non_dim_coors( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_type): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=1, scale=0, intercept=1) result = lr_method_or_function( - data_structure({"pred0": pred0}), tgt, "time", fit_intercept=False + convert_to({"pred0": pred0}, data_type), tgt, "time", fit_intercept=False ) template = tgt.isel(time=0, drop=True) @@ -491,8 +490,8 @@ def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_stru @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_xr_dataset]) -def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'xr_dataset']) +def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_type): # does not work with DataTree due to missing coords in collapse_datatree_into_dataset slope, intercept = 3.14, 3.14 @@ -504,7 +503,7 @@ def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_structur pred0 = pred0.drop_vars(pred0.coords.keys()) tgt = tgt.drop_vars(tgt.coords.keys()) - result = lr_method_or_function(data_structure({"pred0": pred0}), tgt, "time") + result = lr_method_or_function(convert_to({"pred0": pred0}, data_type), tgt, "time") template = tgt.isel(time=0, drop=True) @@ -525,9 +524,9 @@ def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_structur @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) def test_linear_regression_two_predictors( - lr_method_or_function, intercept, slope, as_2D, data_structure + lr_method_or_function, intercept, slope, as_2D, data_type ): pred0 = trend_data_1D(slope=1, scale=0) @@ -535,7 +534,7 @@ def test_linear_regression_two_predictors( tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) result = lr_method_or_function( - data_structure({"pred0": pred0, "pred1": pred1}), tgt, "time" + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, "time" ) template = tgt.isel(time=0, drop=True) @@ -560,9 +559,9 @@ def test_linear_regression_two_predictors( @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) def test_linear_regression_two_predictors_diffnames( - lr_method_or_function, intercept, slope, as_2D, data_structure + lr_method_or_function, intercept, slope, as_2D, data_type ): pred0 = trend_data_1D(slope=1, scale=0).rename("bar") @@ -570,7 +569,7 @@ def test_linear_regression_two_predictors_diffnames( tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) result = lr_method_or_function( - data_structure({"pred0": pred0, "pred1": pred1}), tgt, "time" + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, "time" ) template = tgt.isel(time=0, drop=True) @@ -592,9 +591,9 @@ def test_linear_regression_two_predictors_diffnames( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) -@pytest.mark.parametrize("data_structure", [to_dict, to_xr_dataset]) +@pytest.mark.parametrize("data_type", ['dict', 'xr_dataset']) def test_linear_regression_two_predictors_extra_dim( - lr_method_or_function, data_structure + lr_method_or_function, data_type ): # does not work with datatree because we have not implemented minimal coords # add a 0D dimension/ coordinate and ensure it still works @@ -611,7 +610,7 @@ def test_linear_regression_two_predictors_extra_dim( tgt = trend_data_2D(slope=slope, scale=0, intercept=intercept) result = lr_method_or_function( - data_structure({"pred0": pred0, "pred1": pred1, "pred2": pred0}), tgt, "time" + convert_to({"pred0": pred0, "pred1": pred1, "pred2": pred0}, data_type), tgt, "time" ) template = tgt.isel(time=0, drop=True) @@ -636,8 +635,8 @@ def test_linear_regression_two_predictors_extra_dim( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("intercept", (0, 3.14)) -@pytest.mark.parametrize("data_structure", [to_dict, to_datatree, to_xr_dataset]) -def test_linear_regression_weights(lr_method_or_function, intercept, data_structure): +@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +def test_linear_regression_weights(lr_method_or_function, intercept, data_type): pred0 = trend_data_1D(slope=1, scale=0) tgt = trend_data_2D(slope=1, scale=0, intercept=intercept) @@ -646,7 +645,7 @@ def test_linear_regression_weights(lr_method_or_function, intercept, data_struct weights[0] = 1 result = lr_method_or_function( - data_structure({"pred0": pred0}), tgt, "time", weights=weights + convert_to({"pred0": pred0}, data_type), tgt, "time", weights=weights ) template = tgt.isel(time=0, drop=True) From 549276f31a05b02479495a75b4af47b01b5d2c3b Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 16:26:33 +0100 Subject: [PATCH 09/22] replace renames --- tests/unit/test_linear_regression.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index b39a9dac..3d0228d3 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -101,11 +101,11 @@ def test_lr_predict(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") pred = convert_to({"tas": tas}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")).rename("tas") + expected = xr.DataArray([[5, 8, 11]], dims=("x", "time"), name = "tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) @@ -125,11 +125,11 @@ def test_lr_predict_two_predictors(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") pred = convert_to({"tas": tas, "tas2": tas.rename("tas2")}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name = "tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) @@ -149,11 +149,11 @@ def test_lr_predict_two_predictors_diffnames(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") pred = convert_to({"tas": tas, "tas2": tas}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 9, 13]], dims=("x", "time")).rename("tas") + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name = "tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) @@ -171,7 +171,7 @@ def test_lr_predict_missing_superfluous(data_type): } ) lr.params = params - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") with pytest.raises(ValueError, match="Missing predictors: 'tas', 'tas2'"): lr.predict(convert_to({}, data_type)) @@ -207,7 +207,7 @@ def test_lr_predict_exclude(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") result = lr.predict(convert_to({"tas": tas}, data_type), exclude="tas2") expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) @@ -242,7 +242,7 @@ def test_lr_predict_exclude_intercept(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time").rename("tas") + tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") result = lr.predict(convert_to({"tas": tas}, data_type), exclude="intercept") expected = xr.DataArray([[0, 3, 6]], dims=("x", "time")) From 1fee407b55a3944dc823e84107b92937ca769817 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 16:26:38 +0100 Subject: [PATCH 10/22] uninline --- mesmer/stats/_linear_regression.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 7f30c9b6..90d540ce 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -117,11 +117,8 @@ def predict( for key in required_predictors: prediction = (predictors[key] * params[key]).transpose() + prediction - prediction = ( - _extract_single_dataarray_from_dt(prediction) - if isinstance(prediction, DataTree) - else prediction - ) + if isinstance(prediction, DataTree): + prediction = _extract_single_dataarray_from_dt(prediction) return prediction.rename("prediction") @@ -260,11 +257,9 @@ def _fit_linear_regression_xr( raise ValueError("dim cannot currently be 'predictor'.") for key, pred in predictors.items(): - pred = ( - _extract_single_dataarray_from_dt(pred) - if isinstance(pred, DataTree) - else pred - ) + if isinstance(pred, DataTree): + pred = _extract_single_dataarray_from_dt(pred) + _check_dataarray_form(pred, ndim=1, required_dims=dim, name=f"predictor: {key}") if isinstance(predictors, dict | xr.Dataset): From 0b0587787d883239422f2d1e41345dd0d73fc921 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 16:26:59 +0100 Subject: [PATCH 11/22] linting --- tests/unit/test_linear_regression.py | 68 +++++++++++++++------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 3d0228d3..7249d721 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -92,7 +92,7 @@ def test_lr_params(): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -101,17 +101,17 @@ def test_lr_predict(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") pred = convert_to({"tas": tas}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 8, 11]], dims=("x", "time"), name = "tas") + expected = xr.DataArray([[5, 8, 11]], dims=("x", "time"), name="tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict_two_predictors(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -125,17 +125,17 @@ def test_lr_predict_two_predictors(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") pred = convert_to({"tas": tas, "tas2": tas.rename("tas2")}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name = "tas") + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name="tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict_two_predictors_diffnames(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -149,16 +149,16 @@ def test_lr_predict_two_predictors_diffnames(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") pred = convert_to({"tas": tas, "tas2": tas}, data_type) result = lr.predict(pred) - expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name = "tas") + expected = xr.DataArray([[5, 9, 13]], dims=("x", "time"), name="tas") expected = expected if as_2D else expected.squeeze() xr.testing.assert_equal(result, expected) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict_missing_superfluous(data_type): lr = mesmer.stats.LinearRegression() @@ -171,7 +171,7 @@ def test_lr_predict_missing_superfluous(data_type): } ) lr.params = params - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") with pytest.raises(ValueError, match="Missing predictors: 'tas', 'tas2'"): lr.predict(convert_to({}, data_type)) @@ -180,7 +180,9 @@ def test_lr_predict_missing_superfluous(data_type): lr.predict(convert_to({"tas2": None}, data_type)) with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): - lr.predict(convert_to({"tas": tas, "tas2": tas, "something else": None}, data_type)) + lr.predict( + convert_to({"tas": tas, "tas2": tas, "something else": None}, data_type) + ) with pytest.raises(ValueError, match="Superfluous predictors: 'something else'"): lr.predict( @@ -189,11 +191,13 @@ def test_lr_predict_missing_superfluous(data_type): ) with pytest.raises(ValueError, match="Superfluous predictors: 'bar', 'foo'"): - lr.predict(convert_to({"tas": tas, "tas2": tas, "foo": None, "bar": None}, data_type)) + lr.predict( + convert_to({"tas": tas, "tas2": tas, "foo": None, "bar": None}, data_type) + ) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict_exclude(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -207,7 +211,7 @@ def test_lr_predict_exclude(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") result = lr.predict(convert_to({"tas": tas}, data_type), exclude="tas2") expected = xr.DataArray([[5, 8, 11]], dims=("x", "time")) @@ -229,7 +233,7 @@ def test_lr_predict_exclude(as_2D, data_type): @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_lr_predict_exclude_intercept(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -242,7 +246,7 @@ def test_lr_predict_exclude_intercept(as_2D, data_type): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time", name = "tas") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") result = lr.predict(convert_to({"tas": tas}, data_type), exclude="intercept") expected = xr.DataArray([[0, 3, 6]], dims=("x", "time")) @@ -380,7 +384,7 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_one_predictor( lr_method_or_function, intercept, slope, as_2D, data_type ): @@ -408,7 +412,7 @@ def test_linear_regression_one_predictor( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict']) +@pytest.mark.parametrize("data_type", ["dict"]) def test_linear_regression_predictor_named_like_dim( lr_method_or_function, as_2D, data_type ): @@ -417,7 +421,9 @@ def test_linear_regression_predictor_named_like_dim( slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) - result = lr_method_or_function(convert_to({"time": tgt.time}, data_type), tgt, "time") + result = lr_method_or_function( + convert_to({"time": tgt.time}, data_type), tgt, "time" + ) template = tgt.isel(time=0, drop=True) expected_intercept = xr.full_like(template, intercept) @@ -435,7 +441,7 @@ def test_linear_regression_predictor_named_like_dim( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_predictor_has_non_dim_coors( lr_method_or_function, as_2D, data_type ): @@ -463,7 +469,7 @@ def test_linear_regression_predictor_has_non_dim_coors( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_type): pred0 = trend_data_1D(slope=1, scale=0) @@ -490,7 +496,7 @@ def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_type @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "xr_dataset"]) def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_type): # does not work with DataTree due to missing coords in collapse_datatree_into_dataset @@ -524,7 +530,7 @@ def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_type): @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_two_predictors( lr_method_or_function, intercept, slope, as_2D, data_type ): @@ -559,7 +565,7 @@ def test_linear_regression_two_predictors( @pytest.mark.parametrize("intercept", (0, 3.14)) @pytest.mark.parametrize("slope", (0, 3.14)) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_two_predictors_diffnames( lr_method_or_function, intercept, slope, as_2D, data_type ): @@ -591,10 +597,8 @@ def test_linear_regression_two_predictors_diffnames( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) -@pytest.mark.parametrize("data_type", ['dict', 'xr_dataset']) -def test_linear_regression_two_predictors_extra_dim( - lr_method_or_function, data_type -): +@pytest.mark.parametrize("data_type", ["dict", "xr_dataset"]) +def test_linear_regression_two_predictors_extra_dim(lr_method_or_function, data_type): # does not work with datatree because we have not implemented minimal coords # add a 0D dimension/ coordinate and ensure it still works # NOTE: requires 3 predictors to trigger the error (might be an xarray issue) @@ -610,7 +614,9 @@ def test_linear_regression_two_predictors_extra_dim( tgt = trend_data_2D(slope=slope, scale=0, intercept=intercept) result = lr_method_or_function( - convert_to({"pred0": pred0, "pred1": pred1, "pred2": pred0}, data_type), tgt, "time" + convert_to({"pred0": pred0, "pred1": pred1, "pred2": pred0}, data_type), + tgt, + "time", ) template = tgt.isel(time=0, drop=True) @@ -635,7 +641,7 @@ def test_linear_regression_two_predictors_extra_dim( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("intercept", (0, 3.14)) -@pytest.mark.parametrize("data_type", ['dict', 'DataTree', 'xr_dataset']) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_weights(lr_method_or_function, intercept, data_type): pred0 = trend_data_1D(slope=1, scale=0) From 2207ac7459e3e5462f902facbe35e9576cd6233c Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 17:03:47 +0100 Subject: [PATCH 12/22] rename in datatree --- mesmer/core/datatree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/core/datatree.py b/mesmer/core/datatree.py index bb37ca0a..c8f158d5 100644 --- a/mesmer/core/datatree.py +++ b/mesmer/core/datatree.py @@ -127,9 +127,9 @@ def stack_datatrees_for_linear_regression( # prepare predictors predictors_stacked = DataTree() - for key, subtree in predictors.items(): + for key, pred in predictors.items(): # 1) broadcast to target - pred_broadcast = subtree.broadcast_like(target, exclude=exclude_dim) + pred_broadcast = pred.broadcast_like(target, exclude=exclude_dim) # 2) collapse into DataSets predictor_ds = collapse_datatree_into_dataset(pred_broadcast, dim=collapse_dim) # 3) stack From 6e32b304e6f027ccea3b5892d69bfb54b4caa5ca Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 21:04:13 +0100 Subject: [PATCH 13/22] add name to err msg of extract_single_dataarray --- mesmer/core/datatree.py | 12 +++++++----- tests/unit/test_datatree.py | 12 ++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/mesmer/core/datatree.py b/mesmer/core/datatree.py index c8f158d5..303a9499 100644 --- a/mesmer/core/datatree.py +++ b/mesmer/core/datatree.py @@ -2,19 +2,21 @@ from datatree import DataTree -def _extract_single_dataarray_from_dt(dt: DataTree) -> xr.DataArray: +def _extract_single_dataarray_from_dt(dt: DataTree, name: str = "node") -> xr.DataArray: """ Extract a single DataArray from a DataTree node, holding a ``Dataset`` with one ``DataArray``. """ + if not dt.has_data: - raise ValueError("DataTree must contain data.") + raise ValueError(f"{name} has no data.") ds = dt.to_dataset() - name, *others = ds.keys() + var_name, *others = ds.keys() if others: - raise ValueError("DataTree must only contain one data variable.") + others = ", ".join(map(str, others)) + raise ValueError(f"Node must only contain one data variable, {name} has {others} and {var_name}.") - da = ds[name] + da = ds[var_name] return da diff --git a/tests/unit/test_datatree.py b/tests/unit/test_datatree.py index c69df0f1..0a1301d9 100644 --- a/tests/unit/test_datatree.py +++ b/tests/unit/test_datatree.py @@ -141,23 +141,27 @@ def test_extract_single_dataarray_from_dt(): dt = DataTree(data=xr.Dataset({"tas": da, "tas2": da})) with pytest.raises( - ValueError, match="DataTree must only contain one data variable." + ValueError, match="Node must only contain one data variable, node has tas2 and tas." ): - res = mesmer.datatree._extract_single_dataarray_from_dt(dt) + mesmer.datatree._extract_single_dataarray_from_dt(dt) dt = DataTree.from_dict( {"scen1": xr.Dataset({"tas": da, "tas2": da}), "scen2": xr.Dataset({"tas": da})} ) # passing empty root - with pytest.raises(ValueError, match="DataTree must contain data."): + with pytest.raises(ValueError, match="node has no data."): mesmer.datatree._extract_single_dataarray_from_dt(dt) + # check name + with pytest.raises(ValueError, match="Node must only contain one data variable, scen1 has tas2 and tas."): + mesmer.datatree._extract_single_dataarray_from_dt(dt['scen1'], name="scen1") + res = mesmer.datatree._extract_single_dataarray_from_dt(dt["scen2"]) xr.testing.assert_equal(res, da) # passing empty Dataree - with pytest.raises(ValueError, match="DataTree must contain data."): + with pytest.raises(ValueError, match="node has no data."): mesmer.datatree._extract_single_dataarray_from_dt(DataTree()) From f00cbd915d1c1725a9defca2722535bf8f91fe30 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 21:04:23 +0100 Subject: [PATCH 14/22] extend tests --- mesmer/stats/_linear_regression.py | 10 +++- tests/unit/test_linear_regression.py | 82 +++++++++++++++------------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 90d540ce..85ff282f 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -109,7 +109,8 @@ def predict( prediction = params.intercept # if predictors is a DataTree, rename all data variables to "pred" to avoid conflicts - if isinstance(predictors, DataTree) and not predictors.equals(DataTree()): + # not necessaey if predictors is empty DataTree or only data is in root, i.e. depth == 0 + if isinstance(predictors, DataTree) and not predictors.depth == 0: predictors = map_over_subtree( lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) )(predictors) @@ -242,6 +243,9 @@ def _fit_linear_regression_xr( Dataset of intercepts and coefficients. The intercepts and each predictor is an individual DataArray. """ + # if DataTree only has data in root, extract Dataset + if isinstance(predictors, DataTree) and predictors.depth == 0: + predictors = predictors.to_dataset() if not isinstance(predictors, dict | DataTree | xr.Dataset): raise TypeError( @@ -258,7 +262,7 @@ def _fit_linear_regression_xr( for key, pred in predictors.items(): if isinstance(pred, DataTree): - pred = _extract_single_dataarray_from_dt(pred) + pred = _extract_single_dataarray_from_dt(pred, name=f"predictor: {key}") _check_dataarray_form(pred, ndim=1, required_dims=dim, name=f"predictor: {key}") @@ -277,7 +281,7 @@ def _fit_linear_regression_xr( predictors = map_over_subtree( lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) )(predictors) - predictors_concat = collapse_datatree_into_dataset(predictors, dim="predictor") + predictors_concat = collapse_datatree_into_dataset(predictors, dim="predictor", join="exact", coords="minimal") predictors_concat = ( predictors_concat.to_array().isel(variable=0).drop_vars("variable") ) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 7249d721..87e582e7 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -262,7 +262,8 @@ def test_lr_predict_exclude_intercept(as_2D, data_type): @pytest.mark.parametrize("as_2D", [True, False]) -def test_LR_residuals(as_2D): +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) +def test_LR_residuals(as_2D, data_type): lr = mesmer.stats.LinearRegression() @@ -271,11 +272,11 @@ def test_LR_residuals(as_2D): ) lr.params = params if as_2D else params.squeeze() - tas = xr.DataArray([0, 1, 2], dims="time") + tas = xr.DataArray([0, 1, 2], dims="time", name="tas") target = xr.DataArray([[5, 8, 0]], dims=("x", "time")) target = target if as_2D else target.squeeze() - result = lr.residuals({"tas": tas}, target) + result = lr.residuals(convert_to({"tas": tas}, data_type), target) expected = xr.DataArray([[0, 3, -5]], dims=("x", "time")) expected = expected if as_2D else expected.squeeze() @@ -284,7 +285,8 @@ def test_LR_residuals(as_2D): # TEST XARRAY WRAPPER & LinearRegression().fit @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) -def test_linear_regression_errors(lr_method_or_function): +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) +def test_linear_regression_errors(lr_method_or_function, data_type): pred0 = trend_data_1D() pred1 = trend_data_1D() @@ -295,36 +297,54 @@ def test_linear_regression_errors(lr_method_or_function): weights = trend_data_1D(intercept=1, slope=0, scale=0) - with pytest.raises(TypeError, match="predictors should be a dict"): - lr_method_or_function(pred0, tgt, dim="time") + # test predictors have to be dict, dataset or DataTree + with pytest.raises( + TypeError, + match="predictors should be a dict, DataTree or xr.Dataset, got .", + ): + lr_method_or_function([pred0, pred1], tgt, dim="time") def test_unequal_coords(pred0, pred1, tgt, weights): - with pytest.raises(ValueError, match="cannot align objects"): lr_method_or_function( - {"pred0": pred0, "pred1": pred1}, tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights ) - test_unequal_coords(pred0.isel(time=slice(0, 5)), pred1, tgt, weights) - test_unequal_coords(pred0, pred1.isel(time=slice(0, 5)), tgt, weights) + if not data_type == "xr_dataset": + # for xr_dataset, this leads to nans in the predictors -> user responsibility + test_unequal_coords(pred0.isel(time=slice(0, 5)), pred1, tgt, weights) + test_unequal_coords(pred0, pred1.isel(time=slice(0, 5)), tgt, weights) test_unequal_coords(pred0, pred1, tgt.isel(time=slice(0, 5)), weights) test_unequal_coords(pred0, pred1, tgt, weights.isel(time=slice(0, 5))) - def test_wrong_type(pred0, pred1, tgt, weights, name): - with pytest.raises(TypeError, match=f"Expected {name} to be an xr.DataArray"): + def test_wrong_type(pred0, pred1, tgt, weights, name, preds_wrong = False): + + msg = f"Expected {name} to be an xr.DataArray" + errortype = TypeError + + # errors sooner for datatree predictors + if preds_wrong == True: + if data_type == "DataTree": + msg = f"{name} has no data." + errortype = ValueError + elif data_type == "xr_dataset": + msg = f"{name} should be 1D, but is 0D" + errortype = ValueError + + with pytest.raises(errortype, match=msg): lr_method_or_function( - {"pred0": pred0, "pred1": pred1}, tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights ) - test_wrong_type(None, pred1, tgt, weights, name="predictor: pred0") - test_wrong_type(pred0, None, tgt, weights, name="predictor: pred1") + test_wrong_type(None, pred1, tgt, weights, name="predictor: pred0", preds_wrong = True) + test_wrong_type(pred0, None, tgt, weights, name="predictor: pred1", preds_wrong = True) test_wrong_type(pred0, pred1, None, weights, name="target") test_wrong_type(pred0, pred1, tgt, xr.Dataset(), name="weights") def test_wrong_shape(pred0, pred1, tgt, weights, name, ndim): with pytest.raises(ValueError, match=f"{name} should be {ndim}D"): lr_method_or_function( - {"pred0": pred0, "pred1": pred1}, tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights ) test_wrong_shape( @@ -340,7 +360,7 @@ def test_wrong_shape(pred0, pred1, tgt, weights, name, ndim): # target ndim test has a different error message with pytest.raises(ValueError, match="target should be 1D or 2D"): lr_method_or_function( - {"pred0": pred0, "pred1": pred1}, + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt.expand_dims("new"), dim="time", weights=weights, @@ -349,7 +369,7 @@ def test_wrong_shape(pred0, pred1, tgt, weights, name, ndim): def test_missing_dim(pred0, pred1, tgt, weights, name): with pytest.raises(ValueError, match=f"{name} is missing the required dims"): lr_method_or_function( - {"pred0": pred0, "pred1": pred1}, tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights ) test_missing_dim( @@ -361,23 +381,14 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): test_missing_dim(pred0, pred1, tgt.rename(time="t"), weights, name="target") test_missing_dim(pred0, pred1, tgt, weights.rename(time="t"), name="weights") - with pytest.raises(ValueError, match="dim cannot currently be 'predictor'."): - lr_method_or_function({"pred0": pred0}, tgt, dim="predictor") - - # test predictors have to be dict or DataTree - with pytest.raises( - TypeError, - match="predictors should be a dict, DataTree or xr.Dataset, got .", - ): - lr_method_or_function([pred0, pred1], tgt, dim="time") - - # test DataTree depth - with pytest.raises(ValueError, match="DataTree must contain data."): + with pytest.raises(ValueError, match="A predictor with the name 'weights' or 'intercept' is not allowed"): lr_method_or_function( - DataTree.from_dict({"scen0": DataTree.from_dict({"pred1": pred1})}), + convert_to({"weights": pred0, "intercept": pred1}, data_type), tgt, dim="time", ) + with pytest.raises(ValueError, match="dim cannot currently be 'predictor'."): + lr_method_or_function(convert_to({"pred0": pred0}, data_type), tgt, dim="predictor") @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @@ -416,7 +427,7 @@ def test_linear_regression_one_predictor( def test_linear_regression_predictor_named_like_dim( lr_method_or_function, as_2D, data_type ): - # cannot be DataTree, becaue data_var cannot have the same name as coord + # cannot be DataTree, because data_var cannot have the same name as coord # cannot be Dataset because we need pred as a data_variable not coord slope, intercept = 0.3, 0.2 tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) @@ -496,10 +507,8 @@ def test_linear_regression_fit_intercept(lr_method_or_function, as_2D, data_type @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("as_2D", [True, False]) -@pytest.mark.parametrize("data_type", ["dict", "xr_dataset"]) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_no_coords(lr_method_or_function, as_2D, data_type): - # does not work with DataTree due to missing coords in collapse_datatree_into_dataset - slope, intercept = 3.14, 3.14 pred0 = trend_data_1D(slope=1, scale=0) @@ -597,9 +606,8 @@ def test_linear_regression_two_predictors_diffnames( @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) -@pytest.mark.parametrize("data_type", ["dict", "xr_dataset"]) +@pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_two_predictors_extra_dim(lr_method_or_function, data_type): - # does not work with datatree because we have not implemented minimal coords # add a 0D dimension/ coordinate and ensure it still works # NOTE: requires 3 predictors to trigger the error (might be an xarray issue) From 9a4e9478109a66766218eb969e0662ffe91414df Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 21:05:05 +0100 Subject: [PATCH 15/22] linting --- mesmer/core/datatree.py | 4 ++- mesmer/stats/_linear_regression.py | 4 ++- tests/unit/test_datatree.py | 10 +++++-- tests/unit/test_linear_regression.py | 41 +++++++++++++++++++++------- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/mesmer/core/datatree.py b/mesmer/core/datatree.py index 303a9499..9f4cb5aa 100644 --- a/mesmer/core/datatree.py +++ b/mesmer/core/datatree.py @@ -14,7 +14,9 @@ def _extract_single_dataarray_from_dt(dt: DataTree, name: str = "node") -> xr.Da var_name, *others = ds.keys() if others: others = ", ".join(map(str, others)) - raise ValueError(f"Node must only contain one data variable, {name} has {others} and {var_name}.") + raise ValueError( + f"Node must only contain one data variable, {name} has {others} and {var_name}." + ) da = ds[var_name] return da diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 85ff282f..a5fed57b 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -281,7 +281,9 @@ def _fit_linear_regression_xr( predictors = map_over_subtree( lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) )(predictors) - predictors_concat = collapse_datatree_into_dataset(predictors, dim="predictor", join="exact", coords="minimal") + predictors_concat = collapse_datatree_into_dataset( + predictors, dim="predictor", join="exact", coords="minimal" + ) predictors_concat = ( predictors_concat.to_array().isel(variable=0).drop_vars("variable") ) diff --git a/tests/unit/test_datatree.py b/tests/unit/test_datatree.py index 0a1301d9..114a82a3 100644 --- a/tests/unit/test_datatree.py +++ b/tests/unit/test_datatree.py @@ -141,7 +141,8 @@ def test_extract_single_dataarray_from_dt(): dt = DataTree(data=xr.Dataset({"tas": da, "tas2": da})) with pytest.raises( - ValueError, match="Node must only contain one data variable, node has tas2 and tas." + ValueError, + match="Node must only contain one data variable, node has tas2 and tas.", ): mesmer.datatree._extract_single_dataarray_from_dt(dt) @@ -154,8 +155,11 @@ def test_extract_single_dataarray_from_dt(): mesmer.datatree._extract_single_dataarray_from_dt(dt) # check name - with pytest.raises(ValueError, match="Node must only contain one data variable, scen1 has tas2 and tas."): - mesmer.datatree._extract_single_dataarray_from_dt(dt['scen1'], name="scen1") + with pytest.raises( + ValueError, + match="Node must only contain one data variable, scen1 has tas2 and tas.", + ): + mesmer.datatree._extract_single_dataarray_from_dt(dt["scen1"], name="scen1") res = mesmer.datatree._extract_single_dataarray_from_dt(dt["scen2"]) xr.testing.assert_equal(res, da) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 87e582e7..e732b805 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -307,7 +307,10 @@ def test_linear_regression_errors(lr_method_or_function, data_type): def test_unequal_coords(pred0, pred1, tgt, weights): with pytest.raises(ValueError, match="cannot align objects"): lr_method_or_function( - convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), + tgt, + dim="time", + weights=weights, ) if not data_type == "xr_dataset": @@ -317,8 +320,8 @@ def test_unequal_coords(pred0, pred1, tgt, weights): test_unequal_coords(pred0, pred1, tgt.isel(time=slice(0, 5)), weights) test_unequal_coords(pred0, pred1, tgt, weights.isel(time=slice(0, 5))) - def test_wrong_type(pred0, pred1, tgt, weights, name, preds_wrong = False): - + def test_wrong_type(pred0, pred1, tgt, weights, name, preds_wrong=False): + msg = f"Expected {name} to be an xr.DataArray" errortype = TypeError @@ -333,18 +336,28 @@ def test_wrong_type(pred0, pred1, tgt, weights, name, preds_wrong = False): with pytest.raises(errortype, match=msg): lr_method_or_function( - convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), + tgt, + dim="time", + weights=weights, ) - test_wrong_type(None, pred1, tgt, weights, name="predictor: pred0", preds_wrong = True) - test_wrong_type(pred0, None, tgt, weights, name="predictor: pred1", preds_wrong = True) + test_wrong_type( + None, pred1, tgt, weights, name="predictor: pred0", preds_wrong=True + ) + test_wrong_type( + pred0, None, tgt, weights, name="predictor: pred1", preds_wrong=True + ) test_wrong_type(pred0, pred1, None, weights, name="target") test_wrong_type(pred0, pred1, tgt, xr.Dataset(), name="weights") def test_wrong_shape(pred0, pred1, tgt, weights, name, ndim): with pytest.raises(ValueError, match=f"{name} should be {ndim}D"): lr_method_or_function( - convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), + tgt, + dim="time", + weights=weights, ) test_wrong_shape( @@ -369,7 +382,10 @@ def test_wrong_shape(pred0, pred1, tgt, weights, name, ndim): def test_missing_dim(pred0, pred1, tgt, weights, name): with pytest.raises(ValueError, match=f"{name} is missing the required dims"): lr_method_or_function( - convert_to({"pred0": pred0, "pred1": pred1}, data_type), tgt, dim="time", weights=weights + convert_to({"pred0": pred0, "pred1": pred1}, data_type), + tgt, + dim="time", + weights=weights, ) test_missing_dim( @@ -381,14 +397,19 @@ def test_missing_dim(pred0, pred1, tgt, weights, name): test_missing_dim(pred0, pred1, tgt.rename(time="t"), weights, name="target") test_missing_dim(pred0, pred1, tgt, weights.rename(time="t"), name="weights") - with pytest.raises(ValueError, match="A predictor with the name 'weights' or 'intercept' is not allowed"): + with pytest.raises( + ValueError, + match="A predictor with the name 'weights' or 'intercept' is not allowed", + ): lr_method_or_function( convert_to({"weights": pred0, "intercept": pred1}, data_type), tgt, dim="time", ) with pytest.raises(ValueError, match="dim cannot currently be 'predictor'."): - lr_method_or_function(convert_to({"pred0": pred0}, data_type), tgt, dim="predictor") + lr_method_or_function( + convert_to({"pred0": pred0}, data_type), tgt, dim="predictor" + ) @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) From cccbd54368ca2873b9da045f88444efab030a823 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 21:05:59 +0100 Subject: [PATCH 16/22] more linting --- tests/unit/test_linear_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index e732b805..46f601e8 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -326,7 +326,7 @@ def test_wrong_type(pred0, pred1, tgt, weights, name, preds_wrong=False): errortype = TypeError # errors sooner for datatree predictors - if preds_wrong == True: + if preds_wrong: if data_type == "DataTree": msg = f"{name} has no data." errortype = ValueError From 12160d90b3d1c9db59b74d12f7a7f34bd122fe4e Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Mon, 25 Nov 2024 21:57:39 +0100 Subject: [PATCH 17/22] extend tests to root dt --- tests/unit/test_datatree.py | 23 +++++++++++------------ tests/unit/test_linear_regression.py | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/unit/test_datatree.py b/tests/unit/test_datatree.py index 114a82a3..d7c1d2f5 100644 --- a/tests/unit/test_datatree.py +++ b/tests/unit/test_datatree.py @@ -60,6 +60,17 @@ def test_collapse_datatree_into_dataset(): xr.testing.assert_equal(scen1.drop_vars(collapse_dim), leaf1) + # data in root works + dt = DataTree(leaf1, name="scen1") + res = mesmer.datatree.collapse_datatree_into_dataset(dt, dim=collapse_dim) + + assert isinstance(res, xr.Dataset) + assert collapse_dim in res.dims + assert (res[collapse_dim] == ["scen1"]).all() + assert len(res.dims) == 3 + + xr.testing.assert_equal(scen1.drop_vars(collapse_dim), leaf1) + # nested DataTree works dt = DataTree() dt["scen1/sub_scen1"] = DataTree(leaf1) @@ -109,18 +120,6 @@ def test_collapse_datatree_into_dataset(): dt = DataTree.from_dict({"mem1": ds1, "mem2": ds2}) res = mesmer.datatree.collapse_datatree_into_dataset(dt, dim="members") - # only one leaf also works - ds = xr.Dataset({"tas": trend_data_1D(n_timesteps=n_ts)}) - dt = DataTree.from_dict({"ds": ds}) - - collapse_dim = "scenario" - res = mesmer.datatree.collapse_datatree_into_dataset(dt, dim=collapse_dim) - - expected = ds.expand_dims(collapse_dim).assign_coords( - {collapse_dim: np.array(["ds"])} - ) - xr.testing.assert_equal(res, expected) - # empty nodes are removed before concatenating # NOTE: implicitly this is already there in the other tests, since the root node is always empty # but it is nice to have it explicitly too diff --git a/tests/unit/test_linear_regression.py b/tests/unit/test_linear_regression.py index 46f601e8..f57f325a 100644 --- a/tests/unit/test_linear_regression.py +++ b/tests/unit/test_linear_regression.py @@ -626,6 +626,27 @@ def test_linear_regression_two_predictors_diffnames( xr.testing.assert_allclose(result, expected) +@pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) +@pytest.mark.parametrize("intercept", (0, 3.14)) +@pytest.mark.parametrize("slope", (0, 3.14)) +@pytest.mark.parametrize("as_2D", [True, False]) +def test_linear_regression_datatree_data_in_root( + lr_method_or_function, intercept, slope, as_2D +): + + pred0 = trend_data_1D(slope=1, scale=0).rename("bar") + pred1 = trend_data_1D(slope=1, scale=0).rename("foo") + ds = xr.Dataset({"pred0": pred0, "pred1": pred1}) + dt = DataTree(ds, name="root") + tgt = trend_data_1D_or_2D(as_2D=as_2D, slope=slope, scale=0, intercept=intercept) + + result = lr_method_or_function(dt, tgt, "time") + + expected = lr_method_or_function(ds, tgt, "time") + + xr.testing.assert_allclose(result, expected) + + @pytest.mark.parametrize("lr_method_or_function", LR_METHOD_OR_FUNCTION) @pytest.mark.parametrize("data_type", ["dict", "DataTree", "xr_dataset"]) def test_linear_regression_two_predictors_extra_dim(lr_method_or_function, data_type): From 3c60fbab30f0507eef0602310a6492af5a93c052 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Tue, 26 Nov 2024 11:13:02 +0100 Subject: [PATCH 18/22] revert change in create emus utils --- mesmer/create_emulations/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesmer/create_emulations/utils.py b/mesmer/create_emulations/utils.py index b2e3c3ee..1ba5cbf1 100644 --- a/mesmer/create_emulations/utils.py +++ b/mesmer/create_emulations/utils.py @@ -114,7 +114,7 @@ def _gather_lr_params(params_dict, targ, dims): intercept = xr.DataArray(params_dict["intercept"][targ], dims=dims) fit_intercept = True else: - intercept = xr.zeros_like(params[pred]) + intercept = 0 # xr.zeros_like(params[pred]) fit_intercept = False params["intercept"] = intercept From 5587b163c337697777d67fdd9440a7120c46bf6f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:13:24 +0000 Subject: [PATCH 19/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mesmer/create_emulations/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesmer/create_emulations/utils.py b/mesmer/create_emulations/utils.py index 1ba5cbf1..79a3e8b7 100644 --- a/mesmer/create_emulations/utils.py +++ b/mesmer/create_emulations/utils.py @@ -114,7 +114,7 @@ def _gather_lr_params(params_dict, targ, dims): intercept = xr.DataArray(params_dict["intercept"][targ], dims=dims) fit_intercept = True else: - intercept = 0 # xr.zeros_like(params[pred]) + intercept = 0 # xr.zeros_like(params[pred]) fit_intercept = False params["intercept"] = intercept From 00686203a8fc6fdd1fbf66b5eaf62355bc87234c Mon Sep 17 00:00:00 2001 From: Victoria <112418493+veni-vidi-vici-dormivi@users.noreply.github.com> Date: Tue, 26 Nov 2024 11:14:25 +0100 Subject: [PATCH 20/22] Update tests/unit/test_datatree.py Co-authored-by: Mathias Hauser --- tests/unit/test_datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_datatree.py b/tests/unit/test_datatree.py index d7c1d2f5..521ca364 100644 --- a/tests/unit/test_datatree.py +++ b/tests/unit/test_datatree.py @@ -60,7 +60,7 @@ def test_collapse_datatree_into_dataset(): xr.testing.assert_equal(scen1.drop_vars(collapse_dim), leaf1) - # data in root works + # test data in root works dt = DataTree(leaf1, name="scen1") res = mesmer.datatree.collapse_datatree_into_dataset(dt, dim=collapse_dim) From b80b6d791c408c1bbbad2ae72bc3be14416c40d6 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Tue, 26 Nov 2024 13:19:55 +0100 Subject: [PATCH 21/22] nits --- mesmer/create_emulations/utils.py | 2 +- mesmer/stats/_linear_regression.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mesmer/create_emulations/utils.py b/mesmer/create_emulations/utils.py index 79a3e8b7..a7293eff 100644 --- a/mesmer/create_emulations/utils.py +++ b/mesmer/create_emulations/utils.py @@ -114,7 +114,7 @@ def _gather_lr_params(params_dict, targ, dims): intercept = xr.DataArray(params_dict["intercept"][targ], dims=dims) fit_intercept = True else: - intercept = 0 # xr.zeros_like(params[pred]) + intercept = 0 fit_intercept = False params["intercept"] = intercept diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index a5fed57b..01bcd396 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -278,15 +278,15 @@ def _fit_linear_regression_xr( ) elif isinstance(predictors, DataTree): # rename all data variables to "pred" to avoid conflicts when concatenating - predictors = map_over_subtree( - lambda ds: ds.rename({var: "pred" for var in ds.data_vars}) - )(predictors) + def _rename_vars(ds) -> DataTree: + (var,) = ds.data_vars + return ds.rename({var: "pred"}) + + predictors = map_over_subtree(_rename_vars)(predictors) predictors_concat = collapse_datatree_into_dataset( predictors, dim="predictor", join="exact", coords="minimal" ) - predictors_concat = ( - predictors_concat.to_array().isel(variable=0).drop_vars("variable") - ) + predictors_concat = predictors_concat['pred'] _check_dataarray_form(target, required_dims=dim, name="target") From eb5d463debe4b02107c906eef6a36537030d659e Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Tue, 26 Nov 2024 13:20:32 +0100 Subject: [PATCH 22/22] i know it is called *pre*commit but well. - linting --- mesmer/stats/_linear_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/stats/_linear_regression.py b/mesmer/stats/_linear_regression.py index 01bcd396..e144b0be 100644 --- a/mesmer/stats/_linear_regression.py +++ b/mesmer/stats/_linear_regression.py @@ -281,12 +281,12 @@ def _fit_linear_regression_xr( def _rename_vars(ds) -> DataTree: (var,) = ds.data_vars return ds.rename({var: "pred"}) - + predictors = map_over_subtree(_rename_vars)(predictors) predictors_concat = collapse_datatree_into_dataset( predictors, dim="predictor", join="exact", coords="minimal" ) - predictors_concat = predictors_concat['pred'] + predictors_concat = predictors_concat["pred"] _check_dataarray_form(target, required_dims=dim, name="target")