From a9a610e4476fabdc0dfb4df8eda957c2f946a7ac Mon Sep 17 00:00:00 2001
From: Bryn Pickering <17178478+brynpickering@users.noreply.github.com>
Date: Wed, 16 Oct 2024 13:32:42 +0100
Subject: [PATCH] Add data source dimension name renaming option (#687)

---
 CHANGELOG.md                               |  2 +
 docs/creating/data_tables.md               | 54 +++++++++++++++++-
 src/calliope/config/data_table_schema.yaml |  9 +++
 src/calliope/preprocess/data_tables.py     | 63 +++++++++++++++------
 tests/test_preprocess_data_sources.py      | 64 +++++++++++++++++++++-
 5 files changed, 172 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9aecfda0..4f334677 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### User-facing changes
 
+|new| dimension renaming functionality when loading from a data source, using the `rename_dims` option (#680).
+
 |changed| cost expressions in math, to split out investment costs into the capital cost (`cost_investment`), annualised capital cost (`cost_investment_annualised`), fixed operation costs (`cost_operation_fixed`) and variable operation costs (`cost_operation_variable`, previously `cost_var`) (#645).
 
 |new| Math has been removed from `model.math`, and can now be accessed via `model.math.data` (#639).
diff --git a/docs/creating/data_tables.md b/docs/creating/data_tables.md
index e549be03..dfd28b57 100644
--- a/docs/creating/data_tables.md
+++ b/docs/creating/data_tables.md
@@ -17,6 +17,7 @@ In brief it is:
 * **select**: values within dimensions that you want to select from your tabular data, discarding the rest.
 * **drop**: dimensions to drop from your rows/columns, e.g., a "comment" row.
 * **add_dims**: dimensions to add to the table after loading it in, with the corresponding value(s) to assign to the dimension index.
+* **rename_dims**: dimension names to map from those defined in the data table (e.g `time`) to those used in the Calliope model (e.g. `timesteps`).
 
 When we refer to "dimensions", we mean the sets over which data is indexed in the model: `nodes`, `techs`, `timesteps`, `carriers`, `costs`.
 In addition, when loading from file, there is the _required_ dimension `parameters`.
@@ -391,8 +392,6 @@ Or to define the same timeseries source data for two technologies at different n
         columns: [nodes, techs, parameters]
     ```
 
-
-
 === "With `add_dims`"
 
     |                  |     |
@@ -418,6 +417,57 @@ Or to define the same timeseries source data for two technologies at different n
           parameters: source_use_max
     ```
 
+## Mapping dimension names
+
+Sometimes, data tables are prepared in a model-agnostic fashion, and it would require extra effort to follow Calliope's dimension naming conventions.
+To enable these tables to be loaded without Calliope complaining, we can rename dimensions when loading them using `rename_dims`.
+
+For example, if we have the `time` dimension in file, we can map it to the Calliope-compliant `timesteps` dimension:
+
+=== "Without `rename_dims`"
+
+    Data in file:
+
+    | timesteps           | source_use_equals |
+    | ------------------: | :---------------- |
+    | 2005-01-01 12:00:00 | 15                |
+    | 2005-01-01 13:00:00 | 5                 |
+
+    YAML definition to load data:
+
+    ```yaml
+    data_sources:
+      pv_capacity_factor_data:
+        source: data_sources/pv_resource.csv
+        rows: timesteps
+        columns: parameters
+        add_dims:
+          techs: pv
+    ```
+
+=== "With `rename_dims`"
+
+    Data in file:
+
+    | time                | source_use_equals |
+    | ------------------: | :---------------- |
+    | 2005-01-01 12:00:00 | 15                |
+    | 2005-01-01 13:00:00 | 5                 |
+
+    YAML definition to load data:
+
+    ```yaml
+    data_sources:
+      pv_capacity_factor_data:
+        source: data_sources/pv_resource.csv
+        rows: timesteps
+        columns: parameters
+        add_dims:
+          techs: pv
+        rename_dims:
+          time: timesteps
+    ```
+
 ## Loading CSV files vs `pandas` dataframes
 
 To load from CSV, set the filepath in `data` to point to your file.
diff --git a/src/calliope/config/data_table_schema.yaml b/src/calliope/config/data_table_schema.yaml
index cf647124..df05fb4c 100644
--- a/src/calliope/config/data_table_schema.yaml
+++ b/src/calliope/config/data_table_schema.yaml
@@ -57,6 +57,15 @@ properties:
       Dimensions in the rows and/or columns that contain metadata and should therefore not be passed on to the loaded model dataset.
       These could include comments on the source of the data, the data license, or the parameter units.
       You can also drop a dimension and then reintroduce it in `add_dims`, but with different index items.
+  rename_dims:
+    type: object
+    description: >-
+      Mapping between dimension names in the data table being loaded to equivalent Calliope dimension names.
+      For instance, the "time" column in the data table would need to be mapped to "timesteps": `{"time": "timesteps"}`.
+    unevaluatedProperties:
+      type: string
+      description: Key is a Calliope dimension name (must not be in `rows` or `columns`), value is the dimension name in the data tables.
+      pattern: '^[^_^\d][\w]*$'
   add_dims:
     description: >-
       Data dimensions to add after loading in the array.
diff --git a/src/calliope/preprocess/data_tables.py b/src/calliope/preprocess/data_tables.py
index b8151c1d..717ef595 100644
--- a/src/calliope/preprocess/data_tables.py
+++ b/src/calliope/preprocess/data_tables.py
@@ -5,6 +5,7 @@
 import logging
 from collections.abc import Hashable
 from pathlib import Path
+from typing import Literal
 
 import numpy as np
 import pandas as pd
@@ -25,6 +26,7 @@
 LOGGER = logging.getLogger(__name__)
 
 DTYPE_OPTIONS = {"str": str, "float": float}
+AXIS_T = Literal["columns", "index"]
 
 
 class DataTableDict(TypedDict):
@@ -34,9 +36,10 @@ class DataTableDict(TypedDict):
     columns: NotRequired[str | list[str]]
     data: str
     df: NotRequired[str]
+    rename_dims: NotRequired[dict[str, str]]
     add_dims: NotRequired[dict[str, str | list[str]]]
-    select: dict[str, str | bool | int]
-    drop: Hashable | list[Hashable]
+    select: NotRequired[dict[str, str | bool | int]]
+    drop: NotRequired[Hashable | list[Hashable]]
 
 
 class DataTable:
@@ -275,22 +278,28 @@ def _df_to_ds(self, df: pd.DataFrame) -> xr.Dataset:
                 "Data table must be a pandas DataFrame. "
                 "If you are providing an in-memory object, ensure it is not a pandas Series by calling the method `to_frame()`"
             )
-        for axis, names in {"columns": self.columns, "index": self.index}.items():
-            if names is None:
-                if len(getattr(df, axis).names) != 1:
-                    self._raise_error(f"Expected a single {axis} level in loaded data.")
-                df = df.squeeze(axis=axis)
-            else:
-                if len(getattr(df, axis).names) != len(names):
-                    self._raise_error(
-                        f"Expected {len(names)} {axis} levels in loaded data."
-                    )
-                self._compare_axis_names(getattr(df, axis).names, names, axis)
-                df.rename_axis(inplace=True, **{axis: names})
 
         tdf: pd.Series
+        axis_names: dict[AXIS_T, None | list[str]] = {
+            "columns": self.columns,
+            "index": self.index,
+        }
+        squeeze_me: dict[AXIS_T, bool] = {
+            "columns": self.columns is None,
+            "index": self.index is None,
+        }
+        for axis, names in axis_names.items():
+            if names is None and len(getattr(df, axis).names) != 1:
+                self._raise_error(f"Expected a single {axis} level in loaded data.")
+            elif names is not None:
+                df = self._rename_axes(df, axis, names)
+
+        for axis, squeeze in squeeze_me.items():
+            if squeeze:
+                df = df.squeeze(axis=axis)
+
         if isinstance(df, pd.DataFrame):
-            tdf = df.stack(df.columns.names, future_stack=True).dropna()
+            tdf = df.stack(tuple(df.columns.names), future_stack=True).dropna()
         else:
             tdf = df
 
@@ -314,7 +323,6 @@ def _df_to_ds(self, df: pd.DataFrame) -> xr.Dataset:
                 tdf = pd.concat(
                     [tdf for _ in index_items], keys=index_items, names=[dim_name]
                 )
-
         self._check_processed_tdf(tdf)
         self._check_for_protected_params(tdf)
 
@@ -328,6 +336,29 @@ def _df_to_ds(self, df: pd.DataFrame) -> xr.Dataset:
         self._log(f"Loaded arrays:\n{ds}")
         return ds
 
+    def _rename_axes(
+        self, df: pd.DataFrame, axis: AXIS_T, names: list[str]
+    ) -> pd.DataFrame:
+        """Check and rename DataFrame index and column names according to data table definition.
+
+        Args:
+            df (pd.DataFrame): Loaded data table as a DataFrame.
+            axis (Literal[columns, index]): DataFrame axis.
+            names (list[str] | None): Expected dimension names along `axis`.
+
+        Returns:
+            pd.DataFrame: `df` with all dimensions on `axis` appropriately named.
+        """
+        if len(getattr(df, axis).names) != len(names):
+            self._raise_error(f"Expected {len(names)} {axis} levels in loaded data.")
+        mapper = self.input.get("rename_dims", {})
+        if mapper:
+            df.rename_axis(inplace=True, **{axis: mapper})
+        self._compare_axis_names(getattr(df, axis).names, names, axis)
+        df.rename_axis(inplace=True, **{axis: names})
+
+        return df
+
     def _check_for_protected_params(self, tdf: pd.Series):
         """Raise an error if any defined parameters are in a pre-configured set of _protected_ parameters.
 
diff --git a/tests/test_preprocess_data_sources.py b/tests/test_preprocess_data_sources.py
index ae9598da..a250f04c 100644
--- a/tests/test_preprocess_data_sources.py
+++ b/tests/test_preprocess_data_sources.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 import pytest
+import xarray as xr
 
 import calliope
 from calliope.preprocess import data_tables
@@ -354,6 +355,65 @@ def test_drop_one(self, table_obj):
         )
 
 
+class TestDataTableRenameDims:
+    @pytest.fixture(scope="class")
+    def multi_row_one_col_data(self, data_dir, init_config, dummy_int):
+        """Fixture to create the xarray dataset from the data table, including dimension name mapping."""
+
+        def _multi_row_one_col_data(
+            mapping: dict, new_idx: list, new_cols: list
+        ) -> xr.Dataset:
+            df = pd.DataFrame(
+                {"foo": {("bar1", "bar2"): 0, ("baz1", "baz2"): dummy_int}}
+            )
+            filepath = data_dir / "multi_row_one_col_file.csv"
+            df.rename_axis(
+                index=["test_row1", "test_row2"], columns=["test_col"]
+            ).to_csv(filepath)
+            table_dict: data_tables.DataTableDict = {
+                "data": filepath.as_posix(),
+                "rows": new_idx,
+                "columns": new_cols,
+                "add_dims": {"parameters": "test_param"},
+                "rename_dims": mapping,
+            }
+            ds = data_tables.DataTable(init_config, "ds_name", table_dict)
+            return ds.dataset
+
+        return _multi_row_one_col_data
+
+    def test_fails_without_rename(self, dummy_int, multi_row_one_col_data):
+        """Test that without dimension name mapping, the dataframe doesn't load successfully."""
+        with pytest.raises(calliope.exceptions.ModelError) as excinfo:
+            multi_row_one_col_data({}, ["foobar", "test_row2"], ["test_col"])
+        assert check_error_or_warning(
+            excinfo,
+            "Trying to set names for index but names in the file do no match names provided | "
+            "in file: ['test_row1', 'test_row2'] | defined: ['foobar', 'test_row2'].",
+        )
+
+    @pytest.mark.parametrize(
+        ("mapping", "idx", "col"),
+        [
+            ({"test_row1": "foobar"}, ["foobar", "test_row2"], ["test_col"]),
+            (
+                {"test_row1": "foobar", "test_col": "foobaz"},
+                ["foobar", "test_row2"],
+                ["foobaz"],
+            ),
+        ],
+    )
+    def test_rename(self, dummy_int, multi_row_one_col_data, mapping, idx, col):
+        """Test that dimension name mapping propagates through from the initial dataframe to the final dataset."""
+        dataset = multi_row_one_col_data(mapping, idx, col)
+        assert not any(k in dataset.dims for k in mapping.keys())
+        assert all(v in dataset.dims for v in mapping.values())
+        assert (
+            dataset["test_param"].sel(foobar="baz1", test_row2="baz2").item()
+            == dummy_int
+        )
+
+
 class TestDataTableMalformed:
     @pytest.fixture(scope="class")
     def table_obj(self, init_config):
@@ -450,7 +510,7 @@ def test_carrier_info_dict_from_model_data_var(self, table_obj, param, expected)
     def test_carrier_info_dict_from_model_data_var_missing_dim(self, table_obj):
         with pytest.raises(calliope.exceptions.ModelError) as excinfo:
             table_obj.lookup_dict_from_param("FOO", "foobar")
-        check_error_or_warning(
+        assert check_error_or_warning(
             excinfo,
             "Loading FOO with missing dimension(s). Must contain `techs` and `foobar`, received: ('techs', 'carriers')",
         )
@@ -604,7 +664,7 @@ def test_transmission_tech_with_nodes(self, table_obj):
         with pytest.raises(calliope.exceptions.ModelError) as excinfo:
             table_obj(df_dict).node_dict(tech_dict)
 
-        check_error_or_warning(
+        assert check_error_or_warning(
             excinfo,
             "Cannot define transmission technology data over the `nodes` dimension",
         )