ImperialCollegeLondon · tztsai · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -30,7 +30,6 @@ pyrealm.egg-info
 
 # Data
 pyrealm_build_data/inputs_data_24.25.nc
-pyrealm_build_data/eda.py
 
 # Profiling
 prof/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,7 +47,7 @@ flake8 = "^4.0.1"
 flake8-docstrings = "^1.6.0"
 mypy = "^0.991"
 isort = "^5.12.0"
-pandas = ">1.3.0"
+pandas = "^2.2.0"
 matplotlib = "^3.5.2"
 ipython = "^8.9.0"
 

diff --git a/pyrealm_build_data/data_model.nc b/pyrealm_build_data/data_model.nc
diff --git a/pyrealm_build_data/synth_data.py b/pyrealm_build_data/synth_data.py
@@ -0,0 +1,81 @@
+"""This script uses a parametrized model to compress the input dataset.
+
+It fits a time series model to the input data and stores the model parameters.
+The dataset can then be reconstructed from the model parameters using the `reconstruct`
+function, provided with a custom time index.
+"""
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+VAR_BOUNDS = dict(
+    temp=(-25, 80),
+    patm=(3e4, 11e4),
+    vpd=(0, 1e4),
+    co2=(0, 1e3),
+    fapar=(0, 1),
+    ppfd=(0, 1e4),
+)
+
+
+def make_time_features(t: np.ndarray) -> pd.DataFrame:
+    """Make time features for a given time index."""
+    dt = pd.to_datetime(t).rename("time")
+    df = pd.DataFrame(index=dt).assign(const=1.0)
+
+    df["linear"] = (dt - pd.Timestamp("2000-01-01")) / pd.Timedelta("365.25d")
+
+    for f in [730.5, 365.25, 12, 6, 4, 3, 2, 1, 1 / 2, 1 / 3, 1 / 4, 1 / 6]:
+        df[f"freq_{f:.2f}_sin"] = np.sin(2 * np.pi * f * df["linear"])
+        df[f"freq_{f:.2f}_cos"] = np.cos(2 * np.pi * f * df["linear"])
+
+    return df
+
+
+def fit_ts_model(df: pd.DataFrame, fs: pd.DataFrame) -> Tuple[pd.DataFrame, float]:
+    """Fit a time series model to the data."""
+    df = df.dropna(axis=1, how="all").fillna(df.mean())
+    Y = df.values  # (times, locs)
+    X = fs.values  # (times, feats)
+    A = np.linalg.pinv(X) @ Y  # (feats, locs)
+    loss = np.mean((X @ A - Y) ** 2) / np.var(Y)
+    pars = pd.DataFrame(A.T, index=df.columns, columns=fs.columns)
+    return pars, loss
+
+
+def reconstruct(ds: xr.Dataset, dt: np.ndarray | pd.DatetimeIndex) -> xr.Dataset:
+    """Reconstruct the full dataset from the model parameters."""
+    x = make_time_features(dt).to_xarray().to_dataarray()
+    ds = xr.Dataset({k: a @ x for k, a in ds.items()})
+    ds = xr.Dataset({k: a.clip(*VAR_BOUNDS[k]) for k, a in ds.items()})
+    return ds
+
+
+if __name__ == "__main__":
+    ds = xr.open_dataset("pyrealm_build_data/inputs_data_24.25.nc")
+
+    mask = ~ds.isnull().all("time").to_dataarray().any("variable")
+    ds = ds.where(mask, drop=True)
+
+    special_time_features = dict(
+        patm=["const"],
+        co2=["const", "linear"],
+    )
+
+    features = make_time_features(ds.time)
+    model = xr.Dataset()
+
+    for k in ds.data_vars:
+        print("Fitting", k)
+        da = ds[k].isel(time=slice(None, None, 4))  # downsample along time
+        df = da.to_series().unstack("time").T  # (datetimes, locations)
+        fs = features.loc[df.index]  # (datetimes, features)
+        fs = fs[special_time_features.get(k, fs.columns)]
+        ps, r = fit_ts_model(df, fs)  # (locations, features)
+        print("Loss:", r)
+        ps[features.keys().difference(ps.columns)] = 0.0
+        model[k] = ps.to_xarray().to_dataarray()
+
+    model.to_netcdf("pyrealm_build_data/data_model.nc")
diff --git a/tests/regression/data/test_synth_data.py b/tests/regression/data/test_synth_data.py
@@ -0,0 +1,46 @@
+"""Test the quality of the synthetic data generated from the model parameters."""
+
+import numpy as np
+import pytest
+import xarray as xr
+
+try:
+    DATASET = xr.open_dataset("pyrealm_build_data/inputs_data_24.25.nc")
+    VARS = DATASET.data_vars
+except ValueError:
+    pytest.skip("Original LFS dataset not checked out.", allow_module_level=True)
+
+
+def r2_score(y_true: xr.DataArray, y_pred: xr.DataArray) -> float:
+    """Compute the R2 score."""
+    SSE = ((y_true - y_pred) ** 2).sum()
+    SST = ((y_true - y_true.mean()) ** 2).sum()
+    return 1 - SSE / SST
+
+
+@pytest.fixture
+def syndata(modelpath="pyrealm_build_data/data_model.nc"):
+    """The synthetic dataset."""
+    from pyrealm_build_data.synth_data import reconstruct
+
+    model = xr.open_dataset(modelpath)
+    ts = xr.date_range("2012-01-01", "2018-01-01", freq="12h")
+    return reconstruct(model, ts)
+
+
+@pytest.fixture
+def dataset(syndata):
+    """The original dataset."""
+    return DATASET.sel(time=syndata.time)
+
+
+@pytest.mark.parametrize("var", VARS)
+def test_synth_data_quality(dataset, syndata, var):
+    """Test the quality of the synthetic data."""
+    times = syndata.time[np.random.choice(syndata.time.size, 1000, replace=False)]
+    lats = syndata.lat[np.random.choice(syndata.lat.size, 100, replace=False)]
+    t = dataset[var].sel(lat=lats, time=times)
+    p = syndata[var].sel(lat=lats, time=times)
+    s = r2_score(t, p)
+    print(f"R2 score for {var} is {s:.2f}")
+    assert s > 0.85