From b3641e19427d0afc790bba9d98ccced97fff5395 Mon Sep 17 00:00:00 2001
From: nepslor <nepslor@gmail.com>
Date: Tue, 10 Sep 2024 12:38:26 +0200
Subject: [PATCH] added DiscreteDistr model, fixed quantile bugs

---
 pyforecaster/forecaster.py                    |  4 +-
 pyforecaster/forecasting_models/benchmarks.py | 50 ++++++++++++++++++-
 .../forecasting_models/randomforests.py       |  1 +
 pyforecaster/formatter.py                     |  6 ++-
 requirements.txt                              |  2 +-
 tests/test_models.py                          | 32 +++++++++++-
 6 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py
index 1b144b9..0e44d81 100644
--- a/pyforecaster/forecaster.py
+++ b/pyforecaster/forecaster.py
@@ -107,7 +107,6 @@ def _predict_quantiles(self, x, **kwargs):
     def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None):
         level_0_labels = self.target_cols
         level_1_labels = self.q_vect if q_vect is None else q_vect
-        q_hat = np.swapaxes(q_hat, 1, 2)
         q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2]))
         q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels]))
         q_hat.columns.names = ['target', 'quantile']
@@ -117,8 +116,7 @@ def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None):
     def quantiles_to_numpy(q_hat:pd.DataFrame):
         n_taus = len(q_hat.columns.get_level_values(1).unique())
         q_hat = q_hat.values
-        q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
-        q_hat = np.swapaxes(q_hat, 1, 2)
+        q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus))
         return q_hat
 
     def predict_quantiles(self, x, dataframe=True, **kwargs):
diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py
index 01b893c..706ede9 100644
--- a/pyforecaster/forecasting_models/benchmarks.py
+++ b/pyforecaster/forecasting_models/benchmarks.py
@@ -72,8 +72,56 @@ def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_st
                          conditional_to_hour=conditional_to_hour, **scengen_kwgs)
         self.n_sa = n_sa
         self.period = period
+        self.target_names = None
+        self.y_distributions = None
+        self.support = None
 
     def fit(self, x:pd.DataFrame, y:pd.DataFrame):
         # infer sampling time
         sampling_time = pd.infer_freq(x.index)
-        # aggregate by
\ No newline at end of file
+
+        # retrieve support of target distribution
+        support = np.unique(y.values)
+        self.support = support
+        print('support of target distribution:', support)
+
+        # Create a new column for the time within a day (hours and minutes)
+        time_of_day = y.index.floor(sampling_time).time
+
+        # Group by this time of day and calculate the mean across all days
+        mean_by_time_of_day = {}
+        for v in support:
+            mean_by_time_of_day[v] = y.groupby(time_of_day).agg(lambda x: np.sum(x==v))
+
+        y_distrib = pd.concat(mean_by_time_of_day, axis=1)
+        # swap levels of the multiindex
+        y_distrib = y_distrib.swaplevel(0, 1, axis=1).sort_index(axis=1)
+
+        # normalize the distribution for each variable in level 0
+        for t in y_distrib.columns.levels[0]:
+            y_distrib[t] = y_distrib[t] / (y_distrib[t].sum(axis=1).values.reshape(-1, 1)+1e-12)
+
+        # store the distribution
+        self.y_distributions = y_distrib
+        self.target_names = list(y_distrib.columns.levels[0])
+        self.target_cols = [f'{t}_t+{i}' for t in self.target_names for i in range(1, self.n_sa+1)]
+        return self
+
+    def predict(self, x, **kwargs):
+        return np.mean(self.predict_probabilities(x) * self.support.reshape(1, -1), axis=1)
+
+    def predict_probabilities(self, x, **kwargs):
+        # infer sampling time
+        sampling_time = pd.infer_freq(x.index)
+        # Create a new column for the time within a day (hours and minutes)
+        time_of_day = pd.Series(x.index.floor(sampling_time).time)
+
+        # retrieve the distribution for the time of day
+        y_hat = {}
+        for t in self.target_names:
+            pres_t = [time_of_day.map(self.y_distributions[t].loc[:, i]).rename(f'{i}') for i in
+                      self.y_distributions[t].columns]
+            y_hat[t] = pd.concat(pres_t, axis=1)
+
+        return pd.concat(y_hat, axis=1)
+
diff --git a/pyforecaster/forecasting_models/randomforests.py b/pyforecaster/forecasting_models/randomforests.py
index d678f5f..656f203 100644
--- a/pyforecaster/forecasting_models/randomforests.py
+++ b/pyforecaster/forecasting_models/randomforests.py
@@ -140,6 +140,7 @@ def predict(self, x, **kwargs):
             else:
                 if len(preds.shape) == 2:
                     preds = np.expand_dims(preds, 0)
+                preds = np.swapaxes(preds, 1, 2)
                 preds = self.quantiles_to_df(preds, index=x.index, q_vect=kwargs['quantiles'])
         else:
             preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols)
diff --git a/pyforecaster/formatter.py b/pyforecaster/formatter.py
index 435d76f..ca1763a 100644
--- a/pyforecaster/formatter.py
+++ b/pyforecaster/formatter.py
@@ -47,7 +47,11 @@ def add_time_features(self, x):
                 col_names = ['hour', 'dayofweek', 'minuteofday']
                 self.timezone = False
             else:
-                utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1)
+                if x.shape[1] == 0:
+                    utc_offset = pd.DataFrame(index=x.index, columns=[0]).apply(lambda x: x.name.utcoffset().
+                                                                                total_seconds() / 3600, axis=1)
+                else:
+                    utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1)
                 time_features = [x.index.hour, x.index.dayofweek, x.index.hour * 60 + x.index.minute, utc_offset]
                 col_names = ['hour', 'dayofweek', 'minuteofday', 'utc_offset']
                 self.timezone = True
diff --git a/requirements.txt b/requirements.txt
index 91ba385..bc34bda 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ holidays>=0.16
 python-long-weekends>=0.1.1
 jax>=0.4.1
 jaxlib>=0.4.1
-quantile-forest>=1.2.0
+quantile-forest>=1.3.10
 optax>=0.1.7
 flax>=0.7.4
 statsmodels>=0.14.2
\ No newline at end of file
diff --git a/tests/test_models.py b/tests/test_models.py
index fcd5d7c..d51824b 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -16,7 +16,7 @@
 from pyforecaster.formatter import Formatter
 from pyforecaster.forecasting_models.neural_models.base_nn import FFNN
 from pyforecaster.plot_utils import ts_animation
-from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent
+from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent, DiscreteDistr
 
 class TestFormatDataset(unittest.TestCase):
     def setUp(self) -> None:
@@ -294,6 +294,36 @@ def test_persistence(self):
         y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
         plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)
 
+    """
+    def test_discrete_distr(self):
+        self.data = self.data.resample('1h').mean()
+        self.data = self.data.round(-1)
+        formatter = Formatter(logger=self.logger, augment=False)
+        formatter.add_target_transform(['all'], lags=-np.arange(1, 24))
+        x, y = formatter.transform(self.data)
+        n_tr = 6000
+        n_te = 2000
+        x_tr, y_tr, x_te, y_te = x.iloc[:n_tr], y.iloc[:n_tr], x.iloc[n_tr:n_tr+n_te], y.iloc[n_tr:n_tr+n_te]
+
+        m = DiscreteDistr(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24,
+                                    period='1d').fit(x_tr, y_tr)
+        y_hat = m.predict(x_te)
+        y_hat = m.predict_probabilities(x_te)
+        from pyforecaster.forecaster import ScenarioGenerator
+
+        plt.matshow(ScenarioGenerator().quantiles_to_numpy(y_hat)[0].T)
+
+        n_taus = len(y_hat.columns.get_level_values(1).unique())
+        q_hat = y_hat.values
+        q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus))
+        plt.matshow(q_hat[0].T)
+
+
+        n_taus = len(y_hat.columns.get_level_values(1).unique())
+        q_hat = y_hat.values
+        q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
+        q_hat = np.swapaxes(q_hat, 1, 2)
+    """
 
 if __name__ == '__main__':
     unittest.main()