added statsmodels wrapper, added benchmarks.py

supsi-dacd-isaac · Sep 9, 2024 · e616fdd · e616fdd
1 parent 42ac83f
commit e616fdd
Show file tree

Hide file tree

Showing 4 changed files with 223 additions and 27 deletions.
diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py
@@ -13,7 +13,7 @@
 
 class ScenarioGenerator(object):
     def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
-                 additional_node=False, formatter=None, **scengen_kwgs):
+                 additional_node=False, formatter=None, conditional_to_hour=True, **scengen_kwgs):
         self.q_vect = np.hstack([0.01, np.linspace(0,1,11)[1:-1], 0.99]) if q_vect is None else q_vect
         self.scengen = ScenGen(q_vect=self.q_vect, nodes_at_step=nodes_at_step, additional_node=additional_node, **scengen_kwgs)
         self.val_ratio = val_ratio
@@ -22,6 +22,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None,
         self.n_scen_fit = n_scen_fit
         self.additional_node = additional_node
         self.formatter = formatter
+        self.conditional_to_hour = conditional_to_hour
 
     @property
     def online_tree_reduction(self):
@@ -46,14 +47,18 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame):
         self.scengen.fit(errs, x, n_scen=self.n_scen_fit)
 
         self.err_distr = {}
-        hours = np.arange(24)
-        if len(np.unique(x.index.hour)) != 24:
-            print('not all hours are there in the training set, building unconditional confidence intervals')
-            for h in hours:
-                self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T
+        if self.conditional_to_hour:
+            self.err_distr = {}
+            hours = np.arange(24)
+            if len(np.unique(x.index.hour)) != 24:
+                print('not all hours are there in the training set, building unconditional confidence intervals')
+                for h in hours:
+                    self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T
+            else:
+                for h in hours:
+                    self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T
         else:
-            for h in hours:
-                self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T
+            self.err_distr = np.quantile(errs, self.q_vect, axis=0).T
 
     @abstractmethod
     def predict(self, x, **kwargs):
@@ -69,6 +74,20 @@ def anti_transform(self, x, y_hat):
     def predict_quantiles(self, x, **kwargs):
         pass
 
+    def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
+        """
+        Return probability mass function of the target variable, obtained from quantile predictions
+        :param x:
+        :param predict_q_kwargs:
+        :return:
+        """
+        quantiles = self.predict_quantiles(x, **predict_q_kwargs)
+        pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1))
+        for i in range(quantiles.shape[0]):
+            for j in range(quantiles.shape[1]):
+                pmf[i, j, :] = np.histogram(quantiles[i, j, :], bins=discrete_prob_space)[0]/len(self.q_vect)
+        return pmf
+
     def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs):
         n_scen = self.n_scen_fit if n_scen is None else n_scen
         # retrieve quantiles from child class

diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py
@@ -0,0 +1,79 @@
+from pyforecaster.forecaster import ScenarioGenerator
+import pandas as pd
+import numpy as np
+from pyforecaster.forecasting_models.holtwinters import hankel
+
+class Persistent(ScenarioGenerator):
+    def __init__(self, target_col, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
+                 conditional_to_hour=False, **scengen_kwgs):
+        super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
+                         conditional_to_hour=conditional_to_hour, **scengen_kwgs)
+        self.n_sa = n_sa
+        self.target_col = target_col
+
+    def fit(self, x:pd.DataFrame, y:pd.DataFrame=None):
+        x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]])
+        hw_target = hankel(y_val.iloc[1:].values, self.n_sa)
+        super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]]))
+        return self
+
+    def predict(self, x:pd.DataFrame, **kwargs):
+        y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa))
+        return pd.DataFrame(y_hat, index=x.index)
+
+    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+        preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
+        if self.conditional_to_hour:
+            for h in np.unique(x.index.hour):
+                preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
+        else:
+            preds += np.expand_dims(self.err_distr, 0)
+
+        return preds
+
+
+class SeasonalPersistent(ScenarioGenerator):
+    def __init__(self, target_col, seasonality=1, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
+                 conditional_to_hour=False, **scengen_kwgs):
+        super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
+                         conditional_to_hour=conditional_to_hour, **scengen_kwgs)
+        self.seasonality = seasonality
+        self.n_sa = n_sa
+        self.target_col = target_col
+        self.state = None
+
+    def fit(self, x: pd.DataFrame, y: pd.DataFrame = None):
+        x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]])
+        hw_target = hankel(y_val.iloc[1:].values, self.n_sa)
+        self.state = x[[self.target_col]].iloc[-self.seasonality:].values
+        super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]]))
+        return self
+
+    def predict(self, x: pd.DataFrame, **kwargs):
+        values = np.hstack([self.state.ravel(), x[self.target_col].values])
+        y_hat = hankel(values, self.n_sa)
+        return pd.DataFrame(y_hat[:len(x), :], index=x.index)
+
+    def predict_quantiles(self, x: pd.DataFrame, **kwargs):
+        preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
+        if self.conditional_to_hour:
+            for h in np.unique(x.index.hour):
+                preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
+        else:
+            preds += np.expand_dims(self.err_distr, 0)
+
+        return preds
+
+
+class DiscreteDistr(ScenarioGenerator):
+    def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
+                 conditional_to_hour=False, **scengen_kwgs):
+        super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
+                         conditional_to_hour=conditional_to_hour, **scengen_kwgs)
+        self.n_sa = n_sa
+        self.period = period
+
+    def fit(self, x:pd.DataFrame, y:pd.DataFrame):
+        # infer sampling time
+        sampling_time = pd.infer_freq(x.index)
+        # aggregate by
diff --git a/pyforecaster/forecasting_models/statsmodels_wrapper.py b/pyforecaster/forecasting_models/statsmodels_wrapper.py
@@ -0,0 +1,71 @@
+import numpy as np
+import pandas as pd
+from pyforecaster.forecaster import ScenarioGenerator
+from statsmodels.tsa.statespace import exponential_smoothing
+from tqdm import tqdm
+from pyforecaster.forecasting_models.holtwinters import hankel
+
+def get_forecasts(model, y_te, x_te, n_step_ahead=24):
+  y_hat = np.zeros((len(y_te), n_step_ahead))
+  for i in tqdm(range(len(y_te))):
+    y_hat[i,:] = model.forecast(n_step_ahead)
+    exog = x_te.iloc[[i]].values if x_te is not None else None
+    try:
+        model = model.append(y_te.iloc[[i]], exog=exog) # this method UPDATES the model with the last observation
+    except:
+        print('Error in updating the model')
+        model = model.apply(y_te.iloc[[i]], exog=exog)  # this method UPDATES the model with the last observation
+  return pd.DataFrame(y_hat, index=y_te.index)
+
+
+class StatsModelsWrapper(ScenarioGenerator):
+    def __init__(self, model_class, model_pars, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1,
+                 **scengen_kwgs):
+        self.model_class = model_class
+        self.model_pars = model_pars
+        self.model = None
+        self.target_name = target_name
+        self.n_sa = n_sa
+        super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, **scengen_kwgs)
+
+    def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
+        y_present = x_pd[self.target_name].values
+
+        # exclude the last n_sa, we need them to create the target
+        preds = self.predict(x_pd)[:-self.n_sa]
+
+        # hankelize the target
+        hw_target = hankel(y_present[1:], self.n_sa)
+        resid = hw_target - preds
+        self.err_distr = np.quantile(resid, self.q_vect, axis=0).T
+
+
+    def predict(self, x_pd:pd.DataFrame, **kwargs):
+        pass
+
+    def predict_quantiles(self, x:pd.DataFrame, **kwargs):
+        preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
+        for h in np.unique(x.index.hour):
+            preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
+        return preds
+
+
+class ExponentialSmoothing(StatsModelsWrapper):
+    def __init__(self, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1, trend=None, seasonal=None, **scengen_kwgs):
+        model_class = exponential_smoothing.ExponentialSmoothing
+        model_pars = {'trend': trend, 'seasonal': seasonal}
+        super().__init__(model_class, model_pars, target_name, q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, n_sa=n_sa, **scengen_kwgs)
+
+    def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
+        x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index))
+        y_pd = x_pd[[self.target_name]]
+        self.model = self.model_class(**self.model_pars, endog=y_pd).fit()
+        super().fit(x_pd, y_pd)
+        return self
+
+    def predict(self, x_pd:pd.DataFrame, **kwargs):
+        x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index))
+        y = x_pd[self.target_name]
+        y_hat = get_forecasts(self.model, y, n_step_ahead=self.n_sa, x_te=None)
+        return y_hat
+
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -6,14 +6,17 @@
 import logging
 from pyforecaster.forecasting_models.holtwinters import HoltWinters, HoltWintersMulti
 from pyforecaster.forecasting_models.fast_adaptive_models import Fourier_es, FK, FK_multi
-from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression
+from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression, BrutalRegressor
 from pyforecaster.forecasting_models.randomforests import QRF
 from pyforecaster.forecasting_models.gradientboosters import LGBMHybrid
+from pyforecaster.forecasting_models.statsmodels_wrapper import ExponentialSmoothing
 from pyforecaster.forecaster import LinearForecaster, LGBForecaster
 from pyforecaster.plot_utils import plot_quantiles
 from pyforecaster.formatter import Formatter
 from pyforecaster.forecasting_models.neural_models.base_nn import FFNN
 from pyforecaster.plot_utils import ts_animation
+from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent
+
 class TestFormatDataset(unittest.TestCase):
     def setUp(self) -> None:
         self.t = 10000
@@ -185,23 +188,6 @@ def test_qrf(self):
         y_hat = qrf.predict(x_te.iloc[[0], :])
         q = qrf.predict(x_te.iloc[[0], :])
 
-    def test_rffr(self):
-        formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(144),
-                                                                    relative_lags=True)
-        formatter.add_target_transform(['all'], lags=-np.arange(144))
-
-        x, y = formatter.transform(self.data.iloc[:10000])
-        x.columns = x.columns.astype(str)
-        y.columns = y.columns.astype(str)
-        n_tr = int(len(x) * 0.7)
-        x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
-                                  y.iloc[n_tr:].copy()]
-        m = RFFRegression(std_kernel=0.001, dim_kernel=30).fit(x_tr, y_tr)
-        y_hat = m.predict(x_te)
-        q = m.predict_quantiles(x_te)
-
-        m = LinearForecaster(val_ratio=0.2).fit(x_tr, y_tr)
-        y_hat_lin = m.predict(x_te)
 
 
     def test_antinormalize(self):
@@ -242,7 +228,48 @@ def test_antinormalize(self):
         mae = lambda x, y: np.abs(x-y).mean().mean()
         print('MAE lin:', mae(y_te, y_hat))
 
-
+    def test_statsmodels_wrappers(self):
+        self.data = self.data.resample('1h').mean()
+        data_tr = self.data.iloc[:100]
+        data_te = self.data.iloc[100:300]
+        m = ExponentialSmoothing(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24,
+                                 seasonal=24).fit(data_tr)
+        y_hat = m.predict(data_te)
+        q_hat = m.predict_quantiles(data_te)
+        y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
+        #plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)
+
+        discr_prob = m.predict_pmf(data_te, discrete_prob_space=np.linspace(500, 1200, 10))
+        plt.matshow(discr_prob[2].T )
+        import seaborn as sb
+
+        i = 1
+        plt.figure(figsize=(10, 6))
+        x_bins = np.arange(24)
+        y_bins = np.linspace(500, 1200, 10)
+        extent = [x_bins.min(), x_bins.max(), y_bins.min(), y_bins.max()]
+        plt.imshow(discr_prob[i].T, aspect='auto', extent=extent, origin='lower', cmap='plasma')
+
+        # Overlay the time series on top
+        plt.plot(x_bins, y_plot.values[i, :], color='black', linewidth=2, label="Time Series")
+        plt.plot(x_bins, q_hat[i, :, :], color='black', linewidth=2, label="Time Series")
+
+    def test_persistence(self):
+        self.data = self.data.resample('1h').mean()
+        data_tr = self.data.iloc[:2100]
+        data_te = self.data.iloc[2100:2400]
+        m = Persistent(target_col='all', n_sa=24, q_vect=np.arange(11)/10, val_ratio=0.8, conditional_to_hour=False).fit(data_tr)
+        y_hat = m.predict(data_te)
+        q_hat = m.predict_quantiles(data_te)
+        y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
+        plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)
+
+        m = SeasonalPersistent(target_col='all', seasonality=24, n_sa=24, q_vect=np.arange(11) / 10, val_ratio=0.8,
+                       conditional_to_hour=True).fit(data_tr)
+        y_hat = m.predict(data_te)
+        q_hat = m.predict_quantiles(data_te)
+        y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
+        plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)
 
 
 if __name__ == '__main__':