From e616fddc2a6e2fe024dc25f63ffc169e0afbcd51 Mon Sep 17 00:00:00 2001 From: nepslor Date: Mon, 9 Sep 2024 09:21:12 +0200 Subject: [PATCH] added statsmodels wrapper, added benchmarks.py --- pyforecaster/forecaster.py | 35 ++++++-- pyforecaster/forecasting_models/benchmarks.py | 79 +++++++++++++++++++ .../forecasting_models/statsmodels_wrapper.py | 71 +++++++++++++++++ tests/test_models.py | 65 ++++++++++----- 4 files changed, 223 insertions(+), 27 deletions(-) create mode 100644 pyforecaster/forecasting_models/benchmarks.py create mode 100644 pyforecaster/forecasting_models/statsmodels_wrapper.py diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py index 54ab19e..998f5c9 100644 --- a/pyforecaster/forecaster.py +++ b/pyforecaster/forecaster.py @@ -13,7 +13,7 @@ class ScenarioGenerator(object): def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100, - additional_node=False, formatter=None, **scengen_kwgs): + additional_node=False, formatter=None, conditional_to_hour=True, **scengen_kwgs): self.q_vect = np.hstack([0.01, np.linspace(0,1,11)[1:-1], 0.99]) if q_vect is None else q_vect self.scengen = ScenGen(q_vect=self.q_vect, nodes_at_step=nodes_at_step, additional_node=additional_node, **scengen_kwgs) self.val_ratio = val_ratio @@ -22,6 +22,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, self.n_scen_fit = n_scen_fit self.additional_node = additional_node self.formatter = formatter + self.conditional_to_hour = conditional_to_hour @property def online_tree_reduction(self): @@ -46,14 +47,18 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame): self.scengen.fit(errs, x, n_scen=self.n_scen_fit) self.err_distr = {} - hours = np.arange(24) - if len(np.unique(x.index.hour)) != 24: - print('not all hours are there in the training set, building unconditional confidence intervals') - for h in hours: - self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T + if self.conditional_to_hour: + self.err_distr = {} + hours = np.arange(24) + if len(np.unique(x.index.hour)) != 24: + print('not all hours are there in the training set, building unconditional confidence intervals') + for h in hours: + self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T + else: + for h in hours: + self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T else: - for h in hours: - self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T + self.err_distr = np.quantile(errs, self.q_vect, axis=0).T @abstractmethod def predict(self, x, **kwargs): @@ -69,6 +74,20 @@ def anti_transform(self, x, y_hat): def predict_quantiles(self, x, **kwargs): pass + def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs): + """ + Return probability mass function of the target variable, obtained from quantile predictions + :param x: + :param predict_q_kwargs: + :return: + """ + quantiles = self.predict_quantiles(x, **predict_q_kwargs) + pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1)) + for i in range(quantiles.shape[0]): + for j in range(quantiles.shape[1]): + pmf[i, j, :] = np.histogram(quantiles[i, j, :], bins=discrete_prob_space)[0]/len(self.q_vect) + return pmf + def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs): n_scen = self.n_scen_fit if n_scen is None else n_scen # retrieve quantiles from child class diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py new file mode 100644 index 0000000..138cbc3 --- /dev/null +++ b/pyforecaster/forecasting_models/benchmarks.py @@ -0,0 +1,79 @@ +from pyforecaster.forecaster import ScenarioGenerator +import pandas as pd +import numpy as np +from pyforecaster.forecasting_models.holtwinters import hankel + +class Persistent(ScenarioGenerator): + def __init__(self, target_col, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None, + conditional_to_hour=False, **scengen_kwgs): + super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, + conditional_to_hour=conditional_to_hour, **scengen_kwgs) + self.n_sa = n_sa + self.target_col = target_col + + def fit(self, x:pd.DataFrame, y:pd.DataFrame=None): + x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]]) + hw_target = hankel(y_val.iloc[1:].values, self.n_sa) + super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]])) + return self + + def predict(self, x:pd.DataFrame, **kwargs): + y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa)) + return pd.DataFrame(y_hat, index=x.index) + + def predict_quantiles(self, x:pd.DataFrame, **kwargs): + preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) + if self.conditional_to_hour: + for h in np.unique(x.index.hour): + preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) + else: + preds += np.expand_dims(self.err_distr, 0) + + return preds + + +class SeasonalPersistent(ScenarioGenerator): + def __init__(self, target_col, seasonality=1, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None, + conditional_to_hour=False, **scengen_kwgs): + super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, + conditional_to_hour=conditional_to_hour, **scengen_kwgs) + self.seasonality = seasonality + self.n_sa = n_sa + self.target_col = target_col + self.state = None + + def fit(self, x: pd.DataFrame, y: pd.DataFrame = None): + x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]]) + hw_target = hankel(y_val.iloc[1:].values, self.n_sa) + self.state = x[[self.target_col]].iloc[-self.seasonality:].values + super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]])) + return self + + def predict(self, x: pd.DataFrame, **kwargs): + values = np.hstack([self.state.ravel(), x[self.target_col].values]) + y_hat = hankel(values, self.n_sa) + return pd.DataFrame(y_hat[:len(x), :], index=x.index) + + def predict_quantiles(self, x: pd.DataFrame, **kwargs): + preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) + if self.conditional_to_hour: + for h in np.unique(x.index.hour): + preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) + else: + preds += np.expand_dims(self.err_distr, 0) + + return preds + + +class DiscreteDistr(ScenarioGenerator): + def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None, + conditional_to_hour=False, **scengen_kwgs): + super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, + conditional_to_hour=conditional_to_hour, **scengen_kwgs) + self.n_sa = n_sa + self.period = period + + def fit(self, x:pd.DataFrame, y:pd.DataFrame): + # infer sampling time + sampling_time = pd.infer_freq(x.index) + # aggregate by \ No newline at end of file diff --git a/pyforecaster/forecasting_models/statsmodels_wrapper.py b/pyforecaster/forecasting_models/statsmodels_wrapper.py new file mode 100644 index 0000000..8456b97 --- /dev/null +++ b/pyforecaster/forecasting_models/statsmodels_wrapper.py @@ -0,0 +1,71 @@ +import numpy as np +import pandas as pd +from pyforecaster.forecaster import ScenarioGenerator +from statsmodels.tsa.statespace import exponential_smoothing +from tqdm import tqdm +from pyforecaster.forecasting_models.holtwinters import hankel + +def get_forecasts(model, y_te, x_te, n_step_ahead=24): + y_hat = np.zeros((len(y_te), n_step_ahead)) + for i in tqdm(range(len(y_te))): + y_hat[i,:] = model.forecast(n_step_ahead) + exog = x_te.iloc[[i]].values if x_te is not None else None + try: + model = model.append(y_te.iloc[[i]], exog=exog) # this method UPDATES the model with the last observation + except: + print('Error in updating the model') + model = model.apply(y_te.iloc[[i]], exog=exog) # this method UPDATES the model with the last observation + return pd.DataFrame(y_hat, index=y_te.index) + + +class StatsModelsWrapper(ScenarioGenerator): + def __init__(self, model_class, model_pars, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1, + **scengen_kwgs): + self.model_class = model_class + self.model_pars = model_pars + self.model = None + self.target_name = target_name + self.n_sa = n_sa + super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, **scengen_kwgs) + + def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None): + y_present = x_pd[self.target_name].values + + # exclude the last n_sa, we need them to create the target + preds = self.predict(x_pd)[:-self.n_sa] + + # hankelize the target + hw_target = hankel(y_present[1:], self.n_sa) + resid = hw_target - preds + self.err_distr = np.quantile(resid, self.q_vect, axis=0).T + + + def predict(self, x_pd:pd.DataFrame, **kwargs): + pass + + def predict_quantiles(self, x:pd.DataFrame, **kwargs): + preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect))) + for h in np.unique(x.index.hour): + preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0) + return preds + + +class ExponentialSmoothing(StatsModelsWrapper): + def __init__(self, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1, trend=None, seasonal=None, **scengen_kwgs): + model_class = exponential_smoothing.ExponentialSmoothing + model_pars = {'trend': trend, 'seasonal': seasonal} + super().__init__(model_class, model_pars, target_name, q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, n_sa=n_sa, **scengen_kwgs) + + def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None): + x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index)) + y_pd = x_pd[[self.target_name]] + self.model = self.model_class(**self.model_pars, endog=y_pd).fit() + super().fit(x_pd, y_pd) + return self + + def predict(self, x_pd:pd.DataFrame, **kwargs): + x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index)) + y = x_pd[self.target_name] + y_hat = get_forecasts(self.model, y, n_step_ahead=self.n_sa, x_te=None) + return y_hat + diff --git a/tests/test_models.py b/tests/test_models.py index 9ebdc77..105316a 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -6,14 +6,17 @@ import logging from pyforecaster.forecasting_models.holtwinters import HoltWinters, HoltWintersMulti from pyforecaster.forecasting_models.fast_adaptive_models import Fourier_es, FK, FK_multi -from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression +from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression, BrutalRegressor from pyforecaster.forecasting_models.randomforests import QRF from pyforecaster.forecasting_models.gradientboosters import LGBMHybrid +from pyforecaster.forecasting_models.statsmodels_wrapper import ExponentialSmoothing from pyforecaster.forecaster import LinearForecaster, LGBForecaster from pyforecaster.plot_utils import plot_quantiles from pyforecaster.formatter import Formatter from pyforecaster.forecasting_models.neural_models.base_nn import FFNN from pyforecaster.plot_utils import ts_animation +from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent + class TestFormatDataset(unittest.TestCase): def setUp(self) -> None: self.t = 10000 @@ -185,23 +188,6 @@ def test_qrf(self): y_hat = qrf.predict(x_te.iloc[[0], :]) q = qrf.predict(x_te.iloc[[0], :]) - def test_rffr(self): - formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(144), - relative_lags=True) - formatter.add_target_transform(['all'], lags=-np.arange(144)) - - x, y = formatter.transform(self.data.iloc[:10000]) - x.columns = x.columns.astype(str) - y.columns = y.columns.astype(str) - n_tr = int(len(x) * 0.7) - x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), - y.iloc[n_tr:].copy()] - m = RFFRegression(std_kernel=0.001, dim_kernel=30).fit(x_tr, y_tr) - y_hat = m.predict(x_te) - q = m.predict_quantiles(x_te) - - m = LinearForecaster(val_ratio=0.2).fit(x_tr, y_tr) - y_hat_lin = m.predict(x_te) def test_antinormalize(self): @@ -242,7 +228,48 @@ def test_antinormalize(self): mae = lambda x, y: np.abs(x-y).mean().mean() print('MAE lin:', mae(y_te, y_hat)) - + def test_statsmodels_wrappers(self): + self.data = self.data.resample('1h').mean() + data_tr = self.data.iloc[:100] + data_te = self.data.iloc[100:300] + m = ExponentialSmoothing(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24, + seasonal=24).fit(data_tr) + y_hat = m.predict(data_te) + q_hat = m.predict_quantiles(data_te) + y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1) + #plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300) + + discr_prob = m.predict_pmf(data_te, discrete_prob_space=np.linspace(500, 1200, 10)) + plt.matshow(discr_prob[2].T ) + import seaborn as sb + + i = 1 + plt.figure(figsize=(10, 6)) + x_bins = np.arange(24) + y_bins = np.linspace(500, 1200, 10) + extent = [x_bins.min(), x_bins.max(), y_bins.min(), y_bins.max()] + plt.imshow(discr_prob[i].T, aspect='auto', extent=extent, origin='lower', cmap='plasma') + + # Overlay the time series on top + plt.plot(x_bins, y_plot.values[i, :], color='black', linewidth=2, label="Time Series") + plt.plot(x_bins, q_hat[i, :, :], color='black', linewidth=2, label="Time Series") + + def test_persistence(self): + self.data = self.data.resample('1h').mean() + data_tr = self.data.iloc[:2100] + data_te = self.data.iloc[2100:2400] + m = Persistent(target_col='all', n_sa=24, q_vect=np.arange(11)/10, val_ratio=0.8, conditional_to_hour=False).fit(data_tr) + y_hat = m.predict(data_te) + q_hat = m.predict_quantiles(data_te) + y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1) + plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300) + + m = SeasonalPersistent(target_col='all', seasonality=24, n_sa=24, q_vect=np.arange(11) / 10, val_ratio=0.8, + conditional_to_hour=True).fit(data_tr) + y_hat = m.predict(data_te) + q_hat = m.predict_quantiles(data_te) + y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1) + plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300) if __name__ == '__main__':