Skip to content

Commit

Permalink
added statsmodels wrapper, added benchmarks.py
Browse files Browse the repository at this point in the history
  • Loading branch information
nepslor committed Sep 9, 2024
1 parent 42ac83f commit e616fdd
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 27 deletions.
35 changes: 27 additions & 8 deletions pyforecaster/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

class ScenarioGenerator(object):
def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None, n_scen_fit=100,
additional_node=False, formatter=None, **scengen_kwgs):
additional_node=False, formatter=None, conditional_to_hour=True, **scengen_kwgs):
self.q_vect = np.hstack([0.01, np.linspace(0,1,11)[1:-1], 0.99]) if q_vect is None else q_vect
self.scengen = ScenGen(q_vect=self.q_vect, nodes_at_step=nodes_at_step, additional_node=additional_node, **scengen_kwgs)
self.val_ratio = val_ratio
Expand All @@ -22,6 +22,7 @@ def __init__(self, q_vect=None, nodes_at_step=None, val_ratio=None, logger=None,
self.n_scen_fit = n_scen_fit
self.additional_node = additional_node
self.formatter = formatter
self.conditional_to_hour = conditional_to_hour

@property
def online_tree_reduction(self):
Expand All @@ -46,14 +47,18 @@ def fit(self, x:pd.DataFrame, y:pd.DataFrame):
self.scengen.fit(errs, x, n_scen=self.n_scen_fit)

self.err_distr = {}
hours = np.arange(24)
if len(np.unique(x.index.hour)) != 24:
print('not all hours are there in the training set, building unconditional confidence intervals')
for h in hours:
self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T
if self.conditional_to_hour:
self.err_distr = {}
hours = np.arange(24)
if len(np.unique(x.index.hour)) != 24:
print('not all hours are there in the training set, building unconditional confidence intervals')
for h in hours:
self.err_distr[h] = np.quantile(errs, self.q_vect, axis=0).T
else:
for h in hours:
self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T
else:
for h in hours:
self.err_distr[h] = np.quantile(errs.loc[errs.index.hour == h, :], self.q_vect, axis=0).T
self.err_distr = np.quantile(errs, self.q_vect, axis=0).T

@abstractmethod
def predict(self, x, **kwargs):
Expand All @@ -69,6 +74,20 @@ def anti_transform(self, x, y_hat):
def predict_quantiles(self, x, **kwargs):
pass

def predict_pmf(self, x, discrete_prob_space, **predict_q_kwargs):
"""
Return probability mass function of the target variable, obtained from quantile predictions
:param x:
:param predict_q_kwargs:
:return:
"""
quantiles = self.predict_quantiles(x, **predict_q_kwargs)
pmf = np.zeros((quantiles.shape[0], quantiles.shape[1], len(discrete_prob_space)-1))
for i in range(quantiles.shape[0]):
for j in range(quantiles.shape[1]):
pmf[i, j, :] = np.histogram(quantiles[i, j, :], bins=discrete_prob_space)[0]/len(self.q_vect)
return pmf

def predict_scenarios(self, x, n_scen=None, random_state=None, **predict_q_kwargs):
n_scen = self.n_scen_fit if n_scen is None else n_scen
# retrieve quantiles from child class
Expand Down
79 changes: 79 additions & 0 deletions pyforecaster/forecasting_models/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from pyforecaster.forecaster import ScenarioGenerator
import pandas as pd
import numpy as np
from pyforecaster.forecasting_models.holtwinters import hankel

class Persistent(ScenarioGenerator):
def __init__(self, target_col, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
conditional_to_hour=False, **scengen_kwgs):
super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
conditional_to_hour=conditional_to_hour, **scengen_kwgs)
self.n_sa = n_sa
self.target_col = target_col

def fit(self, x:pd.DataFrame, y:pd.DataFrame=None):
x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]])
hw_target = hankel(y_val.iloc[1:].values, self.n_sa)
super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]]))
return self

def predict(self, x:pd.DataFrame, **kwargs):
y_hat = x[self.target_col].values.reshape(-1, 1) * np.ones((1, self.n_sa))
return pd.DataFrame(y_hat, index=x.index)

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
if self.conditional_to_hour:
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
else:
preds += np.expand_dims(self.err_distr, 0)

return preds


class SeasonalPersistent(ScenarioGenerator):
def __init__(self, target_col, seasonality=1, n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
conditional_to_hour=False, **scengen_kwgs):
super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
conditional_to_hour=conditional_to_hour, **scengen_kwgs)
self.seasonality = seasonality
self.n_sa = n_sa
self.target_col = target_col
self.state = None

def fit(self, x: pd.DataFrame, y: pd.DataFrame = None):
x, y, x_val, y_val = self.train_val_split(x, x[[self.target_col]])
hw_target = hankel(y_val.iloc[1:].values, self.n_sa)
self.state = x[[self.target_col]].iloc[-self.seasonality:].values
super().fit(x_val.iloc[:-self.n_sa, :], pd.DataFrame(hw_target, index=x_val.index[:hw_target.shape[0]]))
return self

def predict(self, x: pd.DataFrame, **kwargs):
values = np.hstack([self.state.ravel(), x[self.target_col].values])
y_hat = hankel(values, self.n_sa)
return pd.DataFrame(y_hat[:len(x), :], index=x.index)

def predict_quantiles(self, x: pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
if self.conditional_to_hour:
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
else:
preds += np.expand_dims(self.err_distr, 0)

return preds


class DiscreteDistr(ScenarioGenerator):
def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_step=None,
conditional_to_hour=False, **scengen_kwgs):
super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio,
conditional_to_hour=conditional_to_hour, **scengen_kwgs)
self.n_sa = n_sa
self.period = period

def fit(self, x:pd.DataFrame, y:pd.DataFrame):
# infer sampling time
sampling_time = pd.infer_freq(x.index)
# aggregate by
71 changes: 71 additions & 0 deletions pyforecaster/forecasting_models/statsmodels_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import numpy as np
import pandas as pd
from pyforecaster.forecaster import ScenarioGenerator
from statsmodels.tsa.statespace import exponential_smoothing
from tqdm import tqdm
from pyforecaster.forecasting_models.holtwinters import hankel

def get_forecasts(model, y_te, x_te, n_step_ahead=24):
y_hat = np.zeros((len(y_te), n_step_ahead))
for i in tqdm(range(len(y_te))):
y_hat[i,:] = model.forecast(n_step_ahead)
exog = x_te.iloc[[i]].values if x_te is not None else None
try:
model = model.append(y_te.iloc[[i]], exog=exog) # this method UPDATES the model with the last observation
except:
print('Error in updating the model')
model = model.apply(y_te.iloc[[i]], exog=exog) # this method UPDATES the model with the last observation
return pd.DataFrame(y_hat, index=y_te.index)


class StatsModelsWrapper(ScenarioGenerator):
def __init__(self, model_class, model_pars, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1,
**scengen_kwgs):
self.model_class = model_class
self.model_pars = model_pars
self.model = None
self.target_name = target_name
self.n_sa = n_sa
super().__init__(q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, **scengen_kwgs)

def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
y_present = x_pd[self.target_name].values

# exclude the last n_sa, we need them to create the target
preds = self.predict(x_pd)[:-self.n_sa]

# hankelize the target
hw_target = hankel(y_present[1:], self.n_sa)
resid = hw_target - preds
self.err_distr = np.quantile(resid, self.q_vect, axis=0).T


def predict(self, x_pd:pd.DataFrame, **kwargs):
pass

def predict_quantiles(self, x:pd.DataFrame, **kwargs):
preds = np.expand_dims(self.predict(x), -1) * np.ones((1, 1, len(self.q_vect)))
for h in np.unique(x.index.hour):
preds[x.index.hour == h, :, :] += np.expand_dims(self.err_distr[h], 0)
return preds


class ExponentialSmoothing(StatsModelsWrapper):
def __init__(self, target_name, q_vect, nodes_at_step=None, val_ratio=0.8, n_sa=1, trend=None, seasonal=None, **scengen_kwgs):
model_class = exponential_smoothing.ExponentialSmoothing
model_pars = {'trend': trend, 'seasonal': seasonal}
super().__init__(model_class, model_pars, target_name, q_vect, nodes_at_step=nodes_at_step, val_ratio=val_ratio, n_sa=n_sa, **scengen_kwgs)

def fit(self, x_pd:pd.DataFrame, y:pd.DataFrame=None):
x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index))
y_pd = x_pd[[self.target_name]]
self.model = self.model_class(**self.model_pars, endog=y_pd).fit()
super().fit(x_pd, y_pd)
return self

def predict(self, x_pd:pd.DataFrame, **kwargs):
x_pd = x_pd.asfreq(pd.infer_freq(x_pd.index))
y = x_pd[self.target_name]
y_hat = get_forecasts(self.model, y, n_step_ahead=self.n_sa, x_te=None)
return y_hat

65 changes: 46 additions & 19 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
import logging
from pyforecaster.forecasting_models.holtwinters import HoltWinters, HoltWintersMulti
from pyforecaster.forecasting_models.fast_adaptive_models import Fourier_es, FK, FK_multi
from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression
from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression, BrutalRegressor
from pyforecaster.forecasting_models.randomforests import QRF
from pyforecaster.forecasting_models.gradientboosters import LGBMHybrid
from pyforecaster.forecasting_models.statsmodels_wrapper import ExponentialSmoothing
from pyforecaster.forecaster import LinearForecaster, LGBForecaster
from pyforecaster.plot_utils import plot_quantiles
from pyforecaster.formatter import Formatter
from pyforecaster.forecasting_models.neural_models.base_nn import FFNN
from pyforecaster.plot_utils import ts_animation
from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent

class TestFormatDataset(unittest.TestCase):
def setUp(self) -> None:
self.t = 10000
Expand Down Expand Up @@ -185,23 +188,6 @@ def test_qrf(self):
y_hat = qrf.predict(x_te.iloc[[0], :])
q = qrf.predict(x_te.iloc[[0], :])

def test_rffr(self):
formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(144),
relative_lags=True)
formatter.add_target_transform(['all'], lags=-np.arange(144))

x, y = formatter.transform(self.data.iloc[:10000])
x.columns = x.columns.astype(str)
y.columns = y.columns.astype(str)
n_tr = int(len(x) * 0.7)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]
m = RFFRegression(std_kernel=0.001, dim_kernel=30).fit(x_tr, y_tr)
y_hat = m.predict(x_te)
q = m.predict_quantiles(x_te)

m = LinearForecaster(val_ratio=0.2).fit(x_tr, y_tr)
y_hat_lin = m.predict(x_te)


def test_antinormalize(self):
Expand Down Expand Up @@ -242,7 +228,48 @@ def test_antinormalize(self):
mae = lambda x, y: np.abs(x-y).mean().mean()
print('MAE lin:', mae(y_te, y_hat))


def test_statsmodels_wrappers(self):
self.data = self.data.resample('1h').mean()
data_tr = self.data.iloc[:100]
data_te = self.data.iloc[100:300]
m = ExponentialSmoothing(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24,
seasonal=24).fit(data_tr)
y_hat = m.predict(data_te)
q_hat = m.predict_quantiles(data_te)
y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
#plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)

discr_prob = m.predict_pmf(data_te, discrete_prob_space=np.linspace(500, 1200, 10))
plt.matshow(discr_prob[2].T )
import seaborn as sb

i = 1
plt.figure(figsize=(10, 6))
x_bins = np.arange(24)
y_bins = np.linspace(500, 1200, 10)
extent = [x_bins.min(), x_bins.max(), y_bins.min(), y_bins.max()]
plt.imshow(discr_prob[i].T, aspect='auto', extent=extent, origin='lower', cmap='plasma')

# Overlay the time series on top
plt.plot(x_bins, y_plot.values[i, :], color='black', linewidth=2, label="Time Series")
plt.plot(x_bins, q_hat[i, :, :], color='black', linewidth=2, label="Time Series")

def test_persistence(self):
self.data = self.data.resample('1h').mean()
data_tr = self.data.iloc[:2100]
data_te = self.data.iloc[2100:2400]
m = Persistent(target_col='all', n_sa=24, q_vect=np.arange(11)/10, val_ratio=0.8, conditional_to_hour=False).fit(data_tr)
y_hat = m.predict(data_te)
q_hat = m.predict_quantiles(data_te)
y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)

m = SeasonalPersistent(target_col='all', seasonality=24, n_sa=24, q_vect=np.arange(11) / 10, val_ratio=0.8,
conditional_to_hour=True).fit(data_tr)
y_hat = m.predict(data_te)
q_hat = m.predict_quantiles(data_te)
y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)


if __name__ == '__main__':
Expand Down

0 comments on commit e616fdd

Please sign in to comment.