From b3641e19427d0afc790bba9d98ccced97fff5395 Mon Sep 17 00:00:00 2001 From: nepslor Date: Tue, 10 Sep 2024 12:38:26 +0200 Subject: [PATCH] added DiscreteDistr model, fixed quantile bugs --- pyforecaster/forecaster.py | 4 +- pyforecaster/forecasting_models/benchmarks.py | 50 ++++++++++++++++++- .../forecasting_models/randomforests.py | 1 + pyforecaster/formatter.py | 6 ++- requirements.txt | 2 +- tests/test_models.py | 32 +++++++++++- 6 files changed, 88 insertions(+), 7 deletions(-) diff --git a/pyforecaster/forecaster.py b/pyforecaster/forecaster.py index 1b144b9..0e44d81 100644 --- a/pyforecaster/forecaster.py +++ b/pyforecaster/forecaster.py @@ -107,7 +107,6 @@ def _predict_quantiles(self, x, **kwargs): def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None): level_0_labels = self.target_cols level_1_labels = self.q_vect if q_vect is None else q_vect - q_hat = np.swapaxes(q_hat, 1, 2) q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2])) q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels])) q_hat.columns.names = ['target', 'quantile'] @@ -117,8 +116,7 @@ def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None): def quantiles_to_numpy(q_hat:pd.DataFrame): n_taus = len(q_hat.columns.get_level_values(1).unique()) q_hat = q_hat.values - q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1)) - q_hat = np.swapaxes(q_hat, 1, 2) + q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus)) return q_hat def predict_quantiles(self, x, dataframe=True, **kwargs): diff --git a/pyforecaster/forecasting_models/benchmarks.py b/pyforecaster/forecasting_models/benchmarks.py index 01b893c..706ede9 100644 --- a/pyforecaster/forecasting_models/benchmarks.py +++ b/pyforecaster/forecasting_models/benchmarks.py @@ -72,8 +72,56 @@ def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_st conditional_to_hour=conditional_to_hour, **scengen_kwgs) self.n_sa = n_sa self.period = period + self.target_names = None + self.y_distributions = None + self.support = None def fit(self, x:pd.DataFrame, y:pd.DataFrame): # infer sampling time sampling_time = pd.infer_freq(x.index) - # aggregate by \ No newline at end of file + + # retrieve support of target distribution + support = np.unique(y.values) + self.support = support + print('support of target distribution:', support) + + # Create a new column for the time within a day (hours and minutes) + time_of_day = y.index.floor(sampling_time).time + + # Group by this time of day and calculate the mean across all days + mean_by_time_of_day = {} + for v in support: + mean_by_time_of_day[v] = y.groupby(time_of_day).agg(lambda x: np.sum(x==v)) + + y_distrib = pd.concat(mean_by_time_of_day, axis=1) + # swap levels of the multiindex + y_distrib = y_distrib.swaplevel(0, 1, axis=1).sort_index(axis=1) + + # normalize the distribution for each variable in level 0 + for t in y_distrib.columns.levels[0]: + y_distrib[t] = y_distrib[t] / (y_distrib[t].sum(axis=1).values.reshape(-1, 1)+1e-12) + + # store the distribution + self.y_distributions = y_distrib + self.target_names = list(y_distrib.columns.levels[0]) + self.target_cols = [f'{t}_t+{i}' for t in self.target_names for i in range(1, self.n_sa+1)] + return self + + def predict(self, x, **kwargs): + return np.mean(self.predict_probabilities(x) * self.support.reshape(1, -1), axis=1) + + def predict_probabilities(self, x, **kwargs): + # infer sampling time + sampling_time = pd.infer_freq(x.index) + # Create a new column for the time within a day (hours and minutes) + time_of_day = pd.Series(x.index.floor(sampling_time).time) + + # retrieve the distribution for the time of day + y_hat = {} + for t in self.target_names: + pres_t = [time_of_day.map(self.y_distributions[t].loc[:, i]).rename(f'{i}') for i in + self.y_distributions[t].columns] + y_hat[t] = pd.concat(pres_t, axis=1) + + return pd.concat(y_hat, axis=1) + diff --git a/pyforecaster/forecasting_models/randomforests.py b/pyforecaster/forecasting_models/randomforests.py index d678f5f..656f203 100644 --- a/pyforecaster/forecasting_models/randomforests.py +++ b/pyforecaster/forecasting_models/randomforests.py @@ -140,6 +140,7 @@ def predict(self, x, **kwargs): else: if len(preds.shape) == 2: preds = np.expand_dims(preds, 0) + preds = np.swapaxes(preds, 1, 2) preds = self.quantiles_to_df(preds, index=x.index, q_vect=kwargs['quantiles']) else: preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols) diff --git a/pyforecaster/formatter.py b/pyforecaster/formatter.py index 435d76f..ca1763a 100644 --- a/pyforecaster/formatter.py +++ b/pyforecaster/formatter.py @@ -47,7 +47,11 @@ def add_time_features(self, x): col_names = ['hour', 'dayofweek', 'minuteofday'] self.timezone = False else: - utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1) + if x.shape[1] == 0: + utc_offset = pd.DataFrame(index=x.index, columns=[0]).apply(lambda x: x.name.utcoffset(). + total_seconds() / 3600, axis=1) + else: + utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1) time_features = [x.index.hour, x.index.dayofweek, x.index.hour * 60 + x.index.minute, utc_offset] col_names = ['hour', 'dayofweek', 'minuteofday', 'utc_offset'] self.timezone = True diff --git a/requirements.txt b/requirements.txt index 91ba385..bc34bda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ holidays>=0.16 python-long-weekends>=0.1.1 jax>=0.4.1 jaxlib>=0.4.1 -quantile-forest>=1.2.0 +quantile-forest>=1.3.10 optax>=0.1.7 flax>=0.7.4 statsmodels>=0.14.2 \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py index fcd5d7c..d51824b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -16,7 +16,7 @@ from pyforecaster.formatter import Formatter from pyforecaster.forecasting_models.neural_models.base_nn import FFNN from pyforecaster.plot_utils import ts_animation -from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent +from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent, DiscreteDistr class TestFormatDataset(unittest.TestCase): def setUp(self) -> None: @@ -294,6 +294,36 @@ def test_persistence(self): y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1) plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300) + """ + def test_discrete_distr(self): + self.data = self.data.resample('1h').mean() + self.data = self.data.round(-1) + formatter = Formatter(logger=self.logger, augment=False) + formatter.add_target_transform(['all'], lags=-np.arange(1, 24)) + x, y = formatter.transform(self.data) + n_tr = 6000 + n_te = 2000 + x_tr, y_tr, x_te, y_te = x.iloc[:n_tr], y.iloc[:n_tr], x.iloc[n_tr:n_tr+n_te], y.iloc[n_tr:n_tr+n_te] + + m = DiscreteDistr(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24, + period='1d').fit(x_tr, y_tr) + y_hat = m.predict(x_te) + y_hat = m.predict_probabilities(x_te) + from pyforecaster.forecaster import ScenarioGenerator + + plt.matshow(ScenarioGenerator().quantiles_to_numpy(y_hat)[0].T) + + n_taus = len(y_hat.columns.get_level_values(1).unique()) + q_hat = y_hat.values + q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus)) + plt.matshow(q_hat[0].T) + + + n_taus = len(y_hat.columns.get_level_values(1).unique()) + q_hat = y_hat.values + q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1)) + q_hat = np.swapaxes(q_hat, 1, 2) + """ if __name__ == '__main__': unittest.main()