Skip to content

Commit

Permalink
added DiscreteDistr model, fixed quantile bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
nepslor committed Sep 10, 2024
1 parent 1561e26 commit b3641e1
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 7 deletions.
4 changes: 1 addition & 3 deletions pyforecaster/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ def _predict_quantiles(self, x, **kwargs):
def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None):
level_0_labels = self.target_cols
level_1_labels = self.q_vect if q_vect is None else q_vect
q_hat = np.swapaxes(q_hat, 1, 2)
q_hat = np.reshape(q_hat, (q_hat.shape[0], q_hat.shape[1] * q_hat.shape[2]))
q_hat = pd.DataFrame(q_hat, index=index, columns=pd.MultiIndex.from_product([level_0_labels, level_1_labels]))
q_hat.columns.names = ['target', 'quantile']
Expand All @@ -117,8 +116,7 @@ def quantiles_to_df(self, q_hat:np.ndarray, index, q_vect=None):
def quantiles_to_numpy(q_hat:pd.DataFrame):
n_taus = len(q_hat.columns.get_level_values(1).unique())
q_hat = q_hat.values
q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
q_hat = np.swapaxes(q_hat, 1, 2)
q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus))
return q_hat

def predict_quantiles(self, x, dataframe=True, **kwargs):
Expand Down
50 changes: 49 additions & 1 deletion pyforecaster/forecasting_models/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,56 @@ def __init__(self, period='1d', n_sa=1, q_vect=None, val_ratio=None, nodes_at_st
conditional_to_hour=conditional_to_hour, **scengen_kwgs)
self.n_sa = n_sa
self.period = period
self.target_names = None
self.y_distributions = None
self.support = None

def fit(self, x:pd.DataFrame, y:pd.DataFrame):
# infer sampling time
sampling_time = pd.infer_freq(x.index)
# aggregate by

# retrieve support of target distribution
support = np.unique(y.values)
self.support = support
print('support of target distribution:', support)

# Create a new column for the time within a day (hours and minutes)
time_of_day = y.index.floor(sampling_time).time

# Group by this time of day and calculate the mean across all days
mean_by_time_of_day = {}
for v in support:
mean_by_time_of_day[v] = y.groupby(time_of_day).agg(lambda x: np.sum(x==v))

y_distrib = pd.concat(mean_by_time_of_day, axis=1)
# swap levels of the multiindex
y_distrib = y_distrib.swaplevel(0, 1, axis=1).sort_index(axis=1)

# normalize the distribution for each variable in level 0
for t in y_distrib.columns.levels[0]:
y_distrib[t] = y_distrib[t] / (y_distrib[t].sum(axis=1).values.reshape(-1, 1)+1e-12)

# store the distribution
self.y_distributions = y_distrib
self.target_names = list(y_distrib.columns.levels[0])
self.target_cols = [f'{t}_t+{i}' for t in self.target_names for i in range(1, self.n_sa+1)]
return self

def predict(self, x, **kwargs):
return np.mean(self.predict_probabilities(x) * self.support.reshape(1, -1), axis=1)

def predict_probabilities(self, x, **kwargs):
# infer sampling time
sampling_time = pd.infer_freq(x.index)
# Create a new column for the time within a day (hours and minutes)
time_of_day = pd.Series(x.index.floor(sampling_time).time)

# retrieve the distribution for the time of day
y_hat = {}
for t in self.target_names:
pres_t = [time_of_day.map(self.y_distributions[t].loc[:, i]).rename(f'{i}') for i in
self.y_distributions[t].columns]
y_hat[t] = pd.concat(pres_t, axis=1)

return pd.concat(y_hat, axis=1)

1 change: 1 addition & 0 deletions pyforecaster/forecasting_models/randomforests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def predict(self, x, **kwargs):
else:
if len(preds.shape) == 2:
preds = np.expand_dims(preds, 0)
preds = np.swapaxes(preds, 1, 2)
preds = self.quantiles_to_df(preds, index=x.index, q_vect=kwargs['quantiles'])
else:
preds = pd.DataFrame(np.atleast_2d(np.squeeze(preds)), index=x.index, columns=self.target_cols)
Expand Down
6 changes: 5 additions & 1 deletion pyforecaster/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ def add_time_features(self, x):
col_names = ['hour', 'dayofweek', 'minuteofday']
self.timezone = False
else:
utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1)
if x.shape[1] == 0:
utc_offset = pd.DataFrame(index=x.index, columns=[0]).apply(lambda x: x.name.utcoffset().
total_seconds() / 3600, axis=1)
else:
utc_offset = x.apply(lambda x: x.name.utcoffset().total_seconds()/3600, axis=1)
time_features = [x.index.hour, x.index.dayofweek, x.index.hour * 60 + x.index.minute, utc_offset]
col_names = ['hour', 'dayofweek', 'minuteofday', 'utc_offset']
self.timezone = True
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ holidays>=0.16
python-long-weekends>=0.1.1
jax>=0.4.1
jaxlib>=0.4.1
quantile-forest>=1.2.0
quantile-forest>=1.3.10
optax>=0.1.7
flax>=0.7.4
statsmodels>=0.14.2
32 changes: 31 additions & 1 deletion tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pyforecaster.formatter import Formatter
from pyforecaster.forecasting_models.neural_models.base_nn import FFNN
from pyforecaster.plot_utils import ts_animation
from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent
from pyforecaster.forecasting_models.benchmarks import Persistent, SeasonalPersistent, DiscreteDistr

class TestFormatDataset(unittest.TestCase):
def setUp(self) -> None:
Expand Down Expand Up @@ -294,6 +294,36 @@ def test_persistence(self):
y_plot = pd.concat({'y_{:02d}'.format(i): data_te['all'].shift(-i) for i in range(24)}, axis=1)
plot_quantiles([y_plot, y_hat], q_hat, ['y_te', 'y_hat'], n_rows=300)

"""
def test_discrete_distr(self):
self.data = self.data.resample('1h').mean()
self.data = self.data.round(-1)
formatter = Formatter(logger=self.logger, augment=False)
formatter.add_target_transform(['all'], lags=-np.arange(1, 24))
x, y = formatter.transform(self.data)
n_tr = 6000
n_te = 2000
x_tr, y_tr, x_te, y_te = x.iloc[:n_tr], y.iloc[:n_tr], x.iloc[n_tr:n_tr+n_te], y.iloc[n_tr:n_tr+n_te]
m = DiscreteDistr(target_name='all', q_vect=np.arange(31)/30, nodes_at_step=None, val_ratio=0.8, n_sa=24,
period='1d').fit(x_tr, y_tr)
y_hat = m.predict(x_te)
y_hat = m.predict_probabilities(x_te)
from pyforecaster.forecaster import ScenarioGenerator
plt.matshow(ScenarioGenerator().quantiles_to_numpy(y_hat)[0].T)
n_taus = len(y_hat.columns.get_level_values(1).unique())
q_hat = y_hat.values
q_hat = np.reshape(q_hat, (q_hat.shape[0], -1, n_taus))
plt.matshow(q_hat[0].T)
n_taus = len(y_hat.columns.get_level_values(1).unique())
q_hat = y_hat.values
q_hat = np.reshape(q_hat, (q_hat.shape[0], n_taus, -1))
q_hat = np.swapaxes(q_hat, 1, 2)
"""

if __name__ == '__main__':
unittest.main()

0 comments on commit b3641e1

Please sign in to comment.