Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
vascomedici committed Aug 7, 2024
2 parents 26827cf + 42ac83f commit c2a47af
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 44 deletions.
68 changes: 41 additions & 27 deletions pyforecaster/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,19 @@ def _transform(self, x, time_features=True, holidays=False, return_target=True,
for tr in self.transformers:
x = tr.transform(x)
transformed_columns = [c for c in x.columns if c not in original_columns]

if return_target:
for tr in self.target_transformers:
target = pd.concat([target, tr.transform(x, augment=False)], axis=1)
# apply normalization if any
if len(self.target_normalizers)>0:
target, x = self.normalize(x, target)

# apply normalization to target if any and if return_target is True
if len(self.target_normalizers)>0:
normalizing_columns = [nr.name for nr in self.target_normalizers]
x = self.add_normalizing_columns(x)

# this is needed even if target is not returned, to normalize features correlated to the target
target, x = self.normalize(x, target, return_target=return_target)
transformed_columns = transformed_columns + normalizing_columns

if return_target:
# remove raws with nans to reconcile impossible dataset entries introduced by shiftin' around
Expand All @@ -239,7 +246,19 @@ def _transform(self, x, time_features=True, holidays=False, return_target=True,
x = self.add_holidays(x, **holidays_kwargs)
return x, target

def normalize(self, x, y, normalizing_fun=None, antitransform=False):
def add_normalizing_columns(self, x):

# if we're doing the direct transform (normalization) we compute the normalizers and add them to the x df
# compute normalizers if any
normalizers = pd.concat([nr.transform(x, augment=False) for nr in self.target_normalizers], axis=1)

# rename normalizers with tag names
normalizers.columns = [nr.name for nr in self.target_normalizers]
x = pd.concat([x, normalizers], axis=1)

return x

def normalize(self, x, y=None, normalizing_fun=None, antitransform=False, return_target=True):
"""
Columns needed to compute the normaliztion factors are computed by the target transformers and returned in
the original x dataframe. The normalizing_fun is a string expression that must be evaluated to normalize the
Expand All @@ -261,47 +280,42 @@ def normalize(self, x, y, normalizing_fun=None, antitransform=False):
'\bor by passing the noralizing_expr argument to this function')
return y, x

if not antitransform:
# if we're doing the direct transform (normalization) we compute the normalizers and add them to the x df
# compute normalizers if any
normalizers = pd.concat([nr.transform(x, augment=False) for nr in self.target_normalizers], axis = 1)

# rename normalizers with tag names
normalizers.columns = [nr.name for nr in self.target_normalizers]
x = pd.concat([x, normalizers], axis=1)
else:
# if we're antitransform, we retrieve the normalizers from the x df
normalizers = x[[nr.name for nr in self.target_normalizers]]
normalizers = x[[nr.name for nr in self.target_normalizers]]

# get normalizers names
target_to_norm_names = [nr.names for nr in self.target_normalizers]
target_to_norm_names = [item for sublist in target_to_norm_names for item in sublist]

# join target and normalizers in a single df
df_n = pd.concat([y, normalizers], axis=1)
# normalize the target if any
if return_target:
# join target and normalizers in a single df
df_n = pd.concat([y, normalizers], axis=1)

for target_to_norm in np.unique(target_to_norm_names):
for tr in self.target_transformers:
nr_columns = (tr.metadata['name'].isin([target_to_norm])).index
for c in nr_columns:
df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c)
y = df_n[[c for c in y.columns]]

# normalize the features related to the target
for target_to_norm in np.unique(target_to_norm_names):
for tr in self.target_transformers:
for tr in self.transformers:
# find df_n columns to normalize
nr_columns = (tr.metadata['name'].isin([target_to_norm])).index
for c in nr_columns:
df_n.loc[:, c] = self.normalizing_wrapper(normalizing_fun, df_n, c)
if not antitransform:
for tr in self.transformers:
# find df_n columns to normalize
nr_columns = (tr.metadata['name'].isin([target_to_norm])).index
for c in nr_columns:
x.loc[:, c] = self.normalizing_wrapper(normalizing_fun, x, c)
x.loc[:, c] = self.normalizing_wrapper(normalizing_fun, x, c)


df_n = df_n[[c for c in y.columns]]
return df_n, x
return y, x

def denormalize(self, x, y):
if self.denormalizing_fun is None:
self.logger.warning('You did not pass any denormalization expression, ** no denormalization will be applied **. '
'\bYou can set a denormalization expression by calling Formatter.add_normalizing_fun ')
return y
y, _ = self.normalize(x, y, normalizing_fun=self.denormalizing_fun, antitransform=True)
y, _ = self.normalize(x.copy(), y, normalizing_fun=self.denormalizing_fun)
return y

def normalizing_wrapper(self, normalizing_fun, df, t):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_normalizers(self):

def test_normalizers_complex(self):
df = pd.DataFrame(np.random.randn(100, 5), index=pd.date_range('01-01-2020', freq='20min', periods=100, tz='Europe/Zurich'), columns=['a', 'b', 'c', 'd', 'e'])
formatter = pyf.Formatter().add_transform(['a', 'b'], lags=np.arange(1, 5), agg_freq='20min')
formatter = pyf.Formatter(augment=False).add_transform(['a', 'b'], lags=np.arange(1, 5), agg_freq='20min')
formatter.add_target_transform(['a'], lags=-np.arange(1, 5), agg_freq='20min')
formatter.add_target_normalizer(['a'], 'mean', agg_freq='10H', name='a_n')
formatter.add_target_normalizer(['a'], 'std', agg_freq='5H', name='b_n')
Expand All @@ -285,7 +285,7 @@ def test_normalizers_complex(self):
y_unnorm = formatter.denormalize(x, y_norm)

# check if back-transform works
assert (y_unnorm-y).sum().sum() < 1e-6
assert (y_unnorm-y).abs().sum().sum() < 1e-6


def test_normalizers_impossible(self):
Expand Down
26 changes: 11 additions & 15 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pyforecaster.forecasting_models.fast_adaptive_models import Fourier_es, FK, FK_multi
from pyforecaster.forecasting_models.random_fourier_features import RFFRegression, AdditiveRFFRegression
from pyforecaster.forecasting_models.randomforests import QRF
from pyforecaster.forecasting_models.gradientboosters import LGBMHybrid
from pyforecaster.forecaster import LinearForecaster, LGBForecaster
from pyforecaster.plot_utils import plot_quantiles
from pyforecaster.formatter import Formatter
Expand Down Expand Up @@ -204,41 +205,36 @@ def test_rffr(self):


def test_antinormalize(self):
formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(144),
formatter = Formatter(logger=self.logger, augment=False).add_transform(['all'], lags=np.arange(144),
relative_lags=True)
formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 20])
formatter.add_target_transform(['all'], lags=-np.arange(144))
formatter.add_target_transform(['all'], lags=-np.arange(1, 145))

formatter.add_target_normalizer(['all'], 'mean', agg_freq='7d', name='a_movingavg')
formatter.add_target_normalizer(['all'], 'std', agg_freq='7d', name='a_movingstd')
formatter.add_target_normalizer(['all'], 'mean', agg_freq='3d', name='a_movingavg')
formatter.add_target_normalizer(['all'], 'std', agg_freq='3d', name='a_movingstd')

x, y = formatter.transform(self.data.iloc[:10000])

n_tr = int(len(x) * 0.9)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]

m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr)
y_hat = m_lin.predict(x_te)
q = m_lin.predict_quantiles(x_te)
#m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr)
#y_hat_nonorm = m_lin.predict(x_te)
#q_nonorm = m_lin.predict_quantiles(x_te)

#m_lgb = LGBForecaster(val_ratio=0.5, lgb_pars={'num_leaves':20}, formatter=formatter).fit(x_tr, y_tr)
#y_hat_lgb = m_lgb.predict(x_te)
mae = lambda x, y: np.abs(x-y).mean().mean()
print('MAE lin:', mae(y_te, y_hat))
#mae = lambda x, y: np.abs(x-y).mean().mean()
#print('MAE lin:', mae(y_te, y_hat_nonorm))


plt.close('all')
plot_quantiles([y_te, y_hat], q, ['y_te', 'y_hat_lin'], n_rows=600)
plt.close('all')

formatter.add_normalizing_fun(expr="(df[t] - df['a_movingavg']) / (df['a_movingstd'] + 1)",
inv_expr="df[t]*(df['a_movingstd']+1) + df['a_movingavg']")
x, y_norm = formatter.transform(self.data.iloc[:10000])
y = formatter.denormalize(x, y_norm)

x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]
x_tr, x_te, y_tr = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y_norm.iloc[:n_tr].copy()]
m_lin = LinearForecaster(val_ratio=0.2, formatter=formatter).fit(x_tr, y_tr)
y_hat = m_lin.predict(x_te)
q = m_lin.predict_quantiles(x_te)
Expand Down

0 comments on commit c2a47af

Please sign in to comment.