Skip to content

Commit

Permalink
corrected LgbHybrid and prune dataset at stephaead
Browse files Browse the repository at this point in the history
  • Loading branch information
nepslor committed Jul 26, 2024
1 parent a267ae7 commit d7f5d15
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 35 deletions.
12 changes: 6 additions & 6 deletions pyforecaster/forecasting_models/gradientboosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,15 @@ def fit(self, x, y):
n_batch = int(len(x_pd)*red_frac)
n_long = n_batch*self.n_multistep
rand_idx = np.random.choice(x_pd.index, n_long).reshape(self.n_multistep, -1)

x_long = []
for sa in range(self.n_multistep):
x_i = pd.concat([x_pd.loc[rand_idx[sa], :].reset_index(drop=True), pd.Series(np.ones(n_batch) * sa)], axis=1)
y_long = []
for i, sa in enumerate(np.arange(self.n_single, self.n_single + self.n_multistep)):
x_i = pd.concat([x_pd.loc[rand_idx[i], :].reset_index(drop=True), pd.Series(np.ones(n_batch) * i)], axis=1)
y_long.append(y.iloc[rand_idx[i], sa])
x_long.append(x_i)

x_long = pd.concat(x_long, axis=0)
y = y
y_long = []
for i in range(self.n_multistep):
y_long.append(y.iloc[rand_idx[i], i])
y_long = pd.concat(y_long)

t_0 = time()
Expand Down
70 changes: 46 additions & 24 deletions pyforecaster/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,41 +399,63 @@ def print_fold(tr_idxs, te_idxs):

def prune_dataset_at_stepahead(self, df, target_col_num, metadata_features, method='periodic', period='24h', tol_period='1h', keep_last_n_lags=0, keep_last_seconds=0):

features = []
# retrieve referring_time of the given sa for the target from target_transformers
target_times = []
target_start_times = []
target_end_times = []
for tt in self.target_transformers:
target_times.append(tt.metadata.iloc[target_col_num, :]['end_time'])
#target_times.append(tt.metadata.loc[tt.metadata['lag']==-sa, 'end_time'])
target_start_times.append(tt.metadata.iloc[target_col_num, :]['start_time'])
target_end_times.append(tt.metadata.iloc[target_col_num, :]['end_time'])

target_start_time = pd.to_timedelta(target_start_times[0])
target_end_time = pd.to_timedelta(target_end_times[0])

target_time = pd.to_timedelta(target_times[0])

metadata = pd.concat([t.metadata for t in self.transformers])
if method == 'periodic':
# find signals with (target_time - referring_time) multiple of period
for t in self.transformers:
features += list(t.metadata.index[((target_time-t.metadata['end_time']).abs() % pd.Timedelta(period)
<= pd.Timedelta(tol_period)) & (t.metadata['end_time']<=target_time)])
c1 = (target_start_time - metadata['start_time']) % pd.Timedelta(period) <= pd.Timedelta(tol_period)
c2 = (target_end_time - metadata['end_time']) % pd.Timedelta(period) <= pd.Timedelta(tol_period)
causality = metadata['end_time'] <= target_start_time
metadata['keep'] = (c1 | c2) & causality
elif method == 'up_to':
for t in self.transformers:
features += list(t.metadata.index[t.metadata['end_time'] <= target_time])
metadata['keep'] = metadata['end_time'] <= target_start_time
else:
metadata['keep'] = True

if keep_last_n_lags > 0:
last_lag_features = list(np.hstack([t.metadata.index[t.metadata['lag'].isin(np.arange(keep_last_n_lags))]
for t in self.transformers]))
features = np.unique(features + last_lag_features)
metadata['keep'] = metadata['keep'] | metadata['lag'].isin(np.arange(keep_last_n_lags))

if keep_last_seconds >0:
closest_features = []
for t in self.transformers:
delta_sec = t.metadata.start_time.apply(lambda x: x.total_seconds())
keep_condition = (delta_sec> -keep_last_seconds) & (delta_sec<=0)
closest_features.append(t.metadata.index[keep_condition])
closest_features = list(np.unique(np.hstack(closest_features)))
features = np.unique(features + closest_features)

features = np.unique(list(features) + metadata_features)
close = (metadata['end_time'] <= pd.Timedelta(keep_last_seconds, unit='s')) | (metadata['end_time'] <= pd.Timedelta(keep_last_seconds, unit='s'))
predecessor = metadata['end_time'] <= target_start_time
metadata['keep'] = metadata['keep'] | (close & predecessor)

features = list(metadata.loc[metadata['keep']].index) + metadata_features
return df[features]

def plot_dataset_at_stepahead(self, df, metadata_features=None, method='periodic', period='24h', tol_period='1h', keep_last_n_lags=0, keep_last_seconds=0):

n_target = np.sum([len(t.metadata) for t in self.target_transformers])
metadata = pd.concat([t.metadata for t in self.transformers])

import matplotlib.pyplot as plt
plt.figure()
for target_col_num in np.arange(n_target):
x_i = self.prune_dataset_at_stepahead(df, target_col_num, metadata_features=metadata_features, method=method,
period=period, keep_last_n_lags=keep_last_n_lags,
keep_last_seconds=keep_last_seconds,
tol_period=tol_period)
plt.cla()
plt.plot(np.hstack([metadata['start_time'].min().total_seconds(), metadata['end_time'].max().total_seconds()]), np.hstack([0, 0]), linestyle='')
k = 0
for i, feature in enumerate(metadata['name'].unique()):
metadata_filt = metadata.loc[metadata['name'] == feature].loc[x_i.columns]
for _, var in metadata_filt.iterrows():
k += 1
plt.plot(np.hstack([var['start_time'].total_seconds(), var['end_time'].total_seconds()]), np.hstack([k, k]), label=feature, alpha=0.5)
plt.legend()
plt.title('target {}'.format(target_col_num))
plt.pause(0.1)


def rename_features_prediction_time(self, x, sa):
"""
Rename features in x such that they contain the relative time w.r.t. the sa step ahead of prediction
Expand Down
11 changes: 6 additions & 5 deletions tests/test_boosters.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ def test_linear_val_split(self):

formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24),
relative_lags=True)
formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 24])
formatter.add_target_transform(['all'], lags=-np.arange(24)-1)

x, y = formatter.transform(self.data.iloc[:1000])
x, y = formatter.transform(self.data.resample('1h').mean())
n_tr = int(len(x) * 0.7)
x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(),
y.iloc[n_tr:].copy()]
Expand All @@ -33,14 +32,16 @@ def test_linear_val_split(self):
y_hat_lin = m_lin.predict(x_te)
q = m_lin.predict_quantiles(x_te)

m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3, lgb_pars={'num_leaves': 100, 'n_estimators': 100, 'learning_rate':0.05}, n_single=20, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour']).fit(x_tr, y_tr)
m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3, lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate':0.05},
n_single=10, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'],tol_period='1h', keep_last_seconds=3600).fit(x_tr, y_tr)
y_hat_lgbh = m_lgbhybrid.predict(x_te)
q = m_lgbhybrid.predict_quantiles(x_te)

m_lgb = LGBForecaster(lgb_pars={'num_leaves': 10, 'n_estimators': 100, 'learning_rate':0.05}).fit(x_tr, y_tr)
m_lgb = LGBForecaster(lgb_pars={'num_leaves': 10, 'n_estimators': 10, 'learning_rate':0.05}, parallel=True).fit(x_tr, y_tr)
y_hat_lgb = m_lgb.predict(x_te)

# plot_quantiles([y_te.iloc[:10, :], y_hat_lin.iloc[:10, :], y_hat_lgbh.iloc[:10, :], y_hat_lgb.iloc[:10, :]], q[:10, :, :], ['y_te', 'y_lin', 'y_lgbhybrid_1', 'y_hat_lgb'])
# plot_quantiles([y_hat_lgbh.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=True)
plot_quantiles([y_hat_lin.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=False)


def do_not_test_linear_val_split(self):
Expand Down

0 comments on commit d7f5d15

Please sign in to comment.