diff --git a/pyforecaster/forecasting_models/gradientboosters.py b/pyforecaster/forecasting_models/gradientboosters.py index bb04253..b9bd4ba 100644 --- a/pyforecaster/forecasting_models/gradientboosters.py +++ b/pyforecaster/forecasting_models/gradientboosters.py @@ -98,15 +98,15 @@ def fit(self, x, y): n_batch = int(len(x_pd)*red_frac) n_long = n_batch*self.n_multistep rand_idx = np.random.choice(x_pd.index, n_long).reshape(self.n_multistep, -1) + x_long = [] - for sa in range(self.n_multistep): - x_i = pd.concat([x_pd.loc[rand_idx[sa], :].reset_index(drop=True), pd.Series(np.ones(n_batch) * sa)], axis=1) + y_long = [] + for i, sa in enumerate(np.arange(self.n_single, self.n_single + self.n_multistep)): + x_i = pd.concat([x_pd.loc[rand_idx[i], :].reset_index(drop=True), pd.Series(np.ones(n_batch) * i)], axis=1) + y_long.append(y.iloc[rand_idx[i], sa]) x_long.append(x_i) + x_long = pd.concat(x_long, axis=0) - y = y - y_long = [] - for i in range(self.n_multistep): - y_long.append(y.iloc[rand_idx[i], i]) y_long = pd.concat(y_long) t_0 = time() diff --git a/pyforecaster/formatter.py b/pyforecaster/formatter.py index 7c426b5..55932bf 100644 --- a/pyforecaster/formatter.py +++ b/pyforecaster/formatter.py @@ -399,41 +399,63 @@ def print_fold(tr_idxs, te_idxs): def prune_dataset_at_stepahead(self, df, target_col_num, metadata_features, method='periodic', period='24h', tol_period='1h', keep_last_n_lags=0, keep_last_seconds=0): - features = [] # retrieve referring_time of the given sa for the target from target_transformers - target_times = [] + target_start_times = [] + target_end_times = [] for tt in self.target_transformers: - target_times.append(tt.metadata.iloc[target_col_num, :]['end_time']) - #target_times.append(tt.metadata.loc[tt.metadata['lag']==-sa, 'end_time']) + target_start_times.append(tt.metadata.iloc[target_col_num, :]['start_time']) + target_end_times.append(tt.metadata.iloc[target_col_num, :]['end_time']) + target_start_time = pd.to_timedelta(target_start_times[0]) + target_end_time = pd.to_timedelta(target_end_times[0]) - target_time = pd.to_timedelta(target_times[0]) - + metadata = pd.concat([t.metadata for t in self.transformers]) if method == 'periodic': - # find signals with (target_time - referring_time) multiple of period - for t in self.transformers: - features += list(t.metadata.index[((target_time-t.metadata['end_time']).abs() % pd.Timedelta(period) - <= pd.Timedelta(tol_period)) & (t.metadata['end_time']<=target_time)]) + c1 = (target_start_time - metadata['start_time']) % pd.Timedelta(period) <= pd.Timedelta(tol_period) + c2 = (target_end_time - metadata['end_time']) % pd.Timedelta(period) <= pd.Timedelta(tol_period) + causality = metadata['end_time'] <= target_start_time + metadata['keep'] = (c1 | c2) & causality elif method == 'up_to': - for t in self.transformers: - features += list(t.metadata.index[t.metadata['end_time'] <= target_time]) + metadata['keep'] = metadata['end_time'] <= target_start_time + else: + metadata['keep'] = True if keep_last_n_lags > 0: - last_lag_features = list(np.hstack([t.metadata.index[t.metadata['lag'].isin(np.arange(keep_last_n_lags))] - for t in self.transformers])) - features = np.unique(features + last_lag_features) + metadata['keep'] = metadata['keep'] | metadata['lag'].isin(np.arange(keep_last_n_lags)) + if keep_last_seconds >0: - closest_features = [] - for t in self.transformers: - delta_sec = t.metadata.start_time.apply(lambda x: x.total_seconds()) - keep_condition = (delta_sec> -keep_last_seconds) & (delta_sec<=0) - closest_features.append(t.metadata.index[keep_condition]) - closest_features = list(np.unique(np.hstack(closest_features))) - features = np.unique(features + closest_features) - - features = np.unique(list(features) + metadata_features) + close = (metadata['end_time'] <= pd.Timedelta(keep_last_seconds, unit='s')) | (metadata['end_time'] <= pd.Timedelta(keep_last_seconds, unit='s')) + predecessor = metadata['end_time'] <= target_start_time + metadata['keep'] = metadata['keep'] | (close & predecessor) + + features = list(metadata.loc[metadata['keep']].index) + metadata_features return df[features] + def plot_dataset_at_stepahead(self, df, metadata_features=None, method='periodic', period='24h', tol_period='1h', keep_last_n_lags=0, keep_last_seconds=0): + + n_target = np.sum([len(t.metadata) for t in self.target_transformers]) + metadata = pd.concat([t.metadata for t in self.transformers]) + + import matplotlib.pyplot as plt + plt.figure() + for target_col_num in np.arange(n_target): + x_i = self.prune_dataset_at_stepahead(df, target_col_num, metadata_features=metadata_features, method=method, + period=period, keep_last_n_lags=keep_last_n_lags, + keep_last_seconds=keep_last_seconds, + tol_period=tol_period) + plt.cla() + plt.plot(np.hstack([metadata['start_time'].min().total_seconds(), metadata['end_time'].max().total_seconds()]), np.hstack([0, 0]), linestyle='') + k = 0 + for i, feature in enumerate(metadata['name'].unique()): + metadata_filt = metadata.loc[metadata['name'] == feature].loc[x_i.columns] + for _, var in metadata_filt.iterrows(): + k += 1 + plt.plot(np.hstack([var['start_time'].total_seconds(), var['end_time'].total_seconds()]), np.hstack([k, k]), label=feature, alpha=0.5) + plt.legend() + plt.title('target {}'.format(target_col_num)) + plt.pause(0.1) + + def rename_features_prediction_time(self, x, sa): """ Rename features in x such that they contain the relative time w.r.t. the sa step ahead of prediction diff --git a/tests/test_boosters.py b/tests/test_boosters.py index 15a42df..bb61ebd 100644 --- a/tests/test_boosters.py +++ b/tests/test_boosters.py @@ -21,10 +21,9 @@ def test_linear_val_split(self): formatter = Formatter(logger=self.logger).add_transform(['all'], lags=np.arange(24), relative_lags=True) - formatter.add_transform(['all'], ['min', 'max'], agg_bins=[1, 2, 15, 24]) formatter.add_target_transform(['all'], lags=-np.arange(24)-1) - x, y = formatter.transform(self.data.iloc[:1000]) + x, y = formatter.transform(self.data.resample('1h').mean()) n_tr = int(len(x) * 0.7) x_tr, x_te, y_tr, y_te = [x.iloc[:n_tr, :].copy(), x.iloc[n_tr:, :].copy(), y.iloc[:n_tr].copy(), y.iloc[n_tr:].copy()] @@ -33,14 +32,16 @@ def test_linear_val_split(self): y_hat_lin = m_lin.predict(x_te) q = m_lin.predict_quantiles(x_te) - m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3, lgb_pars={'num_leaves': 100, 'n_estimators': 100, 'learning_rate':0.05}, n_single=20, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour']).fit(x_tr, y_tr) + m_lgbhybrid = LGBMHybrid(red_frac_multistep=0.1, val_ratio=0.3, lgb_pars={'num_leaves': 300, 'n_estimators': 10, 'learning_rate':0.05}, + n_single=10, parallel=True, formatter=formatter, metadata_features=['minuteofday', 'utc_offset', 'dayofweek', 'hour'],tol_period='1h', keep_last_seconds=3600).fit(x_tr, y_tr) y_hat_lgbh = m_lgbhybrid.predict(x_te) q = m_lgbhybrid.predict_quantiles(x_te) - m_lgb = LGBForecaster(lgb_pars={'num_leaves': 10, 'n_estimators': 100, 'learning_rate':0.05}).fit(x_tr, y_tr) + m_lgb = LGBForecaster(lgb_pars={'num_leaves': 10, 'n_estimators': 10, 'learning_rate':0.05}, parallel=True).fit(x_tr, y_tr) y_hat_lgb = m_lgb.predict(x_te) - # plot_quantiles([y_te.iloc[:10, :], y_hat_lin.iloc[:10, :], y_hat_lgbh.iloc[:10, :], y_hat_lgb.iloc[:10, :]], q[:10, :, :], ['y_te', 'y_lin', 'y_lgbhybrid_1', 'y_hat_lgb']) + # plot_quantiles([y_hat_lgbh.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=True) + plot_quantiles([y_hat_lin.iloc[:100, :]], q[:100, :, :], ['y_hat_lgb'], n_rows=100, repeat=False) def do_not_test_linear_val_split(self):