From 3ce7b8d4757d9ee8cad49e7d13209797d80d8162 Mon Sep 17 00:00:00 2001 From: ali-tny Date: Thu, 5 Aug 2021 11:27:04 +0100 Subject: [PATCH] Fix GroupedPipeline bug selecting from pd.Series When the target `y` is a pd.Series rather than a numpy array, and the pd.Series has a numeric index that isn't consecutive, previously GroupedPipeline will attempt to select targets based on the index. In the newer version of pandas, any missing indices will also raise an Exception. This is because we selected from `y` assuming it's a numpy array: `y[indices]`. If y has a consecutive index, it makes no difference. If y has a non-numeric index, pandas falls back to selecting via integer location like numpy. However, if the index for y is numeric _and_ non-consecutive, the pd.Index is incorrectly used rather than the position, and an Exception can be raised if indices are missing. This change adds a test for the bug and updates the `_iter_groups` method to always select the correct targets regardless of their type (pd.Series or np.array) or their index. --- tests/test_pipeline/test_grouped_pipeline.py | 20 +++++++++++++++++++- timeserio/pipeline/pipeline.py | 6 +++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_pipeline/test_grouped_pipeline.py b/tests/test_pipeline/test_grouped_pipeline.py index e49293d..c7c8dda 100644 --- a/tests/test_pipeline/test_grouped_pipeline.py +++ b/tests/test_pipeline/test_grouped_pipeline.py @@ -191,10 +191,28 @@ def test_all_groups_missing_raises(input_df): ) def test_iter_groups_non_consecutive_index(index): group = [1] * 2 + [2] * (len(index) - 2) + target = np.array(group) value = np.random.random(len(index)) input_df = pd.DataFrame( [group, value], index=["group", "value"], columns=index ).T gp = GroupedPipeline(groupby=["group"], pipeline=None) - for key, sub_df, _ in gp._iter_groups(input_df): + for key, sub_df, sub_target in gp._iter_groups(input_df, target): assert (sub_df["group"] == key).all() + assert (sub_target == key).all() + + +@pytest.mark.parametrize( + "index", [[1, 2, 3, 4], pd.date_range("2019-01-01", periods=4)] +) +def test_iter_groups_non_consecutive_index_target_series(index): + group = [1] * 2 + [2] * (len(index) - 2) + target = pd.Series(group, index=index) + value = np.random.random(len(index)) + input_df = pd.DataFrame( + [group, value], index=["group", "value"], columns=index + ).T + gp = GroupedPipeline(groupby=["group"], pipeline=None) + for key, sub_df, sub_target in gp._iter_groups(input_df, target): + assert (sub_df["group"] == key).all() + assert (sub_target == key).all() diff --git a/timeserio/pipeline/pipeline.py b/timeserio/pipeline/pipeline.py index 24e592f..4dbbb57 100644 --- a/timeserio/pipeline/pipeline.py +++ b/timeserio/pipeline/pipeline.py @@ -127,7 +127,11 @@ def _iter_groups(self, df, y=None): groups = df.groupby(self.groupby).indices for key, sub_idx in groups.items(): sub_df = df.iloc[sub_idx] - sub_y = y[sub_idx] if y is not None else None + if y is not None: + # y is either a numpy array or a pd.Series so index accordingly + sub_y = y.iloc[sub_idx] if type(y) is pd.Series else y[sub_idx] + else: + sub_y = None yield key, sub_df, sub_y def fit(self, df, y=None):