Skip to content

Commit

Permalink
Fix GroupedPipeline bug selecting from pd.Series
Browse files Browse the repository at this point in the history
When the target `y` is a pd.Series rather than a numpy array, and the
pd.Series has a numeric index that isn't consecutive, previously
GroupedPipeline will attempt to select targets based on the index. In
the newer version of pandas, any missing indices will also raise an
Exception.

This is because we selected from `y` assuming it's a numpy array:
`y[indices]`. If y has a consecutive index, it makes no difference. If y
has a non-numeric index, pandas falls back to selecting via integer
location like numpy. However, if the index for y is numeric _and_
non-consecutive, the pd.Index is incorrectly used rather than the
position, and an Exception can be raised if indices are missing.

This change adds a test for the bug and updates the `_iter_groups`
method to always select the correct targets regardless of their type
(pd.Series or np.array) or their index.
  • Loading branch information
ali-tny committed Aug 5, 2021
1 parent 33728b5 commit 3ce7b8d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 2 deletions.
20 changes: 19 additions & 1 deletion tests/test_pipeline/test_grouped_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,28 @@ def test_all_groups_missing_raises(input_df):
)
def test_iter_groups_non_consecutive_index(index):
group = [1] * 2 + [2] * (len(index) - 2)
target = np.array(group)
value = np.random.random(len(index))
input_df = pd.DataFrame(
[group, value], index=["group", "value"], columns=index
).T
gp = GroupedPipeline(groupby=["group"], pipeline=None)
for key, sub_df, _ in gp._iter_groups(input_df):
for key, sub_df, sub_target in gp._iter_groups(input_df, target):
assert (sub_df["group"] == key).all()
assert (sub_target == key).all()


@pytest.mark.parametrize(
"index", [[1, 2, 3, 4], pd.date_range("2019-01-01", periods=4)]
)
def test_iter_groups_non_consecutive_index_target_series(index):
group = [1] * 2 + [2] * (len(index) - 2)
target = pd.Series(group, index=index)
value = np.random.random(len(index))
input_df = pd.DataFrame(
[group, value], index=["group", "value"], columns=index
).T
gp = GroupedPipeline(groupby=["group"], pipeline=None)
for key, sub_df, sub_target in gp._iter_groups(input_df, target):
assert (sub_df["group"] == key).all()
assert (sub_target == key).all()
6 changes: 5 additions & 1 deletion timeserio/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,11 @@ def _iter_groups(self, df, y=None):
groups = df.groupby(self.groupby).indices
for key, sub_idx in groups.items():
sub_df = df.iloc[sub_idx]
sub_y = y[sub_idx] if y is not None else None
if y is not None:
# y is either a numpy array or a pd.Series so index accordingly
sub_y = y.iloc[sub_idx] if type(y) is pd.Series else y[sub_idx]
else:
sub_y = None
yield key, sub_df, sub_y

def fit(self, df, y=None):
Expand Down

0 comments on commit 3ce7b8d

Please sign in to comment.