Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REGR: groupby.idxmin/idxmax wrong result on extreme values #57046

Merged
merged 8 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)

.. ---------------------------------------------------------------------------
Expand Down
17 changes: 10 additions & 7 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
Py_ssize_t i, j, N, K, lab
numeric_object_t val
numeric_object_t[:, ::1] group_min_or_max
uint8_t[:, ::1] seen
bint uses_mask = mask is not None
bint isna_entry
bint compute_max = name == "idxmax"
Expand All @@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(

if numeric_object_t is object:
group_min_or_max = np.empty((<object>out).shape, dtype=object)
seen = np.zeros((<object>out).shape, dtype=np.uint8)
else:
group_min_or_max = np.empty_like(out, dtype=values.dtype)
if N > 0 and K > 0:
# When N or K is zero, we never use group_min_or_max
group_min_or_max[:] = _get_min_or_max(
values[0, 0], compute_max, is_datetimelike
)
seen = np.zeros_like(out, dtype=np.uint8)

# When using transform, we need a valid value for take in the case
# a category is not observed; these values will be dropped
Expand All @@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
if not skipna and out[lab, j] == -1:
# Once we've hit NA there is no going back
continue

val = values[i, j]

if uses_mask:
Expand All @@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
isna_entry = _treat_as_na(val, is_datetimelike)

if isna_entry:
if not skipna:
if not skipna or not seen[lab, j]:
out[lab, j] = -1
else:
if compute_max:
if not seen[lab, j]:
seen[lab, j] = True
group_min_or_max[lab, j] = val
out[lab, j] = i
elif compute_max:
if val > group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i
Expand Down
1 change: 1 addition & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def _call_cython_op(
mask=mask,
result_mask=result_mask,
is_datetimelike=is_datetimelike,
**kwargs,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is kwargs needed here?

Copy link
Member Author

@rhshadrach rhshadrach Jan 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To pass skipna to group_idxmin_idxmax

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK and I guess not all these accept skipna (IIRC there was that related issue related skipna, but forgot if it was for groupby ops or not)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea - I'm planning on turning #56939 into a tracking issue on adding skipna consistently throughout groupby. Can include removing these kwargs as part of that.

)
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
if self.how in ["std", "sem"]:
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,68 @@ def test_empty(frame_or_series, all_boolean_reductions):
tm.assert_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
# GH#57040
if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
# No need to test
return
info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
min_value = info(any_real_numpy_dtype).min
max_value = info(any_real_numpy_dtype).max
df = DataFrame(
{"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
dtype=any_real_numpy_dtype,
)
gb = df.groupby("a")
result = getattr(gb, how)()
expected = DataFrame(
{"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
# GH#57040
min_value = np.finfo(float_numpy_dtype).min
max_value = np.finfo(float_numpy_dtype).max
df = DataFrame(
{
"a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
"b": Series(
[
np.nan,
min_value,
np.nan,
max_value,
min_value,
np.nan,
max_value,
np.nan,
np.nan,
np.nan,
],
dtype=float_numpy_dtype,
),
},
)
gb = df.groupby("a")

warn = None if skipna else FutureWarning
msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb, how)(skipna=skipna)
if skipna:
values = [1, 3, 4, 6, np.nan]
else:
values = np.nan
expected = DataFrame(
{"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"func, values",
[
Expand Down
Loading