Skip to content

Commit

Permalink
REGR: groupby.idxmin/idxmax wrong result on extreme values (pandas-de…
Browse files Browse the repository at this point in the history
…v#57046)

* WIP

* REGR: groupby.idxmin/idxmax wrong result on extreme values

* fixes

* NumPy 2.0 can't come soon enough

* fixup

* fixup
  • Loading branch information
rhshadrach authored and pmhatre1 committed May 7, 2024
1 parent 68fca6b commit 1de399c
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 7 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)

.. ---------------------------------------------------------------------------
Expand Down
17 changes: 10 additions & 7 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
Py_ssize_t i, j, N, K, lab
numeric_object_t val
numeric_object_t[:, ::1] group_min_or_max
uint8_t[:, ::1] seen
bint uses_mask = mask is not None
bint isna_entry
bint compute_max = name == "idxmax"
Expand All @@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(

if numeric_object_t is object:
group_min_or_max = np.empty((<object>out).shape, dtype=object)
seen = np.zeros((<object>out).shape, dtype=np.uint8)
else:
group_min_or_max = np.empty_like(out, dtype=values.dtype)
if N > 0 and K > 0:
# When N or K is zero, we never use group_min_or_max
group_min_or_max[:] = _get_min_or_max(
values[0, 0], compute_max, is_datetimelike
)
seen = np.zeros_like(out, dtype=np.uint8)

# When using transform, we need a valid value for take in the case
# a category is not observed; these values will be dropped
Expand All @@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
if not skipna and out[lab, j] == -1:
# Once we've hit NA there is no going back
continue

val = values[i, j]

if uses_mask:
Expand All @@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
isna_entry = _treat_as_na(val, is_datetimelike)

if isna_entry:
if not skipna:
if not skipna or not seen[lab, j]:
out[lab, j] = -1
else:
if compute_max:
if not seen[lab, j]:
seen[lab, j] = True
group_min_or_max[lab, j] = val
out[lab, j] = i
elif compute_max:
if val > group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i
Expand Down
1 change: 1 addition & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def _call_cython_op(
mask=mask,
result_mask=result_mask,
is_datetimelike=is_datetimelike,
**kwargs,
)
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
if self.how in ["std", "sem"]:
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,68 @@ def test_empty(frame_or_series, all_boolean_reductions):
tm.assert_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
# GH#57040
if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
# No need to test
return
info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
min_value = info(any_real_numpy_dtype).min
max_value = info(any_real_numpy_dtype).max
df = DataFrame(
{"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
dtype=any_real_numpy_dtype,
)
gb = df.groupby("a")
result = getattr(gb, how)()
expected = DataFrame(
{"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
# GH#57040
min_value = np.finfo(float_numpy_dtype).min
max_value = np.finfo(float_numpy_dtype).max
df = DataFrame(
{
"a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
"b": Series(
[
np.nan,
min_value,
np.nan,
max_value,
min_value,
np.nan,
max_value,
np.nan,
np.nan,
np.nan,
],
dtype=float_numpy_dtype,
),
},
)
gb = df.groupby("a")

warn = None if skipna else FutureWarning
msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb, how)(skipna=skipna)
if skipna:
values = [1, 3, 4, 6, np.nan]
else:
values = np.nan
expected = DataFrame(
{"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"func, values",
[
Expand Down

0 comments on commit 1de399c

Please sign in to comment.