From 7063fcee8b117abaa52023c98ae5c9e14f4cc1f8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Jan 2024 17:36:04 -0500 Subject: [PATCH 1/6] WIP --- pandas/_libs/groupby.pyx | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dc8f13a919403..13ade442c2d4f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1771,6 +1771,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1784,13 +1785,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1806,6 +1804,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1817,7 +1816,11 @@ def group_idxmin_idxmax( if not skipna: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i From 866b581e454ec3148067b7cd2826e1d8a29c7798 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Jan 2024 17:36:04 -0500 Subject: [PATCH 2/6] REGR: groupby.idxmin/idxmax wrong result on extreme values --- doc/source/whatsnew/v2.2.1.rst | 2 + pandas/_libs/groupby.pyx | 17 ++++--- pandas/core/groupby/ops.py | 1 + pandas/tests/groupby/test_reductions.py | 63 +++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 75445c978d262..f501ebb15c8ba 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -14,6 +14,8 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dc8f13a919403..45e02c3dd420f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1771,6 +1771,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1784,13 +1785,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1806,6 +1804,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1814,10 +1813,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..e2ddf9aa5c0c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 7530c9ca78cbc..f6a48837c75b9 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -257,6 +257,69 @@ def test_empty(frame_or_series, all_boolean_reductions): tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, dtype): + # GH#57040 + min_value = np.finfo(dtype).min + max_value = np.finfo(dtype).max + df = DataFrame( + { + "a": np.repeat(range(1, 6), repeats=2), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + print(df) + print(result) + expected = DataFrame({"b": values}, index=pd.Index(range(1, 6), name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ From 2dfe3a479f66b1c1b63443cd4e3d8c368083dcd7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 24 Jan 2024 16:31:37 -0500 Subject: [PATCH 3/6] fixes --- pandas/tests/groupby/test_reductions.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index f6a48837c75b9..6034578c150d2 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -279,11 +279,10 @@ def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): @pytest.mark.parametrize("how", ["idxmin", "idxmax"]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_idxmin_idxmax_extremes_skipna(skipna, how, dtype): +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): # GH#57040 - min_value = np.finfo(dtype).min - max_value = np.finfo(dtype).max + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max df = DataFrame( { "a": np.repeat(range(1, 6), repeats=2), @@ -300,7 +299,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, dtype): np.nan, np.nan, ], - dtype=dtype, + dtype=float_numpy_dtype, ), }, ) @@ -314,8 +313,6 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, dtype): values = [1, 3, 4, 6, np.nan] else: values = np.nan - print(df) - print(result) expected = DataFrame({"b": values}, index=pd.Index(range(1, 6), name="a")) tm.assert_frame_equal(result, expected) From 9c54b8e3a22a347c66b8b2597be4004b64ab3c18 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 24 Jan 2024 17:47:06 -0500 Subject: [PATCH 4/6] NumPy 2.0 can't come soon enough --- pandas/tests/groupby/test_reductions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 6034578c150d2..b6aab05700734 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -313,7 +313,9 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): values = [1, 3, 4, 6, np.nan] else: values = np.nan - expected = DataFrame({"b": values}, index=pd.Index(range(1, 6), name="a")) + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) tm.assert_frame_equal(result, expected) From d2dda0a21536fe05ce23186dce6c0bac27d0d9cc Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 25 Jan 2024 16:18:10 -0500 Subject: [PATCH 5/6] fixup --- pandas/tests/groupby/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index b6aab05700734..5fc451d26c35e 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -285,7 +285,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): max_value = np.finfo(float_numpy_dtype).max df = DataFrame( { - "a": np.repeat(range(1, 6), repeats=2), + "a": np.repeat(range(1, 6), repeats=2, dtype="intp"), "b": Series( [ np.nan, From cf22b8144d979787ce76d4c7a8fd30d1a8c3f337 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 25 Jan 2024 21:59:14 -0500 Subject: [PATCH 6/6] fixup --- pandas/tests/groupby/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 5fc451d26c35e..d24a2a26bba81 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -285,7 +285,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): max_value = np.finfo(float_numpy_dtype).max df = DataFrame( { - "a": np.repeat(range(1, 6), repeats=2, dtype="intp"), + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), "b": Series( [ np.nan,