From 5acec90bb8562c02975d3e9acb8c300c28344ae3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 19 Jul 2023 18:46:10 -0400 Subject: [PATCH 01/35] PERF: Implement groupby idxmax/idxmin in Cython --- doc/source/whatsnew/v2.1.0.rst | 3 +- pandas/_libs/groupby.pyx | 142 ++++++++++++++++++++ pandas/core/arrays/categorical.py | 14 +- pandas/core/groupby/generic.py | 68 +++++++--- pandas/core/groupby/groupby.py | 58 +++++++- pandas/core/groupby/ops.py | 29 +++- pandas/core/indexes/datetimelike.py | 9 +- pandas/tests/groupby/test_categorical.py | 9 +- pandas/tests/groupby/test_function.py | 4 - pandas/tests/groupby/test_groupby.py | 18 +-- pandas/tests/groupby/test_groupby_dropna.py | 12 +- pandas/tests/groupby/test_raises.py | 8 +- 12 files changed, 305 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index bc187361493c0..a0fc83f6c63bb 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -416,7 +416,7 @@ Performance improvements - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`) -- +- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (PR#??) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: @@ -574,6 +574,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) - Bug in :meth:`DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) - Bug in :meth:`Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`) +- Bug in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` would fail if the groupings have unobserved categories (:issue:`10694`) - Bug in :meth:`SeriesGroupBy.nth` and :meth:`DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`) - Bug in :meth:`SeriesGroupBy.nth` and :meth:`DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`) - Bug in :meth:`SeriesGroupBy.sum` and :meth:`DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 501d572c6623c..5fdb45afe3fdb 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1780,6 +1780,96 @@ cdef group_min_max( ) +@cython.wraparound(False) +@cython.boundscheck(False) +cdef group_idxmin_idxmax( + int64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[numeric_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + bint compute_max=True, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +): + """ + Compute index of minimum/maximum of columns of `values`, in row groups `labels`. + + This function only computes the row number where the minimum/maximum occurs, we'll + take the corresponding index value after this function. + + Parameters + ---------- + out : np.ndarray[int64_t, ndim=2] + Array to store result in. + counts : np.ndarray[int64] + Input as a zeroed array, populated by group sizes during algorithm + values : array + Values to find column-wise min/max of. + labels : np.ndarray[np.intp] + Labels to group by. + min_count : Py_ssize_t, default -1 + Unused. For compatibility with other Cython ops. + is_datetimelike : bool + True if `values` contains datetime-like entries. + compute_max : bint, default True + True to compute group-wise max, False to compute min + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + `counts` is modified to hold group sizes + """ + cdef: + Py_ssize_t i, j, N, K, lab + numeric_t val + numeric_t[:, ::1] group_min_or_max + bint uses_mask = mask is not None + bint isna_entry + + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: + raise AssertionError("len(index) != len(labels)") + + group_min_or_max = np.empty_like(out, dtype=values.dtype) + group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + if compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + + @cython.wraparound(False) @cython.boundscheck(False) def group_max( @@ -1806,6 +1896,32 @@ def group_max( ) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_idxmax( + int64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[numeric_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +) -> None: + """See group_idxmin_idxmax.__doc__""" + group_idxmin_idxmax( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=True, + mask=mask, + result_mask=result_mask, + ) + + @cython.wraparound(False) @cython.boundscheck(False) def group_min( @@ -1832,6 +1948,32 @@ def group_min( ) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_idxmin( + int64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[numeric_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +) -> None: + """See group_idxmin_idxmax.__doc__""" + group_idxmin_idxmax( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=False, + mask=mask, + result_mask=result_mask, + ) + + @cython.boundscheck(False) @cython.wraparound(False) cdef group_cummin_max( diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6107388bfe78b..4a2e8f25d07eb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2679,12 +2679,22 @@ def _groupby_op( dtype = self.dtype if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: raise TypeError(f"{dtype} type does not support {how} operations") - if how in ["min", "max", "rank"] and not dtype.ordered: + if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we # don't go down a group-by-group path, since in the empty-groups # case that would fail to raise raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank", "any", "all", "first", "last", "min", "max"]: + if how not in [ + "rank", + "any", + "all", + "first", + "last", + "min", + "max", + "idxmin", + "idxmax", + ]: if kind == "transform": raise TypeError(f"{dtype} type does not support {how} operations") raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2a1bd381abfdd..ee726edb94fba 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,15 +1185,33 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "idxmax") + if axis == 1: + result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) + if result.empty: + result = result.astype(self.obj.index.dtype) + else: + alt = lambda x: x.idxmin(skipna=skipna) + result = self._idxmax_idxmin("idxmin", alt) + return result @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "idxmax") + if axis == 1: + result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) + if result.empty: + result = result.astype(self.obj.index.dtype) + else: + alt = lambda x: x.idxmax(skipna=skipna) + result = self._idxmax_idxmin("idxmax", alt) + return result @doc(Series.corr.__doc__) def corr( @@ -2195,14 +2213,21 @@ def idxmax( else: axis = self.axis - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) + if axis == 1: - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - return result.astype(self.obj.index.dtype) if result.empty else result + def func(df): + return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = "idxmax" + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + if result.empty: + result = result.astype(self.obj.index.dtype) + else: + alt = lambda x: x.idxmax(skipna=skipna) + result = self._idxmax_idxmin("idxmax", alt, numeric_only) + return result def idxmin( self, @@ -2290,14 +2315,21 @@ def idxmin( else: axis = self.axis - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) + if axis == 1: - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - return result.astype(self.obj.index.dtype) if result.empty else result + def func(df): + return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = "idxmin" + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + if result.empty: + result = result.astype(self.obj.index.dtype) + else: + alt = lambda x: x.idxmin(skipna=skipna) + result = self._idxmax_idxmin("idxmin", alt, numeric_only) + return result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 64d874a31c428..cd81f3b4d0e48 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -74,6 +74,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import ( coerce_indexer_dtype, ensure_dtype_can_hold_na, + na_value_for_dtype, ) from pandas.core.dtypes.common import ( is_bool_dtype, @@ -1823,12 +1824,14 @@ def _agg_general( *, alias: str, npfunc: Callable, + post_process: Callable | None = None, ): result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, + post_process=post_process, ) return result.__finalize__(self.obj, method="groupby") @@ -1866,7 +1869,8 @@ def _agg_py_fallback( # preserve the kind of exception that raised raise type(err)(msg) from err - if ser.dtype == object: + if ser.dtype == object and how not in ["idxmax", "idxmin"]: + # idxmin/idxmax uses the index dtype res_values = res_values.astype(object, copy=False) # If we are DataFrameGroupBy and went through a SeriesGroupByPath @@ -1882,6 +1886,7 @@ def _cython_agg_general( alt: Callable, numeric_only: bool = False, min_count: int = -1, + post_process: Callable | None = None, **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; @@ -1913,6 +1918,8 @@ def array_func(values: ArrayLike) -> ArrayLike: return result new_mgr = data.grouped_reduce(array_func) + if post_process is not None: + new_mgr = new_mgr.apply(post_process) res = self._wrap_agged_manager(new_mgr) out = self._wrap_aggregated_output(res) if self.axis == 1: @@ -3219,6 +3226,55 @@ def max( npfunc=np.max, ) + def _idxmax_idxmin( + self, + how: Literal["idxmin", "idxmax"], + alt: Callable, + numeric_only: bool = False, + ) -> Series | DataFrame: + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + alt: Callable + Fallback for non-Cython (e.g. object) types. + numeric_only : bool, default False + Include only float, int, boolean columns. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + + def post_process(x): + if x.size == 0: + result = x.astype(self.obj.index.dtype) + else: + index = self.obj.index + if isinstance(index, MultiIndex): + index = index.to_flat_index() + result = index.take(x).values + mask = x == -1 + if mask.any(): + dtype = ensure_dtype_can_hold_na(result.dtype) + if dtype != result.dtype: + result = result.astype(dtype) + result[mask] = na_value_for_dtype(result.dtype) + return result + + result = self._agg_general( + numeric_only=numeric_only, + # min_count is not utilized + min_count=-1, + alias=how, + npfunc=alt, + post_process=post_process, + ) + return result + @final def first(self, numeric_only: bool = False, min_count: int = -1): """ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3c4a22d009406..85a5f881ac495 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -133,6 +133,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "all": functools.partial(libgroupby.group_any_all, val_test="all"), "sum": "group_sum", "prod": "group_prod", + "idxmin": "group_idxmin", + "idxmax": "group_idxmax", "min": "group_min", "max": "group_max", "mean": "group_mean", @@ -268,6 +270,10 @@ def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: if how == "rank": out_dtype = "float64" + elif how in ("idxmin", "idxmax"): + # The Cython implementation only produces the row number; we'll take + # from the index using this in post processing + out_dtype = "int64" else: if dtype.kind in "iufcb": out_dtype = f"{dtype.kind}{dtype.itemsize}" @@ -396,10 +402,24 @@ def _call_cython_op( values = self._get_cython_vals(values) out_dtype = self._get_out_dtype(values.dtype) - result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) + if self.how in ["idxmin", "idxmax"]: + # A single categorical grouper with observed=False includes unobserved + # categories; encode these missing indices as -1 + result = -1 * np.ones(out_shape, dtype=out_dtype) + else: + result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max", "mean", "last", "first", "sum"]: + if self.how in [ + "idxmin", + "idxmax", + "min", + "max", + "mean", + "last", + "first", + "sum", + ]: func( out=result, counts=counts, @@ -463,10 +483,11 @@ def _call_cython_op( **kwargs, ) - if self.kind == "aggregate": + if self.kind == "aggregate" and self.how not in ("idxmin", "idxmax"): # i.e. counts is defined. Locations where count{kernel},dtype->object]"), ] ) - if kernel == "idxmin": - msg = "'<' not supported between instances of 'type' and 'type'" - elif kernel == "idxmax": - msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 635416f0cb1d6..458e60576a6b6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1954,16 +1954,6 @@ def test_empty_groupby( # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2008,7 +1998,11 @@ def get_categorical_invalid_expected(): is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): msg = f"Cannot perform {op} with non-ordered Categorical" with pytest.raises(TypeError, match=msg): get_result() @@ -2017,6 +2011,8 @@ def get_categorical_invalid_expected(): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() + if op in ["idxmax", "idxmin"]: + expected = expected.astype(int) tm.assert_equal(result, expected) return diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 03e3086b8c847..338cce1cfcbb5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -504,15 +504,6 @@ def test_null_is_null_for_dtype( def test_categorical_reducers( request, reduction_func, observed, sort, as_index, index_kind ): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - # Ensure there is at least one null value by appending to the end values = np.append(np.random.choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -562,9 +553,10 @@ def test_categorical_reducers( values = expected["y"].values.tolist() if index_kind == "single": values = [np.nan if e == 4 else e for e in values] + expected["y"] = pd.Categorical(values, categories=[1, 2, 3]) else: values = [(np.nan, np.nan) if e == (4, 4) else e for e in values] - expected["y"] = values + expected["y"] = values if reduction_func == "size": # size, unlike other methods, has the desired behavior in GH#49519 expected = expected.rename(columns={0: "size"}) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9a2b3d44b117..1a606e1ef9235 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -617,12 +617,8 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") - if empty_groups - else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") - if empty_groups - else (None, ""), + "idxmax": (None, "") if empty_groups else (None, ""), + "idxmin": (None, "") if empty_groups else (None, ""), "last": (None, ""), "max": (None, ""), "mean": (TypeError, "category dtype does not support aggregation 'mean'"), From 60407c53eddcaa821c59f94611d99cd86f32c822 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 23 Jul 2023 12:09:34 -0400 Subject: [PATCH 02/35] Update docs --- doc/source/user_guide/groupby.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 482e3fe91ca09..2f31828ead07c 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -511,8 +511,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group From 73927a73446ce48a25cc6537718f437ebf6f6e8d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 23 Jul 2023 12:18:22 -0400 Subject: [PATCH 03/35] Add ASVs --- asv_bench/benchmarks/groupby.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b43fce73ee128..f5bf6eaab3a8b 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -591,12 +591,10 @@ class GroupByCythonAgg: [ "sum", "prod", - # TODO: uncomment min/max - # Currently, min/max implemented very inefficiently - # because it re-uses the Window min/max kernel - # so it will time out ASVs - # "min", - # "max", + "min", + "max", + "idxmin", + "idxmax", "mean", "median", "var", From 6958c9519a72f778d71a04ac86af5f49e26069d0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 23 Jul 2023 12:54:11 -0400 Subject: [PATCH 04/35] mypy fixup --- pandas/core/groupby/generic.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ee726edb94fba..dc091cf0ad3a9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2226,7 +2226,10 @@ def func(df): result = result.astype(self.obj.index.dtype) else: alt = lambda x: x.idxmax(skipna=skipna) - result = self._idxmax_idxmin("idxmax", alt, numeric_only) + # expression has type "Series | DataFrame", variable has type "DataFrame" + result = self._idxmax_idxmin( # type: ignore[assignment] + "idxmax", alt, numeric_only + ) return result def idxmin( @@ -2328,7 +2331,10 @@ def func(df): result = result.astype(self.obj.index.dtype) else: alt = lambda x: x.idxmin(skipna=skipna) - result = self._idxmax_idxmin("idxmin", alt, numeric_only) + # expression has type "Series | DataFrame", variable has type "DataFrame" + result = self._idxmax_idxmin( # type: ignore[assignment] + "idxmin", alt, numeric_only + ) return result boxplot = boxplot_frame_groupby From f1b9ad1bb99ede4086dccdb2f4830fbce0446a31 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 24 Jul 2023 20:09:25 -0400 Subject: [PATCH 05/35] Refinements --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/_libs/groupby.pyi | 11 +++ pandas/_libs/groupby.pyx | 108 +++++++++++--------------- pandas/core/groupby/generic.py | 16 ++-- pandas/core/groupby/groupby.py | 20 +++-- pandas/core/groupby/ops.py | 17 ++-- pandas/tests/groupby/test_function.py | 4 + 7 files changed, 85 insertions(+), 93 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a0fc83f6c63bb..704ddd659fa96 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -410,13 +410,13 @@ Performance improvements - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`) -- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (PR#??) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 18be65195a55e..8ba33f2dd4f2e 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -179,6 +179,17 @@ def group_min( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... +def group_idxmin_idxmax( + out: np.ndarray, # int64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: np.ndarray, # const int64_t[:] + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + name: str = ..., + min_count: int = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5fdb45afe3fdb..061843452e188 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1782,15 +1782,15 @@ cdef group_min_max( @cython.wraparound(False) @cython.boundscheck(False) -cdef group_idxmin_idxmax( +def group_idxmin_idxmax( int64_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_t, ndim=2] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - bint compute_max=True, const uint8_t[:, ::1] mask=None, + str name="idxmin", uint8_t[:, ::1] result_mask=None, ): """ @@ -1805,7 +1805,7 @@ cdef group_idxmin_idxmax( Array to store result in. counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm - values : array + values : np.ndarray[numeric_object_t, ndim=2] Values to find column-wise min/max of. labels : np.ndarray[np.intp] Labels to group by. @@ -1813,8 +1813,8 @@ cdef group_idxmin_idxmax( Unused. For compatibility with other Cython ops. is_datetimelike : bool True if `values` contains datetime-like entries. - compute_max : bint, default True - True to compute group-wise max, False to compute min + name : {"idxmin", "idxmax"}, default "idxmin" + Whether to compute idxmin or idxmax. mask : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used @@ -1829,10 +1829,14 @@ cdef group_idxmin_idxmax( """ cdef: Py_ssize_t i, j, N, K, lab - numeric_t val - numeric_t[:, ::1] group_min_or_max + numeric_object_t val + numeric_object_t[:, ::1] group_min_or_max bint uses_mask = mask is not None bint isna_entry + bint compute_max = name == "idxmax" + + assert name == "idxmin" or name == "idxmax" + assert min_count == -1, "'min_count' only used in sum and prod" # TODO(cython3): # Instead of `labels.shape[0]` use `len(labels)` @@ -1840,11 +1844,15 @@ cdef group_idxmin_idxmax( raise AssertionError("len(index) != len(labels)") group_min_or_max = np.empty_like(out, dtype=values.dtype) - group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) + group_min_or_max[:] = _get_min_or_max( + 0, compute_max, is_datetimelike + ) + out[:] = -1 N, K = (values).shape - with nogil: + # TODO(cython3): De-duplicate once conditional-nogil is available + if numeric_object_t is object: for i in range(N): lab = labels[i] if lab < 0: @@ -1857,7 +1865,8 @@ cdef group_idxmin_idxmax( if uses_mask: isna_entry = mask[i, j] else: - isna_entry = _treat_as_na(val, is_datetimelike) + # TODO(cython3): use _treat_as_na here + isna_entry = checknull(val) if not isna_entry: if compute_max: @@ -1868,6 +1877,31 @@ cdef group_idxmin_idxmax( if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + if compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i @cython.wraparound(False) @@ -1896,32 +1930,6 @@ def group_max( ) -@cython.wraparound(False) -@cython.boundscheck(False) -def group_idxmax( - int64_t[:, ::1] out, - int64_t[::1] counts, - ndarray[numeric_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None, -) -> None: - """See group_idxmin_idxmax.__doc__""" - group_idxmin_idxmax( - out, - counts, - values, - labels, - min_count=min_count, - is_datetimelike=is_datetimelike, - compute_max=True, - mask=mask, - result_mask=result_mask, - ) - - @cython.wraparound(False) @cython.boundscheck(False) def group_min( @@ -1948,32 +1956,6 @@ def group_min( ) -@cython.wraparound(False) -@cython.boundscheck(False) -def group_idxmin( - int64_t[:, ::1] out, - int64_t[::1] counts, - ndarray[numeric_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None, -) -> None: - """See group_idxmin_idxmax.__doc__""" - group_idxmin_idxmax( - out, - counts, - values, - labels, - min_count=min_count, - is_datetimelike=is_datetimelike, - compute_max=False, - mask=mask, - result_mask=result_mask, - ) - - @cython.boundscheck(False) @cython.wraparound(False) cdef group_cummin_max( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dc091cf0ad3a9..3e9ce92514a56 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1193,8 +1193,7 @@ def idxmin( if result.empty: result = result.astype(self.obj.index.dtype) else: - alt = lambda x: x.idxmin(skipna=skipna) - result = self._idxmax_idxmin("idxmin", alt) + result = self._idxmin_idxmax("idxmin") return result @doc(Series.idxmax.__doc__) @@ -1209,8 +1208,7 @@ def idxmax( if result.empty: result = result.astype(self.obj.index.dtype) else: - alt = lambda x: x.idxmax(skipna=skipna) - result = self._idxmax_idxmin("idxmax", alt) + result = self._idxmin_idxmax("idxmax") return result @doc(Series.corr.__doc__) @@ -2225,10 +2223,9 @@ def func(df): if result.empty: result = result.astype(self.obj.index.dtype) else: - alt = lambda x: x.idxmax(skipna=skipna) # expression has type "Series | DataFrame", variable has type "DataFrame" - result = self._idxmax_idxmin( # type: ignore[assignment] - "idxmax", alt, numeric_only + result = self._idxmin_idxmax( # type: ignore[assignment] + "idxmax", numeric_only ) return result @@ -2330,10 +2327,9 @@ def func(df): if result.empty: result = result.astype(self.obj.index.dtype) else: - alt = lambda x: x.idxmin(skipna=skipna) # expression has type "Series | DataFrame", variable has type "DataFrame" - result = self._idxmax_idxmin( # type: ignore[assignment] - "idxmin", alt, numeric_only + result = self._idxmin_idxmax( # type: ignore[assignment] + "idxmin", numeric_only ) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cd81f3b4d0e48..7ffc5883e6cc6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1823,7 +1823,7 @@ def _agg_general( min_count: int = -1, *, alias: str, - npfunc: Callable, + npfunc: Callable | None = None, post_process: Callable | None = None, ): result = self._cython_agg_general( @@ -1883,7 +1883,7 @@ def _agg_py_fallback( def _cython_agg_general( self, how: str, - alt: Callable, + alt: Callable | None = None, numeric_only: bool = False, min_count: int = -1, post_process: Callable | None = None, @@ -1909,11 +1909,12 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? - if how in ["any", "all", "std", "sem"]: + if alt is None or how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result + assert alt is not None result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) return result @@ -3226,10 +3227,9 @@ def max( npfunc=np.max, ) - def _idxmax_idxmin( + def _idxmin_idxmax( self, - how: Literal["idxmin", "idxmax"], - alt: Callable, + how: Literal["idxmax", "idxmin"], numeric_only: bool = False, ) -> Series | DataFrame: """Compute idxmax/idxmin. @@ -3262,7 +3262,12 @@ def post_process(x): dtype = ensure_dtype_can_hold_na(result.dtype) if dtype != result.dtype: result = result.astype(dtype) - result[mask] = na_value_for_dtype(result.dtype) + if isinstance(dtype, np.dtype): + result = np.where( + mask, na_value_for_dtype(result.dtype), result + ) + else: + result[mask] = na_value_for_dtype(result.dtype) return result result = self._agg_general( @@ -3270,7 +3275,6 @@ def post_process(x): # min_count is not utilized min_count=-1, alias=how, - npfunc=alt, post_process=post_process, ) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 85a5f881ac495..7ef5818ed4cc0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -133,8 +133,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "all": functools.partial(libgroupby.group_any_all, val_test="all"), "sum": "group_sum", "prod": "group_prod", - "idxmin": "group_idxmin", - "idxmax": "group_idxmax", + "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), + "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), "min": "group_min", "max": "group_max", "mean": "group_mean", @@ -189,7 +189,7 @@ def _get_cython_function( f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) - elif how in ["std", "sem"]: + elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f elif how == "skew": @@ -270,7 +270,7 @@ def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: if how == "rank": out_dtype = "float64" - elif how in ("idxmin", "idxmax"): + elif how in ["idxmin", "idxmax"]: # The Cython implementation only produces the row number; we'll take # from the index using this in post processing out_dtype = "int64" @@ -402,12 +402,7 @@ def _call_cython_op( values = self._get_cython_vals(values) out_dtype = self._get_out_dtype(values.dtype) - if self.how in ["idxmin", "idxmax"]: - # A single categorical grouper with observed=False includes unobserved - # categories; encode these missing indices as -1 - result = -1 * np.ones(out_shape, dtype=out_dtype) - else: - result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) + result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in [ @@ -483,7 +478,7 @@ def _call_cython_op( **kwargs, ) - if self.kind == "aggregate" and self.how not in ("idxmin", "idxmax"): + if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: # i.e. counts is defined. Locations where count{kernel},dtype->object]"), ] ) + if kernel == "idxmin": + msg = "'<' not supported between instances of 'type' and 'type'" + elif kernel == "idxmax": + msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: From ec6a33b122ffc8d45a04b2cce69db9e629a0b9f9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 24 Jul 2023 20:12:48 -0400 Subject: [PATCH 06/35] Revert --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7ffc5883e6cc6..8b6694909a4a8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1869,8 +1869,7 @@ def _agg_py_fallback( # preserve the kind of exception that raised raise type(err)(msg) from err - if ser.dtype == object and how not in ["idxmax", "idxmin"]: - # idxmin/idxmax uses the index dtype + if ser.dtype == object: res_values = res_values.astype(object, copy=False) # If we are DataFrameGroupBy and went through a SeriesGroupByPath From cede8b4bd7c30c654b464a9cf928777cede0e27b Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 29 Jul 2023 11:10:39 -0400 Subject: [PATCH 07/35] Rework --- pandas/core/arrays/categorical.py | 2 +- pandas/core/groupby/groupby.py | 48 +++++++++++++-------- pandas/tests/groupby/test_categorical.py | 43 ++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 7 ++- pandas/tests/groupby/test_groupby_dropna.py | 18 +++++--- pandas/tests/groupby/test_raises.py | 24 ++++++++++- 6 files changed, 113 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4a2e8f25d07eb..feca2f186e388 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2704,7 +2704,7 @@ def _groupby_op( if how == "rank": assert self.ordered # checked earlier npvalues = self._ndarray - elif how in ["first", "last", "min", "max"]: + elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]: npvalues = self._ndarray result_mask = np.zeros(ngroups, dtype=bool) else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8b6694909a4a8..1770d6f3fd811 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -74,7 +74,6 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import ( coerce_indexer_dtype, ensure_dtype_can_hold_na, - na_value_for_dtype, ) from pandas.core.dtypes.common import ( is_bool_dtype, @@ -1918,9 +1917,9 @@ def array_func(values: ArrayLike) -> ArrayLike: return result new_mgr = data.grouped_reduce(array_func) - if post_process is not None: - new_mgr = new_mgr.apply(post_process) res = self._wrap_agged_manager(new_mgr) + if post_process is not None: + res = post_process(res) out = self._wrap_aggregated_output(res) if self.axis == 1: out = out.infer_objects(copy=False) @@ -3248,25 +3247,36 @@ def _idxmin_idxmax( idxmax or idxmin for the groupby operation. """ - def post_process(x): - if x.size == 0: - result = x.astype(self.obj.index.dtype) + def post_process(res): + if self.observed: + raise_err = False + else: + result_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + raise_err = len(res) < result_len or (res._values == -1).any(axis=None) + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead" + ) + index = self.obj.index + if res.size == 0: + result = res.astype(index.dtype) else: - index = self.obj.index if isinstance(index, MultiIndex): index = index.to_flat_index() - result = index.take(x).values - mask = x == -1 - if mask.any(): - dtype = ensure_dtype_can_hold_na(result.dtype) - if dtype != result.dtype: - result = result.astype(dtype) - if isinstance(dtype, np.dtype): - result = np.where( - mask, na_value_for_dtype(result.dtype), result - ) - else: - result[mask] = na_value_for_dtype(result.dtype) + + if isinstance(res, Series): + result = res._constructor( + index.take(res), index=res.index, name=res.name + ) + else: + buf = {} + for k, _ in enumerate(res.columns): + buf[k] = index.take(res._ixs(k, axis=1)) + result = self.obj._constructor(buf, index=res.index) + result.columns = res.columns return result result = self._agg_general( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fcdb930c9d11b..877bc0e877c1a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1399,6 +1399,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1431,6 +1440,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1497,6 +1515,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1888,6 +1915,15 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -2087,6 +2123,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 458e60576a6b6..29162e39f4f5c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2007,13 +2007,18 @@ def get_categorical_invalid_expected(): with pytest.raises(TypeError, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() if op in ["idxmax", "idxmin"]: expected = expected.astype(int) tm.assert_equal(result, expected) + elif op in ["idxmin", "idxmax"] and isinstance(columns, list): + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + get_result(numeric_only=True) return if op in ["prod", "sum", "skew"]: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 338cce1cfcbb5..2fd4b3c66b120 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -501,9 +501,7 @@ def test_null_is_null_for_dtype( @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -533,6 +531,17 @@ def test_categorical_reducers( args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].replace(4, None) @@ -563,9 +572,6 @@ def test_categorical_reducers( if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 1a606e1ef9235..156409cf052f8 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -572,6 +572,22 @@ def test_groupby_raises_category_on_category( return empty_groups = any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True + if how == "transform": + # empty groups will be ignored + empty_groups = False + if isinstance(by, Grouper) and groupby_func in ["idxmin", "idxmax"] and observed: + # TODO: grouping by Grouper always treats observed as False + empty_groups = False klass, msg = { "all": (None, ""), @@ -617,8 +633,12 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (None, "") if empty_groups else (None, ""), - "idxmin": (None, "") if empty_groups else (None, ""), + "idxmax": (ValueError, "empty group due to unobserved categories") + if empty_groups + else (None, ""), + "idxmin": (ValueError, "empty group due to unobserved categories") + if empty_groups + else (None, ""), "last": (None, ""), "max": (None, ""), "mean": (TypeError, "category dtype does not support aggregation 'mean'"), From 9e4aabc2a05165b76f35d080d44dd3bf91a619b6 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 30 Jul 2023 09:11:51 -0400 Subject: [PATCH 08/35] Refinements --- pandas/core/groupby/groupby.py | 10 ++++------ pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/datetimelike.py | 9 ++++----- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 11f86096f7109..65900cd8afce8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3243,8 +3243,6 @@ def _idxmin_idxmax( ---------- how: {"idxmin", "idxmax"} Whether to compute idxmin or idxmax. - alt: Callable - Fallback for non-Cython (e.g. object) types. numeric_only : bool, default False Include only float, int, boolean columns. @@ -3255,17 +3253,17 @@ def _idxmin_idxmax( """ def post_process(res): - if self.observed: - raise_err = False - else: + if not self.observed: result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) raise_err = len(res) < result_len or (res._values == -1).any(axis=None) + else: + raise_err = False if raise_err: raise ValueError( f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead" + "Specify observed=True in groupby instead." ) index = self.obj.index if res.size == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7ef5818ed4cc0..01fa7e9d99f6e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -273,7 +273,7 @@ def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: elif how in ["idxmin", "idxmax"]: # The Cython implementation only produces the row number; we'll take # from the index using this in post processing - out_dtype = "int64" + out_dtype = "intp" else: if dtype.kind in "iufcb": out_dtype = f"{dtype.kind}{dtype.itemsize}" diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b5b552511826a..d5a292335a5f6 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -812,9 +812,8 @@ def take( self, indices, axis, allow_fill, fill_value, **kwargs ) - if indices.ndim == 1: - maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) - if isinstance(maybe_slice, slice): - freq = self._data._get_getitem_freq(maybe_slice) - result._data._freq = freq + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + if isinstance(maybe_slice, slice): + freq = self._data._get_getitem_freq(maybe_slice) + result._data._freq = freq return result From 3e3f14b7912df82e6bd60d82b2703f359e3eaa88 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 30 Jul 2023 19:18:24 -0400 Subject: [PATCH 09/35] fixup --- pandas/_libs/groupby.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 8ba33f2dd4f2e..a48c99ac9e6cf 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -184,10 +184,10 @@ def group_idxmin_idxmax( counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] + min_count: int = ..., is_datetimelike: bool = ..., mask: np.ndarray | None = ..., name: str = ..., - min_count: int = ..., result_mask: np.ndarray | None = ..., ) -> None: ... def group_cummin( From bfc9cc9a01b109fd8dd675fc1a3d71f9d7af987e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 31 Jul 2023 17:26:49 -0400 Subject: [PATCH 10/35] Fixup, show stderr in ASVs --- .github/workflows/code-checks.yml | 2 +- pandas/_libs/groupby.pyi | 4 ++-- pandas/_libs/groupby.pyx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0954bf7d9231c..de75ea125945d 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --strict --durations=30 --python=same + asv run --quick --dry-run --strict --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index a48c99ac9e6cf..244a41102e01f 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -180,10 +180,10 @@ def group_min( result_mask: np.ndarray | None = ..., ) -> None: ... def group_idxmin_idxmax( - out: np.ndarray, # int64_t[:, ::1] + out: np.ndarray, # intp_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: np.ndarray, # const int64_t[:] + labels: np.ndarray, # const intp[:] min_count: int = ..., is_datetimelike: bool = ..., mask: np.ndarray | None = ..., diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 061843452e188..0a817c4832f72 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1783,7 +1783,7 @@ cdef group_min_max( @cython.wraparound(False) @cython.boundscheck(False) def group_idxmin_idxmax( - int64_t[:, ::1] out, + intp_t[:, ::1] out, int64_t[::1] counts, ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, From 212247b12ad92627575726c1063a4f61caeadb90 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 31 Jul 2023 21:17:12 -0400 Subject: [PATCH 11/35] Remove idxmin/idxmax from numba ASVs --- asv_bench/benchmarks/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4476bdbf54eb3..480d69adfb508 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -73,6 +73,8 @@ "ffill", "first", "head", + "idxmax", + "idxmin", "last", "median", "nunique", From 590b6d4e7eead808331ef91e656f3695f5a61802 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 2 Aug 2023 23:45:09 -0400 Subject: [PATCH 12/35] WIP --- pandas/core/groupby/generic.py | 10 +++++----- pandas/core/groupby/groupby.py | 29 +++++++++++++++++++++++++++-- pandas/tests/groupby/test_raises.py | 19 ++++++++++++------- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cb2d7d4e7ec4f..3a5119d56e8d4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1190,7 +1190,7 @@ def idxmin( if result.empty: result = result.astype(self.obj.index.dtype) else: - result = self._idxmin_idxmax("idxmin") + result = self._idxmin_idxmax("idxmin", skipna=skipna) return result @doc(Series.idxmax.__doc__) @@ -1205,7 +1205,7 @@ def idxmax( if result.empty: result = result.astype(self.obj.index.dtype) else: - result = self._idxmin_idxmax("idxmax") + result = self._idxmin_idxmax("idxmax", skipna=skipna) return result @doc(Series.corr.__doc__) @@ -2222,7 +2222,7 @@ def func(df): else: # expression has type "Series | DataFrame", variable has type "DataFrame" result = self._idxmin_idxmax( # type: ignore[assignment] - "idxmax", numeric_only + "idxmax", numeric_only, skipna=skipna ) return result @@ -2312,7 +2312,7 @@ def idxmin( else: axis = self.axis - if axis == 1: + if axis == 1 or not skipna: def func(df): return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) @@ -2326,7 +2326,7 @@ def func(df): else: # expression has type "Series | DataFrame", variable has type "DataFrame" result = self._idxmin_idxmax( # type: ignore[assignment] - "idxmin", numeric_only + "idxmin", numeric_only, skipna=skipna ) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 65900cd8afce8..eba37424758d0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3236,6 +3236,7 @@ def _idxmin_idxmax( self, how: Literal["idxmax", "idxmin"], numeric_only: bool = False, + skipna: bool = True, ) -> Series | DataFrame: """Compute idxmax/idxmin. @@ -3245,6 +3246,9 @@ def _idxmin_idxmax( Whether to compute idxmin or idxmax. numeric_only : bool, default False Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. Returns ------- @@ -3253,11 +3257,18 @@ def _idxmin_idxmax( """ def post_process(res): + has_na_value = (res._values == -1).any(axis=None) if not self.observed: result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) - raise_err = len(res) < result_len or (res._values == -1).any(axis=None) + raise_err = len(res) < result_len or ( + # When there is one categorical grouping, unobserved categories + # are included in the computation + len(self.grouper.groupings) == 1 + and self.grouper.groupings[0]._passed_categorical + and has_na_value + ) else: raise_err = False if raise_err: @@ -3265,6 +3276,15 @@ def post_process(res): f"Can't get {how} of an empty group due to unobserved categories. " "Specify observed=True in groupby instead." ) + elif has_na_value: + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + index = self.obj.index if res.size == 0: result = res.astype(index.dtype) @@ -3277,9 +3297,14 @@ def post_process(res): index.take(res), index=res.index, name=res.name ) else: + from pandas.core.dtypes.missing import na_value_for_dtype + + na_value = na_value_for_dtype(index.dtype, compat=False) buf = {} for k, _ in enumerate(res.columns): - buf[k] = index.take(res._ixs(k, axis=1)) + buf[k] = index.array.take( + res._ixs(k, axis=1), allow_fill=True, fill_value=na_value + ) result = self.obj._constructor(buf, index=res.index) result.columns = res.columns return result diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 156409cf052f8..430acf0c537d2 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -98,13 +98,14 @@ def df_with_cat_col(): def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) + if klass is None or klass is FutureWarning: + with tm.assert_produces_warning(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) else: with pytest.raises(klass, match=msg): if how == "method": @@ -691,6 +692,10 @@ def test_groupby_raises_category_on_category( ), }[groupby_func] + # if groupby_func in ["idxmin", "idxmax"] and how == "transform": + # klass = FutureWarning + # msg = "The behavior of SeriesGroupBy.idxmax with all-NA values" + _call_and_check(klass, msg, how, gb, groupby_func, args) From 8043f292d07e2102f8c4626b6fe7ff856488da33 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 04:50:59 -0400 Subject: [PATCH 13/35] WIP --- pandas/_libs/groupby.pyx | 4 ++-- pandas/core/groupby/groupby.py | 16 ++++++++++++---- pandas/core/groupby/ops.py | 4 ++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0f38604bb7a8a..f66e1c3684233 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1824,7 +1824,8 @@ def group_idxmin_idxmax( labels : np.ndarray[np.intp] Labels to group by. min_count : Py_ssize_t, default -1 - Unused. For compatibility with other Cython ops. + The minimum number of non-NA group elements, NA result if threshold + is not met. is_datetimelike : bool True if `values` contains datetime-like entries. name : {"idxmin", "idxmax"}, default "idxmin" @@ -1850,7 +1851,6 @@ def group_idxmin_idxmax( bint compute_max = name == "idxmax" assert name == "idxmin" or name == "idxmax" - assert min_count == -1, "'min_count' only used in sum and prod" # TODO(cython3): # Instead of `labels.shape[0]` use `len(labels)` diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 83e7dc08a7f8f..1dba86be7649b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3244,6 +3244,7 @@ def _idxmin_idxmax( how: Literal["idxmax", "idxmin"], numeric_only: bool = False, skipna: bool = True, + ignore_unobserved: bool = False, ) -> Series | DataFrame: """Compute idxmax/idxmin. @@ -3256,6 +3257,9 @@ def _idxmin_idxmax( skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. + ignore_unobserved : bool, default False + When an unobserved group is encountered, do not raise. This used for + transform where unobserved groups do not play an impact on the result. Returns ------- @@ -3265,7 +3269,12 @@ def _idxmin_idxmax( def post_process(res): has_na_value = (res._values == -1).any(axis=None) - if not self.observed: + if not self.observed and not ignore_unobserved: + # Raise if there are unobserved categories. When there are + # multiple groupings, the categories are not included in the + # upfront computation so we compare the final result length + # with the current length + has_unobserved = (res._values == -2).any(axis=None) result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) @@ -3274,7 +3283,7 @@ def post_process(res): # are included in the computation len(self.grouper.groupings) == 1 and self.grouper.groupings[0]._passed_categorical - and has_na_value + and has_unobserved ) else: raise_err = False @@ -3318,8 +3327,7 @@ def post_process(res): result = self._agg_general( numeric_only=numeric_only, - # min_count is not utilized - min_count=-1, + min_count=1, alias=how, post_process=post_process, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 01fa7e9d99f6e..90d00eb473fa2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -497,6 +497,10 @@ def _call_cython_op( result = result.T + if self.how in ["idxmin", "idxmax"]: + # Caller needs to differentiate between all NA input and unobserved + result = np.where(counts < min_count, -2, result) + if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here From 056c5904aa38d5ce59b91948db99f854d5dc02db Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 05:29:31 -0400 Subject: [PATCH 14/35] Rework --- pandas/core/groupby/groupby.py | 16 ++++++++--- pandas/core/groupby/grouper.py | 8 ++++-- pandas/tests/groupby/test_raises.py | 42 +++++++++++++---------------- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a39d6b460fced..37e56ba50f21b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1968,7 +1968,9 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # result to the whole group. Compute func result # and deal with possible broadcasting below. # Temporarily set observed for dealing with categoricals. - with com.temp_setattr(self, "observed", True): + if func in ["idxmin", "idxmax"]: + result = self._idxmin_idxmax("idxmin", **kwargs, ignore_unobserved=True) + else: with com.temp_setattr(self, "as_index", True): # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result @@ -3306,6 +3308,12 @@ def post_process(res): ) index = self.obj.index + + values = res._values + if not self.observed and ignore_unobserved: + # -2 indicates unobserved category; recast as NA + values = np.where(values == -2, -1, values) + if res.size == 0: result = res.astype(index.dtype) else: @@ -3314,16 +3322,16 @@ def post_process(res): if isinstance(res, Series): result = res._constructor( - index.take(res), index=res.index, name=res.name + index.take(values), index=res.index, name=res.name ) else: from pandas.core.dtypes.missing import na_value_for_dtype na_value = na_value_for_dtype(index.dtype, compat=False) buf = {} - for k, _ in enumerate(res.columns): + for k, column_values in enumerate(values.T): buf[k] = index.array.take( - res._ixs(k, axis=1), allow_fill=True, fill_value=na_value + column_values, allow_fill=True, fill_value=na_value ) result = self.obj._constructor(buf, index=res.index) result.columns = res.columns diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ea92fbae9566d..c648d2b00ba7c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -297,7 +297,10 @@ def __init__( self._indexer = None def _get_grouper( - self, obj: NDFrameT, validate: bool = True + self, + obj: NDFrameT, + validate: bool = True, + observed: bool = False, ) -> tuple[ops.BaseGrouper, NDFrameT]: """ Parameters @@ -317,6 +320,7 @@ def _get_grouper( axis=self.axis, level=self.level, sort=self.sort, + observed=observed, validate=validate, dropna=self.dropna, ) @@ -892,7 +896,7 @@ def get_grouper( # a passed-in Grouper, directly convert if isinstance(key, Grouper): - grouper, obj = key._get_grouper(obj, validate=False) + grouper, obj = key._get_grouper(obj, validate=False, observed=observed) if key.key is None: return grouper, frozenset(), obj else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 430acf0c537d2..601f7b68daff4 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -97,23 +97,24 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None or klass is FutureWarning: - with tm.assert_produces_warning(klass, match=msg): - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -234,8 +235,7 @@ def test_groupby_raises_string_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -298,13 +298,11 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -343,8 +341,7 @@ def test_groupby_raises_datetime_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -541,8 +538,7 @@ def test_groupby_raises_category_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) From c0c79ef71e4f8dfc57ed2be1a47bb3ad40f82379 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 07:29:56 -0400 Subject: [PATCH 15/35] Rework --- pandas/_libs/groupby.pyx | 36 +++++++++++++++++++-------- pandas/core/frame.py | 20 +++++++++++++-- pandas/core/groupby/generic.py | 12 +++------ pandas/core/groupby/groupby.py | 35 ++++++++++++++++---------- pandas/core/groupby/ops.py | 5 +--- pandas/core/resample.py | 2 +- pandas/tests/groupby/test_function.py | 32 ++++++++++++++++++++++++ 7 files changed, 103 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f66e1c3684233..5fa16804bc56d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1805,6 +1805,7 @@ def group_idxmin_idxmax( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, str name="idxmin", + bint skipna=True, uint8_t[:, ::1] result_mask=None, ): """ @@ -1833,6 +1834,8 @@ def group_idxmin_idxmax( mask : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used + skipna : bool, default True + Flag to ignore nan values during truth testing result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. @@ -1858,10 +1861,11 @@ def group_idxmin_idxmax( raise AssertionError("len(index) != len(labels)") group_min_or_max = np.empty_like(out, dtype=values.dtype) - group_min_or_max[:] = _get_min_or_max( - 0, compute_max, is_datetimelike - ) - out[:] = -1 + # group_min_or_max[:] = _get_min_or_max( + # 0, compute_max, is_datetimelike + # ) + # Using -2 to indicate a label is never seen + out[:] = -2 N, K = (values).shape @@ -1874,6 +1878,9 @@ def group_idxmin_idxmax( counts[lab] += 1 for j in range(K): + if not skipna and out[lab, j] == -1: + # Once we've hit NA there is no going back + continue val = values[i, j] if uses_mask: @@ -1882,13 +1889,16 @@ def group_idxmin_idxmax( # TODO(cython3): use _treat_as_na here isna_entry = checknull(val) - if not isna_entry: + if isna_entry: + if out[lab, j] == -2 or not skipna: + out[lab, j] = -1 + else: if compute_max: - if val > group_min_or_max[lab, j]: + if out[lab, j] == -2 or val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: - if val < group_min_or_max[lab, j]: + if out[lab, j] == -2 or val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: @@ -1900,6 +1910,9 @@ def group_idxmin_idxmax( counts[lab] += 1 for j in range(K): + if not skipna and out[lab, j] == -1: + # Once we've hit NA there is no going back + continue val = values[i, j] if uses_mask: @@ -1907,13 +1920,16 @@ def group_idxmin_idxmax( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if isna_entry: + if out[lab, j] == -2 or not skipna: + out[lab, j] = -1 + else: if compute_max: - if val > group_min_or_max[lab, j]: + if out[lab, j] == -2 or val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: - if val < group_min_or_max[lab, j]: + if out[lab, j] == -2 or val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9939daadd9237..fa2d71d1eb69a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -66,7 +66,10 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -11190,7 +11193,20 @@ def _get_data() -> DataFrame: row_index = np.tile(np.arange(nrows), ncols) col_index = np.repeat(np.arange(ncols), nrows) ser = Series(arr, index=col_index, copy=False) - result = ser.groupby(row_index).agg(name, **kwds) + # GroupBy will raise a warning with SeriesGroupBy as the object, + # likely confusing users + with rewrite_warning( + target_message=( + f"The behavior of SeriesGroupBy.{name} with all-NA values" + ), + target_category=FutureWarning, + new_message=( + f"The behavior of {type(self).__name__}.{name} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In " + "a future version this will raise ValueError" + ), + ): + result = ser.groupby(row_index).agg(name, **kwds) result.index = df.index if not skipna and name not in ("any", "all"): mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0648de7bce668..5fbe6ac3db199 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2220,10 +2220,7 @@ def func(df): if result.empty: result = result.astype(self.obj.index.dtype) else: - # expression has type "Series | DataFrame", variable has type "DataFrame" - result = self._idxmin_idxmax( # type: ignore[assignment] - "idxmax", numeric_only, skipna=skipna - ) + result = self._idxmin_idxmax("idxmax", numeric_only, skipna=skipna) return result def idxmin( @@ -2312,7 +2309,7 @@ def idxmin( else: axis = self.axis - if axis == 1 or not skipna: + if axis == 1: def func(df): return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) @@ -2324,10 +2321,7 @@ def func(df): if result.empty: result = result.astype(self.obj.index.dtype) else: - # expression has type "Series | DataFrame", variable has type "DataFrame" - result = self._idxmin_idxmax( # type: ignore[assignment] - "idxmin", numeric_only, skipna=skipna - ) + result = self._idxmin_idxmax("idxmin", numeric_only, skipna=skipna) return result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37e56ba50f21b..83f793f440afc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1830,6 +1830,7 @@ def _agg_general( alias: str, npfunc: Callable | None = None, post_process: Callable | None = None, + **kwargs, ): result = self._cython_agg_general( how=alias, @@ -1837,6 +1838,7 @@ def _agg_general( numeric_only=numeric_only, min_count=min_count, post_process=post_process, + **kwargs, ) return result.__finalize__(self.obj, method="groupby") @@ -1968,10 +1970,13 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # result to the whole group. Compute func result # and deal with possible broadcasting below. # Temporarily set observed for dealing with categoricals. - if func in ["idxmin", "idxmax"]: - result = self._idxmin_idxmax("idxmin", **kwargs, ignore_unobserved=True) - else: - with com.temp_setattr(self, "as_index", True): + with com.temp_setattr(self, "as_index", True): + if func in ["idxmin", "idxmax"]: + # mypy doesn't recognize func as Literal["idxmin", "idxmax"] + result = self._idxmin_idxmax( # type: ignore[arg-type] + func, **kwargs, ignore_unobserved=True + ) + else: # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result if engine is not None: @@ -3251,7 +3256,7 @@ def _idxmin_idxmax( numeric_only: bool = False, skipna: bool = True, ignore_unobserved: bool = False, - ) -> Series | DataFrame: + ) -> NDFrameT: """Compute idxmax/idxmin. Parameters @@ -3275,12 +3280,12 @@ def _idxmin_idxmax( def post_process(res): has_na_value = (res._values == -1).any(axis=None) + has_unobserved = (res._values == -2).any(axis=None) if not self.observed and not ignore_unobserved: # Raise if there are unobserved categories. When there are # multiple groupings, the categories are not included in the # upfront computation so we compare the final result length # with the current length - has_unobserved = (res._values == -2).any(axis=None) result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) @@ -3292,7 +3297,7 @@ def post_process(res): and has_unobserved ) else: - raise_err = False + raise_err = not skipna and has_na_value if raise_err: raise ValueError( f"Can't get {how} of an empty group due to unobserved categories. " @@ -3307,10 +3312,10 @@ def post_process(res): stacklevel=find_stack_level(), ) - index = self.obj.index + index = self.obj._get_axis(self.axis) values = res._values - if not self.observed and ignore_unobserved: + if not self.observed and ignore_unobserved and has_unobserved: # -2 indicates unobserved category; recast as NA values = np.where(values == -2, -1, values) @@ -3320,14 +3325,17 @@ def post_process(res): if isinstance(index, MultiIndex): index = index.to_flat_index() + from pandas.core.dtypes.missing import na_value_for_dtype + + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): result = res._constructor( - index.take(values), index=res.index, name=res.name + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, ) else: - from pandas.core.dtypes.missing import na_value_for_dtype - - na_value = na_value_for_dtype(index.dtype, compat=False) buf = {} for k, column_values in enumerate(values.T): buf[k] = index.array.take( @@ -3342,6 +3350,7 @@ def post_process(res): min_count=1, alias=how, post_process=post_process, + skipna=skipna, ) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 90d00eb473fa2..144841c75a138 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: @@ -497,10 +498,6 @@ def _call_cython_op( result = result.T - if self.how in ["idxmin", "idxmax"]: - # Caller needs to differentiate between all NA input and unobserved - result = np.where(counts < min_count, -2, result) - if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9b8d1c870091d..468775cac2017 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2164,7 +2164,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: ) def _get_grouper( - self, obj: NDFrameT, validate: bool = True + self, obj: NDFrameT, validate: bool = True, observed: bool = False ) -> tuple[BinGrouper, NDFrameT]: # create the resampler and return our binner r = self._get_resampler(obj) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..c1344bb1b41c1 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -542,6 +542,38 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize( + "func, values, expected_values, warn", + [ + ("idxmin", [0, 1, 2], [0, 2], None), + ("idxmax", [0, 1, 2], [1, 2], None), + ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), + ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): + df = DataFrame( + { + "a": [1, 1, 2], + "b": values, + } + ) + gb = df.groupby("a") + index = Index([1, 2], name="a") + expected = DataFrame({"b": expected_values}, index=index) + if test_series: + gb = gb["b"] + expected = expected["b"] + klass = "Series" if test_series else "DataFrame" + msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, func)(skipna=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): From b2641a4fc78c7f1e8826aec34b68dd5c319e941c Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 07:32:03 -0400 Subject: [PATCH 16/35] fixup --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 83f793f440afc..a6a1ebad31f4b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1973,8 +1973,8 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "as_index", True): if func in ["idxmin", "idxmax"]: # mypy doesn't recognize func as Literal["idxmin", "idxmax"] - result = self._idxmin_idxmax( # type: ignore[arg-type] - func, **kwargs, ignore_unobserved=True + result = self._idxmin_idxmax( + func, **kwargs, ignore_unobserved=True # type: ignore[arg-type] ) else: # GH#49834 - result needs groups in the index for From cbcca6df1371194db3f14a2d3fa66844873418f5 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 07:33:35 -0400 Subject: [PATCH 17/35] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 -- doc/source/whatsnew/v2.2.0.rst | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0bf925e17d2b3..d1a689dc60830 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -635,7 +635,6 @@ Performance improvements - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) -- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`) @@ -806,7 +805,6 @@ Groupby/resample/rolling - Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`) - Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`) - Bug in :meth:`Series.groupby` raising an error when grouped :class:`Series` has a :class:`DatetimeIndex` index and a :class:`Series` with a name that is a month is given to the ``by`` argument (:issue:`48509`) -- Bug in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` would fail if the groupings have unobserved categories (:issue:`10694`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c35473b852eb9..76568da27d706 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - .. --------------------------------------------------------------------------- From 03d45105877685d45c13a8efbfd16982f3c8790b Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 14 Aug 2023 12:13:57 -0400 Subject: [PATCH 18/35] refinements --- pandas/_libs/groupby.pyi | 1 + pandas/_libs/groupby.pyx | 3 --- pandas/core/groupby/groupby.py | 36 ++++++++++++--------------- pandas/core/groupby/ops.py | 3 +-- pandas/tests/groupby/test_function.py | 1 + pandas/tests/groupby/test_groupby.py | 6 +---- pandas/tests/groupby/test_raises.py | 10 +++----- 7 files changed, 23 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 2cca115c4b003..d7db0f38d090e 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -190,6 +190,7 @@ def group_idxmin_idxmax( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., name: str = ..., + skipna: bool = ..., result_mask: np.ndarray | None = ..., ) -> None: ... def group_cummin( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5fa16804bc56d..f99d2022c45a5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1861,9 +1861,6 @@ def group_idxmin_idxmax( raise AssertionError("len(index) != len(labels)") group_min_or_max = np.empty_like(out, dtype=values.dtype) - # group_min_or_max[:] = _get_min_or_max( - # 0, compute_max, is_datetimelike - # ) # Using -2 to indicate a label is never seen out[:] = -2 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a6a1ebad31f4b..f62ad0d2a9d37 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -89,6 +89,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import ( isna, + na_value_for_dtype, notna, ) @@ -3281,29 +3282,26 @@ def _idxmin_idxmax( def post_process(res): has_na_value = (res._values == -1).any(axis=None) has_unobserved = (res._values == -2).any(axis=None) - if not self.observed and not ignore_unobserved: - # Raise if there are unobserved categories. When there are - # multiple groupings, the categories are not included in the - # upfront computation so we compare the final result length - # with the current length + raise_err = not ignore_unobserved and has_unobserved + if ( + not self.observed + and not ignore_unobserved + and not raise_err + and (len(self.grouper.groupings) > 1 or res.empty) + ): + # When there are multiple groupings or the result is empty, the + # categories are not included in the upfront computation so we + # compare the final result length with the current length result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) - raise_err = len(res) < result_len or ( - # When there is one categorical grouping, unobserved categories - # are included in the computation - len(self.grouper.groupings) == 1 - and self.grouper.groupings[0]._passed_categorical - and has_unobserved - ) - else: - raise_err = not skipna and has_na_value + raise_err = len(res) < result_len if raise_err: raise ValueError( f"Can't get {how} of an empty group due to unobserved categories. " "Specify observed=True in groupby instead." ) - elif has_na_value: + elif not skipna and has_na_value: warnings.warn( f"The behavior of {type(self).__name__}.{how} with all-NA " "values, or any-NA and skipna=False, is deprecated. In a future " @@ -3316,17 +3314,15 @@ def post_process(res): values = res._values if not self.observed and ignore_unobserved and has_unobserved: - # -2 indicates unobserved category; recast as NA - values = np.where(values == -2, -1, values) + # -2 indicates unobserved category; recode as 0 to not unnecessarily + # coerce the resulting dtype. These values will be removed. + values = np.where(values == -2, 0, values) if res.size == 0: result = res.astype(index.dtype) else: if isinstance(index, MultiIndex): index = index.to_flat_index() - - from pandas.core.dtypes.missing import na_value_for_dtype - na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 144841c75a138..823029282c36a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -482,8 +482,7 @@ def _call_cython_op( if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: # i.e. counts is defined. Locations where count Date: Fri, 18 Aug 2023 04:44:17 -0400 Subject: [PATCH 19/35] cleanup --- pandas/tests/groupby/test_raises.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index eeda2af9f3ae0..dc91fa145f584 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -582,9 +582,6 @@ def test_groupby_raises_category_on_category( if how == "transform": # empty groups will be ignored empty_groups = False - # if isinstance(by, Grouper) and groupby_func in ["idxmin", "idxmax"] and observed: - # # TODO: grouping by Grouper always treats observed as False - # empty_groups = False klass, msg = { "all": (None, ""), From 448581a8a4e67e7297b7c910ff1fb6fa93b508c3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 13 Sep 2023 16:31:00 -0400 Subject: [PATCH 20/35] fixup type-hints in groupby.pyi --- pandas/_libs/groupby.pyi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index d7db0f38d090e..9aa89e62fde1f 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -182,10 +182,10 @@ def group_min( result_mask: np.ndarray | None = ..., ) -> None: ... def group_idxmin_idxmax( - out: np.ndarray, # intp_t[:, ::1] - counts: np.ndarray, # int64_t[::1] + out: npt.NDArray[np.intp], + counts: npt.NDArray[np.int64], values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: np.ndarray, # const intp[:] + labels: npt.NDArray[np.int64], # const intp[:] min_count: int = ..., is_datetimelike: bool = ..., mask: np.ndarray | None = ..., From ff32210fb608cff6ae3c8c3f2f002772dd2097ba Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 14 Sep 2023 16:24:02 -0400 Subject: [PATCH 21/35] Use mask instead of sentinel --- doc/source/whatsnew/v2.1.0.rst | 1 - pandas/_libs/groupby.pyx | 35 ++++++++++++++++++++++------------ pandas/core/groupby/groupby.py | 19 ++++++++---------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7341df4ad010f..040ca048d1224 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -642,7 +642,6 @@ Performance improvements - Performance improvement in :meth:`Series.str.get` for PyArrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for PyArrow-backed strings (:issue:`53585`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a NumPy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) -- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a PyArrow timestamp or duration dtype to NumPy (:issue:`53326`) - Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArray` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 303da82e194d3..8ab6e469fa664 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1807,6 +1807,7 @@ def group_idxmin_idxmax( str name="idxmin", bint skipna=True, uint8_t[:, ::1] result_mask=None, + uint8_t[::1] seen=None, ): """ Compute index of minimum/maximum of columns of `values`, in row groups `labels`. @@ -1839,6 +1840,10 @@ def group_idxmin_idxmax( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + seen : ndarray[bool] + Whether a group has been seen. While duplicative of counts, this is passed + by idxmin/idxmax and modified inplace in order to make it back to the GroupBy + method. Notes ----- @@ -1860,20 +1865,26 @@ def group_idxmin_idxmax( if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") - group_min_or_max = np.empty_like(out, dtype=values.dtype) - # Using -2 to indicate a label is never seen - out[:] = -2 - N, K = (values).shape + group_min_or_max = np.empty_like(out, dtype=values.dtype) + if N > 0 and K > 0: + # When N or K is zero, we never use group_min_or_max + group_min_or_max[:] = _get_min_or_max( + values[0, 0], compute_max, is_datetimelike + ) + # When using transform, we need a valid value for take in the case + # a category is not observed; these values will be dropped + out[:] = 0 + # TODO(cython3): De-duplicate once conditional-nogil is available if numeric_object_t is object: for i in range(N): lab = labels[i] if lab < 0: continue + seen[lab] = 1 - counts[lab] += 1 for j in range(K): if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back @@ -1887,15 +1898,15 @@ def group_idxmin_idxmax( isna_entry = checknull(val) if isna_entry: - if out[lab, j] == -2 or not skipna: + if not skipna: out[lab, j] = -1 else: if compute_max: - if out[lab, j] == -2 or val > group_min_or_max[lab, j]: + if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: - if out[lab, j] == -2 or val < group_min_or_max[lab, j]: + if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: @@ -1904,8 +1915,8 @@ def group_idxmin_idxmax( lab = labels[i] if lab < 0: continue + seen[lab] = 1 - counts[lab] += 1 for j in range(K): if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back @@ -1918,15 +1929,15 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if out[lab, j] == -2 or not skipna: + if not skipna: out[lab, j] = -1 else: if compute_max: - if out[lab, j] == -2 or val > group_min_or_max[lab, j]: + if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i else: - if out[lab, j] == -2 or val < group_min_or_max[lab, j]: + if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1dcf42b784929..cd5dfae9a2efc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3323,18 +3323,20 @@ def _idxmin_idxmax( Exclude NA/null values. If an entire row/column is NA, the result will be NA. ignore_unobserved : bool, default False - When an unobserved group is encountered, do not raise. This used for - transform where unobserved groups do not play an impact on the result. + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. Returns ------- Series or DataFrame idxmax or idxmin for the groupby operation. """ + # seen is passed to the Cython code and mutated + seen = np.zeros(self.ngroups, dtype="bool") def post_process(res): has_na_value = (res._values == -1).any(axis=None) - has_unobserved = (res._values == -2).any(axis=None) + has_unobserved = not seen.all() raise_err = not ignore_unobserved and has_unobserved if ( not self.observed @@ -3364,20 +3366,13 @@ def post_process(res): ) index = self.obj._get_axis(self.axis) - - values = res._values - if not self.observed and ignore_unobserved and has_unobserved: - # -2 indicates unobserved category; recode as 0 to not unnecessarily - # coerce the resulting dtype. These values will be removed. - values = np.where(values == -2, 0, values) - if res.size == 0: result = res.astype(index.dtype) else: if isinstance(index, MultiIndex): index = index.to_flat_index() + values = res._values na_value = na_value_for_dtype(index.dtype, compat=False) - if isinstance(res, Series): result = res._constructor( index.array.take(values, allow_fill=True, fill_value=na_value), @@ -3392,6 +3387,7 @@ def post_process(res): ) result = self.obj._constructor(buf, index=res.index) result.columns = res.columns + return result result = self._agg_general( @@ -3400,6 +3396,7 @@ def post_process(res): alias=how, post_process=post_process, skipna=skipna, + seen=seen, ) return result From 57d7b81c44ca1daf88f66cc719666c060ddea1e6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 14 Sep 2023 16:28:32 -0400 Subject: [PATCH 22/35] fixup --- pandas/_libs/groupby.pyi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 9aa89e62fde1f..a8f04dbb9b890 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -185,13 +185,14 @@ def group_idxmin_idxmax( out: npt.NDArray[np.intp], counts: npt.NDArray[np.int64], values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: npt.NDArray[np.int64], # const intp[:] + labels: npt.NDArray[np.intp_t], min_count: int = ..., is_datetimelike: bool = ..., mask: np.ndarray | None = ..., name: str = ..., skipna: bool = ..., result_mask: np.ndarray | None = ..., + seen: npt.NDArray[np.uint8] | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] From 75bde4bbf7b9ddc1ced1aa4ccd43012850f9fc02 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 14 Sep 2023 16:30:03 -0400 Subject: [PATCH 23/35] fixup --- pandas/_libs/groupby.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index a8f04dbb9b890..cd8a754511863 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -185,7 +185,7 @@ def group_idxmin_idxmax( out: npt.NDArray[np.intp], counts: npt.NDArray[np.int64], values: np.ndarray, # ndarray[groupby_t, ndim=2] - labels: npt.NDArray[np.intp_t], + labels: npt.NDArray[np.intp], min_count: int = ..., is_datetimelike: bool = ..., mask: np.ndarray | None = ..., From 587a05477563c4aedc4b3517c4724fad2f259108 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 15 Sep 2023 06:38:05 -0400 Subject: [PATCH 24/35] fixup --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cd5dfae9a2efc..c8c16343d1297 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1970,7 +1970,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TODO: avoid special casing SparseArray here if how in ["any", "all"] and isinstance(values, SparseArray): pass - if alt is None or how in ["any", "all", "std", "sem"]: + elif alt is None or how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result @@ -3337,7 +3337,7 @@ def _idxmin_idxmax( def post_process(res): has_na_value = (res._values == -1).any(axis=None) has_unobserved = not seen.all() - raise_err = not ignore_unobserved and has_unobserved + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved if ( not self.observed and not ignore_unobserved From 6658a98adca5a604af84ebee745c6f04de95a726 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 19 Sep 2023 22:39:39 -0400 Subject: [PATCH 25/35] seen -> unobserved; add assert --- pandas/_libs/groupby.pyi | 2 +- pandas/_libs/groupby.pyx | 10 +++++----- pandas/core/groupby/groupby.py | 9 +++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index cd8a754511863..f8b71a2bd8aad 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -192,7 +192,7 @@ def group_idxmin_idxmax( name: str = ..., skipna: bool = ..., result_mask: np.ndarray | None = ..., - seen: npt.NDArray[np.uint8] | None = ..., + unobserved: npt.NDArray[np.uint8] | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8ab6e469fa664..ec9b2a3e87c0f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1807,7 +1807,7 @@ def group_idxmin_idxmax( str name="idxmin", bint skipna=True, uint8_t[:, ::1] result_mask=None, - uint8_t[::1] seen=None, + uint8_t[::1] unobserved=None, ): """ Compute index of minimum/maximum of columns of `values`, in row groups `labels`. @@ -1840,8 +1840,8 @@ def group_idxmin_idxmax( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. - seen : ndarray[bool] - Whether a group has been seen. While duplicative of counts, this is passed + unobserved : ndarray[bool] + Whether a group has been observed. While duplicative of counts, this is passed by idxmin/idxmax and modified inplace in order to make it back to the GroupBy method. @@ -1883,7 +1883,7 @@ def group_idxmin_idxmax( lab = labels[i] if lab < 0: continue - seen[lab] = 1 + unobserved[lab] = 0 for j in range(K): if not skipna and out[lab, j] == -1: @@ -1915,7 +1915,7 @@ def group_idxmin_idxmax( lab = labels[i] if lab < 0: continue - seen[lab] = 1 + unobserved[lab] = 0 for j in range(K): if not skipna and out[lab, j] == -1: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c8c16343d1297..90d5057df5cde 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3331,12 +3331,12 @@ def _idxmin_idxmax( Series or DataFrame idxmax or idxmin for the groupby operation. """ - # seen is passed to the Cython code and mutated - seen = np.zeros(self.ngroups, dtype="bool") + # unobserved is passed to the Cython code and mutated + unobserved = np.ones(self.ngroups, dtype="bool") def post_process(res): has_na_value = (res._values == -1).any(axis=None) - has_unobserved = not seen.all() + has_unobserved = unobserved.any() raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved if ( not self.observed @@ -3350,6 +3350,7 @@ def post_process(res): result_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) + assert len(res) <= result_len raise_err = len(res) < result_len if raise_err: raise ValueError( @@ -3396,7 +3397,7 @@ def post_process(res): alias=how, post_process=post_process, skipna=skipna, - seen=seen, + unobserved=unobserved, ) return result From 0d9d54c9cfb41fb13291ffe396183ff89a8de572 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 8 Oct 2023 16:09:24 -0400 Subject: [PATCH 26/35] Rework --- pandas/_libs/groupby.pyi | 1 - pandas/_libs/groupby.pyx | 7 - pandas/core/groupby/groupby.py | 248 +++++++----------- pandas/core/groupby/grouper.py | 8 +- pandas/core/resample.py | 2 +- pandas/tests/groupby/test_groupby.py | 7 +- pandas/tests/groupby/test_raises.py | 2 +- .../tests/groupby/transform/test_transform.py | 2 +- 8 files changed, 95 insertions(+), 182 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index f8b71a2bd8aad..609663c721950 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -192,7 +192,6 @@ def group_idxmin_idxmax( name: str = ..., skipna: bool = ..., result_mask: np.ndarray | None = ..., - unobserved: npt.NDArray[np.uint8] | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ec9b2a3e87c0f..bbe8aa36d6c72 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1807,7 +1807,6 @@ def group_idxmin_idxmax( str name="idxmin", bint skipna=True, uint8_t[:, ::1] result_mask=None, - uint8_t[::1] unobserved=None, ): """ Compute index of minimum/maximum of columns of `values`, in row groups `labels`. @@ -1840,10 +1839,6 @@ def group_idxmin_idxmax( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. - unobserved : ndarray[bool] - Whether a group has been observed. While duplicative of counts, this is passed - by idxmin/idxmax and modified inplace in order to make it back to the GroupBy - method. Notes ----- @@ -1883,7 +1878,6 @@ def group_idxmin_idxmax( lab = labels[i] if lab < 0: continue - unobserved[lab] = 0 for j in range(K): if not skipna and out[lab, j] == -1: @@ -1915,7 +1909,6 @@ def group_idxmin_idxmax( lab = labels[i] if lab < 0: continue - unobserved[lab] = 0 for j in range(K): if not skipna and out[lab, j] == -1: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index da97e605dc8ab..64525787f93ec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1881,7 +1881,6 @@ def _agg_general( *, alias: str, npfunc: Callable | None = None, - post_process: Callable | None = None, **kwargs, ): result = self._cython_agg_general( @@ -1889,7 +1888,6 @@ def _agg_general( alt=npfunc, numeric_only=numeric_only, min_count=min_count, - post_process=post_process, **kwargs, ) return result.__finalize__(self.obj, method="groupby") @@ -1944,7 +1942,6 @@ def _cython_agg_general( alt: Callable | None = None, numeric_only: bool = False, min_count: int = -1, - post_process: Callable | None = None, **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; @@ -1981,8 +1978,8 @@ def array_func(values: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) - if post_process is not None: - res = post_process(res) + if how in ["idxmin", "idxmax"]: + res = self._wrap_idxmax_idxmin(res) out = self._wrap_aggregated_output(res) if self.axis == 1: out = out.infer_objects(copy=False) @@ -2021,10 +2018,10 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. - # Temporarily set observed for dealing with categoricals. with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result if func in ["idxmin", "idxmax"]: - # mypy doesn't recognize func as Literal["idxmin", "idxmax"] func = cast(Literal["idxmin", "idxmax"], func) result = self._idxmax_idxmin(func, True, *args, **kwargs) else: @@ -3301,12 +3298,13 @@ def max( npfunc=np.max, ) - def _idxmin_idxmax( + def _idxmax_idxmin( self, how: Literal["idxmax", "idxmin"], - numeric_only: bool = False, - skipna: bool = True, ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, ) -> NDFrameT: """Compute idxmax/idxmin. @@ -3314,6 +3312,9 @@ def _idxmin_idxmax( ---------- how: {"idxmin", "idxmax"} Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True @@ -3328,33 +3329,45 @@ def _idxmin_idxmax( Series or DataFrame idxmax or idxmin for the groupby operation. """ - # unobserved is passed to the Cython code and mutated - unobserved = np.ones(self.ngroups, dtype="bool") + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len - def post_process(res): - has_na_value = (res._values == -1).any(axis=None) - has_unobserved = unobserved.any() raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved - if ( - not self.observed - and not ignore_unobserved - and not raise_err - and (len(self.grouper.groupings) > 1 or res.empty) - ): - # When there are multiple groupings or the result is empty, the - # categories are not included in the upfront computation so we - # compare the final result length with the current length - result_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - assert len(res) <= result_len - raise_err = len(res) < result_len + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + if raise_err: raise ValueError( f"Can't get {how} of an empty group due to unobserved categories. " "Specify observed=True in groupby instead." ) - elif not skipna and has_na_value: + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): warnings.warn( f"The behavior of {type(self).__name__}.{how} with all-NA " "values, or any-NA and skipna=False, is deprecated. In a future " @@ -3363,41 +3376,65 @@ def post_process(res): stacklevel=find_stack_level(), ) - index = self.obj._get_axis(self.axis) - if res.size == 0: - result = res.astype(index.dtype) - else: - if isinstance(index, MultiIndex): - index = index.to_flat_index() - values = res._values - na_value = na_value_for_dtype(index.dtype, compat=False) - if isinstance(res, Series): - result = res._constructor( - index.array.take(values, allow_fill=True, fill_value=na_value), - index=res.index, - name=res.name, - ) + if axis == 1: + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) else: - buf = {} - for k, column_values in enumerate(values.T): - buf[k] = index.array.take( - column_values, allow_fill=True, fill_value=na_value + + def func(df): + method = getattr(df, how) + return method( + axis=axis, skipna=skipna, numeric_only=numeric_only ) - result = self.obj._constructor(buf, index=res.index) - result.columns = res.columns + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise return result result = self._agg_general( numeric_only=numeric_only, min_count=1, alias=how, - post_process=post_process, skipna=skipna, - unobserved=unobserved, ) return result + def _wrap_idxmax_idxmin(self, res): + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + result = res._constructor( + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, + ) + else: + buf = {} + for k, column_values in enumerate(values.T): + buf[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(buf, index=res.index) + result.columns = res.columns + return result + @final def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: """ @@ -5830,113 +5867,6 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) - def _idxmax_idxmin( - self, - how: Literal["idxmax", "idxmin"], - ignore_unobserved: bool = False, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - ): - """Compute idxmax/idxmin. - - Parameters - ---------- - how: {"idxmin", "idxmax"} - Whether to compute idxmin or idxmax. - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - numeric_only : bool, default False - Include only float, int, boolean columns. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ignore_unobserved : bool, default False - When True and an unobserved group is encountered, do not raise. This used - for transform where unobserved groups do not play an impact on the result. - - Returns - ------- - Series or DataFrame - idxmax or idxmin for the groupby operation. - """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, how) - else: - axis = self.axis - - if not self.observed and any( - ping._passed_categorical for ping in self.grouper.groupings - ): - expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) - assert result_len <= expected_len - has_unobserved = result_len < expected_len - - raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved - # Only raise an error if there are columns to compute; otherwise we return - # an empty DataFrame with an index (possibly including unobserved) but no - # columns - data = self._obj_with_exclusions - if raise_err and isinstance(data, DataFrame): - if numeric_only: - data = data._get_numeric_data() - raise_err = len(data.columns) > 0 - else: - raise_err = False - if raise_err: - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) - - try: - if self.obj.ndim == 1: - result = self._op_via_apply(how, skipna=skipna) - else: - - def func(df): - method = getattr(df, how) - return method(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = how - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - except ValueError as err: - name = "argmax" if how == "idxmax" else "argmin" - if f"attempt to get {name} of an empty sequence" in str(err): - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) from None - raise - - result = result.astype(self.obj.index.dtype) if result.empty else result - - if not skipna: - has_na_value = result.isnull().any(axis=None) - if has_na_value: - warnings.warn( - f"The behavior of {type(self).__name__}.{how} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return result - @doc(GroupBy) def get_groupby( diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8cf43ba44aa27..c51c17e04796a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -297,10 +297,7 @@ def __init__( self._indexer: npt.NDArray[np.intp] | None = None def _get_grouper( - self, - obj: NDFrameT, - validate: bool = True, - observed: bool = False, + self, obj: NDFrameT, validate: bool = True ) -> tuple[ops.BaseGrouper, NDFrameT]: """ Parameters @@ -320,7 +317,6 @@ def _get_grouper( axis=self.axis, level=self.level, sort=self.sort, - observed=observed, validate=validate, dropna=self.dropna, ) @@ -897,7 +893,7 @@ def get_grouper( # a passed-in Grouper, directly convert if isinstance(key, Grouper): - grouper, obj = key._get_grouper(obj, validate=False, observed=observed) + grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, frozenset(), obj else: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2780f07a94061..bb5f3ce56b470 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2219,7 +2219,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: ) def _get_grouper( - self, obj: NDFrameT, validate: bool = True, observed: bool = False + self, obj: NDFrameT, validate: bool = True ) -> tuple[BinGrouper, NDFrameT]: # create the resampler and return our binner r = self._get_resampler(obj) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e0bd84ffc0d27..7addfc7f7e8d8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2063,16 +2063,11 @@ def get_categorical_invalid_expected(): with pytest.raises(klass, match=msg): get_result() - if op in ["min", "max"] and isinstance(columns, list): + if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() tm.assert_equal(result, expected) - elif op in ["idxmin", "idxmax"] and isinstance(columns, list): - with pytest.raises( - ValueError, match="empty group due to unobserved categories" - ): - get_result(numeric_only=True) return if op in ["prod", "sum", "skew"]: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index dc91fa145f584..0f4a73e4e2a38 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -568,7 +568,7 @@ def test_groupby_raises_category_on_category( assert not hasattr(gb, "corrwith") return - empty_groups = any(group.empty for group in gb.groups.values()) + empty_groups = not observed and any(group.empty for group in gb.groups.values()) if ( not observed and how != "transform" diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4a493ef3fd52c..848440452296e 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1649,7 +1649,7 @@ def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): with tm.assert_produces_warning(FutureWarning, match=msg): result = gb.transform(how, 0, skipna, numeric_only) warn = None if skipna else FutureWarning - msg = f"The behavior of DataFrame.{how} with .* any-NA and skipna=False" + msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False" with tm.assert_produces_warning(warn, match=msg): expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) tm.assert_frame_equal(result, expected) From 363212d8ba220a4cf7ad3697a3778f5795a10fad Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 8 Oct 2023 16:12:34 -0400 Subject: [PATCH 27/35] cleanup --- pandas/core/groupby/groupby.py | 274 ++++++++++++++++----------------- 1 file changed, 137 insertions(+), 137 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 64525787f93ec..f26e0c1fe4e9e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3298,143 +3298,6 @@ def max( npfunc=np.max, ) - def _idxmax_idxmin( - self, - how: Literal["idxmax", "idxmin"], - ignore_unobserved: bool = False, - axis: Axis | None | lib.NoDefault = lib.no_default, - skipna: bool = True, - numeric_only: bool = False, - ) -> NDFrameT: - """Compute idxmax/idxmin. - - Parameters - ---------- - how: {"idxmin", "idxmax"} - Whether to compute idxmin or idxmax. - axis : {{0 or 'index', 1 or 'columns'}}, default None - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - If axis is not provided, grouper's axis is used. - numeric_only : bool, default False - Include only float, int, boolean columns. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ignore_unobserved : bool, default False - When True and an unobserved group is encountered, do not raise. This used - for transform where unobserved groups do not play an impact on the result. - - Returns - ------- - Series or DataFrame - idxmax or idxmin for the groupby operation. - """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, how) - else: - axis = self.axis - - if not self.observed and any( - ping._passed_categorical for ping in self.grouper.groupings - ): - expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) - assert result_len <= expected_len - has_unobserved = result_len < expected_len - - raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved - # Only raise an error if there are columns to compute; otherwise we return - # an empty DataFrame with an index (possibly including unobserved) but no - # columns - data = self._obj_with_exclusions - if raise_err and isinstance(data, DataFrame): - if numeric_only: - data = data._get_numeric_data() - raise_err = len(data.columns) > 0 - - if raise_err: - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) - elif not skipna: - if self._obj_with_exclusions.isna().any(axis=None): - warnings.warn( - f"The behavior of {type(self).__name__}.{how} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if axis == 1: - try: - if self.obj.ndim == 1: - result = self._op_via_apply(how, skipna=skipna) - else: - - def func(df): - method = getattr(df, how) - return method( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - - func.__name__ = how - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) - except ValueError as err: - name = "argmax" if how == "idxmax" else "argmin" - if f"attempt to get {name} of an empty sequence" in str(err): - raise ValueError( - f"Can't get {how} of an empty group due to unobserved " - "categories. Specify observed=True in groupby instead." - ) from None - raise - return result - - result = self._agg_general( - numeric_only=numeric_only, - min_count=1, - alias=how, - skipna=skipna, - ) - return result - - def _wrap_idxmax_idxmin(self, res): - index = self.obj._get_axis(self.axis) - if res.size == 0: - result = res.astype(index.dtype) - else: - if isinstance(index, MultiIndex): - index = index.to_flat_index() - values = res._values - na_value = na_value_for_dtype(index.dtype, compat=False) - if isinstance(res, Series): - result = res._constructor( - index.array.take(values, allow_fill=True, fill_value=na_value), - index=res.index, - name=res.name, - ) - else: - buf = {} - for k, column_values in enumerate(values.T): - buf[k] = index.array.take( - column_values, allow_fill=True, fill_value=na_value - ) - result = self.obj._constructor(buf, index=res.index) - result.columns = res.columns - return result - @final def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: """ @@ -5867,6 +5730,143 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> NDFrameT: + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if axis == 1: + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) + else: + + def func(df): + method = getattr(df, how) + return method( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise + return result + + result = self._agg_general( + numeric_only=numeric_only, + min_count=1, + alias=how, + skipna=skipna, + ) + return result + + def _wrap_idxmax_idxmin(self, res): + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + result = res._constructor( + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, + ) + else: + buf = {} + for k, column_values in enumerate(values.T): + buf[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(buf, index=res.index) + result.columns = res.columns + return result + @doc(GroupBy) def get_groupby( From 30bc4c78a91ca9882a3f094d8359836dff938803 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 9 Oct 2023 16:48:04 -0400 Subject: [PATCH 28/35] Fixup --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f26e0c1fe4e9e..756cd0659a5fe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5742,7 +5742,7 @@ def _idxmax_idxmin( Parameters ---------- - how: {"idxmin", "idxmax"} + how : {'idxmin', 'idxmax'} Whether to compute idxmin or idxmax. axis : {{0 or 'index', 1 or 'columns'}}, default None The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. From ff00e20664847dee8133136b047581b477262024 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 14 Oct 2023 09:26:01 -0400 Subject: [PATCH 29/35] fixup --- pandas/_libs/groupby.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index bbe8aa36d6c72..d97550b0a7422 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1862,12 +1862,16 @@ def group_idxmin_idxmax( N, K = (values).shape - group_min_or_max = np.empty_like(out, dtype=values.dtype) + if numeric_object_t is object: + group_min_or_max = np.empty((out).shape, dtype=object) + else: + group_min_or_max = np.empty_like(out, dtype=values.dtype) if N > 0 and K > 0: # When N or K is zero, we never use group_min_or_max group_min_or_max[:] = _get_min_or_max( values[0, 0], compute_max, is_datetimelike ) + # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped out[:] = 0 From 3d2d8a091f0508f2ddd242c59dc1676eb2a6ee0c Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 16 Oct 2023 22:50:57 -0400 Subject: [PATCH 30/35] Refinements --- doc/source/whatsnew/v2.2.0.rst | 3 ++- pandas/_libs/groupby.pyx | 2 +- pandas/core/groupby/groupby.py | 12 +++++++----- pandas/core/groupby/ops.py | 1 - 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 98edbf5ee08a5..ca287b263ee09 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -374,10 +374,11 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) +- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) -- Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d97550b0a7422..6f9b33b042959 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1816,7 +1816,7 @@ def group_idxmin_idxmax( Parameters ---------- - out : np.ndarray[int64_t, ndim=2] + out : np.ndarray[intp, ndim=2] Array to store result in. counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 756cd0659a5fe..6836a29d39786 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5842,7 +5842,7 @@ def func(df): ) return result - def _wrap_idxmax_idxmin(self, res): + def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: index = self.obj._get_axis(self.axis) if res.size == 0: result = res.astype(index.dtype) @@ -5850,20 +5850,22 @@ def _wrap_idxmax_idxmin(self, res): if isinstance(index, MultiIndex): index = index.to_flat_index() values = res._values + assert isinstance(values, np.ndarray) na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): - result = res._constructor( + # mypy: expression has type "Series", variable has type "NDFrameT" + result = res._constructor( # type: ignore[assignment] index.array.take(values, allow_fill=True, fill_value=na_value), index=res.index, name=res.name, ) else: - buf = {} + data = {} for k, column_values in enumerate(values.T): - buf[k] = index.array.take( + data[k] = index.array.take( column_values, allow_fill=True, fill_value=na_value ) - result = self.obj._constructor(buf, index=res.index) + result = self.obj._constructor(data, index=res.index) result.columns = res.columns return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f8984aef7c2d6..6923defee4d14 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,7 +424,6 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, - **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: From ad076535649c15bbcf4b7da3ee9bba4924874e47 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 16 Oct 2023 23:17:27 -0400 Subject: [PATCH 31/35] fixup --- pandas/core/groupby/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6836a29d39786..d1a8f8e5cae2e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5850,6 +5850,8 @@ def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: if isinstance(index, MultiIndex): index = index.to_flat_index() values = res._values + if isinstance(values, ExtensionArray): + values = values.to_numpy(copy=False) assert isinstance(values, np.ndarray) na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): From b07e9bab7d11923a78ff1f6c2629d7e4034fb3ba Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 16 Oct 2023 23:24:15 -0400 Subject: [PATCH 32/35] fixup --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d1a8f8e5cae2e..80e5a0525541d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5852,7 +5852,6 @@ def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: values = res._values if isinstance(values, ExtensionArray): values = values.to_numpy(copy=False) - assert isinstance(values, np.ndarray) na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): # mypy: expression has type "Series", variable has type "NDFrameT" From 5c416fe69254011343feb58a25ac4eeb1183dd99 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 24 Oct 2023 17:19:19 -0400 Subject: [PATCH 33/35] WIP --- pandas/core/groupby/groupby.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 80e5a0525541d..46a3afc088b83 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5810,20 +5810,15 @@ def _idxmax_idxmin( if axis == 1: try: - if self.obj.ndim == 1: - result = self._op_via_apply(how, skipna=skipna) - else: - def func(df): - method = getattr(df, how) - return method( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) - func.__name__ = how - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True - ) + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) except ValueError as err: name = "argmax" if how == "idxmax" else "argmin" if f"attempt to get {name} of an empty sequence" in str(err): From 75638a56bcc60d7346dce695f12580ce68755426 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 24 Oct 2023 17:20:03 -0400 Subject: [PATCH 34/35] Avoid _maybe_mask_result --- pandas/core/arrays/masked.py | 10 +++++++--- pandas/core/groupby/groupby.py | 2 -- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 56d3711c7d13b..c8447397c7bfe 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1506,9 +1506,13 @@ def _groupby_op( arity = op._cython_arity.get(op.how, 1) result_mask = np.tile(result_mask, (arity, 1)).T - # res_values should already have the correct dtype, we just need to - # wrap in a MaskedArray - return self._maybe_mask_result(res_values, result_mask) + if op.how in ["idxmin", "idxmax"]: + # Result values are indexes to take, keep as ndarray + return res_values + else: + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) def transpose_homogeneous_masked_arrays( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 46a3afc088b83..11597e7202034 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5845,8 +5845,6 @@ def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: if isinstance(index, MultiIndex): index = index.to_flat_index() values = res._values - if isinstance(values, ExtensionArray): - values = values.to_numpy(copy=False) na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): # mypy: expression has type "Series", variable has type "NDFrameT" From a8a5412ef8bb43767fec819a94351ea82c195447 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 24 Oct 2023 22:24:32 -0400 Subject: [PATCH 35/35] Add assert --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d1f51eeae1f69..67b5c483fcb97 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5844,6 +5844,7 @@ def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: if isinstance(index, MultiIndex): index = index.to_flat_index() values = res._values + assert isinstance(values, np.ndarray) na_value = na_value_for_dtype(index.dtype, compat=False) if isinstance(res, Series): # mypy: expression has type "Series", variable has type "NDFrameT"