diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4567b5b414301..54c240e84243a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -73,6 +73,8 @@ "ffill", "first", "head", + "idxmax", + "idxmin", "last", "median", "nunique", @@ -588,6 +590,8 @@ class GroupByCythonAgg: "prod", "min", "max", + "idxmin", + "idxmax", "mean", "median", "var", diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 240b262ca6151..4e80be8fb0fc6 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -517,8 +517,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b74f93942c411..e844ebf0e8440 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -303,6 +303,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) +- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) @@ -403,10 +404,11 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) +- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) -- Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index d165ddd6c8afa..609663c721950 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -181,6 +181,18 @@ def group_min( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... +def group_idxmin_idxmax( + out: npt.NDArray[np.intp], + counts: npt.NDArray[np.int64], + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: npt.NDArray[np.intp], + min_count: int = ..., + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + name: str = ..., + skipna: bool = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7635b261d4149..6f9b33b042959 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1794,6 +1794,151 @@ cdef group_min_max( ) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_idxmin_idxmax( + intp_t[:, ::1] out, + int64_t[::1] counts, + ndarray[numeric_object_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + str name="idxmin", + bint skipna=True, + uint8_t[:, ::1] result_mask=None, +): + """ + Compute index of minimum/maximum of columns of `values`, in row groups `labels`. + + This function only computes the row number where the minimum/maximum occurs, we'll + take the corresponding index value after this function. + + Parameters + ---------- + out : np.ndarray[intp, ndim=2] + Array to store result in. + counts : np.ndarray[int64] + Input as a zeroed array, populated by group sizes during algorithm + values : np.ndarray[numeric_object_t, ndim=2] + Values to find column-wise min/max of. + labels : np.ndarray[np.intp] + Labels to group by. + min_count : Py_ssize_t, default -1 + The minimum number of non-NA group elements, NA result if threshold + is not met. + is_datetimelike : bool + True if `values` contains datetime-like entries. + name : {"idxmin", "idxmax"}, default "idxmin" + Whether to compute idxmin or idxmax. + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + skipna : bool, default True + Flag to ignore nan values during truth testing + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + `counts` is modified to hold group sizes + """ + cdef: + Py_ssize_t i, j, N, K, lab + numeric_object_t val + numeric_object_t[:, ::1] group_min_or_max + bint uses_mask = mask is not None + bint isna_entry + bint compute_max = name == "idxmax" + + assert name == "idxmin" or name == "idxmax" + + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: + raise AssertionError("len(index) != len(labels)") + + N, K = (values).shape + + if numeric_object_t is object: + group_min_or_max = np.empty((out).shape, dtype=object) + else: + group_min_or_max = np.empty_like(out, dtype=values.dtype) + if N > 0 and K > 0: + # When N or K is zero, we never use group_min_or_max + group_min_or_max[:] = _get_min_or_max( + values[0, 0], compute_max, is_datetimelike + ) + + # When using transform, we need a valid value for take in the case + # a category is not observed; these values will be dropped + out[:] = 0 + + # TODO(cython3): De-duplicate once conditional-nogil is available + if numeric_object_t is object: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + for j in range(K): + if not skipna and out[lab, j] == -1: + # Once we've hit NA there is no going back + continue + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + # TODO(cython3): use _treat_as_na here + isna_entry = checknull(val) + + if isna_entry: + if not skipna: + out[lab, j] = -1 + else: + if compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + for j in range(K): + if not skipna and out[lab, j] == -1: + # Once we've hit NA there is no going back + continue + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if isna_entry: + if not skipna: + out[lab, j] = -1 + else: + if compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + + @cython.wraparound(False) @cython.boundscheck(False) def group_max( diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a2fe38fe3ed4..eec833c600177 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2701,12 +2701,22 @@ def _groupby_op( dtype = self.dtype if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: raise TypeError(f"{dtype} type does not support {how} operations") - if how in ["min", "max", "rank"] and not dtype.ordered: + if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we # don't go down a group-by-group path, since in the empty-groups # case that would fail to raise raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank", "any", "all", "first", "last", "min", "max"]: + if how not in [ + "rank", + "any", + "all", + "first", + "last", + "min", + "max", + "idxmin", + "idxmax", + ]: if kind == "transform": raise TypeError(f"{dtype} type does not support {how} operations") raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") @@ -2716,7 +2726,7 @@ def _groupby_op( if how == "rank": assert self.ordered # checked earlier npvalues = self._ndarray - elif how in ["first", "last", "min", "max"]: + elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]: npvalues = self._ndarray result_mask = np.zeros(ngroups, dtype=bool) else: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 56d3711c7d13b..c8447397c7bfe 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1506,9 +1506,13 @@ def _groupby_op( arity = op._cython_arity.get(op.how, 1) result_mask = np.tile(result_mask, (arity, 1)).T - # res_values should already have the correct dtype, we just need to - # wrap in a MaskedArray - return self._maybe_mask_result(res_values, result_mask) + if op.how in ["idxmin", "idxmax"]: + # Result values are indexes to take, keep as ndarray + return res_values + else: + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) def transpose_homogeneous_masked_arrays( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 838792ceb45e9..2d50adbc93f9d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,7 +68,10 @@ deprecate_nonkeyword_arguments, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -11371,7 +11374,20 @@ def _get_data() -> DataFrame: row_index = np.tile(np.arange(nrows), ncols) col_index = np.repeat(np.arange(ncols), nrows) ser = Series(arr, index=col_index, copy=False) - result = ser.groupby(row_index).agg(name, **kwds) + # GroupBy will raise a warning with SeriesGroupBy as the object, + # likely confusing users + with rewrite_warning( + target_message=( + f"The behavior of SeriesGroupBy.{name} with all-NA values" + ), + target_category=FutureWarning, + new_message=( + f"The behavior of {type(self).__name__}.{name} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In " + "a future version this will raise ValueError" + ), + ): + result = ser.groupby(row_index).agg(name, **kwds) result.index = df.index if not skipna and name not in ("any", "all"): mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f1b97fc5e88f6..67b5c483fcb97 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -89,6 +89,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import ( isna, + na_value_for_dtype, notna, ) @@ -1879,13 +1880,15 @@ def _agg_general( min_count: int = -1, *, alias: str, - npfunc: Callable, + npfunc: Callable | None = None, + **kwargs, ): result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, + **kwargs, ) return result.__finalize__(self.obj, method="groupby") @@ -1935,7 +1938,7 @@ def _agg_py_fallback( def _cython_agg_general( self, how: str, - alt: Callable, + alt: Callable | None = None, numeric_only: bool = False, min_count: int = -1, **kwargs, @@ -1963,16 +1966,19 @@ def array_func(values: ArrayLike) -> ArrayLike: # TODO: avoid special casing SparseArray here if how in ["any", "all"] and isinstance(values, SparseArray): pass - elif how in ["any", "all", "std", "sem"]: + elif alt is None or how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result + assert alt is not None result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) return result new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + res = self._wrap_idxmax_idxmin(res) out = self._wrap_aggregated_output(res) if self.axis == 1: out = out.infer_objects(copy=False) @@ -5730,12 +5736,12 @@ def _idxmax_idxmin( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, - ): + ) -> NDFrameT: """Compute idxmax/idxmin. Parameters ---------- - how: {"idxmin", "idxmax"} + how : {'idxmin', 'idxmax'} Whether to compute idxmin or idxmax. axis : {{0 or 'index', 1 or 'columns'}}, default None The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. @@ -5785,18 +5791,24 @@ def _idxmax_idxmin( if numeric_only: data = data._get_numeric_data() raise_err = len(data.columns) > 0 - else: - raise_err = False - if raise_err: - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) - try: - if self.obj.ndim == 1: - result = self._op_via_apply(how, skipna=skipna) - else: + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if axis == 1: + try: def func(df): method = getattr(df, how) @@ -5806,28 +5818,49 @@ def func(df): result = self._python_apply_general( func, self._obj_with_exclusions, not_indexed_same=True ) - except ValueError as err: - name = "argmax" if how == "idxmax" else "argmin" - if f"attempt to get {name} of an empty sequence" in str(err): - raise ValueError( - f"Can't get {how} of an empty group due to unobserved categories. " - "Specify observed=True in groupby instead." - ) from None - raise + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise + return result - result = result.astype(self.obj.index.dtype) if result.empty else result + result = self._agg_general( + numeric_only=numeric_only, + min_count=1, + alias=how, + skipna=skipna, + ) + return result - if not skipna: - has_na_value = result.isnull().any(axis=None) - if has_na_value: - warnings.warn( - f"The behavior of {type(self).__name__}.{how} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), + def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + assert isinstance(values, np.ndarray) + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + # mypy: expression has type "Series", variable has type "NDFrameT" + result = res._constructor( # type: ignore[assignment] + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, ) - + else: + data = {} + for k, column_values in enumerate(values.T): + data[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(data, index=res.index) + result.columns = res.columns return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 607059e5183ec..6923defee4d14 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -133,6 +133,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "all": functools.partial(libgroupby.group_any_all, val_test="all"), "sum": "group_sum", "prod": "group_prod", + "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), + "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), "min": "group_min", "max": "group_max", "mean": "group_mean", @@ -187,7 +189,7 @@ def _get_cython_function( f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) - elif how in ["std", "sem"]: + elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f elif how == "skew": @@ -268,6 +270,10 @@ def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: if how == "rank": out_dtype = "float64" + elif how in ["idxmin", "idxmax"]: + # The Cython implementation only produces the row number; we'll take + # from the index using this in post processing + out_dtype = "intp" else: if dtype.kind in "iufcb": out_dtype = f"{dtype.kind}{dtype.itemsize}" @@ -399,7 +405,16 @@ def _call_cython_op( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max", "mean", "last", "first", "sum"]: + if self.how in [ + "idxmin", + "idxmax", + "min", + "max", + "mean", + "last", + "first", + "sum", + ]: func( out=result, counts=counts, @@ -463,10 +478,10 @@ def _call_cython_op( **kwargs, ) - if self.kind == "aggregate": + if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: # i.e. counts is defined. Locations where count