PERF: Implement groupby idxmax/idxmin in Cython (#54234)

* PERF: Implement groupby idxmax/idxmin in Cython * Update docs * Add ASVs * mypy fixup * Refinements * Revert * Rework * Refinements * fixup * Fixup, show stderr in ASVs * Remove idxmin/idxmax from numba ASVs * WIP * WIP * Rework * Rework * fixup * whatsnew * refinements * cleanup * fixup type-hints in groupby.pyi * Use mask instead of sentinel * fixup * fixup * fixup * seen -> unobserved; add assert * Rework * cleanup * Fixup * fixup * Refinements * fixup * fixup * WIP * Avoid _maybe_mask_result * Add assert
pandas-dev · Oct 27, 2023 · ccca5df · ccca5df
1 parent ff5cae7
commit ccca5df
Show file tree

Hide file tree

Showing 14 changed files with 299 additions and 54 deletions.
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -73,6 +73,8 @@
     "ffill",
     "first",
     "head",
+    "idxmax",
+    "idxmin",
     "last",
     "median",
     "nunique",
@@ -588,6 +590,8 @@ class GroupByCythonAgg:
             "prod",
             "min",
             "max",
+            "idxmin",
+            "idxmax",
             "mean",
             "median",
             "var",

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -517,8 +517,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation
         :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups
         :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups
         :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group
-        :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group
-        :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group
+        :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group
+        :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group
         :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group
         :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group
         :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -302,6 +302,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
 - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
+- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 - Performance improvement when localizing time to UTC (:issue:`55241`)
 
@@ -403,10 +404,11 @@ Plotting
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
+- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`)
+- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`)
 - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
--
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -181,6 +181,18 @@ def group_min(
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
 ) -> None: ...
+def group_idxmin_idxmax(
+    out: npt.NDArray[np.intp],
+    counts: npt.NDArray[np.int64],
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: npt.NDArray[np.intp],
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    mask: np.ndarray | None = ...,
+    name: str = ...,
+    skipna: bool = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
 def group_cummin(
     out: np.ndarray,  # groupby_t[:, ::1]
     values: np.ndarray,  # ndarray[groupby_t, ndim=2]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1794,6 +1794,151 @@ cdef group_min_max(
         )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_idxmin_idxmax(
+    intp_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[numeric_object_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    str name="idxmin",
+    bint skipna=True,
+    uint8_t[:, ::1] result_mask=None,
+):
+    """
+    Compute index of minimum/maximum of columns of `values`, in row groups `labels`.
+
+    This function only computes the row number where the minimum/maximum occurs, we'll
+    take the corresponding index value after this function.
+
+    Parameters
+    ----------
+    out : np.ndarray[intp, ndim=2]
+        Array to store result in.
+    counts : np.ndarray[int64]
+        Input as a zeroed array, populated by group sizes during algorithm
+    values : np.ndarray[numeric_object_t, ndim=2]
+        Values to find column-wise min/max of.
+    labels : np.ndarray[np.intp]
+        Labels to group by.
+    min_count : Py_ssize_t, default -1
+        The minimum number of non-NA group elements, NA result if threshold
+        is not met.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    name : {"idxmin", "idxmax"}, default "idxmin"
+        Whether to compute idxmin or idxmax.
+    mask : ndarray[bool, ndim=2], optional
+        If not None, indices represent missing values,
+        otherwise the mask will not be used
+    skipna : bool, default True
+        Flag to ignore nan values during truth testing
+    result_mask : ndarray[bool, ndim=2], optional
+        If not None, these specify locations in the output that are NA.
+        Modified in-place.
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    `counts` is modified to hold group sizes
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        numeric_object_t val
+        numeric_object_t[:, ::1] group_min_or_max
+        bint uses_mask = mask is not None
+        bint isna_entry
+        bint compute_max = name == "idxmax"
+
+    assert name == "idxmin" or name == "idxmax"
+
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
+        raise AssertionError("len(index) != len(labels)")
+
+    N, K = (<object>values).shape
+
+    if numeric_object_t is object:
+        group_min_or_max = np.empty((<object>out).shape, dtype=object)
+    else:
+        group_min_or_max = np.empty_like(out, dtype=values.dtype)
+    if N > 0 and K > 0:
+        # When N or K is zero, we never use group_min_or_max
+        group_min_or_max[:] = _get_min_or_max(
+            values[0, 0], compute_max, is_datetimelike
+        )
+
+    # When using transform, we need a valid value for take in the case
+    # a category is not observed; these values will be dropped
+    out[:] = 0
+
+    # TODO(cython3): De-duplicate once conditional-nogil is available
+    if numeric_object_t is object:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            for j in range(K):
+                if not skipna and out[lab, j] == -1:
+                    # Once we've hit NA there is no going back
+                    continue
+                val = values[i, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    # TODO(cython3): use _treat_as_na here
+                    isna_entry = checknull(val)
+
+                if isna_entry:
+                    if not skipna:
+                        out[lab, j] = -1
+                else:
+                    if compute_max:
+                        if val > group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+                    else:
+                        if val < group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                for j in range(K):
+                    if not skipna and out[lab, j] == -1:
+                        # Once we've hit NA there is no going back
+                        continue
+                    val = values[i, j]
+
+                    if uses_mask:
+                        isna_entry = mask[i, j]
+                    else:
+                        isna_entry = _treat_as_na(val, is_datetimelike)
+
+                    if isna_entry:
+                        if not skipna:
+                            out[lab, j] = -1
+                    else:
+                        if compute_max:
+                            if val > group_min_or_max[lab, j]:
+                                group_min_or_max[lab, j] = val
+                                out[lab, j] = i
+                        else:
+                            if val < group_min_or_max[lab, j]:
+                                group_min_or_max[lab, j] = val
+                                out[lab, j] = i
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max(

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2701,12 +2701,22 @@ def _groupby_op(
         dtype = self.dtype
         if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
             raise TypeError(f"{dtype} type does not support {how} operations")
-        if how in ["min", "max", "rank"] and not dtype.ordered:
+        if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
             # raise TypeError instead of NotImplementedError to ensure we
             #  don't go down a group-by-group path, since in the empty-groups
             #  case that would fail to raise
             raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
-        if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
+        if how not in [
+            "rank",
+            "any",
+            "all",
+            "first",
+            "last",
+            "min",
+            "max",
+            "idxmin",
+            "idxmax",
+        ]:
             if kind == "transform":
                 raise TypeError(f"{dtype} type does not support {how} operations")
             raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")
@@ -2716,7 +2726,7 @@ def _groupby_op(
         if how == "rank":
             assert self.ordered  # checked earlier
             npvalues = self._ndarray
-        elif how in ["first", "last", "min", "max"]:
+        elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]:
             npvalues = self._ndarray
             result_mask = np.zeros(ngroups, dtype=bool)
         else:

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -1506,9 +1506,13 @@ def _groupby_op(
             arity = op._cython_arity.get(op.how, 1)
             result_mask = np.tile(result_mask, (arity, 1)).T
 
-        # res_values should already have the correct dtype, we just need to
-        #  wrap in a MaskedArray
-        return self._maybe_mask_result(res_values, result_mask)
+        if op.how in ["idxmin", "idxmax"]:
+            # Result values are indexes to take, keep as ndarray
+            return res_values
+        else:
+            # res_values should already have the correct dtype, we just need to
+            #  wrap in a MaskedArray
+            return self._maybe_mask_result(res_values, result_mask)
 
 
 def transpose_homogeneous_masked_arrays(

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -68,7 +68,10 @@
     deprecate_nonkeyword_arguments,
     doc,
 )
-from pandas.util._exceptions import find_stack_level
+from pandas.util._exceptions import (
+    find_stack_level,
+    rewrite_warning,
+)
 from pandas.util._validators import (
     validate_ascending,
     validate_bool_kwarg,
@@ -11369,7 +11372,20 @@ def _get_data() -> DataFrame:
                     row_index = np.tile(np.arange(nrows), ncols)
                     col_index = np.repeat(np.arange(ncols), nrows)
                     ser = Series(arr, index=col_index, copy=False)
-                    result = ser.groupby(row_index).agg(name, **kwds)
+                    # GroupBy will raise a warning with SeriesGroupBy as the object,
+                    # likely confusing users
+                    with rewrite_warning(
+                        target_message=(
+                            f"The behavior of SeriesGroupBy.{name} with all-NA values"
+                        ),
+                        target_category=FutureWarning,
+                        new_message=(
+                            f"The behavior of {type(self).__name__}.{name} with all-NA "
+                            "values, or any-NA and skipna=False, is deprecated. In "
+                            "a future version this will raise ValueError"
+                        ),
+                    ):
+                        result = ser.groupby(row_index).agg(name, **kwds)
                     result.index = df.index
                     if not skipna and name not in ("any", "all"):
                         mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)