pandas-dev · mroeschke · Oct 27, 2023 · Jul 19, 2023 · Jul 23, 2023 · Jul 23, 2023
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -511,8 +511,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation
         :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups
         :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups
         :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group
-        :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group
-        :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group
+        :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group
+        :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group
         :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group
         :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group
         :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -416,7 +416,7 @@ Performance improvements
 - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
 - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
 - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`)
--
+- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (PR#??)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:
@@ -574,6 +574,7 @@ Groupby/resample/rolling
 - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`)
 - Bug in :meth:`DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`)
 - Bug in :meth:`Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`)
+- Bug in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` would fail if the groupings have unobserved categories (:issue:`10694`)
 - Bug in :meth:`SeriesGroupBy.nth` and :meth:`DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`)
 - Bug in :meth:`SeriesGroupBy.nth` and :meth:`DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`)
 - Bug in :meth:`SeriesGroupBy.sum` and :meth:`DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`)

@@ -1780,6 +1780,96 @@ cdef group_min_max(
         )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef group_idxmin_idxmax(
+    int64_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[numeric_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    bint compute_max=True,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+):
+    """
+    Compute index of minimum/maximum of columns of `values`, in row groups `labels`.
+
+    This function only computes the row number where the minimum/maximum occurs, we'll
+    take the corresponding index value after this function.
+
+    Parameters
+    ----------
+    out : np.ndarray[int64_t, ndim=2]
+        Array to store result in.
+    counts : np.ndarray[int64]
+        Input as a zeroed array, populated by group sizes during algorithm
+    values : array
+        Values to find column-wise min/max of.
+    labels : np.ndarray[np.intp]
+        Labels to group by.
+    min_count : Py_ssize_t, default -1
+        Unused. For compatibility with other Cython ops.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    compute_max : bint, default True
+        True to compute group-wise max, False to compute min
+    mask : ndarray[bool, ndim=2], optional
+        If not None, indices represent missing values,
+        otherwise the mask will not be used
+    result_mask : ndarray[bool, ndim=2], optional
+        If not None, these specify locations in the output that are NA.
+        Modified in-place.
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    `counts` is modified to hold group sizes
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        numeric_t val
+        numeric_t[:, ::1] group_min_or_max
+        bint uses_mask = mask is not None
+        bint isna_entry
+
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
+        raise AssertionError("len(index) != len(labels)")
+
+    group_min_or_max = np.empty_like(out, dtype=values.dtype)
+    group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
+
+    N, K = (<object>values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    isna_entry = _treat_as_na(val, is_datetimelike)
+
+                if not isna_entry:
+                    if compute_max:
+                        if val > group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+                    else:
+                        if val < group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max(
@@ -1806,6 +1896,32 @@ def group_max(
     )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_idxmax(
+    int64_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[numeric_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+) -> None:
+    """See group_idxmin_idxmax.__doc__"""
+    group_idxmin_idxmax(
+        out,
+        counts,
+        values,
+        labels,
+        min_count=min_count,
+        is_datetimelike=is_datetimelike,
+        compute_max=True,
+        mask=mask,
+        result_mask=result_mask,
+    )
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_min(
@@ -1832,6 +1948,32 @@ def group_min(
     )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_idxmin(
+    int64_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[numeric_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+) -> None:
+    """See group_idxmin_idxmax.__doc__"""
+    group_idxmin_idxmax(
+        out,
+        counts,
+        values,
+        labels,
+        min_count=min_count,
+        is_datetimelike=is_datetimelike,
+        compute_max=False,
+        mask=mask,
+        result_mask=result_mask,
+    )
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cdef group_cummin_max(

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2679,12 +2679,22 @@ def _groupby_op(
         dtype = self.dtype
         if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
             raise TypeError(f"{dtype} type does not support {how} operations")
-        if how in ["min", "max", "rank"] and not dtype.ordered:
+        if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
             # raise TypeError instead of NotImplementedError to ensure we
             #  don't go down a group-by-group path, since in the empty-groups
             #  case that would fail to raise
             raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
-        if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
+        if how not in [
+            "rank",
+            "any",
+            "all",
+            "first",
+            "last",
+            "min",
+            "max",
+            "idxmin",
+            "idxmax",
+        ]:
             if kind == "transform":
                 raise TypeError(f"{dtype} type does not support {how} operations")
             raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1185,15 +1185,33 @@ def nsmallest(
     def idxmin(
         self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
     ) -> Series:
-        result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
-        return result.astype(self.obj.index.dtype) if result.empty else result
+        if axis is not lib.no_default:
+            axis = self.obj._get_axis_number(axis)
+            self._deprecate_axis(axis, "idxmax")
+        if axis == 1:
+            result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
+            if result.empty:
+                result = result.astype(self.obj.index.dtype)
+        else:
+            alt = lambda x: x.idxmin(skipna=skipna)
+            result = self._idxmax_idxmin("idxmin", alt)
+        return result
 
     @doc(Series.idxmax.__doc__)
     def idxmax(
         self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
     ) -> Series:
-        result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
-        return result.astype(self.obj.index.dtype) if result.empty else result
+        if axis is not lib.no_default:
+            axis = self.obj._get_axis_number(axis)
+            self._deprecate_axis(axis, "idxmax")
+        if axis == 1:
+            result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
+            if result.empty:
+                result = result.astype(self.obj.index.dtype)
+        else:
+            alt = lambda x: x.idxmax(skipna=skipna)
+            result = self._idxmax_idxmin("idxmax", alt)
+        return result
 
     @doc(Series.corr.__doc__)
     def corr(
@@ -2195,14 +2213,21 @@ def idxmax(
         else:
             axis = self.axis
 
-        def func(df):
-            return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
+        if axis == 1:
 
-        func.__name__ = "idxmax"
-        result = self._python_apply_general(
-            func, self._obj_with_exclusions, not_indexed_same=True
-        )
-        return result.astype(self.obj.index.dtype) if result.empty else result
+            def func(df):
+                return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
+
+            func.__name__ = "idxmax"
+            result = self._python_apply_general(
+                func, self._obj_with_exclusions, not_indexed_same=True
+            )
+            if result.empty:
+                result = result.astype(self.obj.index.dtype)
+        else:
+            alt = lambda x: x.idxmax(skipna=skipna)
+            result = self._idxmax_idxmin("idxmax", alt, numeric_only)
+        return result
 
     def idxmin(
         self,
@@ -2290,14 +2315,21 @@ def idxmin(
         else:
             axis = self.axis
 
-        def func(df):
-            return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
+        if axis == 1:
 
-        func.__name__ = "idxmin"
-        result = self._python_apply_general(
-            func, self._obj_with_exclusions, not_indexed_same=True
-        )
-        return result.astype(self.obj.index.dtype) if result.empty else result
+            def func(df):
+                return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
+
+            func.__name__ = "idxmin"
+            result = self._python_apply_general(
+                func, self._obj_with_exclusions, not_indexed_same=True
+            )
+            if result.empty:
+                result = result.astype(self.obj.index.dtype)
+        else:
+            alt = lambda x: x.idxmin(skipna=skipna)
+            result = self._idxmax_idxmin("idxmin", alt, numeric_only)
+        return result
 
     boxplot = boxplot_frame_groupby