Merge branch 'main' into patch-1

pandas-dev · Oct 27, 2023 · fa68f06 · fa68f06
2 parents f4e9d1c + beed6bc
commit fa68f06
Show file tree

Hide file tree

Showing 28 changed files with 431 additions and 81 deletions.
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -73,6 +73,8 @@
     "ffill",
     "first",
     "head",
+    "idxmax",
+    "idxmin",
     "last",
     "median",
     "nunique",
@@ -588,6 +590,8 @@ class GroupByCythonAgg:
             "prod",
             "min",
             "max",
+            "idxmin",
+            "idxmax",
             "mean",
             "median",
             "var",

diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED
 
 COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"
 
-PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET"
+# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE
+PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET"
 
 if [[ "$PATTERN" ]]; then
   PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""

diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst
@@ -73,6 +73,13 @@ Top-level evaluation
 
    eval
 
+Datetime formats
+~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   tseries.api.guess_datetime_format
+
 Hashing
 ~~~~~~~
 .. autosummary::

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -517,8 +517,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation
         :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups
         :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups
         :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group
-        :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group
-        :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group
+        :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group
+        :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group
         :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group
         :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group
         :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -78,6 +78,7 @@ Other enhancements
 - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
 - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
+- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
@@ -302,6 +303,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
 - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
+- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 - Performance improvement when localizing time to UTC (:issue:`55241`)
 
@@ -321,6 +323,7 @@ Datetimelike
 ^^^^^^^^^^^^
 - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
 - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
+- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
 - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
 - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`)
 - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`)
@@ -358,6 +361,7 @@ Strings
 Interval
 ^^^^^^^^
 - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
+- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
 - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
 - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
 - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
@@ -403,10 +407,11 @@ Plotting
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
+- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`)
+- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`)
 - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
--
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -181,6 +181,18 @@ def group_min(
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
 ) -> None: ...
+def group_idxmin_idxmax(
+    out: npt.NDArray[np.intp],
+    counts: npt.NDArray[np.int64],
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: npt.NDArray[np.intp],
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    mask: np.ndarray | None = ...,
+    name: str = ...,
+    skipna: bool = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
 def group_cummin(
     out: np.ndarray,  # groupby_t[:, ::1]
     values: np.ndarray,  # ndarray[groupby_t, ndim=2]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1794,6 +1794,151 @@ cdef group_min_max(
         )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_idxmin_idxmax(
+    intp_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[numeric_object_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    str name="idxmin",
+    bint skipna=True,
+    uint8_t[:, ::1] result_mask=None,
+):
+    """
+    Compute index of minimum/maximum of columns of `values`, in row groups `labels`.
+
+    This function only computes the row number where the minimum/maximum occurs, we'll
+    take the corresponding index value after this function.
+
+    Parameters
+    ----------
+    out : np.ndarray[intp, ndim=2]
+        Array to store result in.
+    counts : np.ndarray[int64]
+        Input as a zeroed array, populated by group sizes during algorithm
+    values : np.ndarray[numeric_object_t, ndim=2]
+        Values to find column-wise min/max of.
+    labels : np.ndarray[np.intp]
+        Labels to group by.
+    min_count : Py_ssize_t, default -1
+        The minimum number of non-NA group elements, NA result if threshold
+        is not met.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+    name : {"idxmin", "idxmax"}, default "idxmin"
+        Whether to compute idxmin or idxmax.
+    mask : ndarray[bool, ndim=2], optional
+        If not None, indices represent missing values,
+        otherwise the mask will not be used
+    skipna : bool, default True
+        Flag to ignore nan values during truth testing
+    result_mask : ndarray[bool, ndim=2], optional
+        If not None, these specify locations in the output that are NA.
+        Modified in-place.
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    `counts` is modified to hold group sizes
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        numeric_object_t val
+        numeric_object_t[:, ::1] group_min_or_max
+        bint uses_mask = mask is not None
+        bint isna_entry
+        bint compute_max = name == "idxmax"
+
+    assert name == "idxmin" or name == "idxmax"
+
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
+        raise AssertionError("len(index) != len(labels)")
+
+    N, K = (<object>values).shape
+
+    if numeric_object_t is object:
+        group_min_or_max = np.empty((<object>out).shape, dtype=object)
+    else:
+        group_min_or_max = np.empty_like(out, dtype=values.dtype)
+    if N > 0 and K > 0:
+        # When N or K is zero, we never use group_min_or_max
+        group_min_or_max[:] = _get_min_or_max(
+            values[0, 0], compute_max, is_datetimelike
+        )
+
+    # When using transform, we need a valid value for take in the case
+    # a category is not observed; these values will be dropped
+    out[:] = 0
+
+    # TODO(cython3): De-duplicate once conditional-nogil is available
+    if numeric_object_t is object:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            for j in range(K):
+                if not skipna and out[lab, j] == -1:
+                    # Once we've hit NA there is no going back
+                    continue
+                val = values[i, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    # TODO(cython3): use _treat_as_na here
+                    isna_entry = checknull(val)
+
+                if isna_entry:
+                    if not skipna:
+                        out[lab, j] = -1
+                else:
+                    if compute_max:
+                        if val > group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+                    else:
+                        if val < group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                            out[lab, j] = i
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                for j in range(K):
+                    if not skipna and out[lab, j] == -1:
+                        # Once we've hit NA there is no going back
+                        continue
+                    val = values[i, j]
+
+                    if uses_mask:
+                        isna_entry = mask[i, j]
+                    else:
+                        isna_entry = _treat_as_na(val, is_datetimelike)
+
+                    if isna_entry:
+                        if not skipna:
+                            out[lab, j] = -1
+                    else:
+                        if compute_max:
+                            if val > group_min_or_max[lab, j]:
+                                group_min_or_max[lab, j] = val
+                                out[lab, j] = i
+                        else:
+                            if val < group_min_or_max[lab, j]:
+                                group_min_or_max[lab, j] = val
+                                out[lab, j] = i
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max(

diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py
@@ -33,6 +33,7 @@
     "is_supported_unit",
     "npy_unit_to_abbrev",
     "get_supported_reso",
+    "guess_datetime_format",
 ]
 
 from pandas._libs.tslibs import dtypes  # pylint: disable=import-self
@@ -63,6 +64,7 @@
     Tick,
     to_offset,
 )
+from pandas._libs.tslibs.parsing import guess_datetime_format
 from pandas._libs.tslibs.period import (
     IncompatibleFrequency,
     Period,

diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -24,7 +24,7 @@ def try_parse_dates(
     parser,
 ) -> npt.NDArray[np.object_]: ...
 def guess_datetime_format(
-    dt_str,
+    dt_str: str,
     dayfirst: bool | None = ...,
 ) -> str | None: ...
 def concat_date_cols(

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -855,14 +855,24 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
         Datetime string to guess the format of.
     dayfirst : bool, default False
         If True parses dates with the day first, eg 20/01/2005
-        Warning: dayfirst=True is not strict, but will prefer to parse
-        with day first (this is a known bug).
+
+        .. warning::
+            dayfirst=True is not strict, but will prefer to parse
+            with day first (this is a known bug).
 
     Returns
     -------
     str or None : ret
         datetime format string (for `strftime` or `strptime`),
         or None if it can't be guessed.
+
+    Examples
+    --------
+    >>> from pandas.tseries.api import guess_datetime_format
+    >>> guess_datetime_format('09/13/2023')
+    '%m/%d/%Y'
+
+    >>> guess_datetime_format('2023|September|13')
     """
     cdef:
         NPY_DATETIMEUNIT out_bestunit

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -13,6 +13,10 @@
 
 from pandas._libs import lib
 from pandas._libs.arrays import NDArrayBacked
+from pandas._libs.tslibs import (
+    get_unit_from_dtype,
+    is_supported_unit,
+)
 from pandas._typing import (
     ArrayLike,
     AxisInt,
@@ -137,21 +141,19 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
             cls = dtype.construct_array_type()  # type: ignore[assignment]
             dt64_values = arr.view(f"M8[{dtype.unit}]")
             return cls(dt64_values, dtype=dtype)
-        elif dtype == "M8[ns]":
+        elif lib.is_np_dtype(dtype, "M") and is_supported_unit(
+            get_unit_from_dtype(dtype)
+        ):
             from pandas.core.arrays import DatetimeArray
 
-            # error: Argument 1 to "view" of "ndarray" has incompatible type
-            # "ExtensionDtype | dtype[Any]"; expected "dtype[Any] | type[Any]
-            # | _SupportsDType[dtype[Any]]"
-            dt64_values = arr.view(dtype)  # type: ignore[arg-type]
+            dt64_values = arr.view(dtype)
             return DatetimeArray(dt64_values, dtype=dtype)
-        elif dtype == "m8[ns]":
+        elif lib.is_np_dtype(dtype, "m") and is_supported_unit(
+            get_unit_from_dtype(dtype)
+        ):
             from pandas.core.arrays import TimedeltaArray
 
-            # error: Argument 1 to "view" of "ndarray" has incompatible type
-            # "ExtensionDtype | dtype[Any]"; expected "dtype[Any] | type[Any]
-            # | _SupportsDType[dtype[Any]]"
-            td64_values = arr.view(dtype)  # type: ignore[arg-type]
+            td64_values = arr.view(dtype)
             return TimedeltaArray(td64_values, dtype=dtype)
 
         # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2701,12 +2701,22 @@ def _groupby_op(
         dtype = self.dtype
         if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
             raise TypeError(f"{dtype} type does not support {how} operations")
-        if how in ["min", "max", "rank"] and not dtype.ordered:
+        if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
             # raise TypeError instead of NotImplementedError to ensure we
             #  don't go down a group-by-group path, since in the empty-groups
             #  case that would fail to raise
             raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
-        if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
+        if how not in [
+            "rank",
+            "any",
+            "all",
+            "first",
+            "last",
+            "min",
+            "max",
+            "idxmin",
+            "idxmax",
+        ]:
             if kind == "transform":
                 raise TypeError(f"{dtype} type does not support {how} operations")
             raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")
@@ -2716,7 +2726,7 @@ def _groupby_op(
         if how == "rank":
             assert self.ordered  # checked earlier
             npvalues = self._ndarray
-        elif how in ["first", "last", "min", "max"]:
+        elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]:
             npvalues = self._ndarray
             result_mask = np.zeros(ngroups, dtype=bool)
         else: