Skip to content

Commit

Permalink
Merge branch 'main' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
ggold7046 authored Oct 27, 2023
2 parents f4e9d1c + beed6bc commit fa68f06
Show file tree
Hide file tree
Showing 28 changed files with 431 additions and 81 deletions.
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@
"ffill",
"first",
"head",
"idxmax",
"idxmin",
"last",
"median",
"nunique",
Expand Down Expand Up @@ -588,6 +590,8 @@ class GroupByCythonAgg:
"prod",
"min",
"max",
"idxmin",
"idxmax",
"mean",
"median",
"var",
Expand Down
3 changes: 2 additions & 1 deletion ci/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED

COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"

PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET"
# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE
PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET"

if [[ "$PATTERN" ]]; then
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
Expand Down
7 changes: 7 additions & 0 deletions doc/source/reference/general_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ Top-level evaluation

eval

Datetime formats
~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/

tseries.api.guess_datetime_format

Hashing
~~~~~~~
.. autosummary::
Expand Down
4 changes: 2 additions & 2 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -517,8 +517,8 @@ listed below, those with a ``*`` do *not* have a Cython-optimized implementation
:meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups
:meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups
:meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group
:meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group
:meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group
:meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group
:meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group
:meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group
:meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group
:meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group
Expand Down
7 changes: 6 additions & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Other enhancements
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
Expand Down Expand Up @@ -302,6 +303,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand All @@ -321,6 +323,7 @@ Datetimelike
^^^^^^^^^^^^
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
- Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`)
- Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`)
Expand Down Expand Up @@ -358,6 +361,7 @@ Strings
Interval
^^^^^^^^
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`)
- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
Expand Down Expand Up @@ -403,10 +407,11 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`)
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`)
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
-

Reshaping
^^^^^^^^^
Expand Down
12 changes: 12 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,18 @@ def group_min(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_idxmin_idxmax(
out: npt.NDArray[np.intp],
counts: npt.NDArray[np.int64],
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: npt.NDArray[np.intp],
min_count: int = ...,
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
name: str = ...,
skipna: bool = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
Expand Down
145 changes: 145 additions & 0 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1794,6 +1794,151 @@ cdef group_min_max(
)


@cython.wraparound(False)
@cython.boundscheck(False)
def group_idxmin_idxmax(
intp_t[:, ::1] out,
int64_t[::1] counts,
ndarray[numeric_object_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
str name="idxmin",
bint skipna=True,
uint8_t[:, ::1] result_mask=None,
):
"""
Compute index of minimum/maximum of columns of `values`, in row groups `labels`.
This function only computes the row number where the minimum/maximum occurs, we'll
take the corresponding index value after this function.
Parameters
----------
out : np.ndarray[intp, ndim=2]
Array to store result in.
counts : np.ndarray[int64]
Input as a zeroed array, populated by group sizes during algorithm
values : np.ndarray[numeric_object_t, ndim=2]
Values to find column-wise min/max of.
labels : np.ndarray[np.intp]
Labels to group by.
min_count : Py_ssize_t, default -1
The minimum number of non-NA group elements, NA result if threshold
is not met.
is_datetimelike : bool
True if `values` contains datetime-like entries.
name : {"idxmin", "idxmax"}, default "idxmin"
Whether to compute idxmin or idxmax.
mask : ndarray[bool, ndim=2], optional
If not None, indices represent missing values,
otherwise the mask will not be used
skipna : bool, default True
Flag to ignore nan values during truth testing
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
Notes
-----
This method modifies the `out` parameter, rather than returning an object.
`counts` is modified to hold group sizes
"""
cdef:
Py_ssize_t i, j, N, K, lab
numeric_object_t val
numeric_object_t[:, ::1] group_min_or_max
bint uses_mask = mask is not None
bint isna_entry
bint compute_max = name == "idxmax"

assert name == "idxmin" or name == "idxmax"

# TODO(cython3):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")

N, K = (<object>values).shape

if numeric_object_t is object:
group_min_or_max = np.empty((<object>out).shape, dtype=object)
else:
group_min_or_max = np.empty_like(out, dtype=values.dtype)
if N > 0 and K > 0:
# When N or K is zero, we never use group_min_or_max
group_min_or_max[:] = _get_min_or_max(
values[0, 0], compute_max, is_datetimelike
)

# When using transform, we need a valid value for take in the case
# a category is not observed; these values will be dropped
out[:] = 0

# TODO(cython3): De-duplicate once conditional-nogil is available
if numeric_object_t is object:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

for j in range(K):
if not skipna and out[lab, j] == -1:
# Once we've hit NA there is no going back
continue
val = values[i, j]

if uses_mask:
isna_entry = mask[i, j]
else:
# TODO(cython3): use _treat_as_na here
isna_entry = checknull(val)

if isna_entry:
if not skipna:
out[lab, j] = -1
else:
if compute_max:
if val > group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i
else:
with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

for j in range(K):
if not skipna and out[lab, j] == -1:
# Once we've hit NA there is no going back
continue
val = values[i, j]

if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if isna_entry:
if not skipna:
out[lab, j] = -1
else:
if compute_max:
if val > group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
out[lab, j] = i


@cython.wraparound(False)
@cython.boundscheck(False)
def group_max(
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"is_supported_unit",
"npy_unit_to_abbrev",
"get_supported_reso",
"guess_datetime_format",
]

from pandas._libs.tslibs import dtypes # pylint: disable=import-self
Expand Down Expand Up @@ -63,6 +64,7 @@
Tick,
to_offset,
)
from pandas._libs.tslibs.parsing import guess_datetime_format
from pandas._libs.tslibs.period import (
IncompatibleFrequency,
Period,
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/parsing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def try_parse_dates(
parser,
) -> npt.NDArray[np.object_]: ...
def guess_datetime_format(
dt_str,
dt_str: str,
dayfirst: bool | None = ...,
) -> str | None: ...
def concat_date_cols(
Expand Down
14 changes: 12 additions & 2 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -855,14 +855,24 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
Datetime string to guess the format of.
dayfirst : bool, default False
If True parses dates with the day first, eg 20/01/2005
Warning: dayfirst=True is not strict, but will prefer to parse
with day first (this is a known bug).

.. warning::
dayfirst=True is not strict, but will prefer to parse
with day first (this is a known bug).

Returns
-------
str or None : ret
datetime format string (for `strftime` or `strptime`),
or None if it can't be guessed.

Examples
--------
>>> from pandas.tseries.api import guess_datetime_format
>>> guess_datetime_format('09/13/2023')
'%m/%d/%Y'

>>> guess_datetime_format('2023|September|13')
"""
cdef:
NPY_DATETIMEUNIT out_bestunit
Expand Down
22 changes: 12 additions & 10 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._libs.tslibs import (
get_unit_from_dtype,
is_supported_unit,
)
from pandas._typing import (
ArrayLike,
AxisInt,
Expand Down Expand Up @@ -137,21 +141,19 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
cls = dtype.construct_array_type() # type: ignore[assignment]
dt64_values = arr.view(f"M8[{dtype.unit}]")
return cls(dt64_values, dtype=dtype)
elif dtype == "M8[ns]":
elif lib.is_np_dtype(dtype, "M") and is_supported_unit(
get_unit_from_dtype(dtype)
):
from pandas.core.arrays import DatetimeArray

# error: Argument 1 to "view" of "ndarray" has incompatible type
# "ExtensionDtype | dtype[Any]"; expected "dtype[Any] | type[Any]
# | _SupportsDType[dtype[Any]]"
dt64_values = arr.view(dtype) # type: ignore[arg-type]
dt64_values = arr.view(dtype)
return DatetimeArray(dt64_values, dtype=dtype)
elif dtype == "m8[ns]":
elif lib.is_np_dtype(dtype, "m") and is_supported_unit(
get_unit_from_dtype(dtype)
):
from pandas.core.arrays import TimedeltaArray

# error: Argument 1 to "view" of "ndarray" has incompatible type
# "ExtensionDtype | dtype[Any]"; expected "dtype[Any] | type[Any]
# | _SupportsDType[dtype[Any]]"
td64_values = arr.view(dtype) # type: ignore[arg-type]
td64_values = arr.view(dtype)
return TimedeltaArray(td64_values, dtype=dtype)

# error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2701,12 +2701,22 @@ def _groupby_op(
dtype = self.dtype
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank"] and not dtype.ordered:
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
# don't go down a group-by-group path, since in the empty-groups
# case that would fail to raise
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
if how not in [
"rank",
"any",
"all",
"first",
"last",
"min",
"max",
"idxmin",
"idxmax",
]:
if kind == "transform":
raise TypeError(f"{dtype} type does not support {how} operations")
raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")
Expand All @@ -2716,7 +2726,7 @@ def _groupby_op(
if how == "rank":
assert self.ordered # checked earlier
npvalues = self._ndarray
elif how in ["first", "last", "min", "max"]:
elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]:
npvalues = self._ndarray
result_mask = np.zeros(ngroups, dtype=bool)
else:
Expand Down
Loading

0 comments on commit fa68f06

Please sign in to comment.