Skip to content

Commit

Permalink
Merge branch 'main' into idxmin_arrow
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Oct 3, 2023
2 parents 723b4a0 + 3bf0f64 commit 968a1a6
Show file tree
Hide file tree
Showing 31 changed files with 895 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ jobs:
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"

- name: Build wheels
uses: pypa/[email protected].0
uses: pypa/[email protected].1
with:
package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
env:
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ repos:
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
]
- repo: https://github.com/pylint-dev/pylint
rev: v3.0.0a7
rev: v3.0.0b0
hooks:
- id: pylint
stages: [manual]
Expand Down
18 changes: 17 additions & 1 deletion asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand Down Expand Up @@ -72,7 +73,16 @@ class Duplicated:
params = [
[True, False],
["first", "last", False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
[
"int",
"uint",
"float",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
"timestamp[ms][pyarrow]",
"duration[s][pyarrow]",
],
]
param_names = ["unique", "keep", "dtype"]

Expand All @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
),
}[dtype]
if not unique:
data = data.repeat(5)
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ objects.
api.extensions.ExtensionArray.copy
api.extensions.ExtensionArray.view
api.extensions.ExtensionArray.dropna
api.extensions.ExtensionArray.duplicated
api.extensions.ExtensionArray.equals
api.extensions.ExtensionArray.factorize
api.extensions.ExtensionArray.fillna
Expand Down
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
-
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)

.. ---------------------------------------------------------------------------
.. _whatsnew_212.bug_fixes:

Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
-

.. ---------------------------------------------------------------------------
Expand Down
6 changes: 4 additions & 2 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other enhancements

- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-
Expand Down Expand Up @@ -241,6 +242,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand Down Expand Up @@ -281,7 +283,7 @@ Numeric

Conversion
^^^^^^^^^^
-
- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
-

Strings
Expand Down Expand Up @@ -310,7 +312,7 @@ Missing

MultiIndex
^^^^^^^^^^
-
- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`)
-

I/O
Expand Down
136 changes: 114 additions & 22 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4329,28 +4329,6 @@ cdef class CustomBusinessHour(BusinessHour):


cdef class _CustomBusinessMonth(BusinessMixin):
"""
DateOffset subclass representing custom business month(s).
Increments between beginning/end of month dates.
Parameters
----------
n : int, default 1
The number of months represented.
normalize : bool, default False
Normalize start/end dates to midnight before generating date range.
weekmask : str, Default 'Mon Tue Wed Thu Fri'
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
holidays : list
List/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``.
calendar : np.busdaycalendar
Calendar to integrate.
offset : timedelta, default timedelta(0)
Time offset to apply.
"""

_attributes = tuple(
["n", "normalize", "weekmask", "holidays", "calendar", "offset"]
)
Expand Down Expand Up @@ -4426,10 +4404,124 @@ cdef class _CustomBusinessMonth(BusinessMixin):


cdef class CustomBusinessMonthEnd(_CustomBusinessMonth):
"""
DateOffset subclass representing custom business month(s).
Increments between end of month dates.
Parameters
----------
n : int, default 1
The number of months represented.
normalize : bool, default False
Normalize end dates to midnight before generating date range.
weekmask : str, Default 'Mon Tue Wed Thu Fri'
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
holidays : list
List/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``.
calendar : np.busdaycalendar
Calendar to integrate.
offset : timedelta, default timedelta(0)
Time offset to apply.
See Also
--------
:class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
Examples
--------
In the example below we use the default parameters.
>>> ts = pd.Timestamp(2022, 8, 5)
>>> ts + pd.offsets.CustomBusinessMonthEnd()
Timestamp('2022-08-31 00:00:00')
Custom business month end can be specified by ``weekmask`` parameter.
To convert the returned datetime object to its string representation
the function strftime() is used in the next example.
>>> import datetime as dt
>>> freq = pd.offsets.CustomBusinessMonthEnd(weekmask="Wed Thu")
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18),
... freq=freq).strftime('%a %d %b %Y %H:%M')
Index(['Thu 28 Jul 2022 00:00', 'Wed 31 Aug 2022 00:00',
'Thu 29 Sep 2022 00:00', 'Thu 27 Oct 2022 00:00',
'Wed 30 Nov 2022 00:00'],
dtype='object')
Using NumPy business day calendar you can define custom holidays.
>>> import datetime as dt
>>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30',
... '2022-10-31', '2022-11-01'])
>>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc)
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq)
DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'],
dtype='datetime64[ns]', freq='CBM')
"""

_prefix = "CBM"


cdef class CustomBusinessMonthBegin(_CustomBusinessMonth):
"""
DateOffset subclass representing custom business month(s).
Increments between beginning of month dates.
Parameters
----------
n : int, default 1
The number of months represented.
normalize : bool, default False
Normalize start dates to midnight before generating date range.
weekmask : str, Default 'Mon Tue Wed Thu Fri'
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
holidays : list
List/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``.
calendar : np.busdaycalendar
Calendar to integrate.
offset : timedelta, default timedelta(0)
Time offset to apply.
See Also
--------
:class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
Examples
--------
In the example below we use the default parameters.
>>> ts = pd.Timestamp(2022, 8, 5)
>>> ts + pd.offsets.CustomBusinessMonthBegin()
Timestamp('2022-09-01 00:00:00')
Custom business month start can be specified by ``weekmask`` parameter.
To convert the returned datetime object to its string representation
the function strftime() is used in the next example.
>>> import datetime as dt
>>> freq = pd.offsets.CustomBusinessMonthBegin(weekmask="Wed Thu")
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18),
... freq=freq).strftime('%a %d %b %Y %H:%M')
Index(['Wed 03 Aug 2022 00:00', 'Thu 01 Sep 2022 00:00',
'Wed 05 Oct 2022 00:00', 'Wed 02 Nov 2022 00:00',
'Thu 01 Dec 2022 00:00'],
dtype='object')
Using NumPy business day calendar you can define custom holidays.
>>> import datetime as dt
>>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30',
... '2022-10-31', '2022-11-01'])
>>> freq = pd.offsets.CustomBusinessMonthBegin(calendar=bdc)
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq)
DatetimeIndex(['2022-08-02', '2022-09-01', '2022-10-03', '2022-11-02'],
dtype='datetime64[ns]', freq='CBMS')
"""

_prefix = "CBMS"


Expand Down
2 changes: 2 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Hashable,
Iterator,
Mapping,
MutableMapping,
Sequence,
)
from datetime import (
Expand Down Expand Up @@ -103,6 +104,7 @@
TypeGuard: Any = None

HashableT = TypeVar("HashableT", bound=Hashable)
MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping)

# array-like

Expand Down
19 changes: 7 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ArrowDtype,
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
Expand Down Expand Up @@ -979,36 +978,32 @@ def value_counts_arraylike(


def duplicated(
values: ArrayLike, keep: Literal["first", "last", False] = "first"
values: ArrayLike,
keep: Literal["first", "last", False] = "first",
mask: npt.NDArray[np.bool_] | None = None,
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.
Parameters
----------
values : nd.array, ExtensionArray or Series
values : np.ndarray or ExtensionArray
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
mask : ndarray[bool], optional
array indicating which elements to exclude from checking
Returns
-------
duplicated : ndarray[bool]
"""
if hasattr(values, "dtype"):
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
values = values._to_masked() # type: ignore[union-attr]

if isinstance(values.dtype, BaseMaskedDtype):
values = cast("BaseMaskedArray", values)
return htable.duplicated(values._data, keep=keep, mask=values._mask)

values = _ensure_data(values)
return htable.duplicated(values, keep=keep)
return htable.duplicated(values, keep=keep, mask=mask)


def mode(
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core import (
algorithms as algos,
missing,
roperator,
)
Expand Down Expand Up @@ -1292,6 +1293,30 @@ def to_numpy(
result[~mask] = data[~mask]._pa_array.to_numpy()
return result

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
pa_type = self._pa_array.type
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
values = self.to_numpy(na_value=0)
elif pa.types.is_boolean(pa_type):
values = self.to_numpy(na_value=False)
elif pa.types.is_temporal(pa_type):
if pa_type.bit_width == 32:
pa_type = pa.int32()
else:
pa_type = pa.int64()
arr = self.astype(ArrowDtype(pa_type))
values = arr.to_numpy(na_value=0)
else:
# factorize the values to avoid the performance penalty of
# converting to object dtype
values = self.factorize()[0]

mask = self.isna() if self._hasna else None
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the ArrowExtensionArray of unique values.
Expand Down
Loading

0 comments on commit 968a1a6

Please sign in to comment.