Skip to content

Commit

Permalink
ENH/PERF: add ExtensionArray.duplicated (#55255)
Browse files Browse the repository at this point in the history
* PERF: Series.duplicated for pyarrow timestamp and duration types

* whatsnew

* fix setup

* add ExtensionArray.duplicated

* fix

* simplify

* add SparseArray.duplicated

* simplify

* docs

* pass mask

* mypy

* use mask

* add optional to docstring

* revert asv change
  • Loading branch information
lukemanley authored Oct 3, 2023
1 parent 6e6a683 commit 3bf0f64
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 14 deletions.
18 changes: 17 additions & 1 deletion asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand Down Expand Up @@ -72,7 +73,16 @@ class Duplicated:
params = [
[True, False],
["first", "last", False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
[
"int",
"uint",
"float",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
"timestamp[ms][pyarrow]",
"duration[s][pyarrow]",
],
]
param_names = ["unique", "keep", "dtype"]

Expand All @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
),
}[dtype]
if not unique:
data = data.repeat(5)
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ objects.
api.extensions.ExtensionArray.copy
api.extensions.ExtensionArray.view
api.extensions.ExtensionArray.dropna
api.extensions.ExtensionArray.duplicated
api.extensions.ExtensionArray.equals
api.extensions.ExtensionArray.factorize
api.extensions.ExtensionArray.fillna
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other enhancements

- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-
Expand Down Expand Up @@ -241,6 +242,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand Down
19 changes: 7 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ArrowDtype,
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
Expand Down Expand Up @@ -979,36 +978,32 @@ def value_counts_arraylike(


def duplicated(
values: ArrayLike, keep: Literal["first", "last", False] = "first"
values: ArrayLike,
keep: Literal["first", "last", False] = "first",
mask: npt.NDArray[np.bool_] | None = None,
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.
Parameters
----------
values : nd.array, ExtensionArray or Series
values : np.ndarray or ExtensionArray
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
mask : ndarray[bool], optional
array indicating which elements to exclude from checking
Returns
-------
duplicated : ndarray[bool]
"""
if hasattr(values, "dtype"):
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
values = values._to_masked() # type: ignore[union-attr]

if isinstance(values.dtype, BaseMaskedDtype):
values = cast("BaseMaskedArray", values)
return htable.duplicated(values._data, keep=keep, mask=values._mask)

values = _ensure_data(values)
return htable.duplicated(values, keep=keep)
return htable.duplicated(values, keep=keep, mask=mask)


def mode(
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core import (
algorithms as algos,
missing,
roperator,
)
Expand Down Expand Up @@ -1289,6 +1290,30 @@ def to_numpy(
result[~mask] = data[~mask]._pa_array.to_numpy()
return result

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
pa_type = self._pa_array.type
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
values = self.to_numpy(na_value=0)
elif pa.types.is_boolean(pa_type):
values = self.to_numpy(na_value=False)
elif pa.types.is_temporal(pa_type):
if pa_type.bit_width == 32:
pa_type = pa.int32()
else:
pa_type = pa.int64()
arr = self.astype(ArrowDtype(pa_type))
values = arr.to_numpy(na_value=0)
else:
# factorize the values to avoid the performance penalty of
# converting to object dtype
values = self.factorize()[0]

mask = self.isna() if self._hasna else None
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the ArrowExtensionArray of unique values.
Expand Down
27 changes: 27 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
roperator,
)
from pandas.core.algorithms import (
duplicated,
factorize_array,
isin,
map_array,
Expand Down Expand Up @@ -125,6 +126,7 @@ class ExtensionArray:
astype
copy
dropna
duplicated
factorize
fillna
equals
Expand Down Expand Up @@ -1116,6 +1118,31 @@ def dropna(self) -> Self:
# error: Unsupported operand type for ~ ("ExtensionArray")
return self[~self.isna()] # type: ignore[operator]

def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.
Parameters
----------
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- False : Mark all duplicates as ``True``.
Returns
-------
ndarray[bool]
Examples
--------
>>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated()
array([False, True, False, False, True])
"""
mask = self.isna().astype(np.bool_, copy=False)
return duplicated(values=self, keep=keep, mask=mask)

def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray:
"""
Shift values by desired number.
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,14 @@ def copy(self) -> Self:
mask = self._mask.copy()
return self._simple_new(data, mask)

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
values = self._data
mask = self._mask
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the BaseMaskedArray of unique values.
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pandas._libs.tslibs import NaT
from pandas.compat.numpy import function as nv
from pandas.errors import PerformanceWarning
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
validate_bool_kwarg,
Expand Down Expand Up @@ -830,6 +831,14 @@ def _first_fill_value_loc(self):
diff = np.r_[np.diff(indices), 2]
return indices[(diff > 1).argmax()] + 1

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
values = np.asarray(self)
mask = np.asarray(self.isna())
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
uniques = algos.unique(self.sp_values)
if len(self.sp_values) != len(self):
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"):

@final
def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
return algorithms.duplicated(self._values, keep=keep)
arr = self._values
if isinstance(arr, ExtensionArray):
return arr.duplicated(keep=keep)
return algorithms.duplicated(arr, keep=keep)

def _arith_method(self, other, op):
res_name = ops.get_op_result_name(self, other)
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("keep", ["first", "last", False])
def test_duplicated(self, data, keep):
arr = data.take([0, 1, 0, 1])
result = arr.duplicated(keep=keep)
if keep == "first":
expected = np.array([False, False, True, True])
elif keep == "last":
expected = np.array([True, True, False, False])
else:
expected = np.array([True, True, True, True])
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
Expand Down

0 comments on commit 3bf0f64

Please sign in to comment.