Merge branch 'main' into bug-is_string_dtype-obj-empty

pandas-dev · Oct 3, 2023 · 3945bb0 · 3945bb0
2 parents 8ea61bd + 8664572
commit 3945bb0
Show file tree

Hide file tree

Showing 19 changed files with 181 additions and 21 deletions.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -1,6 +1,7 @@
 from importlib import import_module
 
 import numpy as np
+import pyarrow as pa
 
 import pandas as pd
 
@@ -72,7 +73,16 @@ class Duplicated:
     params = [
         [True, False],
         ["first", "last", False],
-        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+        [
+            "int",
+            "uint",
+            "float",
+            "string",
+            "datetime64[ns]",
+            "datetime64[ns, tz]",
+            "timestamp[ms][pyarrow]",
+            "duration[s][pyarrow]",
+        ],
     ]
     param_names = ["unique", "keep", "dtype"]
 
@@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
             ),
+            "timestamp[ms][pyarrow]": pd.Index(
+                np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
+            ),
+            "duration[s][pyarrow]": pd.Index(
+                np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
+            ),
         }[dtype]
         if not unique:
             data = data.repeat(5)

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -49,6 +49,7 @@ objects.
       api.extensions.ExtensionArray.copy
       api.extensions.ExtensionArray.view
       api.extensions.ExtensionArray.dropna
+      api.extensions.ExtensionArray.duplicated
       api.extensions.ExtensionArray.equals
       api.extensions.ExtensionArray.factorize
       api.extensions.ExtensionArray.fillna

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -451,7 +451,7 @@ Merge
 Concat
 ~~~~~~
 
-pandas provides various facilities for easily combining together :class:`Series`` and
+pandas provides various facilities for easily combining together :class:`Series` and
 :class:`DataFrame` objects with various kinds of set logic for the indexes
 and relational algebra functionality in the case of join / merge-type
 operations.

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -22,8 +22,12 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
+- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
+- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
 - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
+- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
 - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
 -
 

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -76,6 +76,7 @@ Other enhancements
 
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
+- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
 - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
 -
@@ -241,6 +242,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
+- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 - Performance improvement when localizing time to UTC (:issue:`55241`)
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -55,7 +55,6 @@
 )
 from pandas.core.dtypes.concat import concat_compat
 from pandas.core.dtypes.dtypes import (
-    ArrowDtype,
     BaseMaskedDtype,
     CategoricalDtype,
     ExtensionDtype,
@@ -979,36 +978,32 @@ def value_counts_arraylike(
 
 
 def duplicated(
-    values: ArrayLike, keep: Literal["first", "last", False] = "first"
+    values: ArrayLike,
+    keep: Literal["first", "last", False] = "first",
+    mask: npt.NDArray[np.bool_] | None = None,
 ) -> npt.NDArray[np.bool_]:
     """
     Return boolean ndarray denoting duplicate values.
 
     Parameters
     ----------
-    values : nd.array, ExtensionArray or Series
+    values : np.ndarray or ExtensionArray
         Array over which to check for duplicate values.
     keep : {'first', 'last', False}, default 'first'
         - ``first`` : Mark duplicates as ``True`` except for the first
           occurrence.
         - ``last`` : Mark duplicates as ``True`` except for the last
           occurrence.
         - False : Mark all duplicates as ``True``.
+    mask : ndarray[bool], optional
+        array indicating which elements to exclude from checking
 
     Returns
     -------
     duplicated : ndarray[bool]
     """
-    if hasattr(values, "dtype"):
-        if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
-            values = values._to_masked()  # type: ignore[union-attr]
-
-        if isinstance(values.dtype, BaseMaskedDtype):
-            values = cast("BaseMaskedArray", values)
-            return htable.duplicated(values._data, keep=keep, mask=values._mask)
-
     values = _ensure_data(values)
-    return htable.duplicated(values, keep=keep)
+    return htable.duplicated(values, keep=keep, mask=mask)
 
 
 def mode(

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -30,8 +30,12 @@
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
 
-from pandas.core.dtypes.cast import can_hold_element
+from pandas.core.dtypes.cast import (
+    can_hold_element,
+    infer_dtype_from_scalar,
+)
 from pandas.core.dtypes.common import (
+    CategoricalDtype,
     is_array_like,
     is_bool_dtype,
     is_integer,
@@ -42,6 +46,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import (
+    algorithms as algos,
     missing,
     roperator,
 )
@@ -627,7 +632,9 @@ def __setstate__(self, state) -> None:
 
     def _cmp_method(self, other, op):
         pc_func = ARROW_CMP_FUNCS[op.__name__]
-        if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)):
+        if isinstance(
+            other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
+        ) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
             result = pc_func(self._pa_array, self._box_pa(other))
         elif is_scalar(other):
             try:
@@ -1289,6 +1296,30 @@ def to_numpy(
             result[~mask] = data[~mask]._pa_array.to_numpy()
         return result
 
+    @doc(ExtensionArray.duplicated)
+    def duplicated(
+        self, keep: Literal["first", "last", False] = "first"
+    ) -> npt.NDArray[np.bool_]:
+        pa_type = self._pa_array.type
+        if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
+            values = self.to_numpy(na_value=0)
+        elif pa.types.is_boolean(pa_type):
+            values = self.to_numpy(na_value=False)
+        elif pa.types.is_temporal(pa_type):
+            if pa_type.bit_width == 32:
+                pa_type = pa.int32()
+            else:
+                pa_type = pa.int64()
+            arr = self.astype(ArrowDtype(pa_type))
+            values = arr.to_numpy(na_value=0)
+        else:
+            # factorize the values to avoid the performance penalty of
+            # converting to object dtype
+            values = self.factorize()[0]
+
+        mask = self.isna() if self._hasna else None
+        return algos.duplicated(values, keep=keep, mask=mask)
+
     def unique(self) -> Self:
         """
         Compute the ArrowExtensionArray of unique values.
@@ -1599,13 +1630,21 @@ def _reduce(
         pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
 
         if keepdims:
-            result = pa.array([pa_result.as_py()], type=pa_result.type)
+            if isinstance(pa_result, pa.Scalar):
+                result = pa.array([pa_result.as_py()], type=pa_result.type)
+            else:
+                result = pa.array(
+                    [pa_result],
+                    type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),
+                )
             return type(self)(result)
 
         if pc.is_null(pa_result).as_py():
             return self.dtype.na_value
-        else:
+        elif isinstance(pa_result, pa.Scalar):
             return pa_result.as_py()
+        else:
+            return pa_result
 
     def _explode(self):
         """

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -61,6 +61,7 @@
     roperator,
 )
 from pandas.core.algorithms import (
+    duplicated,
     factorize_array,
     isin,
     map_array,
@@ -125,6 +126,7 @@ class ExtensionArray:
     astype
     copy
     dropna
+    duplicated
     factorize
     fillna
     equals
@@ -891,7 +893,6 @@ def interpolate(
         limit,
         limit_direction,
         limit_area,
-        fill_value,
         copy: bool,
         **kwargs,
     ) -> Self:
@@ -1116,6 +1117,31 @@ def dropna(self) -> Self:
         # error: Unsupported operand type for ~ ("ExtensionArray")
         return self[~self.isna()]  # type: ignore[operator]
 
+    def duplicated(
+        self, keep: Literal["first", "last", False] = "first"
+    ) -> npt.NDArray[np.bool_]:
+        """
+        Return boolean ndarray denoting duplicate values.
+
+        Parameters
+        ----------
+        keep : {'first', 'last', False}, default 'first'
+            - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
+            - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
+            - False : Mark all duplicates as ``True``.
+
+        Returns
+        -------
+        ndarray[bool]
+
+        Examples
+        --------
+        >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated()
+        array([False,  True, False, False,  True])
+        """
+        mask = self.isna().astype(np.bool_, copy=False)
+        return duplicated(values=self, keep=keep, mask=mask)
+
     def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray:
         """
         Shift values by desired number.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -952,6 +952,14 @@ def copy(self) -> Self:
         mask = self._mask.copy()
         return self._simple_new(data, mask)
 
+    @doc(ExtensionArray.duplicated)
+    def duplicated(
+        self, keep: Literal["first", "last", False] = "first"
+    ) -> npt.NDArray[np.bool_]:
+        values = self._data
+        mask = self._mask
+        return algos.duplicated(values, keep=keep, mask=mask)
+
     def unique(self) -> Self:
         """
         Compute the BaseMaskedArray of unique values.

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -28,6 +28,7 @@
 from pandas._libs.tslibs import NaT
 from pandas.compat.numpy import function as nv
 from pandas.errors import PerformanceWarning
+from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_bool_kwarg,
@@ -830,6 +831,14 @@ def _first_fill_value_loc(self):
         diff = np.r_[np.diff(indices), 2]
         return indices[(diff > 1).argmax()] + 1
 
+    @doc(ExtensionArray.duplicated)
+    def duplicated(
+        self, keep: Literal["first", "last", False] = "first"
+    ) -> npt.NDArray[np.bool_]:
+        values = np.asarray(self)
+        mask = np.asarray(self.isna())
+        return algos.duplicated(values, keep=keep, mask=mask)
+
     def unique(self) -> Self:
         uniques = algos.unique(self.sp_values)
         if len(self.sp_values) != len(self):

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -613,3 +613,8 @@ def _reduce(
             )
         else:
             return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+
+    def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics:
+        if item is np.nan:
+            item = libmissing.NA
+        return super().insert(loc, item)  # type: ignore[return-value]
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"):
 
     @final
     def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
-        return algorithms.duplicated(self._values, keep=keep)
+        arr = self._values
+        if isinstance(arr, ExtensionArray):
+            return arr.duplicated(keep=keep)
+        return algorithms.duplicated(arr, keep=keep)
 
     def _arith_method(self, other, op):
         res_name = ops.get_op_result_name(self, other)

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -383,7 +383,7 @@ def ndarray_to_mgr(
             new_block(
                 dtype.construct_array_type()._from_sequence(data, dtype=dtype),
                 BlockPlacement(slice(i, i + 1)),
-                ndim=1,
+                ndim=2,
             )
             for i, data in enumerate(obj_columns)
         ]

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending):
         )
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("keep", ["first", "last", False])
+    def test_duplicated(self, data, keep):
+        arr = data.take([0, 1, 0, 1])
+        result = arr.duplicated(keep=keep)
+        if keep == "first":
+            expected = np.array([False, False, True, True])
+        elif keep == "last":
+            expected = np.array([True, True, False, False])
+        else:
+            expected = np.array([True, True, True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
     @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
     @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
     def test_unique(self, data, box, method):

diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py
@@ -497,3 +497,9 @@ def test_interpolate_empty_df(self):
         result = df.interpolate(inplace=True)
         assert result is None
         tm.assert_frame_equal(df, expected)
+
+    def test_interpolate_ea_raise(self):
+        # GH#55347
+        df = DataFrame({"a": [1, None, 2]}, dtype="Int64")
+        with pytest.raises(NotImplementedError, match="does not implement"):
+            df.interpolate()
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2743,6 +2743,13 @@ def test_frame_string_inference_array_string_dtype(self):
             df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"])
         tm.assert_frame_equal(df, expected)
 
+    def test_frame_string_inference_block_dim(self):
+        # GH#55363
+        pytest.importorskip("pyarrow")
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
+        assert df._mgr.blocks[0].ndim == 2
+
 
 class TestDataFrameConstructorIndexInference:
     def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):