From 503d8971207c0e58d6e7587f104d2444da0d8081 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 00:39:36 +0100 Subject: [PATCH 1/4] Fix string option tests in indexing --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/indexes/base.py | 7 +++ pandas/tests/indexing/test_categorical.py | 69 +++++++++++++---------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 2a04adf2ac7f7..c4f618c44c4f4 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -23,6 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:` - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cbf7dc5ba67d2..70f189be9826e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5568,6 +5568,13 @@ def equals(self, other: Any) -> bool: # quickly return if the lengths are different return False + if ( + self.dtype == "string[pyarrow_numpy]" + and other.dtype != "string[pyarrow_numpy]" + ): + # special case for object behavior + return other.equals(self.astype(object)) + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 0432c8856e5c5..a6d1e40c5bc19 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -14,6 +16,7 @@ Series, Timedelta, Timestamp, + option_context, ) import pandas._testing as tm @@ -428,38 +431,42 @@ def test_ix_categorical_index(self): expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) - def test_ix_categorical_index_non_unique(self): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_ix_categorical_index_non_unique(self, infer_string): # non-unique - df = DataFrame( - np.random.default_rng(2).standard_normal((3, 3)), - index=list("ABA"), - columns=list("XYX"), - ) - cdf = df.copy() - cdf.index = CategoricalIndex(df.index) - cdf.columns = CategoricalIndex(df.columns) - - exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) - expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) - tm.assert_frame_equal(cdf.loc["A", :], expect) - - exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) - expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) - tm.assert_frame_equal(cdf.loc[:, "X"], expect) - - expect = DataFrame( - df.loc[["A", "B"], :], - columns=cdf.columns, - index=CategoricalIndex(list("AAB")), - ) - tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) - - expect = DataFrame( - df.loc[:, ["X", "Y"]], - index=cdf.index, - columns=CategoricalIndex(list("XXY")), - ) - tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + with option_context("future.infer_string", infer_string): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=list("ABA"), + columns=list("XYX"), + ) + cdf = df.copy() + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) + + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + tm.assert_frame_equal(cdf.loc["A", :], expect) + + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + tm.assert_frame_equal(cdf.loc[:, "X"], expect) + + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) + + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_loc_slice(self, df): # GH9748 From 87b8c6da9517e74413361ba93f833bdad3e2ef34 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 00:41:40 +0100 Subject: [PATCH 2/4] Update v2.1.4.rst --- doc/source/whatsnew/v2.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index c4f618c44c4f4..7f1c2fca34b9e 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -23,7 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- Fixed bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:` +- Fixed bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - From 7f13eb8a59e47606fd0c8cbe882e9cd70fee872e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 22:53:03 +0100 Subject: [PATCH 3/4] Fixup --- pandas/core/indexes/base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5bb4a131cc433..299df883dcdf2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -158,7 +158,10 @@ ExtensionArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) from pandas.core.base import ( IndexOpsMixin, PandasObject, @@ -5569,8 +5572,9 @@ def equals(self, other: Any) -> bool: return False if ( - self.dtype == "string[pyarrow_numpy]" - and other.dtype != "string[pyarrow_numpy]" + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "pyarrow_numpy" + and other.dtype != self.dtype ): # special case for object behavior return other.equals(self.astype(object)) From c69974feedb595c2e9fbd3d1223e9b0451e3aeb8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 9 Dec 2023 00:34:25 +0100 Subject: [PATCH 4/4] Update whatsnew --- doc/source/whatsnew/v2.1.4.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 8c2061f2b3f10..9cc79b7090499 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -27,7 +27,6 @@ Bug fixes - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- Fixed bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 919ac8b03f936..293224b6c51e4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -576,6 +576,7 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)