From 365448dc665f6d49f3d3071a6c3a04aaccacb3d2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 4 Nov 2023 18:50:32 -0400 Subject: [PATCH] BUG: Index.isin raising for arrow strings and null set (#55821) --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/arrays/string_arrow.py | 4 +++- pandas/tests/indexes/test_base.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..31ab01f171b4a 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ae020ec2f599f..1c2ecd4684e2f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -222,7 +222,9 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(value_set): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(value_set)) + result = pc.is_in( + self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type) + ) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0c32149124ac6..828700573c7ea 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,7 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark +import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -921,6 +921,14 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) + @td.skip_if_no("pyarrow") + def test_isin_arrow_string_null(self): + # GH#55821 + index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + result = index.isin([None]) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "values", [ @@ -1235,7 +1243,7 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() + @td.async_mark() async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0")