From 5c901a3be768df03fa0cc823bac1bd1b171c6a4f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 8 Dec 2023 16:01:11 -0500 Subject: [PATCH 1/2] find offset fix --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 919ac8b03f936..1544b3d953d71 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -577,6 +577,7 @@ Strings - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) +- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ae6942db11fae..5abdfe69e52c0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2193,7 +2193,8 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) result = pc.find_substring(slices, sub) not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) result = pc.if_else(not_found, result, offset_result) elif start == 0 and end is None: slices = self._pa_array diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1941e359299b6..aa2bc604e70b1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1844,6 +1844,14 @@ def test_str_find(sub, start, end, exp, exp_typ): tm.assert_series_equal(result, expected) +def test_str_find_negative_start(): + # GH 56411 + ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="b", start=-1000, end=3) + expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + def test_str_find_notimplemented(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="find not implemented"): From 90da5e86eb07ea4d7a0eab8e3bc8eb161d46a829 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 8 Dec 2023 16:38:45 -0500 Subject: [PATCH 2/2] fix test --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index aa2bc604e70b1..75e5fb00586e6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1835,7 +1835,7 @@ def test_str_fullmatch(pat, case, na, exp): @pytest.mark.parametrize( "sub, start, end, exp, exp_typ", - [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [2, None], pa.int64()]], + [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]], ) def test_str_find(sub, start, end, exp, exp_typ): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))