From 957a1563dff4bf81b37bb8df7f492539e21138ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Dec 2023 13:22:46 -1000 Subject: [PATCH] Backport PR #56332: BUG: str.split for ArrowDtype with pat=None --- doc/source/whatsnew/v2.1.4.rst | 4 +++- pandas/core/arrays/arrow/array.py | 22 +++++++++++++++------- pandas/tests/extension/test_arrow.py | 9 +++++++++ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 3caeef3d26ead..a1434baddc7ef 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,10 +25,12 @@ Bug fixes - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) -- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`) +- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) +- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a08f3ba44f417..4b333c92dd39c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import operator import re import textwrap @@ -2320,18 +2321,25 @@ def _str_split( ): if n in {-1, 0}: n = None - if regex: - split_func = pc.split_pattern_regex + if pat is None: + split_func = pc.utf8_split_whitespace + elif regex: + split_func = functools.partial(pc.split_pattern_regex, pattern=pat) else: - split_func = pc.split_pattern - return type(self)(split_func(self._pa_array, pat, max_splits=n)) + split_func = functools.partial(pc.split_pattern, pattern=pat) + return type(self)(split_func(self._pa_array, max_splits=n)) def _str_rsplit(self, pat: str | None = None, n: int | None = -1): if n in {-1, 0}: n = None - return type(self)( - pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) - ) + if pat is None: + return type(self)( + pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True) + ) + else: + return type(self)( + pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) + ) def _str_translate(self, table: dict[int, str]): predicate = lambda val: val.translate(table) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 61474aa94d1c8..eedd8b6cb4ab5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2202,6 +2202,15 @@ def test_str_partition(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["rsplit", "split"]) +def test_str_split_pat_none(method): + # GH 56271 + ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string())) + result = getattr(ser.str, method)() + expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None]))) + tm.assert_series_equal(result, expected) + + def test_str_split(): # GH 52401 ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))