Skip to content

Commit

Permalink
Backport PR #56332: BUG: str.split for ArrowDtype with pat=None
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Dec 5, 2023
1 parent 3ee9b3e commit 957a156
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 8 deletions.
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v2.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ Bug fixes
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`)
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
-

Expand Down
22 changes: 15 additions & 7 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import functools
import operator
import re
import textwrap
Expand Down Expand Up @@ -2320,18 +2321,25 @@ def _str_split(
):
if n in {-1, 0}:
n = None
if regex:
split_func = pc.split_pattern_regex
if pat is None:
split_func = pc.utf8_split_whitespace
elif regex:
split_func = functools.partial(pc.split_pattern_regex, pattern=pat)
else:
split_func = pc.split_pattern
return type(self)(split_func(self._pa_array, pat, max_splits=n))
split_func = functools.partial(pc.split_pattern, pattern=pat)
return type(self)(split_func(self._pa_array, max_splits=n))

def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
if n in {-1, 0}:
n = None
return type(self)(
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
)
if pat is None:
return type(self)(
pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True)
)
else:
return type(self)(
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
)

def _str_translate(self, table: dict[int, str]):
predicate = lambda val: val.translate(table)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,15 @@ def test_str_partition():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["rsplit", "split"])
def test_str_split_pat_none(method):
# GH 56271
ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
result = getattr(ser.str, method)()
expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
tm.assert_series_equal(result, expected)


def test_str_split():
# GH 52401
ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
Expand Down

0 comments on commit 957a156

Please sign in to comment.