From bb01589320b785aa2944a52d35acb9cc66560c59 Mon Sep 17 00:00:00 2001 From: abizzinotto Date: Sun, 15 Oct 2023 18:52:38 +0300 Subject: [PATCH 1/6] Ensure "string[pyarrow]" type is preserved when calling extractall --- pandas/core/strings/accessor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 71d6f9c58e2c2..58b904fd31b6a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3449,10 +3449,9 @@ def _result_dtype(arr): # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - if isinstance(arr.dtype, StringDtype): + if isinstance(arr.dtype, (ArrowDtype, StringDtype)): return arr.dtype - else: - return object + return object def _get_single_group_name(regex: re.Pattern) -> Hashable: From e5ef14be4baa8936c88677fa8ff8f8d89d5def57 Mon Sep 17 00:00:00 2001 From: abizzinotto Date: Sun, 15 Oct 2023 19:10:08 +0300 Subject: [PATCH 2/6] Add whatsnew note --- doc/source/whatsnew/v2.1.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index cc51e22265d7c..fdc15abf9dffb 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -30,6 +30,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) +- Fixed bug in :meth:`Series.str.extractall` for ``string[pyarrow]`` dtype being converted to object (:issue:`53846`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - From ee33c4e45cd9b20b460043f2f3f862f8ddf41374 Mon Sep 17 00:00:00 2001 From: abizzinotto Date: Wed, 18 Oct 2023 02:37:58 -0300 Subject: [PATCH 3/6] Add test and fix whatsnew entry --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/tests/strings/test_extract.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index fdc15abf9dffb..32e71e5e3ea51 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -30,7 +30,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) -- Fixed bug in :meth:`Series.str.extractall` for ``string[pyarrow]`` dtype being converted to object (:issue:`53846`) +- Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 4773c13aad376..0eedc1fa724c3 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -2,6 +2,7 @@ import re import numpy as np +import pyarrow as pa import pytest from pandas import ( @@ -11,6 +12,7 @@ Series, _testing as tm, ) +from pandas.core.dtypes.dtypes import ArrowDtype def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype): @@ -706,3 +708,18 @@ def test_extractall_same_as_extract_subject_index(any_string_dtype): has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_match_index) + + +@pytest.mark.parametrize( + "data, expected_dtype", + [ + (Series(["abc", "ab"], dtype=ArrowDtype(pa.string())), "string[pyarrow]"), + (Series(["abc", "ab"], dtype="string"), "string[python]"), + (Series(["abc", "ab"]), "object"), + ] +) +def test_extractall_preserves_dtype(data, expected_dtype): + # Ensure that when extractall is called on a series with specific dtypes set, that + # the dtype is preserved in the resulting DataFrame's column. + result = data.str.extractall("(ab)") + assert result.dtypes[0] == expected_dtype From 03ed4a195b34ddf61cd921e1247dcb09a493f2a4 Mon Sep 17 00:00:00 2001 From: Amanda Bizzinotto Date: Wed, 18 Oct 2023 16:36:53 -0300 Subject: [PATCH 4/6] Fix test case and move import --- pandas/tests/strings/test_extract.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 0eedc1fa724c3..f6d7f80b96460 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -2,9 +2,10 @@ import re import numpy as np -import pyarrow as pa import pytest +from pandas.core.dtypes.dtypes import ArrowDtype + from pandas import ( DataFrame, Index, @@ -12,7 +13,6 @@ Series, _testing as tm, ) -from pandas.core.dtypes.dtypes import ArrowDtype def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype): @@ -710,16 +710,10 @@ def test_extractall_same_as_extract_subject_index(any_string_dtype): tm.assert_frame_equal(extract_one_noname, no_match_index) -@pytest.mark.parametrize( - "data, expected_dtype", - [ - (Series(["abc", "ab"], dtype=ArrowDtype(pa.string())), "string[pyarrow]"), - (Series(["abc", "ab"], dtype="string"), "string[python]"), - (Series(["abc", "ab"]), "object"), - ] -) -def test_extractall_preserves_dtype(data, expected_dtype): +def test_extractall_preserves_dtype(): # Ensure that when extractall is called on a series with specific dtypes set, that # the dtype is preserved in the resulting DataFrame's column. - result = data.str.extractall("(ab)") - assert result.dtypes[0] == expected_dtype + import pyarrow as pa + + result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") + assert result.dtypes[0] == "string[pyarrow]" From 35aa528414f7ed4e2ef2889943bffeee89f6e9a9 Mon Sep 17 00:00:00 2001 From: abizzinotto Date: Wed, 18 Oct 2023 18:53:02 -0300 Subject: [PATCH 5/6] Add pyarrow requirement to actions --- ci/deps/actions-311-numpydev.yaml | 1 + ci/deps/actions-pypy-39.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 9795a1fb39c6f..ba77a02048a2b 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -31,3 +31,4 @@ dependencies: - "numpy" - "scipy" - "tzdata>=2022.1" + - "pyarrow" diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index a2f4d6395783a..23937e17454ca 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -26,3 +26,4 @@ dependencies: - pytz - pip: - tzdata>=2022.1 + - pyarrow >= 7.0.0 From 27cf36dd0a143e863899eae089ddcfcb70df18b5 Mon Sep 17 00:00:00 2001 From: Amanda Bizzinotto Date: Wed, 18 Oct 2023 20:25:04 -0300 Subject: [PATCH 6/6] Add pyarrow requirement as importorskip --- ci/deps/actions-311-numpydev.yaml | 1 - ci/deps/actions-pypy-39.yaml | 1 - pandas/tests/strings/test_extract.py | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index ba77a02048a2b..9795a1fb39c6f 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -31,4 +31,3 @@ dependencies: - "numpy" - "scipy" - "tzdata>=2022.1" - - "pyarrow" diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 23937e17454ca..a2f4d6395783a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -26,4 +26,3 @@ dependencies: - pytz - pip: - tzdata>=2022.1 - - pyarrow >= 7.0.0 diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index f6d7f80b96460..b8319e90e09a8 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -713,7 +713,7 @@ def test_extractall_same_as_extract_subject_index(any_string_dtype): def test_extractall_preserves_dtype(): # Ensure that when extractall is called on a series with specific dtypes set, that # the dtype is preserved in the resulting DataFrame's column. - import pyarrow as pa + pa = pytest.importorskip("pyarrow") result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") assert result.dtypes[0] == "string[pyarrow]"