From bc7e58c4e02aef32f880cdaa2f312e53c5d08cd5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:45:39 +0100 Subject: [PATCH 1/9] Adjust tests in strings folder for new string option --- pandas/core/config_init.py | 2 +- pandas/core/strings/accessor.py | 39 +++++++++++++------- pandas/tests/strings/test_find_replace.py | 24 +++++++----- pandas/tests/strings/test_split_partition.py | 8 ++-- pandas/tests/strings/test_string_array.py | 4 +- pandas/tests/strings/test_strings.py | 20 +++++----- 6 files changed, 60 insertions(+), 37 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..bdbab78a443de 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..110ca92166984 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: + _dtype = vdtype + elif vdtype is not None: dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -913,7 +914,10 @@ def split( if is_re(pat): regex = True result = self._data.array._str_split(pat, n, expand, regex) - return self._wrap_result(result, returns_string=expand, expand=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_split"] @@ -931,7 +935,10 @@ def split( @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, *, n=-1, expand: bool = False): result = self._data.array._str_rsplit(pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) _shared_docs[ "str_partition" @@ -1027,7 +1034,10 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): @forbid_nonstring_types(["bytes"]) def partition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_partition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_partition"] @@ -1041,7 +1051,10 @@ def partition(self, sep: str = " ", expand: bool = True): @forbid_nonstring_types(["bytes"]) def rpartition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_rpartition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) def get(self, i): """ @@ -2317,7 +2330,7 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + return self._wrap_result(result, dtype=self._data.dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..0bcee98bfae7d 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -241,7 +241,7 @@ def test_contains_nan(any_string_dtype): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_startswith(pat, dtype, null_value, na): @@ -253,10 +253,10 @@ def test_startswith(pat, dtype, null_value, na): result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) - elif dtype is None and null_value is None: + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -299,7 +299,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_endswith(pat, dtype, null_value, na): @@ -311,10 +311,10 @@ def test_endswith(pat, dtype, null_value, na): result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 - exp = exp.fillna(pd.NA) - elif dtype is None and null_value is None: + exp = exp.fillna(null_value) + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -381,7 +381,9 @@ def test_replace_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -468,7 +470,9 @@ def test_replace_compiled_regex_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -909,7 +913,7 @@ def test_translate_mixed_object(): # Series with non-string values s = Series(["a", "b", "c", 1.2]) table = str.maketrans("abc", "cde") - expected = Series(["c", "d", "e", np.nan]) + expected = Series(["c", "d", "e", np.nan], dtype=object) result = s.str.translate(table) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0a7d409773dd6..9ff1fc0e13ae9 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -681,14 +681,16 @@ def test_partition_sep_kwarg(any_string_dtype, method): def test_get(): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = ser.str.split("_").str.get(1) - expected = Series(["b", "d", np.nan, "g"]) + expected = Series(["b", "d", np.nan, "g"], dtype=object) tm.assert_series_equal(result, expected) def test_get_mixed_object(): ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) result = ser.str.split("_").str.get(1) - expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan]) + expected = Series( + ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -696,7 +698,7 @@ def test_get_mixed_object(): def test_get_bounds(idx): ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) result = ser.str.split("_").str.get(idx) - expected = Series(["3", "8", np.nan]) + expected = Series(["3", "8", np.nan], dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index a88dcc8956931..0b3f368afea5e 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -8,6 +8,7 @@ DataFrame, Series, _testing as tm, + option_context, ) @@ -56,7 +57,8 @@ def test_string_array(nullable_string_dtype, any_string_method): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) - expected[columns] = expected[columns].fillna(NA) # GH#18463 + with option_context("future.no_silent_downcasting", True): + expected[columns] = expected[columns].fillna(NA) # GH#18463 tm.assert_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 4315835b70a40..f662dfd7e2b14 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -76,7 +76,8 @@ def test_repeat_mixed_object(): ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan] + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -270,7 +271,8 @@ def test_spilt_join_roundtrip_mixed_object(): ) result = ser.str.split("_").str.join("_") expected = Series( - ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan] + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -398,7 +400,7 @@ def test_slice(start, stop, step, expected, any_string_dtype): def test_slice_mixed_object(start, stop, step, expected): ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) result = ser.str.slice(start, stop, step) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) @@ -453,7 +455,7 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() - expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan]) + expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, expected) @@ -529,7 +531,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")) + expected = ser.map(lambda x: x.decode("utf-8")).astype(object) tm.assert_series_equal(result, expected) @@ -559,7 +561,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) tm.assert_series_equal(result, expected) @@ -672,7 +674,7 @@ def test_str_accessor_in_apply_func(): def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) - expected = Series(["-01", "001", "1000", np.nan, np.nan]) + expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) @@ -704,10 +706,10 @@ def test_get_with_dict_label(): ] ) result = s.str.get("name") - expected = Series(["Hello", "Goodbye", None]) + expected = Series(["Hello", "Goodbye", None], dtype=object) tm.assert_series_equal(result, expected) result = s.str.get("value") - expected = Series(["World", "Planet", "Sea"]) + expected = Series(["World", "Planet", "Sea"], dtype=object) tm.assert_series_equal(result, expected) From 211c8985ea468ec2d313a0809736ed04e74c97d5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:48:38 +0100 Subject: [PATCH 2/9] BUG: translate losing object dtype with new string dtype --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/strings/accessor.py | 19 ++++++++++--------- pandas/tests/strings/test_find_replace.py | 6 +++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 543a9864ced26..77ce303dc1bfe 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..7deedc1c4cbe2 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: + _dtype = vdtype + elif vdtype is not None: dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -2317,7 +2318,7 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + return self._wrap_result(result, dtype=self._data.dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..bd64a5dce3b9a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -5,6 +5,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -893,7 +894,10 @@ def test_find_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -def test_translate(index_or_series, any_string_dtype): +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) From b500af0fa92a6373f8ed4d592ee4891eb09a2de7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:51:40 +0100 Subject: [PATCH 3/9] Fix --- pandas/core/strings/accessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7deedc1c4cbe2..c563f2f366da3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2318,7 +2318,8 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result, dtype=self._data.dtype) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): From b366c3d3925009d587dc9f9eed00433c1bf45af4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 23:51:29 +0100 Subject: [PATCH 4/9] BUG: Index.str.cat casting result always to object --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/strings/accessor.py | 7 ++- pandas/tests/strings/test_cat.py | 85 +++++++++++++++++--------------- 3 files changed, 52 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 543a9864ced26..0f4d3a22f5129 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..a05fae1524ffd 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -393,7 +393,7 @@ def cons_row(x): else: dtype = vdtype else: - dtype = vdtype + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim @@ -689,8 +689,11 @@ def cat( out: Index | Series if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA + dtype = None + if isna(result).all(): + dtype = object - out = Index(result, dtype=object, name=self._orig.name) + out = Index(result, dtype=dtype, name=self._orig.name) else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 3e620b7664335..497f87e245ba3 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -10,6 +12,7 @@ Series, _testing as tm, concat, + option_context, ) @@ -26,45 +29,49 @@ def test_str_cat_name(index_or_series, other): assert result.name == "name" -def test_str_cat(index_or_series): - box = index_or_series - # test_cat above tests "str_cat" from ndarray; - # here testing "str.cat" from Series/Index to ndarray/list - s = box(["a", "a", "b", "b", "c", np.nan]) - - # single array - result = s.str.cat() - expected = "aabbc" - assert result == expected - - result = s.str.cat(na_rep="-") - expected = "aabbc-" - assert result == expected - - result = s.str.cat(sep="_", na_rep="NA") - expected = "a_a_b_b_c_NA" - assert result == expected - - t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) - expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) - - # Series/Index with array - result = s.str.cat(t, na_rep="-") - tm.assert_equal(result, expected) - - # Series/Index with list - result = s.str.cat(list(t), na_rep="-") - tm.assert_equal(result, expected) - - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(z.values) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(list(z)) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_str_cat(index_or_series, infer_string): + with option_context("future.infer_string", infer_string): + box = index_or_series + # test_cat above tests "str_cat" from ndarray; + # here testing "str.cat" from Series/Index to ndarray/list + s = box(["a", "a", "b", "b", "c", np.nan]) + + # single array + result = s.str.cat() + expected = "aabbc" + assert result == expected + + result = s.str.cat(na_rep="-") + expected = "aabbc-" + assert result == expected + + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" + assert result == expected + + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) + + # Series/Index with array + result = s.str.cat(t, na_rep="-") + tm.assert_equal(result, expected) + + # Series/Index with list + result = s.str.cat(list(t), na_rep="-") + tm.assert_equal(result, expected) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(z.values) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(list(z)) def test_str_cat_raises_intuitive_error(index_or_series): From d43c3be8a0564975e57c2c5effdd6dc98aae8e2a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 23:52:25 +0100 Subject: [PATCH 5/9] Update accessor.py --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a05fae1524ffd..62f6a576db24f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -393,7 +393,7 @@ def cons_row(x): else: dtype = vdtype else: - _dtype = vdtype + dtype = vdtype if expand: cons = self._orig._constructor_expanddim From 35bc604a8c1525ae887423547e3f7c9cd55cc941 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:22:37 +0100 Subject: [PATCH 6/9] Fix further bugs --- pandas/core/strings/accessor.py | 5 ++- pandas/tests/strings/test_cat.py | 64 ++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 62f6a576db24f..35bfb3a1ad2f1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -44,6 +44,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -455,7 +456,7 @@ def _get_series_list(self, others): # in case of list-like `others`, all elements must be # either Series/Index/np.ndarray (1-dim)... if all( - isinstance(x, (ABCSeries, ABCIndex)) + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): @@ -697,7 +698,7 @@ def cat( else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = None + dtype = self._orig.dtype.categories.dtype else: dtype = self._orig.dtype res_ser = Series( diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 497f87e245ba3..284932491a65e 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -85,39 +85,54 @@ def test_str_cat_raises_intuitive_error(index_or_series): s.str.cat(" ") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) -def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): +def test_str_cat_categorical( + index_or_series, dtype_caller, dtype_target, sep, infer_string +): box = index_or_series - s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) - t = Index(["b", "a", "b", "c"], dtype=dtype_target) - - expected = Index(["ab", "aa", "bb", "ac"]) - expected = expected if box == Index else Series(expected, index=s) + with option_context("future.infer_string", infer_string): + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) + s = s if box == Index else Series(s, index=s) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) + + expected = Index(["ab", "aa", "bb", "ac"]) + expected = ( + expected + if box == Index + else Series(expected, index=Index(s, dtype=dtype_caller)) + ) - # Series/Index with unaligned Index -> t.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with Series having matching Index + t = Series(t.values, index=Index(s, dtype=dtype_caller)) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) - expected = expected if box == Index else Series(expected, index=expected.str[:1]) + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) + dtype = object if dtype_caller == "object" else s.dtype.categories.dtype + expected = ( + expected + if box == Index + else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + ) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -328,8 +343,9 @@ def test_str_cat_all_na(index_or_series, index_or_series2): # all-NA target if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) + expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) else: # box == Index + # TODO: Strimg option, this should return string dtype expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") tm.assert_equal(result, expected) From f5f53d7209aa18ced64fb40b38d96f4006a81883 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:24:26 +0100 Subject: [PATCH 7/9] Fix --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 305790238d6a6..609996ccf5bce 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -395,7 +395,7 @@ def cons_row(x): else: _dtype = vdtype elif vdtype is not None: - dtype = vdtype + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim From c64c6467eff52573995660b4c5c65b09c141cb2d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:39:57 +0100 Subject: [PATCH 8/9] Fix tests --- pandas/core/config_init.py | 2 +- pandas/core/strings/accessor.py | 19 ++++++++---- pandas/tests/strings/test_api.py | 6 +++- pandas/tests/strings/test_case_justify.py | 35 ++++++++++++++++------- pandas/tests/strings/test_extract.py | 15 ++++++---- 5 files changed, 54 insertions(+), 23 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bdbab78a443de..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True, + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 609996ccf5bce..95091cc5f2320 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -918,7 +918,10 @@ def split( if is_re(pat): regex = True result = self._data.array._str_split(pat, n, expand, regex) - dtype = object if self._data.dtype == object else None + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None return self._wrap_result( result, expand=expand, returns_string=expand, dtype=dtype ) @@ -1038,7 +1041,10 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): @forbid_nonstring_types(["bytes"]) def partition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_partition(sep, expand) - dtype = object if self._data.dtype == object else None + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None return self._wrap_result( result, expand=expand, returns_string=expand, dtype=dtype ) @@ -1055,7 +1061,10 @@ def partition(self, sep: str = " ", expand: bool = True): @forbid_nonstring_types(["bytes"]) def rpartition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_rpartition(sep, expand) - dtype = object if self._data.dtype == object else None + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None return self._wrap_result( result, expand=expand, returns_string=expand, dtype=dtype ) @@ -2764,7 +2773,7 @@ def extract( else: name = _get_single_group_name(regex) result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) - return self._wrap_result(result, name=name) + return self._wrap_result(result, name=name, dtype=result_dtype) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags: int = 0) -> DataFrame: @@ -3504,7 +3513,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: raise ValueError("pattern contains no capture groups") if isinstance(arr, ABCIndex): - arr = arr.to_series().reset_index(drop=True) + arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) columns = _get_group_names(regex) match_list = [] diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 2914b22a52e94..31e005466af7b 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -2,11 +2,13 @@ import pytest from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, Series, _testing as tm, + option_context, ) from pandas.core.strings.accessor import StringMethods @@ -162,7 +164,8 @@ def test_api_per_method( if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 - method(*args, **kwargs) # works! + with option_context("future.no_silent_downcasting", True): + method(*args, **kwargs) # works! else: # GH 23011, GH 23163 msg = ( @@ -178,6 +181,7 @@ def test_api_for_categorical(any_string_method, any_string_dtype): s = Series(list("aabb"), dtype=any_string_dtype) s = s + " " + s c = s.astype("category") + c = c.astype(CategoricalDtype(c.dtype.categories.astype("object"))) assert isinstance(c.str, StringMethods) method_name, args, kwargs = any_string_method diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 1dee25e631648..41aedae90ca76 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -21,7 +21,8 @@ def test_title_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.title() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_almost_equal(result, expected) @@ -41,11 +42,15 @@ def test_lower_upper_mixed_object(): s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = s.str.upper() - expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan]) + expected = Series( + ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) result = s.str.lower() - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -71,7 +76,8 @@ def test_capitalize_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.capitalize() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -87,7 +93,8 @@ def test_swapcase_mixed_object(): s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) result = s.str.swapcase() expected = Series( - ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan] + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -138,19 +145,22 @@ def test_pad_mixed_object(): result = s.str.pad(5, side="left") expected = Series( - [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan] + [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="right") expected = Series( - ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan] + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="both") expected = Series( - [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan] + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +248,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -255,7 +266,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -272,7 +284,8 @@ def test_center_ljust_rjust_mixed_object(): None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 9ad9b1eca41d9..77d008c650264 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -47,13 +47,16 @@ def test_extract_expand_False_mixed_object(): # two groups result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) # single group result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) expected = Series( - ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan] + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +241,9 @@ def test_extract_expand_True_mixed_object(): ) result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True) - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) @@ -603,8 +608,8 @@ def test_extractall_stringindex(any_string_dtype): # index.name doesn't affect to the result if any_string_dtype == "object": for idx in [ - Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name="xxx"), + Index(["a1a2", "b1", "c1"], dtype=object), + Index(["a1a2", "b1", "c1"], name="xxx", dtype=object), ]: result = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(result, expected) From aa5a42e20d9a10fab44238702489201d4df56b94 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 22:54:25 +0100 Subject: [PATCH 9/9] Update accessor.py --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 95091cc5f2320..75866c6f6013a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -699,7 +699,7 @@ def cat( else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = self._orig.dtype.categories.dtype + dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] else: dtype = self._orig.dtype res_ser = Series(