diff --git a/pandas/conftest.py b/pandas/conftest.py index 5210e727aeb3c..10826f50d1fe1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1321,6 +1321,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1329,6 +1330,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'pyarrow_numpy' """ return request.param @@ -1380,6 +1382,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): @@ -2000,4 +2003,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 48ff769f6c737..c4960f49e3ba5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -515,7 +515,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..1e285f90e9fea 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -76,7 +76,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -108,11 +108,11 @@ def na_value(self) -> libmissing.NAType: def __init__(self, storage=None) -> None: if storage is None: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -160,6 +160,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +178,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +200,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..bc1d7cb52e196 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -27,6 +28,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -113,10 +115,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -144,7 +147,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -178,6 +184,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @classmethod + def _result_converter(cls, values, na=None): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): @@ -313,7 +323,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,7 +332,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -331,7 +341,7 @@ def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -369,39 +379,39 @@ def _str_fullmatch( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -433,3 +443,114 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _storage = "pyarrow_numpy" + + @classmethod + def _result_converter(cls, values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + + def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamond inheritance) + if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _convert_int_dtype(self, result): + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat).to_numpy() + return self._convert_int_dtype(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array).to_numpy() + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result.to_numpy()) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True): + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..745689ab1fcc8 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index becf9b47b3af1..124ca546c4583 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index de93e89ecacd5..e29a72e1a5338 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,12 +5,16 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.util.version import Version +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) @pytest.fixture @@ -33,7 +37,12 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + else: + arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -195,19 +204,34 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = ( + pd.array(expected, dtype="boolean") + .to_numpy(na_value=False) + .astype(np.bool_) + ) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +247,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +270,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +340,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -412,7 +455,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -455,6 +498,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +515,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..20530e37116f2 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + if string_storage == "python": + arr_type = "StringArray" + elif string_storage == "pyarrow": + arr_type = "ArrowStringArray" + else: + arr_type = "ArrowStringArrayNumpySemantics" with pd.option_context("string_storage", string_storage): with pytest.raises( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..16059155a7a8f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4e142eb6e14b8..069d53aeb248f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -184,6 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 496a2d095d85b..bf119f2721ed4 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,7 +2,7 @@ import pandas as pd -object_pyarrow_numpy = ("object",) +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") def _convert_na_value(ser, expected): diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..1dee25e631648 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,11 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip( + "Arrow logic is different, " + "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index d5017b1c47d85..78f0730d730e8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -22,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]",) + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): @@ -223,6 +223,8 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) @@ -807,7 +809,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)