From d1366df478f7c198c72daa3257b1595e35c8e65c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:04:47 +0200 Subject: [PATCH 1/2] REF: Move methods that can be shared with new string dtype --- pandas/core/arrays/_arrow_string_mixins.py | 77 ++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 68 ++----------------- 2 files changed, 82 insertions(+), 63 deletions(-) create mode 100644 pandas/core/arrays/_arrow_string_mixins.py diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000..29ea4c6dc221d --- /dev/null +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import Literal + +import numpy as np + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar(None, type=self._pa_array.type) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + if stop is None: + stop = np.iinfo(np.int64).max + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88695f11fba59..5f39b86c1dc3e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core import roperator from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -184,7 +185,10 @@ def to_pyarrow_type( class ArrowExtensionArray( - OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, ): """ Pandas ExtensionArray backed by a PyArrow ChunkedArray. @@ -1987,24 +1991,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_pad( - self, - width: int, - side: Literal["left", "right", "both"] = "left", - fillchar: str = " ", - ): - if side == "left": - pa_pad = pc.utf8_lpad - elif side == "right": - pa_pad = pc.utf8_rpad - elif side == "both": - pa_pad = pc.utf8_center - else: - raise ValueError( - f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" - ) - return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2089,26 +2075,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): ) return type(self)(result) - def _str_get(self, i: int): - lengths = pc.utf8_length(self._pa_array) - if i >= 0: - out_of_bounds = pc.greater_equal(i, lengths) - start = i - stop = i + 1 - step = 1 - else: - out_of_bounds = pc.greater(-i, lengths) - start = i - stop = i - 1 - step = -1 - not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) - selected = pc.utf8_slice_codeunits( - self._pa_array, start=start, stop=stop, step=step - ) - null_value = pa.scalar(None, type=self._pa_array.type) - result = pc.if_else(not_out_of_bounds, selected, null_value) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type): result = self._apply_elementwise(list) @@ -2138,15 +2104,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_slice_replace( - self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): - if repl is None: - repl = "" - if start is None: - start = 0 - return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) @@ -2171,18 +2128,9 @@ def _str_isspace(self): def _str_istitle(self): return type(self)(pc.utf8_is_title(self._pa_array)) - def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._pa_array)) - - def _str_title(self): - return type(self)(pc.utf8_title(self._pa_array)) - def _str_isupper(self): return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) @@ -2223,12 +2171,6 @@ def _str_removeprefix(self, prefix: str): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) From 51ce0847f16e47205a8441244e5d718b539c74dd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:55:14 +0200 Subject: [PATCH 2/2] Fix typing --- pandas/core/arrays/_arrow_string_mixins.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 29ea4c6dc221d..63db03340683b 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -12,6 +12,11 @@ class ArrowStringArrayMixin: + _pa_array = None + + def __init__(self, *args, **kwargs) -> None: + raise NotImplementedError + def _str_pad( self, width: int, @@ -46,7 +51,9 @@ def _str_get(self, i: int): selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar(None, type=self._pa_array.type) + null_value = pa.scalar( + None, type=self._pa_array.type # type: ignore[attr-defined] + ) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result)