From d1366df478f7c198c72daa3257b1595e35c8e65c Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 22:04:47 +0200
Subject: [PATCH 1/2] REF: Move methods that can be shared with new string
 dtype

---
 pandas/core/arrays/_arrow_string_mixins.py | 77 ++++++++++++++++++++++
 pandas/core/arrays/arrow/array.py          | 68 ++-----------------
 2 files changed, 82 insertions(+), 63 deletions(-)
 create mode 100644 pandas/core/arrays/_arrow_string_mixins.py

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
new file mode 100644
index 0000000000000..29ea4c6dc221d
--- /dev/null
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from typing import Literal
+
+import numpy as np
+
+from pandas.compat import pa_version_under7p0
+
+if not pa_version_under7p0:
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
+
+class ArrowStringArrayMixin:
+    def _str_pad(
+        self,
+        width: int,
+        side: Literal["left", "right", "both"] = "left",
+        fillchar: str = " ",
+    ):
+        if side == "left":
+            pa_pad = pc.utf8_lpad
+        elif side == "right":
+            pa_pad = pc.utf8_rpad
+        elif side == "both":
+            pa_pad = pc.utf8_center
+        else:
+            raise ValueError(
+                f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
+            )
+        return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
+
+    def _str_get(self, i: int):
+        lengths = pc.utf8_length(self._pa_array)
+        if i >= 0:
+            out_of_bounds = pc.greater_equal(i, lengths)
+            start = i
+            stop = i + 1
+            step = 1
+        else:
+            out_of_bounds = pc.greater(-i, lengths)
+            start = i
+            stop = i - 1
+            step = -1
+        not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
+        selected = pc.utf8_slice_codeunits(
+            self._pa_array, start=start, stop=stop, step=step
+        )
+        null_value = pa.scalar(None, type=self._pa_array.type)
+        result = pc.if_else(not_out_of_bounds, selected, null_value)
+        return type(self)(result)
+
+    def _str_slice_replace(
+        self, start: int | None = None, stop: int | None = None, repl: str | None = None
+    ):
+        if repl is None:
+            repl = ""
+        if start is None:
+            start = 0
+        if stop is None:
+            stop = np.iinfo(np.int64).max
+        return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
+
+    def _str_capitalize(self):
+        return type(self)(pc.utf8_capitalize(self._pa_array))
+
+    def _str_title(self):
+        return type(self)(pc.utf8_title(self._pa_array))
+
+    def _str_swapcase(self):
+        return type(self)(pc.utf8_swapcase(self._pa_array))
+
+    def _str_removesuffix(self, suffix: str):
+        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+        result = pc.if_else(ends_with, removed, self._pa_array)
+        return type(self)(result)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 88695f11fba59..5f39b86c1dc3e 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -42,6 +42,7 @@
 
 from pandas.core import roperator
 from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.base import (
     ExtensionArray,
     ExtensionArraySupportsAnyAll,
@@ -184,7 +185,10 @@ def to_pyarrow_type(
 
 
 class ArrowExtensionArray(
-    OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
+    OpsMixin,
+    ExtensionArraySupportsAnyAll,
+    ArrowStringArrayMixin,
+    BaseStringArrayMethods,
 ):
     """
     Pandas ExtensionArray backed by a PyArrow ChunkedArray.
@@ -1987,24 +1991,6 @@ def _str_count(self, pat: str, flags: int = 0):
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
-    def _str_pad(
-        self,
-        width: int,
-        side: Literal["left", "right", "both"] = "left",
-        fillchar: str = " ",
-    ):
-        if side == "left":
-            pa_pad = pc.utf8_lpad
-        elif side == "right":
-            pa_pad = pc.utf8_rpad
-        elif side == "both":
-            pa_pad = pc.utf8_center
-        else:
-            raise ValueError(
-                f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
-            )
-        return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
-
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
     ):
@@ -2089,26 +2075,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             )
         return type(self)(result)
 
-    def _str_get(self, i: int):
-        lengths = pc.utf8_length(self._pa_array)
-        if i >= 0:
-            out_of_bounds = pc.greater_equal(i, lengths)
-            start = i
-            stop = i + 1
-            step = 1
-        else:
-            out_of_bounds = pc.greater(-i, lengths)
-            start = i
-            stop = i - 1
-            step = -1
-        not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
-        selected = pc.utf8_slice_codeunits(
-            self._pa_array, start=start, stop=stop, step=step
-        )
-        null_value = pa.scalar(None, type=self._pa_array.type)
-        result = pc.if_else(not_out_of_bounds, selected, null_value)
-        return type(self)(result)
-
     def _str_join(self, sep: str):
         if pa.types.is_string(self._pa_array.type):
             result = self._apply_elementwise(list)
@@ -2138,15 +2104,6 @@ def _str_slice(
             pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
         )
 
-    def _str_slice_replace(
-        self, start: int | None = None, stop: int | None = None, repl: str | None = None
-    ):
-        if repl is None:
-            repl = ""
-        if start is None:
-            start = 0
-        return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
-
     def _str_isalnum(self):
         return type(self)(pc.utf8_is_alnum(self._pa_array))
 
@@ -2171,18 +2128,9 @@ def _str_isspace(self):
     def _str_istitle(self):
         return type(self)(pc.utf8_is_title(self._pa_array))
 
-    def _str_capitalize(self):
-        return type(self)(pc.utf8_capitalize(self._pa_array))
-
-    def _str_title(self):
-        return type(self)(pc.utf8_title(self._pa_array))
-
     def _str_isupper(self):
         return type(self)(pc.utf8_is_upper(self._pa_array))
 
-    def _str_swapcase(self):
-        return type(self)(pc.utf8_swapcase(self._pa_array))
-
     def _str_len(self):
         return type(self)(pc.utf8_length(self._pa_array))
 
@@ -2223,12 +2171,6 @@ def _str_removeprefix(self, prefix: str):
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_removesuffix(self, suffix: str):
-        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
-        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
-        result = pc.if_else(ends_with, removed, self._pa_array)
-        return type(self)(result)
-
     def _str_casefold(self):
         predicate = lambda val: val.casefold()
         result = self._apply_elementwise(predicate)

From 51ce0847f16e47205a8441244e5d718b539c74dd Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 22:55:14 +0200
Subject: [PATCH 2/2] Fix typing

---
 pandas/core/arrays/_arrow_string_mixins.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 29ea4c6dc221d..63db03340683b 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -12,6 +12,11 @@
 
 
 class ArrowStringArrayMixin:
+    _pa_array = None
+
+    def __init__(self, *args, **kwargs) -> None:
+        raise NotImplementedError
+
     def _str_pad(
         self,
         width: int,
@@ -46,7 +51,9 @@ def _str_get(self, i: int):
         selected = pc.utf8_slice_codeunits(
             self._pa_array, start=start, stop=stop, step=step
         )
-        null_value = pa.scalar(None, type=self._pa_array.type)
+        null_value = pa.scalar(
+            None, type=self._pa_array.type  # type: ignore[attr-defined]
+        )
         result = pc.if_else(not_out_of_bounds, selected, null_value)
         return type(self)(result)