From 2fdb16b347fc34f78213868a8a973447ac79ab2d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 11:16:04 +0100
Subject: [PATCH] String dtype: implement sum reduction (#59853)

---
 doc/source/whatsnew/v2.3.0.rst                |  2 +-
 pandas/core/array_algos/masked_reductions.py  |  4 ++
 pandas/core/arrays/arrow/array.py             | 32 ++++++++++
 pandas/core/arrays/string_.py                 | 18 +++++-
 pandas/core/arrays/string_arrow.py            |  6 +-
 pandas/tests/apply/test_frame_apply.py        | 10 ---
 pandas/tests/apply/test_invalid_arg.py        | 39 ++++++------
 pandas/tests/arrays/string_/test_string.py    |  2 -
 pandas/tests/extension/test_arrow.py          | 25 ++------
 pandas/tests/extension/test_string.py         |  2 +-
 pandas/tests/frame/test_reductions.py         | 61 ++++++-------------
 pandas/tests/groupby/aggregate/test_cython.py |  1 -
 pandas/tests/groupby/test_groupby.py          | 15 +----
 pandas/tests/groupby/test_raises.py           |  4 +-
 .../tests/groupby/transform/test_transform.py | 11 +---
 pandas/tests/series/test_reductions.py        | 39 ++++--------
 16 files changed, 120 insertions(+), 151 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 01c2ed3821d7a..64486c5a3e3ba 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -32,7 +32,7 @@ enhancement1
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
--
+- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
index f2a32fbe2b0e5..bdf88f2e9fa07 100644
--- a/pandas/core/array_algos/masked_reductions.py
+++ b/pandas/core/array_algos/masked_reductions.py
@@ -62,6 +62,10 @@ def _reductions(
         ):
             return libmissing.NA
 
+        if values.dtype == np.dtype(object):
+            # object dtype does not support `where` without passing an initial
+            values = values[~mask]
+            return func(values, axis=axis, **kwargs)
         return func(values, where=~mask, axis=axis, **kwargs)
 
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 619e7b3ccfb4f..53f703b701217 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -68,6 +68,7 @@
     unpack_tuple_and_ellipses,
     validate_indices,
 )
+from pandas.core.nanops import check_below_min_count
 from pandas.core.strings.base import BaseStringArrayMethods
 
 from pandas.io._util import _arrow_dtype_mapping
@@ -1705,6 +1706,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
                 denominator = pc.sqrt_checked(pc.count(self._pa_array))
                 return pc.divide_checked(numerator, denominator)
 
+        elif name == "sum" and (
+            pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type)
+        ):
+
+            def pyarrow_meth(data, skip_nulls, min_count=0):  # type: ignore[misc]
+                mask = pc.is_null(data) if data.null_count > 0 else None
+                if skip_nulls:
+                    if min_count > 0 and check_below_min_count(
+                        (len(data),),
+                        None if mask is None else mask.to_numpy(),
+                        min_count,
+                    ):
+                        return pa.scalar(None, type=data.type)
+                    if data.null_count > 0:
+                        # binary_join returns null if there is any null ->
+                        # have to filter out any nulls
+                        data = data.filter(pc.invert(mask))
+                else:
+                    if mask is not None or check_below_min_count(
+                        (len(data),), None, min_count
+                    ):
+                        return pa.scalar(None, type=data.type)
+
+                if pa.types.is_large_string(data.type):
+                    # binary_join only supports string, not large_string
+                    data = data.cast(pa.string())
+                data_list = pa.ListArray.from_arrays(
+                    [0, len(data)], data.combine_chunks()
+                )[0]
+                return pc.binary_join(data_list, "")
+
         else:
             pyarrow_name = {
                 "median": "quantile",
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f20c4c8625475..4af26858cb131 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -812,8 +812,8 @@ def _reduce(
             else:
                 return nanops.nanall(self._ndarray, skipna=skipna)
 
-        if name in ["min", "max"]:
-            result = getattr(self, name)(skipna=skipna, axis=axis)
+        if name in ["min", "max", "sum"]:
+            result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
             if keepdims:
                 return self._from_sequence([result], dtype=self.dtype)
             return result
@@ -840,6 +840,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
         )
         return self._wrap_reduction_result(axis, result)
 
+    def sum(
+        self,
+        *,
+        axis: AxisInt | None = None,
+        skipna: bool = True,
+        min_count: int = 0,
+        **kwargs,
+    ) -> Scalar:
+        nv.validate_sum((), kwargs)
+        result = masked_reductions.sum(
+            values=self._ndarray, mask=self.isna(), skipna=skipna
+        )
+        return self._wrap_reduction_result(axis, result)
+
     def value_counts(self, dropna: bool = True) -> Series:
         from pandas.core.algorithms import value_counts_internal as value_counts
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9e9893ecbbd97..cde39c7f4dc6a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -435,7 +435,11 @@ def _reduce(
                 return result.astype(np.bool_)
             return result
 
-        result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        if name in ("min", "max", "sum", "argmin", "argmax"):
+            result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        else:
+            raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+
         if name in ("argmin", "argmax") and isinstance(result, pa.Array):
             return self._convert_int_result(result)
         elif isinstance(result, pa.Array):
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index f0ab01e9e960e..ed7eae4502a64 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -4,10 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
 import pandas as pd
@@ -1218,7 +1214,6 @@ def test_agg_with_name_as_column_name():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_agg_multiple_mixed():
     # GH 20909
     mdf = DataFrame(
@@ -1247,9 +1242,6 @@ def test_agg_multiple_mixed():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_agg_multiple_mixed_raises():
     # GH 20909
     mdf = DataFrame(
@@ -1347,7 +1339,6 @@ def test_named_agg_reduce_axis1_raises(float_frame):
             float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_nuiscance_columns():
     # GH 15015
     df = DataFrame(
@@ -1524,7 +1515,6 @@ def test_apply_datetime_tz_issue(engine, request):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})])
 @pytest.mark.parametrize("method", ["min", "max", "sum"])
 def test_mixed_column_raises(df, method, using_infer_string):
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
index ba970e328ae40..e19c21f81b3e1 100644
--- a/pandas/tests/apply/test_invalid_arg.py
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -12,9 +12,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
 from pandas.errors import SpecificationError
 
 from pandas import (
@@ -212,10 +209,6 @@ def transform(row):
         data.apply(transform, axis=1)
 
 
-# we should raise a proper TypeError instead of propagating the pyarrow error
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 @pytest.mark.parametrize(
     "df, func, expected",
     tm.get_cython_table_params(
@@ -225,21 +218,25 @@ def transform(row):
 def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
     # GH 21224
     if using_infer_string:
-        import pyarrow as pa
+        if df.dtypes.iloc[0].storage == "pyarrow":
+            import pyarrow as pa
 
-        expected = (expected, pa.lib.ArrowNotImplementedError)
+            # TODO(infer_string)
+            # should raise a proper TypeError instead of propagating the pyarrow error
 
-    msg = "can't multiply sequence by non-int of type 'str'|has no kernel"
+            expected = (expected, pa.lib.ArrowNotImplementedError)
+        else:
+            expected = (expected, NotImplementedError)
+
+    msg = (
+        "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform"
+    )
     warn = None if isinstance(func, str) else FutureWarning
     with pytest.raises(expected, match=msg):
         with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"):
             df.agg(func, axis=axis)
 
 
-# we should raise a proper TypeError instead of propagating the pyarrow error
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 @pytest.mark.parametrize(
     "series, func, expected",
     chain(
@@ -263,11 +260,15 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri
         msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
 
     if using_infer_string:
-        import pyarrow as pa
-
-        expected = (expected, pa.lib.ArrowNotImplementedError)
-
-    msg = msg + "|does not support|has no kernel"
+        if series.dtype.storage == "pyarrow":
+            import pyarrow as pa
+
+            # TODO(infer_string)
+            # should raise a proper TypeError instead of propagating the pyarrow error
+            expected = (expected, pa.lib.ArrowNotImplementedError)
+        else:
+            expected = (expected, NotImplementedError)
+    msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform"
     warn = None if isinstance(func, str) else FutureWarning
 
     with pytest.raises(expected, match=msg):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 33708be497f31..7856cf390127e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -444,14 +444,12 @@ def test_astype_float(dtype, any_float_dtype):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce(skipna, dtype):
     arr = pd.Series(["a", "b", "c"], dtype=dtype)
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce_missing(skipna, dtype):
     arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
     result = arr.sum(skipna=skipna)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index f56094dfd47ca..f0ff11e5fa3f7 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -461,10 +461,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
                 pass
             else:
                 return False
+        elif pa.types.is_binary(pa_dtype) and op_name == "sum":
+            return False
         elif (
             pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
         ) and op_name in [
-            "sum",
             "mean",
             "median",
             "prod",
@@ -563,6 +564,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
             cmp_dtype = "float64[pyarrow]"
         elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type):
             cmp_dtype = "uint64[pyarrow]"
+        elif op_name == "sum" and pa.types.is_string(pa_type):
+            cmp_dtype = arr.dtype
         else:
             cmp_dtype = {
                 "i": "int64[pyarrow]",
@@ -594,26 +597,6 @@ def test_median_not_approximate(self, typ):
         result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
         assert result == 1.5
 
-    def test_in_numeric_groupby(self, data_for_grouping):
-        dtype = data_for_grouping.dtype
-        if is_string_dtype(dtype):
-            df = pd.DataFrame(
-                {
-                    "A": [1, 1, 2, 2, 3, 3, 1, 4],
-                    "B": data_for_grouping,
-                    "C": [1, 1, 1, 1, 1, 1, 1, 1],
-                }
-            )
-
-            expected = pd.Index(["C"])
-            msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
-            with pytest.raises(TypeError, match=msg):
-                df.groupby("A").sum()
-            result = df.groupby("A").sum(numeric_only=True).columns
-            tm.assert_index_equal(result, expected)
-        else:
-            super().test_in_numeric_groupby(data_for_grouping)
-
     def test_construct_from_string_own_name(self, dtype, request):
         pa_dtype = dtype.pyarrow_dtype
         if pa.types.is_decimal(pa_dtype):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 509ae653e4793..57710d9caad4d 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -188,7 +188,7 @@ def _get_expected_exception(
 
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
-            op_name in ["min", "max"]
+            op_name in ["min", "max", "sum"]
             or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 1d667d35db253..05bb603f5c462 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -226,7 +226,6 @@ def float_frame_with_na():
 class TestDataFrameAnalytics:
     # ---------------------------------------------------------------------
     # Reductions
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("axis", [0, 1])
     @pytest.mark.parametrize(
         "opname",
@@ -246,17 +245,11 @@ class TestDataFrameAnalytics:
             pytest.param("kurt", marks=td.skip_if_no("scipy")),
         ],
     )
-    def test_stat_op_api_float_string_frame(
-        self, float_string_frame, axis, opname, using_infer_string
-    ):
-        if (
-            (opname in ("sum", "min", "max") and axis == 0)
-            or opname
-            in (
-                "count",
-                "nunique",
-            )
-        ) and not (using_infer_string and opname == "sum"):
+    def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
+        if (opname in ("sum", "min", "max") and axis == 0) or opname in (
+            "count",
+            "nunique",
+        ):
             getattr(float_string_frame, opname)(axis=axis)
         else:
             if opname in ["var", "std", "sem", "skew", "kurt"]:
@@ -283,10 +276,11 @@ def test_stat_op_api_float_string_frame(
                 msg = "'[><]=' not supported between instances of 'float' and 'str'"
             elif opname == "median":
                 msg = re.compile(
-                    r"Cannot convert \[.*\] to numeric|does not support", flags=re.S
+                    r"Cannot convert \[.*\] to numeric|does not support|Cannot perform",
+                    flags=re.S,
                 )
             if not isinstance(msg, re.Pattern):
-                msg = msg + "|does not support"
+                msg = msg + "|does not support|Cannot perform reduction"
             with pytest.raises(TypeError, match=msg):
                 getattr(float_string_frame, opname)(axis=axis)
         if opname != "nunique":
@@ -432,7 +426,6 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis):
             expected[expected.isna()] = None
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
     def test_mixed_ops(self, op):
         # GH#16116
@@ -449,26 +442,16 @@ def test_mixed_ops(self, op):
                 "could not convert",
                 "can't multiply sequence by non-int",
                 "does not support",
+                "Cannot perform",
             ]
         )
         with pytest.raises(TypeError, match=msg):
             getattr(df, op)()
 
         with pd.option_context("use_bottleneck", False):
-            msg = "|".join(
-                [
-                    "Could not convert",
-                    "could not convert",
-                    "can't multiply sequence by non-int",
-                    "does not support",
-                ]
-            )
             with pytest.raises(TypeError, match=msg):
                 getattr(df, op)()
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="sum doesn't work for arrow strings"
-    )
     def test_reduce_mixed_frame(self):
         # GH 6806
         df = DataFrame(
@@ -608,7 +591,6 @@ def test_sem(self, datetime_frame):
             result = nanops.nansem(arr, axis=0)
             assert not (result < 0).any()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "dropna, expected",
         [
@@ -630,7 +612,7 @@ def test_sem(self, datetime_frame):
                     "A": [12],
                     "B": [10.0],
                     "C": [np.nan],
-                    "D": np.array([np.nan], dtype=object),
+                    "D": Series([np.nan], dtype="str"),
                     "E": Categorical([np.nan], categories=["a"]),
                     "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"),
                     "G": to_timedelta([pd.NaT]),
@@ -672,7 +654,7 @@ def test_mode_dropna(self, dropna, expected):
                 "A": [12, 12, 19, 11],
                 "B": [10, 10, np.nan, 3],
                 "C": [1, np.nan, np.nan, np.nan],
-                "D": Series([np.nan, np.nan, "a", np.nan], dtype=object),
+                "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"),
                 "E": Categorical([np.nan, np.nan, "a", np.nan]),
                 "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"),
                 "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
@@ -692,7 +674,6 @@ def test_mode_dropna(self, dropna, expected):
         expected = DataFrame(expected)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_mode_sortwarning(self, using_infer_string):
         # Check for the warning that is raised when the mode
         # results cannot be sorted
@@ -700,7 +681,12 @@ def test_mode_sortwarning(self, using_infer_string):
         df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
         expected = DataFrame({"A": ["a", np.nan]})
 
-        warning = None if using_infer_string else UserWarning
+        # TODO(infer_string) avoid this UserWarning for python storage
+        warning = (
+            None
+            if using_infer_string and df.A.dtype.storage == "pyarrow"
+            else UserWarning
+        )
         with tm.assert_produces_warning(warning, match="Unable to sort modes"):
             result = df.mode(dropna=False)
             result = result.sort_values(by="A").reset_index(drop=True)
@@ -1354,11 +1340,8 @@ def test_any_all_extra(self):
         result = df[["C"]].all(axis=None).item()
         assert result is True
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("axis", [0, 1])
-    def test_any_all_object_dtype(
-        self, axis, all_boolean_reductions, skipna, using_infer_string
-    ):
+    def test_any_all_object_dtype(self, axis, all_boolean_reductions, skipna):
         # GH#35450
         df = DataFrame(
             data=[
@@ -1368,13 +1351,8 @@ def test_any_all_object_dtype(
                 [np.nan, np.nan, "5", np.nan],
             ]
         )
-        if using_infer_string:
-            # na in object is True while in string pyarrow numpy it's false
-            val = not axis == 0 and not skipna and all_boolean_reductions == "all"
-        else:
-            val = True
         result = getattr(df, all_boolean_reductions)(axis=axis, skipna=skipna)
-        expected = Series([True, True, val, True])
+        expected = Series([True, True, True, True])
         tm.assert_series_equal(result, expected)
 
     def test_any_datetime(self):
@@ -1939,7 +1917,6 @@ def test_sum_timedelta64_skipna_false():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings")
 def test_mixed_frame_with_integer_sum():
     # https://github.com/pandas-dev/pandas/issues/34520
     df = DataFrame([["a", 1]], columns=list("ab"))
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 4a4f5882b7e85..d28eb227314c7 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -146,7 +146,6 @@ def test_cython_agg_return_dict():
     tm.assert_series_equal(ts, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_cython_fail_agg():
     dr = bdate_range("1/1/2000", periods=50)
     ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 6393468fb8ccd..0d13db79835ba 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -8,12 +8,9 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
 from pandas.errors import SpecificationError
 import pandas.util._test_decorators as td
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -1408,23 +1405,15 @@ def g(group):
     tm.assert_series_equal(result, expected)
 
 
-# TODO harmonize error messages
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
-def test_set_group_name(df, grouper, using_infer_string):
+def test_set_group_name(df, grouper):
     def f(group):
         assert group.name is not None
         return group
 
     def freduce(group):
         assert group.name is not None
-        if using_infer_string and grouper == "A" and is_string_dtype(group.dtype):
-            with pytest.raises(TypeError, match="does not support"):
-                group.sum()
-        else:
-            return group.sum()
+        return group.sum()
 
     def freducex(x):
         return freduce(x)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index f28967fa81ddb..38b4abfddda1e 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -208,7 +208,6 @@ def func(x):
         getattr(gb, how)(func)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize("how", ["agg", "transform"])
 @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
 def test_groupby_raises_string_np(
@@ -225,7 +224,8 @@ def test_groupby_raises_string_np(
         np.sum: (None, ""),
         np.mean: (
             TypeError,
-            "Could not convert string .* to numeric",
+            "Could not convert string .* to numeric|"
+            "Cannot perform reduction 'mean' with string dtype",
         ),
     }[groupby_func_np]
     _call_and_check(klass, msg, how, gb, groupby_func_np, ())
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 053dda0629571..5b8fa96291c9f 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -6,7 +6,6 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
-from pandas.compat import HAS_PYARROW
 
 from pandas.core.dtypes.common import ensure_platform_int
 
@@ -385,10 +384,7 @@ def test_transform_nuisance_raises(df, using_infer_string):
     gbc = grouped["B"]
     msg = "Could not convert"
     if using_infer_string:
-        if df.columns.dtype.storage == "pyarrow":
-            msg = "with dtype str does not support operation 'mean'"
-        else:
-            msg = "Cannot perform reduction 'mean' with string dtype"
+        msg = "Cannot perform reduction 'mean' with string dtype"
     with pytest.raises(TypeError, match=msg):
         gbc.transform(lambda x: np.mean(x))
 
@@ -483,10 +479,7 @@ def test_groupby_transform_with_int(using_infer_string):
     )
     msg = "Could not convert"
     if using_infer_string:
-        if HAS_PYARROW:
-            msg = "with dtype str does not support operation 'mean'"
-        else:
-            msg = "Cannot perform reduction 'mean' with string dtype"
+        msg = "Cannot perform reduction 'mean' with string dtype"
     with np.errstate(all="ignore"):
         with pytest.raises(TypeError, match=msg):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index 7bbb902e14a36..86ce60b1fc12b 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import Series
 import pandas._testing as tm
@@ -166,60 +162,49 @@ def test_validate_stat_keepdims():
         np.sum(ser, keepdims=True)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_mean_with_convertible_string_raises(using_infer_string):
+def test_mean_with_convertible_string_raises():
     # GH#44008
     ser = Series(["1", "2"])
-    if using_infer_string:
-        msg = "does not support"
-        with pytest.raises(TypeError, match=msg):
-            ser.sum()
-    else:
-        assert ser.sum() == "12"
-    msg = "Could not convert string '12' to numeric|does not support"
+    assert ser.sum() == "12"
+
+    msg = "Could not convert string '12' to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         ser.mean()
 
     df = ser.to_frame()
-    msg = r"Could not convert \['12'\] to numeric|does not support"
+    msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_mean_dont_convert_j_to_complex():
     # GH#36703
     df = pd.DataFrame([{"db": "J", "numeric": 123}])
-    msg = r"Could not convert \['J'\] to numeric|does not support"
+    msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
     with pytest.raises(TypeError, match=msg):
         df.agg("mean")
 
-    msg = "Could not convert string 'J' to numeric|does not support"
+    msg = "Could not convert string 'J' to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df["db"].mean()
-    msg = "Could not convert string 'J' to numeric|ufunc 'divide'"
+    msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         np.mean(df["db"].astype("string").array)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_median_with_convertible_string_raises():
     # GH#34671 this _could_ return a string "2", but definitely not float 2.0
-    msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support"
+    msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform"
     ser = Series(["1", "2", "3"])
     with pytest.raises(TypeError, match=msg):
         ser.median()
 
-    msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support"
+    msg = (
+        r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support|Cannot perform"
+    )
     df = ser.to_frame()
     with pytest.raises(TypeError, match=msg):
         df.median()