From 2fdb16b347fc34f78213868a8a973447ac79ab2d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 11:16:04 +0100 Subject: [PATCH] String dtype: implement sum reduction (#59853) --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 4 ++ pandas/core/arrays/arrow/array.py | 32 ++++++++++ pandas/core/arrays/string_.py | 18 +++++- pandas/core/arrays/string_arrow.py | 6 +- pandas/tests/apply/test_frame_apply.py | 10 --- pandas/tests/apply/test_invalid_arg.py | 39 ++++++------ pandas/tests/arrays/string_/test_string.py | 2 - pandas/tests/extension/test_arrow.py | 25 ++------ pandas/tests/extension/test_string.py | 2 +- pandas/tests/frame/test_reductions.py | 61 ++++++------------- pandas/tests/groupby/aggregate/test_cython.py | 1 - pandas/tests/groupby/test_groupby.py | 15 +---- pandas/tests/groupby/test_raises.py | 4 +- .../tests/groupby/transform/test_transform.py | 11 +--- pandas/tests/series/test_reductions.py | 39 ++++-------- 16 files changed, 120 insertions(+), 151 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..64486c5a3e3ba 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,7 +32,7 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index f2a32fbe2b0e5..bdf88f2e9fa07 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -62,6 +62,10 @@ def _reductions( ): return libmissing.NA + if values.dtype == np.dtype(object): + # object dtype does not support `where` without passing an initial + values = values[~mask] + return func(values, axis=axis, **kwargs) return func(values, where=~mask, axis=axis, **kwargs) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 619e7b3ccfb4f..53f703b701217 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -68,6 +68,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -1705,6 +1706,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) + elif name == "sum" and ( + pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) + ): + + def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] + mask = pc.is_null(data) if data.null_count > 0 else None + if skip_nulls: + if min_count > 0 and check_below_min_count( + (len(data),), + None if mask is None else mask.to_numpy(), + min_count, + ): + return pa.scalar(None, type=data.type) + if data.null_count > 0: + # binary_join returns null if there is any null -> + # have to filter out any nulls + data = data.filter(pc.invert(mask)) + else: + if mask is not None or check_below_min_count( + (len(data),), None, min_count + ): + return pa.scalar(None, type=data.type) + + if pa.types.is_large_string(data.type): + # binary_join only supports string, not large_string + data = data.cast(pa.string()) + data_list = pa.ListArray.from_arrays( + [0, len(data)], data.combine_chunks() + )[0] + return pc.binary_join(data_list, "") + else: pyarrow_name = { "median": "quantile", diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f20c4c8625475..4af26858cb131 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -812,8 +812,8 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max"]: - result = getattr(self, name)(skipna=skipna, axis=axis) + if name in ["min", "max", "sum"]: + result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) return result @@ -840,6 +840,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._ndarray, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9e9893ecbbd97..cde39c7f4dc6a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -435,7 +435,11 @@ def _reduce( return result.astype(np.bool_) return result - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("min", "max", "sum", "argmin", "argmax"): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + else: + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_result(result) elif isinstance(result, pa.Array): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index f0ab01e9e960e..ed7eae4502a64 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1218,7 +1214,6 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1247,9 +1242,6 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( @@ -1347,7 +1339,6 @@ def test_named_agg_reduce_axis1_raises(float_frame): float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1524,7 +1515,6 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index ba970e328ae40..e19c21f81b3e1 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,9 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -212,10 +209,6 @@ def transform(row): data.apply(transform, axis=1) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -225,21 +218,25 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - import pyarrow as pa + if df.dtypes.iloc[0].storage == "pyarrow": + import pyarrow as pa - expected = (expected, pa.lib.ArrowNotImplementedError) + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error - msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + + msg = ( + "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): df.agg(func, axis=axis) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "series, func, expected", chain( @@ -263,11 +260,15 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" if using_infer_string: - import pyarrow as pa - - expected = (expected, pa.lib.ArrowNotImplementedError) - - msg = msg + "|does not support|has no kernel" + if series.dtype.storage == "pyarrow": + import pyarrow as pa + + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 33708be497f31..7856cf390127e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -444,14 +444,12 @@ def test_astype_float(dtype, any_float_dtype): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) assert result == "abc" -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna, dtype): arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f56094dfd47ca..f0ff11e5fa3f7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -461,10 +461,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: pass else: return False + elif pa.types.is_binary(pa_dtype) and op_name == "sum": + return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ) and op_name in [ - "sum", "mean", "median", "prod", @@ -563,6 +564,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): cmp_dtype = "float64[pyarrow]" elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): cmp_dtype = "uint64[pyarrow]" + elif op_name == "sum" and pa.types.is_string(pa_type): + cmp_dtype = arr.dtype else: cmp_dtype = { "i": "int64[pyarrow]", @@ -594,26 +597,6 @@ def test_median_not_approximate(self, typ): result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - def test_in_numeric_groupby(self, data_for_grouping): - dtype = data_for_grouping.dtype - if is_string_dtype(dtype): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1], - } - ) - - expected = pd.Index(["C"]) - msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") - with pytest.raises(TypeError, match=msg): - df.groupby("A").sum() - result = df.groupby("A").sum(numeric_only=True).columns - tm.assert_index_equal(result, expected) - else: - super().test_in_numeric_groupby(data_for_grouping) - def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 509ae653e4793..57710d9caad4d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -188,7 +188,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - op_name in ["min", "max"] + op_name in ["min", "max", "sum"] or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1d667d35db253..05bb603f5c462 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,7 +226,6 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -246,17 +245,11 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame( - self, float_string_frame, axis, opname, using_infer_string - ): - if ( - (opname in ("sum", "min", "max") and axis == 0) - or opname - in ( - "count", - "nunique", - ) - ) and not (using_infer_string and opname == "sum"): + def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -283,10 +276,11 @@ def test_stat_op_api_float_string_frame( msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": msg = re.compile( - r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + r"Cannot convert \[.*\] to numeric|does not support|Cannot perform", + flags=re.S, ) if not isinstance(msg, re.Pattern): - msg = msg + "|does not support" + msg = msg + "|does not support|Cannot perform reduction" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -432,7 +426,6 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -449,26 +442,16 @@ def test_mixed_ops(self, op): "could not convert", "can't multiply sequence by non-int", "does not support", + "Cannot perform", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() with pd.option_context("use_bottleneck", False): - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "does not support", - ] - ) with pytest.raises(TypeError, match=msg): getattr(df, op)() - @pytest.mark.xfail( - using_string_dtype(), reason="sum doesn't work for arrow strings" - ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -608,7 +591,6 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -630,7 +612,7 @@ def test_sem(self, datetime_frame): "A": [12], "B": [10.0], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": Series([np.nan], dtype="str"), "E": Categorical([np.nan], categories=["a"]), "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), @@ -672,7 +654,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), + "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -692,7 +674,6 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -700,7 +681,12 @@ def test_mode_sortwarning(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - warning = None if using_infer_string else UserWarning + # TODO(infer_string) avoid this UserWarning for python storage + warning = ( + None + if using_infer_string and df.A.dtype.storage == "pyarrow" + else UserWarning + ) with tm.assert_produces_warning(warning, match="Unable to sort modes"): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -1354,11 +1340,8 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) - def test_any_all_object_dtype( - self, axis, all_boolean_reductions, skipna, using_infer_string - ): + def test_any_all_object_dtype(self, axis, all_boolean_reductions, skipna): # GH#35450 df = DataFrame( data=[ @@ -1368,13 +1351,8 @@ def test_any_all_object_dtype( [np.nan, np.nan, "5", np.nan], ] ) - if using_infer_string: - # na in object is True while in string pyarrow numpy it's false - val = not axis == 0 and not skipna and all_boolean_reductions == "all" - else: - val = True result = getattr(df, all_boolean_reductions)(axis=axis, skipna=skipna) - expected = Series([True, True, val, True]) + expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) def test_any_datetime(self): @@ -1939,7 +1917,6 @@ def test_sum_timedelta64_skipna_false(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 4a4f5882b7e85..d28eb227314c7 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -146,7 +146,6 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6393468fb8ccd..0d13db79835ba 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,12 +8,9 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -1408,23 +1405,15 @@ def g(group): tm.assert_series_equal(result, expected) -# TODO harmonize error messages -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper, using_infer_string): +def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): - with pytest.raises(TypeError, match="does not support"): - group.sum() - else: - return group.sum() + return group.sum() def freducex(x): return freduce(x) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f28967fa81ddb..38b4abfddda1e 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -208,7 +208,6 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( @@ -225,7 +224,8 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - "Could not convert string .* to numeric", + "Could not convert string .* to numeric|" + "Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 053dda0629571..5b8fa96291c9f 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -6,7 +6,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ensure_platform_int @@ -385,10 +384,7 @@ def test_transform_nuisance_raises(df, using_infer_string): gbc = grouped["B"] msg = "Could not convert" if using_infer_string: - if df.columns.dtype.storage == "pyarrow": - msg = "with dtype str does not support operation 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) @@ -483,10 +479,7 @@ def test_groupby_transform_with_int(using_infer_string): ) msg = "Could not convert" if using_infer_string: - if HAS_PYARROW: - msg = "with dtype str does not support operation 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 7bbb902e14a36..86ce60b1fc12b 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import Series import pandas._testing as tm @@ -166,60 +162,49 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_mean_with_convertible_string_raises(using_infer_string): +def test_mean_with_convertible_string_raises(): # GH#44008 ser = Series(["1", "2"]) - if using_infer_string: - msg = "does not support" - with pytest.raises(TypeError, match=msg): - ser.sum() - else: - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric|does not support" + assert ser.sum() == "12" + + msg = "Could not convert string '12' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_mean_dont_convert_j_to_complex(): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_median_with_convertible_string_raises(): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = ( + r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support|Cannot perform" + ) df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median()