diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 46c27849356b5..64220f1d3d5b4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -296,12 +294,11 @@ def aggfun_1(ser): assert len(result) == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype == object or ser.dtype == "string": raise TypeError("Test error message") return ser.sum() @@ -1117,7 +1114,6 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1129,7 +1125,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index b937e7dcc8136..a706ea795a0e2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -92,7 +90,6 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 835cad0d13078..ce78b58e5d8f4 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -308,7 +306,6 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 4a8ad65200caa..28cb25b515ed2 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -158,11 +156,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - msg = "dtype 'object' does not support operation 'quantile'" + msg = "dtype '(object|str)' does not support operation 'quantile'" with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 91200f53e36bd..2dc89bc75746f 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -76,6 +76,8 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +# TODO(infer_string) in case the column is object dtype, it should preserve that dtype +# for the result's index @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_size_strings(any_string_dtype): # GH#55627 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1e86b5401ee09..6d84dae1d25d8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -322,8 +320,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -331,6 +328,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3305b48a4dcdc..702bbfef2be3b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1281,7 +1281,6 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1290,7 +1289,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -2345,7 +2344,6 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2354,7 +2352,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2362,7 +2360,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2373,7 +2370,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d42aa06d6bbfe..060a8b7fd3824 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6bb2eaf89b5d7..366eb59ee226a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -807,7 +805,6 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -817,7 +814,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -981,12 +978,13 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1044c83e3e56b..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -11,7 +8,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 @@ -39,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a6ea1502103c5..51c7eab2bfa82 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -470,8 +468,7 @@ def test_max_min_non_numeric(): assert "ss" in result -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_max_min_object_multiple_columns(): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -484,7 +481,7 @@ def test_max_min_object_multiple_columns(): } ) df._consolidate_inplace() # should already be consolidate, but double-check - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ee4973cbf18af..a7712d9dc6586 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -76,6 +76,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5b8fa96291c9f..022d3d51ded4e 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -1034,20 +1032,19 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object)