Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST (string dtype): resolve all easy xfails in pandas/tests/groupby #60314

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import SpecificationError

from pandas.core.dtypes.common import is_integer_dtype
Expand Down Expand Up @@ -296,12 +294,11 @@ def aggfun_1(ser):
assert len(result) == 0


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])

def func(ser):
if ser.dtype == object:
if ser.dtype == object or ser.dtype == "string":
raise TypeError("Test error message")
return ser.sum()

Expand Down Expand Up @@ -1117,7 +1114,6 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_aggregate_mixed_types():
# GH 16916
df = DataFrame(
Expand All @@ -1129,7 +1125,7 @@ def test_aggregate_mixed_types():
expected = DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
columns=Index(["X", "Y", "Z"]),
)
tm.assert_frame_equal(result, expected)

Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -92,7 +90,6 @@ def test_cython_agg_boolean():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_cython_agg_nothing_to_agg():
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
Expand All @@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg():

result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
[],
index=frame["a"].sort_values().drop_duplicates(),
columns=Index([], dtype="str"),
)
tm.assert_frame_equal(result, expected)

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import SpecificationError

import pandas as pd
Expand Down Expand Up @@ -308,7 +306,6 @@ def test_series_agg_multikey():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_series_agg_multi_pure_python():
data = DataFrame(
{
Expand Down Expand Up @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python():
)

def bad(x):
assert len(x.values.base) > 0
if isinstance(x.values, np.ndarray):
assert len(x.values.base) > 0
return "foo"

result = data.groupby(["A", "B"]).agg(bad)
Comment on lines 357 to 362
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I am not entirely sure about the purpose of this test. Essentially it compares lambda x: "foo" passed to agg as a lambda vs a plain function, except for this assert in the function (but I don't think that should alter how this is executed?)

This was introduced in 71e9046

Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -158,11 +156,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

msg = "dtype 'object' does not support operation 'quantile'"
msg = "dtype '(object|str)' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/methods/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def test_size_series_masked_type_returns_Int64(dtype):
tm.assert_series_equal(result, expected)


# TODO(infer_string) in case the column is object dtype, it should preserve that dtype
# for the result's index
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_size_strings(any_string_dtype):
# GH#55627
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -322,15 +320,18 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_observed(observed):
def test_observed(request, using_infer_string, observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869

if using_infer_string and not observed:
# TODO(infer_string) this fails with filling the string column with 0
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,6 @@ def test_groupby_two_group_keys_all_nan():
assert result == {}


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
Expand All @@ -1290,7 +1289,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down Expand Up @@ -2345,7 +2344,6 @@ def test_groupby_all_nan_groups_drop():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_empty_multi_column(as_index, numeric_only):
# GH 15106 & GH 41998
Expand All @@ -2354,15 +2352,14 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
result = gb.sum(numeric_only=numeric_only)
if as_index:
index = MultiIndex([[], []], [[], []], names=["A", "B"])
columns = ["C"] if not numeric_only else []
columns = ["C"] if not numeric_only else Index([], dtype="str")
else:
index = RangeIndex(0)
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
expected = DataFrame([], columns=columns, index=index)
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_aggregation_non_numeric_dtype():
# GH #43108
df = DataFrame(
Expand All @@ -2373,7 +2370,7 @@ def test_groupby_aggregation_non_numeric_dtype():
{
"v": [[1, 1], [10, 20]],
},
index=Index(["M", "W"], dtype="object", name="MW"),
index=Index(["M", "W"], name="MW"),
)

gb = df.groupby(by=["MW"])
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.pyarrow import pa_version_under10p1

from pandas.core.dtypes.missing import na_value_for_dtype
Expand Down Expand Up @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
tm.assert_frame_equal(grouped, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"dropna, idx, outputs",
[
Expand All @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()

expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))

tm.assert_frame_equal(grouped, expected)

Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import SpecificationError

import pandas as pd
Expand Down Expand Up @@ -807,7 +805,6 @@ def test_groupby_empty(self):
expected = ["name"]
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_level_index_value_all_na(self):
# issue 20519
df = DataFrame(
Expand All @@ -817,7 +814,7 @@ def test_groupby_level_index_value_all_na(self):
expected = DataFrame(
data=[],
index=MultiIndex(
levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
codes=[[], []],
names=["A", "B"],
),
Expand Down Expand Up @@ -981,12 +978,13 @@ def test_groupby_with_empty(self):
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_with_single_column(self):
df = DataFrame({"a": list("abssbab")})
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
# GH 13530
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
exp = DataFrame(
index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
)
tm.assert_frame_equal(df.groupby("a").count(), exp)
tm.assert_frame_equal(df.groupby("a").sum(), exp)

Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/groupby/test_pipe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
Expand All @@ -11,7 +8,6 @@
import pandas._testing as tm


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_pipe():
# Test the pipe method of DataFrameGroupBy.
# Issue #17871
Expand Down Expand Up @@ -39,7 +35,7 @@ def square(srs):
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)

index = Index(["bar", "foo"], dtype="object", name="A")
index = Index(["bar", "foo"], name="A")
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)

tm.assert_series_equal(expected, result)
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs import iNaT

from pandas.core.dtypes.common import pandas_dtype
Expand Down Expand Up @@ -470,8 +468,7 @@ def test_max_min_non_numeric():
assert "ss" in result


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_max_min_object_multiple_columns():
def test_max_min_object_multiple_columns(using_infer_string):
# GH#41111 case where the aggregation is valid for some columns but not
# others; we split object blocks column-wise, consistent with
# DataFrame._reduce
Expand All @@ -484,7 +481,7 @@ def test_max_min_object_multiple_columns():
}
)
df._consolidate_inplace() # should already be consolidate, but double-check
assert len(df._mgr.blocks) == 2
assert len(df._mgr.blocks) == 3 if using_infer_string else 2

gb = df.groupby("A")

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):


class TestGroupBy:
# TODO(infer_string) resample sum introduces 0's
# https://github.com/pandas-dev/pandas/issues/60229
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_with_timegrouper(self):
# GH 4161
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib

from pandas.core.dtypes.common import ensure_platform_int
Expand Down Expand Up @@ -1034,20 +1032,19 @@ def test_groupby_transform_with_datetimes(func, values):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_transform_dtype():
# GH 22243
df = DataFrame({"a": [1], "val": [1.35]})

result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
expected1 = Series(["+1.35"], name="val", dtype="object")
expected1 = Series(["+1.35"], name="val")
tm.assert_series_equal(result, expected1)

result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
tm.assert_series_equal(result, expected1)

result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
expected2 = Series(["+(1.35)"], name="val", dtype="object")
expected2 = Series(["+(1.35)"], name="val")
tm.assert_series_equal(result, expected2)

df["val"] = df["val"].astype(object)
Expand Down
Loading