Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust groupby tests for string option #56414

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])

def func(ser):
if ser.dtype == object:
if ser.dtype in [object, pd.StringDtype("pyarrow_numpy")]:
raise TypeError("Test error message")
return ser.sum()

Expand Down Expand Up @@ -1089,18 +1089,19 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


def test_aggregate_mixed_types():
def test_aggregate_mixed_types(using_infer_string):
# GH 16916
df = DataFrame(
data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
)
df["grouping"] = ["group 1", "group 1", 2]
result = df.groupby("grouping").aggregate(lambda x: x.tolist())
expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
dtype = "string[pyarrow_numpy]" if using_infer_string else object
expected = DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
columns=Index(["X", "Y", "Z"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)

Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_cython_agg_boolean():
tm.assert_series_equal(result, expected)


def test_cython_agg_nothing_to_agg():
def test_cython_agg_nothing_to_agg(using_infer_string):
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
Expand All @@ -104,8 +104,12 @@ def test_cython_agg_nothing_to_agg():
)

result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
dtype = "string[pyarrow_numpy]" if using_infer_string else object

expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
[],
index=frame["a"].sort_values().drop_duplicates(),
columns=Index([], dtype=dtype),
)
tm.assert_frame_equal(result, expected)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ def test_series_agg_multi_pure_python():
)

def bad(x):
assert len(x.values.base) > 0
if x.dtype == object:
assert len(x.values.base) > 0
return "foo"

result = data.groupby(["A", "B"]).agg(bad)
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/groupby/methods/test_nth.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from decimal import Decimal

import numpy as np
import pytest

Expand Down Expand Up @@ -680,7 +682,15 @@ def test_first_multi_key_groupby_categorical():
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
expected = Series(["y"])
if nulls_fixture is not pd.NA and (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think nulls_fixture is not pd.NA is unnecessary

nulls_fixture is pd.NaT
or isinstance(nulls_fixture, Decimal)
and Decimal.is_nan(nulls_fixture)
Comment on lines +687 to +688
Copy link
Member

@rhshadrach rhshadrach Dec 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add parentheses around the last two conditions here for clarify (I think it's the same behavior)

):
dtype = object
else:
dtype = None
expected = Series(["y"], dtype=dtype)
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
with pytest.raises(
TypeError, match="cannot be performed against 'object' dtypes|No matching"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the full error message for the No matching case?

):
df.groupby("key").quantile()


Expand Down Expand Up @@ -248,7 +250,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
TypeError,
match="'quantile' cannot be performed against 'object' dtypes!|No matching",
):
df.groupby("a").quantile(q, numeric_only=numeric_only)

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_apply_index_date_object(using_infer_string):
1.40750,
1.40649,
]
dtype = "string[pyarrow_numpy]" if using_infer_string else object
dtype = "string[pyarrow_numpy]" if using_infer_string else None
exp_idx = Index(
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date"
)
Expand Down Expand Up @@ -1243,9 +1243,7 @@ def test_apply_dropna_with_indexed_same(dropna):
[
[
False,
DataFrame(
[[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
),
DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])),
],
[
True,
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,14 +312,18 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


def test_observed(observed):
def test_observed(observed, using_infer_string, request):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869

if not observed and using_infer_string:
mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this test be fixed in the future? If so, okay to xfail - otherwise I'd prefer to test for the exception with pytest.raises. xfails have a perf impact on running the tests.

request.applymarker(mark)

cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
Expand Down
21 changes: 15 additions & 6 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2315,14 +2315,18 @@ def test_groupby_all_nan_groups_drop():


@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_empty_multi_column(as_index, numeric_only):
def test_groupby_empty_multi_column(as_index, numeric_only, using_infer_string):
# GH 15106 & GH 41998
df = DataFrame(data=[], columns=["A", "B", "C"])
gb = df.groupby(["A", "B"], as_index=as_index)
result = gb.sum(numeric_only=numeric_only)
if as_index:
index = MultiIndex([[], []], [[], []], names=["A", "B"])
columns = ["C"] if not numeric_only else []
if using_infer_string:
dtype = "string[pyarrow_numpy]"
else:
dtype = object
columns = ["C"] if not numeric_only else Index([], dtype=dtype)
else:
index = RangeIndex(0)
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
Expand All @@ -2340,7 +2344,7 @@ def test_groupby_aggregation_non_numeric_dtype():
{
"v": [[1, 1], [10, 20]],
},
index=Index(["M", "W"], dtype="object", name="MW"),
index=Index(["M", "W"], name="MW"),
)

gb = df.groupby(by=["MW"])
Expand Down Expand Up @@ -2487,11 +2491,16 @@ def test_groupby_none_in_first_mi_level():
tm.assert_series_equal(result, expected)


def test_groupby_none_column_name():
def test_groupby_none_column_name(using_infer_string):
# GH#47348
df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
result = df.groupby(by=[None]).sum()
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
if using_infer_string:
result = df.groupby(by=[np.nan]).sum()
name = np.nan
else:
result = df.groupby(by=[None]).sum()
name = None
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=name))
tm.assert_frame_equal(result, expected)


Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
),
],
)
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
def test_groupby_dropna_normal_index_dataframe(
dropna, idx, outputs, using_infer_string
):
# GH 3729
df_list = [
["B", 12, 12, 12],
Expand All @@ -123,7 +125,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()

expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
dtype = "string[pyarrow_numpy]" if using_infer_string else object

expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype=dtype, name="a"))

tm.assert_frame_equal(grouped, expected)

Expand Down
11 changes: 7 additions & 4 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ def test_groupby_empty(self):
expected = ["name"]
assert result == expected

def test_groupby_level_index_value_all_na(self):
def test_groupby_level_index_value_all_na(self, using_infer_string):
# issue 20519
df = DataFrame(
[["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
Expand All @@ -805,7 +805,7 @@ def test_groupby_level_index_value_all_na(self):
columns=["C"],
dtype="int64",
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected, check_index_type=not using_infer_string)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just change dytpe="object" on L840?


def test_groupby_multiindex_level_empty(self):
# https://github.com/pandas-dev/pandas/issues/31670
Expand Down Expand Up @@ -933,11 +933,14 @@ def test_groupby_with_empty(self):
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None

def test_groupby_with_single_column(self):
def test_groupby_with_single_column(self, using_infer_string):
df = DataFrame({"a": list("abssbab")})
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
# GH 13530
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
dtype = "string[pyarrow_numpy]" if using_infer_string else object
exp = DataFrame(
index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype=dtype)
)
tm.assert_frame_equal(df.groupby("a").count(), exp)
tm.assert_frame_equal(df.groupby("a").sum(), exp)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def square(srs):
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)

index = Index(["bar", "foo"], dtype="object", name="A")
index = Index(["bar", "foo"], name="A")
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)

tm.assert_series_equal(expected, result)
Expand Down
38 changes: 26 additions & 12 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):

@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_string(
how, by, groupby_series, groupby_func, df_with_string_col
how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
):
df = df_with_string_col
args = get_groupby_method_args(groupby_func, df)
Expand All @@ -119,30 +119,41 @@ def test_groupby_raises_string(
assert not hasattr(gb, "corrwith")
return

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError)
else:
errs = TypeError

klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (TypeError, "Could not convert"),
"corrwith": (errs, "Could not convert|has no kernel"),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
"(function|cummax) is not (implemented|supported) "
"for (this|object|string) dtype",
),
"cummin": (
(NotImplementedError, TypeError),
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
"(function|cummin) is not (implemented|supported) "
"for (this|object|string) dtype",
),
"cumprod": (
(NotImplementedError, TypeError),
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
"(function|cumprod) is not (implemented|supported) "
"for (this|object|string) dtype",
),
"cumsum": (
(NotImplementedError, TypeError),
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
"(function|cumsum) is not (implemented|supported) "
"for (this|object|string) dtype",
),
"diff": (TypeError, "unsupported operand type"),
"diff": (errs, "unsupported operand type|has no kernel"),
"ffill": (None, ""),
"fillna": (None, ""),
"first": (None, ""),
Expand All @@ -152,21 +163,24 @@ def test_groupby_raises_string(
"max": (None, ""),
"mean": (
TypeError,
re.escape("agg function failed [how->mean,dtype->object]"),
re.escape("agg function failed [how->mean,dtype->"),
),
"median": (
TypeError,
re.escape("agg function failed [how->median,dtype->object]"),
re.escape("agg function failed [how->median,dtype->"),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"pct_change": (errs, "unsupported operand type|has no kernel"),
"prod": (
TypeError,
re.escape("agg function failed [how->prod,dtype->object]"),
re.escape("agg function failed [how->prod,dtype->"),
),
"quantile": (
TypeError,
"cannot be performed against 'object' dtypes!|No matching signature",
),
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
"rank": (None, ""),
"sem": (ValueError, "could not convert string to float"),
"shift": (None, ""),
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def test_max_min_non_numeric():
assert "ss" in result


def test_max_min_object_multiple_columns():
def test_max_min_object_multiple_columns(using_infer_string):
# GH#41111 case where the aggregation is valid for some columns but not
# others; we split object blocks column-wise, consistent with
# DataFrame._reduce
Expand All @@ -481,7 +481,10 @@ def test_max_min_object_multiple_columns():
}
)
df._consolidate_inplace() # should already be consolidate, but double-check
assert len(df._mgr.blocks) == 2
if using_infer_string:
assert len(df._mgr.blocks) == 3
else:
assert len(df._mgr.blocks) == 2

gb = df.groupby("A")

Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):


class TestGroupBy:
def test_groupby_with_timegrouper(self):
def test_groupby_with_timegrouper(self, using_infer_string):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
Expand Down Expand Up @@ -112,7 +112,9 @@ def test_groupby_with_timegrouper(self):
index=exp_dti,
)
# Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
expected = expected.astype({"Buyer": object})
dtype = "string[pyarrow_numpy]" if using_infer_string else object

expected = expected.astype({"Buyer": dtype})
expected.iloc[0, 0] = "CarlCarlCarl"
expected.iloc[6, 0] = "CarlCarl"
expected.iloc[18, 0] = "Joe"
Expand Down
Loading
Loading