Skip to content

Commit

Permalink
CLN: Enforce deprecation of groupby with as_index=False excluding out…
Browse files Browse the repository at this point in the history
…-of-axis groupings (#57741)

* CLN: Enforce deprecation of groupby with as_index=False excluding out-of-axis groupings

* type annotation fixup
  • Loading branch information
rhshadrach authored Mar 7, 2024
1 parent b89b2f1 commit fe2ef37
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 72 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ Removal of prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
- All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
Expand Down
62 changes: 35 additions & 27 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,34 +1286,43 @@ def _set_result_index_ordered(
return result

@final
def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
def _insert_inaxis_grouper(
self, result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None
) -> DataFrame:
if isinstance(result, Series):
result = result.to_frame()

n_groupings = len(self._grouper.groupings)

if qs is not None:
result.insert(
0, f"level_{n_groupings}", np.tile(qs, len(result) // len(qs))
)

# zip in reverse so we can always insert at loc 0
columns = result.columns
for name, lev, in_axis in zip(
reversed(self._grouper.names),
reversed(self._grouper.get_group_levels()),
reversed([grp.in_axis for grp in self._grouper.groupings]),
for level, (name, lev, in_axis) in enumerate(
zip(
reversed(self._grouper.names),
reversed(self._grouper.get_group_levels()),
reversed([grp.in_axis for grp in self._grouper.groupings]),
)
):
if name is None:
# Behave the same as .reset_index() when a level is unnamed
name = (
"index"
if n_groupings == 1 and qs is None
else f"level_{n_groupings - level - 1}"
)

# GH #28549
# When using .apply(-), name will be in columns already
if name not in columns:
if in_axis:
if name not in result.columns:
# if in_axis:
if qs is None:
result.insert(0, name, lev)
else:
msg = (
"A grouping was used that is not in the columns of the "
"DataFrame and so was excluded from the result. This grouping "
"will be included in a future version of pandas. Add the "
"grouping as a column of the DataFrame to silence this warning."
)
warnings.warn(
message=msg,
category=FutureWarning,
stacklevel=find_stack_level(),
)
result.insert(0, name, Index(np.repeat(lev, len(qs))))

return result

Expand All @@ -1340,18 +1349,17 @@ def _wrap_aggregated_output(
if not self.as_index:
# `not self.as_index` is only relevant for DataFrameGroupBy,
# enforced in __init__
result = self._insert_inaxis_grouper(result)
result = self._insert_inaxis_grouper(result, qs=qs)
result = result._consolidate()
index = Index(range(self._grouper.ngroups))
result.index = RangeIndex(len(result))

else:
index = self._grouper.result_index

if qs is not None:
# We get here with len(qs) != 1 and not self.as_index
# in test_pass_args_kwargs
index = _insert_quantile_level(index, qs)
result.index = index
if qs is not None:
# We get here with len(qs) != 1 and not self.as_index
# in test_pass_args_kwargs
index = _insert_quantile_level(index, qs)
result.index = index

return result

Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,18 +1248,15 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
tsframe.columns = ["A", "B", "A", "C"]
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)

warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
res = gb.agg(np.percentile, 80, axis=0)
res = gb.agg(np.percentile, 80, axis=0)

ex_data = {
1: tsframe[tsframe.index.month == 1].quantile(0.8),
2: tsframe[tsframe.index.month == 2].quantile(0.8),
}
expected = DataFrame(ex_data).T
if not as_index:
# TODO: try to get this more consistent?
expected.insert(0, "index", [1, 2])
expected.index = Index(range(2))

tm.assert_frame_equal(res, expected)
Expand Down
19 changes: 11 additions & 8 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,24 +779,27 @@ def test_as_index():

# function grouper
f = lambda r: df.loc[r, "A"]
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
expected = DataFrame(
{
"cat": Categorical([1, 2], categories=df.cat.cat.categories),
"level_1": [10, 11],
"A": [10, 22],
"B": [101, 205],
},
columns=["cat", "A", "B"],
)
tm.assert_frame_equal(result, expected)

# another not in-axis grouper (conflicting names in index)
s = Series(["a", "b", "b"], name="cat")
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
expected = DataFrame(
{
"cat": ["a", "b"],
"A": [10, 22],
"B": [101, 205],
},
)
tm.assert_frame_equal(result, expected)

# is original index dropped?
Expand Down Expand Up @@ -1852,7 +1855,7 @@ def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
):
# GH#48749
if reduction_func == "corrwith" and not as_index:
if reduction_func == "corrwith" and not as_index and index_kind != "single":
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
request.applymarker(pytest.mark.xfail(reason=msg))
elif index_kind != "range" and not as_index:
Expand Down
24 changes: 10 additions & 14 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,26 +103,22 @@ def f(x, q=None, axis=0):
# DataFrame
for as_index in [True, False]:
df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
with tm.assert_produces_warning(warn, match=msg):
expected = df_grouped.quantile(0.8)
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
expected = df_grouped.quantile(0.8)
tm.assert_frame_equal(apply_result, expected, check_names=False)
tm.assert_frame_equal(agg_result, expected)

apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
with tm.assert_produces_warning(warn, match=msg):
expected_seq = df_grouped.quantile([0.4, 0.8])
expected_seq = df_grouped.quantile([0.4, 0.8])
if not as_index:
# apply treats the op as a transform; .quantile knows it's a reduction
apply_result = apply_result.reset_index()
apply_result["level_0"] = [1, 1, 2, 2]
tm.assert_frame_equal(apply_result, expected_seq, check_names=False)

with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(f, q=80)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
agg_result = df_grouped.agg(f, q=80)
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
tm.assert_frame_equal(agg_result, expected)
tm.assert_frame_equal(apply_result, expected, check_names=False)

Expand Down
13 changes: 1 addition & 12 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,6 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
expected = expected.set_index(["x", "x2"])
else:
expected = expected.set_index("x")
elif index_kind != "range" and reduction_func != "size":
# size, unlike other methods, has the desired behavior in GH#49519
expected = expected.drop(columns="x")
if index_kind == "multi":
expected = expected.drop(columns="x2")
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
# expected was computed with a RangeIndex; need to translate to index values
values = expected["y"].values.tolist()
Expand All @@ -572,13 +567,7 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
if as_index:
expected = expected["size"].rename(None)

if as_index or index_kind == "range" or reduction_func == "size":
warn = None
else:
warn = FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb_keepna, reduction_func)(*args)
result = getattr(gb_keepna, reduction_func)(*args)

# size will return a Series, others are DataFrame
tm.assert_equal(result, expected)
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,12 +1125,8 @@ def test_grouping_by_key_is_in_axis():
assert not gb._grouper.groupings[0].in_axis
assert gb._grouper.groupings[1].in_axis

# Currently only in-axis groupings are including in the result when as_index=False;
# This is likely to change in the future.
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.sum()
expected = DataFrame({"b": [1, 2], "c": [7, 5]})
result = gb.sum()
expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]})
tm.assert_frame_equal(result, expected)


Expand Down

0 comments on commit fe2ef37

Please sign in to comment.