Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Enforce deprecation of include_groups in groupby.apply #60566

Merged
merged 4 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/user_guide/cookbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
df

# List the size of the animals with the highest weight.
df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False)
df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()])

`Using get_group
<https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key>`__
Expand All @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"])


expected_df = gb.apply(GrowUp, include_groups=False)
expected_df = gb.apply(GrowUp)
expected_df

`Expanding apply
Expand Down
8 changes: 4 additions & 4 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,7 @@ missing values with the ``ffill()`` method.
).set_index("date")
df_re

df_re.groupby("group").resample("1D", include_groups=False).ffill()
df_re.groupby("group").resample("1D").ffill()

.. _groupby.filter:

Expand Down Expand Up @@ -1252,13 +1252,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare

.. ipython:: python

df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False)
df.groupby("A", group_keys=True).apply(lambda x: x)

with

.. ipython:: python

df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)
df.groupby("A", group_keys=False).apply(lambda x: x)


Numba accelerated routines
Expand Down Expand Up @@ -1742,7 +1742,7 @@ column index name will be used as the name of the inserted column:
result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
return pd.Series(result, name="metrics")

result = df.groupby("a").apply(compute_metrics, include_groups=False)
result = df.groupby("a").apply(compute_metrics)

result

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,7 @@ Other Removals
- Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`)
- Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`)
- Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`)
- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.performance:
Expand Down
89 changes: 23 additions & 66 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1393,7 +1393,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
# -----------------------------------------------------------------
# apply/agg/transform

def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT:
"""
Apply function ``func`` group-wise and combine the results together.

Expand All @@ -1419,18 +1419,17 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
*args : tuple
Optional positional arguments to pass to ``func``.

include_groups : bool, default True
include_groups : bool, default False
When True, will attempt to apply ``func`` to the groupings in
the case that they are columns of the DataFrame. If this raises a
TypeError, the result will be computed with the groupings excluded.
When False, the groupings will be excluded when applying ``func``.

.. versionadded:: 2.2.0

.. deprecated:: 2.2.0
.. versionchanged:: 3.0.0
mroeschke marked this conversation as resolved.
Show resolved Hide resolved

Setting include_groups to True is deprecated. Only the value
False will be allowed in a future version of pandas.
The default changed from True to False, and True is no longer allowed.

**kwargs : dict
Optional keyword arguments to pass to ``func``.
Expand Down Expand Up @@ -1520,7 +1519,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
each group together into a Series, including setting the index as
appropriate:

>>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False)
>>> g1.apply(lambda x: x.C.max() - x.B.min())
A
a 5
b 2
Expand All @@ -1529,11 +1528,13 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
Example 4: The function passed to ``apply`` returns ``None`` for one of the
group. This group is filtered from the result:

>>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x, include_groups=False)
>>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x)
B C
0 1 4
1 2 6
"""
if include_groups:
raise ValueError("include_groups=True is no longer allowed.")
if isinstance(func, str):
if hasattr(self, func):
res = getattr(self, func)
Expand All @@ -1560,33 +1561,7 @@ def f(g):
else:
f = func

if not include_groups:
return self._python_apply_general(f, self._obj_with_exclusions)

try:
result = self._python_apply_general(f, self._selected_obj)
if (
not isinstance(self.obj, Series)
and self._selection is None
and self._selected_obj.shape != self._obj_with_exclusions.shape
):
warnings.warn(
message=_apply_groupings_depr.format(type(self).__name__, "apply"),
category=DeprecationWarning,
stacklevel=find_stack_level(),
)
except TypeError:
# gh-20949
# try again, with .apply acting as a filtering
# operation, by excluding the grouping column
# This would normally not be triggered
# except if the udf is trying an operation that
# fails on *some* columns, e.g. a numeric operation
# on a string grouper column

return self._python_apply_general(f, self._obj_with_exclusions)

return result
return self._python_apply_general(f, self._obj_with_exclusions)

@final
def _python_apply_general(
Expand Down Expand Up @@ -3424,7 +3399,9 @@ def describe(
return result

@final
def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler:
def resample(
self, rule, *args, include_groups: bool = False, **kwargs
) -> Resampler:
"""
Provide resampling when using a TimeGrouper.

Expand All @@ -3449,10 +3426,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp

.. versionadded:: 2.2.0

.. deprecated:: 2.2.0
.. versionchanged:: 3.0

Setting include_groups to True is deprecated. Only the value
False will be allowed in a future version of pandas.
The default was changed to False, and True is no longer allowed.

**kwargs
Possible arguments are `how`, `fill_method`, `limit`, `kind` and
Expand Down Expand Up @@ -3485,7 +3461,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp
Downsample the DataFrame into 3 minute bins and sum the values of
the timestamps falling into a bin.

>>> df.groupby("a").resample("3min", include_groups=False).sum()
>>> df.groupby("a").resample("3min").sum()
b
a
0 2000-01-01 00:00:00 2
Expand All @@ -3494,7 +3470,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp

Upsample the series into 30 second bins.

>>> df.groupby("a").resample("30s", include_groups=False).sum()
>>> df.groupby("a").resample("30s").sum()
b
a
0 2000-01-01 00:00:00 1
Expand All @@ -3508,7 +3484,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp

Resample by month. Values are assigned to the month of the period.

>>> df.groupby("a").resample("ME", include_groups=False).sum()
>>> df.groupby("a").resample("ME").sum()
b
a
0 2000-01-31 3
Expand All @@ -3517,11 +3493,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp
Downsample the series into 3 minute bins as above, but close the right
side of the bin interval.

>>> (
... df.groupby("a")
... .resample("3min", closed="right", include_groups=False)
... .sum()
... )
>>> (df.groupby("a").resample("3min", closed="right").sum())
b
a
0 1999-12-31 23:57:00 1
Expand All @@ -3532,11 +3504,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp
the bin interval, but label each bin using the right edge instead of
the left.

>>> (
... df.groupby("a")
... .resample("3min", closed="right", label="right", include_groups=False)
... .sum()
... )
>>> (df.groupby("a").resample("3min", closed="right", label="right").sum())
b
a
0 2000-01-01 00:00:00 1
Expand All @@ -3545,11 +3513,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp
"""
from pandas.core.resample import get_resampler_for_grouping

# mypy flags that include_groups could be specified via `*args` or `**kwargs`
# GH#54961 would resolve.
return get_resampler_for_grouping( # type: ignore[misc]
self, rule, *args, include_groups=include_groups, **kwargs
)
if include_groups:
raise ValueError("include_groups=True is no longer allowed.")

return get_resampler_for_grouping(self, rule, *args, **kwargs)

@final
def rolling(
Expand Down Expand Up @@ -5561,13 +5528,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None])

return mi


# GH#7155
_apply_groupings_depr = (
"{}.{} operated on the grouping columns. This behavior is deprecated, "
"and in a future version of pandas the grouping columns will be excluded "
"from the operation. Either pass `include_groups=False` to exclude the "
"groupings or explicitly select the grouping columns after groupby to silence "
"this warning."
)
Loading
Loading