Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Some Grouper and Grouping attributes #56149

Merged
merged 4 commits into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ Other Deprecations
- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,9 +819,9 @@ def value_counts(
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

# multi-index components
codes = self.grouper.reconstructed_codes
codes = self.grouper._reconstructed_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
levels = [ping._group_index for ping in self.grouper.groupings] + [lev]

if dropna:
mask = codes[-1] != -1
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2820,7 +2820,7 @@ def _value_counts(
and not grouping._observed
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
levels_list = [ping._result_index for ping in groupings]
multi_index = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
)
Expand Down Expand Up @@ -5573,7 +5573,7 @@ def _reindex_output(
):
return output

levels_list = [ping.group_index for ping in groupings]
levels_list = [ping._group_index for ping in groupings]
names = self.grouper.names
if qs is not None:
# error: Argument 1 to "append" of "list" has incompatible type
Expand Down Expand Up @@ -5795,7 +5795,7 @@ def _idxmax_idxmin(
ping._passed_categorical for ping in self.grouper.groupings
):
expected_len = np.prod(
[len(ping.group_index) for ping in self.grouper.groupings]
[len(ping._group_index) for ping in self.grouper.groupings]
)
if len(self.grouper.groupings) == 1:
result_len = len(self.grouper.groupings[0].grouping_vector.unique())
Expand Down
53 changes: 43 additions & 10 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,6 @@ class Grouping:
"""

_codes: npt.NDArray[np.signedinteger] | None = None
_group_index: Index | None = None
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index
Expand Down Expand Up @@ -679,7 +678,7 @@ def _ilevel(self) -> int | None:

@property
def ngroups(self) -> int:
return len(self.group_index)
return len(self._group_index)

@cache_readonly
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
Expand All @@ -695,34 +694,58 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
return self._codes_and_uniques[0]

@cache_readonly
def group_arraylike(self) -> ArrayLike:
def _group_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can retain ExtensionDtypes.
"""
if self._all_grouper is not None:
# retain dtype for categories, including unobserved ones
return self.result_index._values
return self._result_index._values

elif self._passed_categorical:
return self.group_index._values
return self._group_index._values

return self._codes_and_uniques[1]

@property
def group_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can retain ExtensionDtypes.
"""
warnings.warn(
"group_arraylike is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_arraylike

@cache_readonly
def result_index(self) -> Index:
def _result_index(self) -> Index:
# result_index retains dtype for categories, including unobserved ones,
# which group_index does not
if self._all_grouper is not None:
group_idx = self.group_index
group_idx = self._group_index
assert isinstance(group_idx, CategoricalIndex)
cats = self._orig_cats
# set_categories is dynamically added
return group_idx.set_categories(cats) # type: ignore[attr-defined]
return self.group_index
return self._group_index

@property
def result_index(self) -> Index:
warnings.warn(
"result_index is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._result_index

@cache_readonly
def group_index(self) -> Index:
def _group_index(self) -> Index:
codes, uniques = self._codes_and_uniques
if not self._dropna and self._passed_categorical:
assert isinstance(uniques, Categorical)
Expand All @@ -744,6 +767,16 @@ def group_index(self) -> Index:
)
return Index._with_infer(uniques, name=self.name)

@property
def group_index(self) -> Index:
warnings.warn(
"group_index is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_index

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques: ArrayLike
Expand Down Expand Up @@ -809,7 +842,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:

@cache_readonly
def groups(self) -> dict[Hashable, np.ndarray]:
cats = Categorical.from_codes(self.codes, self.group_index, validate=False)
cats = Categorical.from_codes(self.codes, self._group_index, validate=False)
return self._index.groupby(cats)


Expand Down
50 changes: 36 additions & 14 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Generic,
final,
)
import warnings

import numpy as np

Expand All @@ -32,6 +33,7 @@
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -616,7 +618,7 @@ def get_iterator(
for each group
"""
splitter = self._get_splitter(data, axis=axis)
keys = self.group_keys_seq
keys = self._group_keys_seq
yield from zip(keys, splitter)

@final
Expand All @@ -638,7 +640,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:

@final
@cache_readonly
def group_keys_seq(self):
def _group_keys_seq(self):
if len(self.groupings) == 1:
return self.levels[0]
else:
Expand All @@ -647,14 +649,24 @@ def group_keys_seq(self):
# provide "flattened" iterator for multi-group setting
return get_flattened_list(ids, ngroups, self.levels, self.codes)

@property
def group_keys_seq(self):
warnings.warn(
"group_keys_seq is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_keys_seq

@cache_readonly
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
"""dict {group name -> group indices}"""
if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
# This shows unused categories in indices GH#38642
return self.groupings[0].indices
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
keys = [ping._group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)

@final
Expand Down Expand Up @@ -691,7 +703,7 @@ def codes(self) -> list[npt.NDArray[np.signedinteger]]:

@property
def levels(self) -> list[Index]:
return [ping.group_index for ping in self.groupings]
return [ping._group_index for ping in self.groupings]

@property
def names(self) -> list[Hashable]:
Expand Down Expand Up @@ -766,26 +778,36 @@ def _get_compressed_codes(
# FIXME: compress_group_index's second return value is int64, not intp

ping = self.groupings[0]
return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
return ping.codes, np.arange(len(ping._group_index), dtype=np.intp)

@final
@cache_readonly
def ngroups(self) -> int:
return len(self.result_index)

@property
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
codes = self.codes
ids, obs_ids, _ = self.group_info
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

@property
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
warnings.warn(
"reconstructed_codes is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._reconstructed_codes

@cache_readonly
def result_index(self) -> Index:
if len(self.groupings) == 1:
return self.groupings[0].result_index.rename(self.names[0])
return self.groupings[0]._result_index.rename(self.names[0])

codes = self.reconstructed_codes
levels = [ping.result_index for ping in self.groupings]
codes = self._reconstructed_codes
levels = [ping._result_index for ping in self.groupings]
return MultiIndex(
levels=levels, codes=codes, verify_integrity=False, names=self.names
)
Expand All @@ -795,12 +817,12 @@ def get_group_levels(self) -> list[ArrayLike]:
# Note: only called from _insert_inaxis_grouper, which
# is only called for BaseGrouper, never for BinGrouper
if len(self.groupings) == 1:
return [self.groupings[0].group_arraylike]
return [self.groupings[0]._group_arraylike]

name_list = []
for ping, codes in zip(self.groupings, self.reconstructed_codes):
for ping, codes in zip(self.groupings, self._reconstructed_codes):
codes = ensure_platform_int(codes)
levels = ping.group_arraylike.take(codes)
levels = ping._group_arraylike.take(codes)

name_list.append(levels)

Expand Down Expand Up @@ -907,7 +929,7 @@ def apply_groupwise(
) -> tuple[list, bool]:
mutated = False
splitter = self._get_splitter(data, axis=axis)
group_keys = self.group_keys_seq
group_keys = self._group_keys_seq
result_values = []

# This calls DataSplitter.__iter__
Expand Down Expand Up @@ -1087,7 +1109,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
)

@cache_readonly
def reconstructed_codes(self) -> list[np.ndarray]:
def _reconstructed_codes(self) -> list[np.ndarray]:
# get unique result indices, and prepend 0 as groupby starts from the first
return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3303,3 +3303,13 @@ def test_groupby_ffill_with_duplicated_index():
result = df.groupby(level=0).ffill()
expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
tm.assert_frame_equal(result, expected, check_dtype=False)


@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"])
def test_depr_grouper_attrs(attr):
# GH#56148
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
gb = df.groupby("a")
msg = f"{attr} is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(gb.grouper, attr)
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,3 +1211,13 @@ def test_grouper_groups():
msg = "Grouper.indexer is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
grper.indexer


@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"])
def test_depr_grouping_attrs(attr):
# GH#56148
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
gb = df.groupby("a")
msg = f"{attr} is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(gb.grouper.groupings[0], attr)
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
gb = df.groupby(tdg)

# check we're testing the case we're interested in
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
msg = "group_keys_seq is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)

return gb

Expand Down
Loading