Skip to content

Commit

Permalink
DEPR: Some Grouper and Grouping attributes (#56149)
Browse files Browse the repository at this point in the history
* DEPR: Some Grouper and Grouping attributes

* GH#

* GH#

* Rework _group_index
  • Loading branch information
rhshadrach authored Nov 26, 2023
1 parent c07563e commit bd27a3e
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 30 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ Other Deprecations
- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,9 +819,9 @@ def value_counts(
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

# multi-index components
codes = self.grouper.reconstructed_codes
codes = self.grouper._reconstructed_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
levels = [ping._group_index for ping in self.grouper.groupings] + [lev]

if dropna:
mask = codes[-1] != -1
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2820,7 +2820,7 @@ def _value_counts(
and not grouping._observed
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
levels_list = [ping._result_index for ping in groupings]
multi_index = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
)
Expand Down Expand Up @@ -5573,7 +5573,7 @@ def _reindex_output(
):
return output

levels_list = [ping.group_index for ping in groupings]
levels_list = [ping._group_index for ping in groupings]
names = self.grouper.names
if qs is not None:
# error: Argument 1 to "append" of "list" has incompatible type
Expand Down Expand Up @@ -5795,7 +5795,7 @@ def _idxmax_idxmin(
ping._passed_categorical for ping in self.grouper.groupings
):
expected_len = np.prod(
[len(ping.group_index) for ping in self.grouper.groupings]
[len(ping._group_index) for ping in self.grouper.groupings]
)
if len(self.grouper.groupings) == 1:
result_len = len(self.grouper.groupings[0].grouping_vector.unique())
Expand Down
53 changes: 43 additions & 10 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,6 @@ class Grouping:
"""

_codes: npt.NDArray[np.signedinteger] | None = None
_group_index: Index | None = None
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index
Expand Down Expand Up @@ -679,7 +678,7 @@ def _ilevel(self) -> int | None:

@property
def ngroups(self) -> int:
return len(self.group_index)
return len(self._group_index)

@cache_readonly
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
Expand All @@ -695,34 +694,58 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
return self._codes_and_uniques[0]

@cache_readonly
def group_arraylike(self) -> ArrayLike:
def _group_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can retain ExtensionDtypes.
"""
if self._all_grouper is not None:
# retain dtype for categories, including unobserved ones
return self.result_index._values
return self._result_index._values

elif self._passed_categorical:
return self.group_index._values
return self._group_index._values

return self._codes_and_uniques[1]

@property
def group_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can retain ExtensionDtypes.
"""
warnings.warn(
"group_arraylike is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_arraylike

@cache_readonly
def result_index(self) -> Index:
def _result_index(self) -> Index:
# result_index retains dtype for categories, including unobserved ones,
# which group_index does not
if self._all_grouper is not None:
group_idx = self.group_index
group_idx = self._group_index
assert isinstance(group_idx, CategoricalIndex)
cats = self._orig_cats
# set_categories is dynamically added
return group_idx.set_categories(cats) # type: ignore[attr-defined]
return self.group_index
return self._group_index

@property
def result_index(self) -> Index:
warnings.warn(
"result_index is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._result_index

@cache_readonly
def group_index(self) -> Index:
def _group_index(self) -> Index:
codes, uniques = self._codes_and_uniques
if not self._dropna and self._passed_categorical:
assert isinstance(uniques, Categorical)
Expand All @@ -744,6 +767,16 @@ def group_index(self) -> Index:
)
return Index._with_infer(uniques, name=self.name)

@property
def group_index(self) -> Index:
warnings.warn(
"group_index is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_index

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques: ArrayLike
Expand Down Expand Up @@ -809,7 +842,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:

@cache_readonly
def groups(self) -> dict[Hashable, np.ndarray]:
cats = Categorical.from_codes(self.codes, self.group_index, validate=False)
cats = Categorical.from_codes(self.codes, self._group_index, validate=False)
return self._index.groupby(cats)


Expand Down
50 changes: 36 additions & 14 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Generic,
final,
)
import warnings

import numpy as np

Expand All @@ -32,6 +33,7 @@
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -616,7 +618,7 @@ def get_iterator(
for each group
"""
splitter = self._get_splitter(data, axis=axis)
keys = self.group_keys_seq
keys = self._group_keys_seq
yield from zip(keys, splitter)

@final
Expand All @@ -638,7 +640,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:

@final
@cache_readonly
def group_keys_seq(self):
def _group_keys_seq(self):
if len(self.groupings) == 1:
return self.levels[0]
else:
Expand All @@ -647,14 +649,24 @@ def group_keys_seq(self):
# provide "flattened" iterator for multi-group setting
return get_flattened_list(ids, ngroups, self.levels, self.codes)

@property
def group_keys_seq(self):
warnings.warn(
"group_keys_seq is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._group_keys_seq

@cache_readonly
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
"""dict {group name -> group indices}"""
if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
# This shows unused categories in indices GH#38642
return self.groupings[0].indices
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
keys = [ping._group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)

@final
Expand Down Expand Up @@ -691,7 +703,7 @@ def codes(self) -> list[npt.NDArray[np.signedinteger]]:

@property
def levels(self) -> list[Index]:
return [ping.group_index for ping in self.groupings]
return [ping._group_index for ping in self.groupings]

@property
def names(self) -> list[Hashable]:
Expand Down Expand Up @@ -766,26 +778,36 @@ def _get_compressed_codes(
# FIXME: compress_group_index's second return value is int64, not intp

ping = self.groupings[0]
return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
return ping.codes, np.arange(len(ping._group_index), dtype=np.intp)

@final
@cache_readonly
def ngroups(self) -> int:
return len(self.result_index)

@property
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
codes = self.codes
ids, obs_ids, _ = self.group_info
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

@property
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
warnings.warn(
"reconstructed_codes is deprecated and will be removed in a future "
"version of pandas",
category=FutureWarning,
stacklevel=find_stack_level(),
)
return self._reconstructed_codes

@cache_readonly
def result_index(self) -> Index:
if len(self.groupings) == 1:
return self.groupings[0].result_index.rename(self.names[0])
return self.groupings[0]._result_index.rename(self.names[0])

codes = self.reconstructed_codes
levels = [ping.result_index for ping in self.groupings]
codes = self._reconstructed_codes
levels = [ping._result_index for ping in self.groupings]
return MultiIndex(
levels=levels, codes=codes, verify_integrity=False, names=self.names
)
Expand All @@ -795,12 +817,12 @@ def get_group_levels(self) -> list[ArrayLike]:
# Note: only called from _insert_inaxis_grouper, which
# is only called for BaseGrouper, never for BinGrouper
if len(self.groupings) == 1:
return [self.groupings[0].group_arraylike]
return [self.groupings[0]._group_arraylike]

name_list = []
for ping, codes in zip(self.groupings, self.reconstructed_codes):
for ping, codes in zip(self.groupings, self._reconstructed_codes):
codes = ensure_platform_int(codes)
levels = ping.group_arraylike.take(codes)
levels = ping._group_arraylike.take(codes)

name_list.append(levels)

Expand Down Expand Up @@ -907,7 +929,7 @@ def apply_groupwise(
) -> tuple[list, bool]:
mutated = False
splitter = self._get_splitter(data, axis=axis)
group_keys = self.group_keys_seq
group_keys = self._group_keys_seq
result_values = []

# This calls DataSplitter.__iter__
Expand Down Expand Up @@ -1087,7 +1109,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
)

@cache_readonly
def reconstructed_codes(self) -> list[np.ndarray]:
def _reconstructed_codes(self) -> list[np.ndarray]:
# get unique result indices, and prepend 0 as groupby starts from the first
return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3303,3 +3303,13 @@ def test_groupby_ffill_with_duplicated_index():
result = df.groupby(level=0).ffill()
expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
tm.assert_frame_equal(result, expected, check_dtype=False)


@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"])
def test_depr_grouper_attrs(attr):
# GH#56148
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
gb = df.groupby("a")
msg = f"{attr} is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(gb.grouper, attr)
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,3 +1211,13 @@ def test_grouper_groups():
msg = "Grouper.indexer is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
grper.indexer


@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"])
def test_depr_grouping_attrs(attr):
# GH#56148
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
gb = df.groupby("a")
msg = f"{attr} is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(gb.grouper.groupings[0], attr)
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
gb = df.groupby(tdg)

# check we're testing the case we're interested in
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
msg = "group_keys_seq is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)

return gb

Expand Down

0 comments on commit bd27a3e

Please sign in to comment.