From 87dfe6ff254bf402be173365e5be60c769c2a5a4 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 16 Dec 2023 11:05:23 -0500 Subject: [PATCH 1/4] DEPR: groupby.grouper --- pandas/core/frame.py | 2 +- pandas/core/groupby/generic.py | 64 +++---- pandas/core/groupby/groupby.py | 161 ++++++++++-------- pandas/core/groupby/ops.py | 36 +--- pandas/core/resample.py | 16 +- pandas/core/reshape/merge.py | 2 +- pandas/core/reshape/pivot.py | 2 +- pandas/tests/extension/base/groupby.py | 7 +- .../tests/groupby/aggregate/test_aggregate.py | 11 +- .../methods/test_groupby_shift_diff.py | 2 +- pandas/tests/groupby/test_groupby.py | 14 +- pandas/tests/groupby/test_groupby_dropna.py | 4 +- pandas/tests/groupby/test_grouping.py | 62 ++++--- pandas/tests/groupby/test_timegrouper.py | 12 +- .../tests/groupby/transform/test_transform.py | 4 +- pandas/tests/test_sorting.py | 2 +- 16 files changed, 209 insertions(+), 192 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ba9926c054ba..3a105c1c468dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7464,7 +7464,7 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() counts.name = name if sort: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c4cc30f9631ea..8b450c80b225b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -283,11 +283,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return self.obj._constructor( [], name=self.obj.name, - index=self.grouper.result_index, + index=self._grouper.result_index, dtype=obj.dtype, ) - if self.grouper.nkeys > 1: + if self._grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) try: @@ -309,7 +309,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) ) # result is a dict whose keys are the elements of result_index - result = Series(result, index=self.grouper.result_index) + result = Series(result, index=self._grouper.result_index) result = self._wrap_aggregated_output(result) return result @@ -324,7 +324,7 @@ def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) obj = self._obj_with_exclusions - result = self.grouper.agg_series(obj, f) + result = self._grouper.agg_series(obj, f) res = obj._constructor(result, name=obj.name) return self._wrap_aggregated_output(res) @@ -404,7 +404,7 @@ def _wrap_applied_output( # GH#47787 see test_group_on_empty_multiindex res_index = data.index else: - res_index = self.grouper.result_index + res_index = self._grouper.result_index return self.obj._constructor( [], @@ -416,7 +416,7 @@ def _wrap_applied_output( if isinstance(values[0], dict): # GH #823 #24880 - index = self.grouper.result_index + index = self._grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) res_df = self._reindex_output(res_df) # if self.observed is False, @@ -439,7 +439,7 @@ def _wrap_applied_output( else: # GH #6265 #24880 result = self.obj._constructor( - data=values, index=self.grouper.result_index, name=self.obj.name + data=values, index=self._grouper.result_index, name=self.obj.name ) if not self.as_index: result = self._insert_inaxis_grouper(result) @@ -452,7 +452,7 @@ def _aggregate_named(self, func, *args, **kwargs): result = {} initialized = False - for name, group in self.grouper.get_iterator( + for name, group in self._grouper.get_iterator( self._obj_with_exclusions, axis=self.axis ): # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations @@ -526,7 +526,7 @@ def _cython_transform( obj = self._obj_with_exclusions try: - result = self.grouper._cython_operation( + result = self._grouper._cython_operation( "transform", obj._values, how, axis, **kwargs ) except NotImplementedError as err: @@ -549,7 +549,7 @@ def _transform_general( klass = type(self.obj) results = [] - for name, group in self.grouper.get_iterator( + for name, group in self._grouper.get_iterator( self._obj_with_exclusions, axis=self.axis ): # this setattr is needed for test_transform_lambda_with_datetimetz @@ -621,7 +621,7 @@ def true_and_notna(x) -> bool: try: indices = [ self._get_index(name) - for name, group in self.grouper.get_iterator( + for name, group in self._grouper.get_iterator( self._obj_with_exclusions, axis=self.axis ) if true_and_notna(group) @@ -673,11 +673,11 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: mask = ids >= 0 ids = ids[mask] codes = codes[mask] @@ -699,7 +699,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: res = np.bincount(ids[~mask], minlength=ngroups) res = ensure_int64(res) - ri = self.grouper.result_index + ri = self._grouper.result_index result: Series | DataFrame = self.obj._constructor( res, index=ri, name=self.obj.name ) @@ -734,10 +734,10 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info val = self.obj._values - index_names = self.grouper.names + [self.obj.name] + index_names = self._grouper.names + [self.obj.name] if isinstance(val.dtype, CategoricalDtype) or ( bins is not None and not np.iterable(bins) @@ -804,9 +804,9 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self.grouper._reconstructed_codes + codes = self._grouper.reconstructed_codes codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping._group_index for ping in self.grouper.groupings] + [lev] + levels = [ping._group_index for ping in self._grouper.groupings] + [lev] if dropna: mask = codes[-1] != -1 @@ -1460,7 +1460,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) func, *args, engine_kwargs=engine_kwargs, **kwargs ) # grouper specific aggregations - if self.grouper.nkeys > 1: + if self._grouper.nkeys > 1: # test_groupby_as_index_series_scalar gets here with 'not self.as_index' return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: @@ -1528,7 +1528,7 @@ def _python_agg_general(self, func, *args, **kwargs): output: dict[int, ArrayLike] = {} for idx, (name, ser) in enumerate(obj.items()): - result = self.grouper.agg_series(ser, f) + result = self._grouper.agg_series(ser, f) output[idx] = result res = self.obj._constructor(output) @@ -1536,17 +1536,17 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(res) def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: - if self.grouper.nkeys != 1: + if self._grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") obj = self._obj_with_exclusions result: dict[Hashable, NDFrame | np.ndarray] = {} - for name, grp_df in self.grouper.get_iterator(obj, self.axis): + for name, grp_df in self._grouper.get_iterator(obj, self.axis): fres = func(grp_df, *args, **kwargs) result[name] = fres - result_index = self.grouper.result_index + result_index = self._grouper.result_index other_ax = obj.axes[1 - self.axis] out = self.obj._constructor(result, index=other_ax, columns=result_index) if self.axis == 0: @@ -1566,7 +1566,7 @@ def _wrap_applied_output( # GH#47787 see test_group_on_empty_multiindex res_index = data.index else: - res_index = self.grouper.result_index + res_index = self._grouper.result_index result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes, copy=False) @@ -1586,7 +1586,7 @@ def _wrap_applied_output( is_transform=is_transform, ) - key_index = self.grouper.result_index if self.as_index else None + key_index = self._grouper.result_index if self.as_index else None if isinstance(first_not_none, (np.ndarray, Index)): # GH#1738: values is list of arrays of unequal lengths @@ -1692,7 +1692,7 @@ def _cython_transform( ) def arr_func(bvalues: ArrayLike) -> ArrayLike: - return self.grouper._cython_operation( + return self._grouper._cython_operation( "transform", bvalues, how, 1, **kwargs ) @@ -1714,7 +1714,7 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): applied = [] obj = self._obj_with_exclusions - gen = self.grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) # Determine whether to use slow or fast path by evaluating on the first group. @@ -1908,7 +1908,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): indices = [] obj = self._selected_obj - gen = self.grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj, axis=self.axis) for name, group in gen: # 2023-02-27 no tests are broken this pinning, but it is documented in the @@ -1970,7 +1970,7 @@ def _gotitem(self, key, ndim: int, subset=None): self.keys, axis=self.axis, level=self.level, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, selection=key, as_index=self.as_index, @@ -1986,7 +1986,7 @@ def _gotitem(self, key, ndim: int, subset=None): subset, self.keys, level=self.level, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, selection=key, as_index=self.as_index, @@ -2023,7 +2023,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: SeriesGroupBy( obj.iloc[:, i], selection=colname, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, observed=self.observed, ) @@ -2033,7 +2033,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: if not len(results): # concat would raise - res_df = DataFrame([], columns=columns, index=self.grouper.result_index) + res_df = DataFrame([], columns=columns, index=self._grouper.result_index) else: res_df = concat(results, keys=columns, axis=1) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa78bbe1c3ec1..9a79488b07bce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -776,7 +776,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): } axis: AxisInt - grouper: ops.BaseGrouper + _grouper: ops.BaseGrouper keys: _KeysArgType | None = None level: IndexLabel | None = None group_keys: bool @@ -790,6 +790,17 @@ def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) + @final + @property + def grouper(self) -> ops.BaseGrouper: + warnings.warn( + f"{type(self).__name__}.grouper is deprecated and will be removed in a " + "future version of pandas.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._grouper + @final @property def groups(self) -> dict[Hashable, np.ndarray]: @@ -836,12 +847,12 @@ def groups(self) -> dict[Hashable, np.ndarray]: >>> ser.resample('MS').groups {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} """ - return self.grouper.groups + return self._grouper.groups @final @property def ngroups(self) -> int: - return self.grouper.ngroups + return self._grouper.ngroups @final @property @@ -891,7 +902,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], Timestamp('2023-02-01 00:00:00'): [2, 3]}) """ - return self.grouper.indices + return self._grouper.indices @final def _get_indices(self, names): @@ -1188,7 +1199,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ keys = self.keys level = self.level - result = self.grouper.get_iterator(self._selected_obj, axis=self.axis) + result = self._grouper.get_iterator(self._selected_obj, axis=self.axis) # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] # GH 51583 @@ -1278,7 +1289,7 @@ class GroupBy(BaseGroupBy[NDFrameT]): more """ - grouper: ops.BaseGrouper + _grouper: ops.BaseGrouper as_index: bool @final @@ -1339,7 +1350,7 @@ def __init__( self.obj = obj self.axis = obj._get_axis_number(axis) - self.grouper = grouper + self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() def __getattr__(self, attr: str): @@ -1417,7 +1428,7 @@ def curried(x): not_indexed_same=not is_transform, ) - if self.grouper.has_dropped_na and is_transform: + if self._grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input result = self._set_result_index_ordered(result) @@ -1438,9 +1449,9 @@ def _concat_objects( if self.group_keys and not is_transform: if self.as_index: # possible MI return case - group_keys = self.grouper.result_index - group_levels = self.grouper.levels - group_names = self.grouper.names + group_keys = self._grouper.result_index + group_levels = self._grouper.levels + group_names = self._grouper.names result = concat( values, @@ -1461,7 +1472,7 @@ def _concat_objects( ax = self._selected_obj._get_axis(self.axis) if self.dropna: - labels = self.grouper.group_info[0] + labels = self._grouper.group_info[0] mask = labels != -1 ax = ax[mask] @@ -1503,16 +1514,16 @@ def _set_result_index_ordered( obj_axis = self.obj._get_axis(self.axis) - if self.grouper.is_monotonic and not self.grouper.has_dropped_na: + if self._grouper.is_monotonic and not self._grouper.has_dropped_na: # shortcut if we have an already ordered grouper result = result.set_axis(obj_axis, axis=self.axis, copy=False) return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self.grouper.result_ilocs()) + original_positions = Index(self._grouper.result_ilocs()) result = result.set_axis(original_positions, axis=self.axis, copy=False) result = result.sort_index(axis=self.axis) - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) @@ -1528,9 +1539,9 @@ def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: # zip in reverse so we can always insert at loc 0 columns = result.columns for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), + reversed(self._grouper.names), + reversed(self._grouper.get_group_levels()), + reversed([grp.in_axis for grp in self._grouper.groupings]), ): # GH #28549 # When using .apply(-), name will be in columns already @@ -1588,10 +1599,10 @@ def _wrap_aggregated_output( # enforced in __init__ result = self._insert_inaxis_grouper(result) result = result._consolidate() - index = Index(range(self.grouper.ngroups)) + index = Index(range(self._grouper.ngroups)) else: - index = self.grouper.result_index + index = self._grouper.result_index if qs is not None: # We get here with len(qs) != 1 and not self.as_index @@ -1619,20 +1630,20 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self.grouper.group_info - sorted_index = self.grouper._sort_idx - sorted_ids = self.grouper._sorted_ids + ids, _, ngroups = self._grouper.group_info + sorted_index = self._grouper._sort_idx + sorted_ids = self._grouper._sorted_ids sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() # GH 46867 index_data = data.index if isinstance(index_data, MultiIndex): - if len(self.grouper.groupings) > 1: + if len(self._grouper.groupings) > 1: raise NotImplementedError( "Grouping with more than 1 grouping labels and " "a MultiIndex is not supported with engine='numba'" ) - group_key = self.grouper.groupings[0].name + group_key = self._grouper.groupings[0].name index_data = index_data.get_level_values(group_key) sorted_index_data = index_data.take(sorted_index).to_numpy() @@ -1673,13 +1684,13 @@ def _numba_agg_general( ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self.grouper.group_info - ngroups = self.grouper.ngroups + ids, _, _ = self._grouper.group_info + ngroups = self._grouper.ngroups res_mgr = df._mgr.apply( aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs ) - res_mgr.axes[1] = self.grouper.result_index + res_mgr.axes[1] = self._grouper.result_index result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) if data.ndim == 1: @@ -1750,7 +1761,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): len(df.columns), *args, ) - index = self.grouper.result_index + index = self._grouper.result_index if data.ndim == 1: result_kwargs = {"name": data.name} result = result.ravel() @@ -1870,7 +1881,7 @@ def _python_apply_general( Series or DataFrame data after applying f """ - values, mutated = self.grouper.apply_groupwise(f, data, self.axis) + values, mutated = self._grouper.apply_groupwise(f, data, self.axis) if not_indexed_same is None: not_indexed_same = mutated @@ -1927,7 +1938,7 @@ def _agg_py_fallback( # should always be preserved by the implemented aggregations # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? try: - res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) + res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True) except Exception as err: msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" # preserve the kind of exception that raised @@ -1958,7 +1969,7 @@ def _cython_agg_general( def array_func(values: ArrayLike) -> ArrayLike: try: - result = self.grouper._cython_operation( + result = self._grouper._cython_operation( "aggregate", values, how, @@ -2047,8 +2058,8 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self.grouper.group_info - result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) + ids, _, _ = self._grouper.group_info + result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: # i.e. SeriesGroupBy @@ -2100,7 +2111,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2116,7 +2127,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: else: out = np.repeat(out[np.r_[run[1:], True]], rep) - out - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) else: out = out.astype(np.int64, copy=False) @@ -2309,7 +2320,7 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info mask = ids != -1 is_series = data.ndim == 1 @@ -2764,7 +2775,7 @@ def _value_counts( obj = self._obj_with_exclusions in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis + grouping.name for grouping in self._grouper.groupings if grouping.in_axis } if isinstance(obj, Series): _name = obj.name @@ -2795,7 +2806,7 @@ def _value_counts( if _name not in in_axis_names and _name in subsetted ] - groupings = list(self.grouper.groupings) + groupings = list(self._grouper.groupings) for key in keys: grouper, _, _ = get_grouper( df, @@ -2840,7 +2851,7 @@ def _value_counts( names = result_series.index.names # GH#55951 - Temporarily replace names in case they are integers result_series.index.names = range(len(names)) - index_level = list(range(len(self.grouper.groupings))) + index_level = list(range(len(self._grouper.groupings))) result_series = result_series.sort_index( level=index_level, sort_remaining=False ) @@ -2851,7 +2862,7 @@ def _value_counts( # We are guaranteed to have the first N levels be the # user-requested grouping. levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) + range(len(self._grouper.groupings), result_series.index.nlevels) ) indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), @@ -2877,7 +2888,7 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] + orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) result_frame.columns = cols result = result_frame @@ -3027,7 +3038,7 @@ def size(self) -> DataFrame | Series: 2023-02-01 1 Freq: MS, dtype: int64 """ - result = self.grouper.size() + result = self._grouper.size() dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): @@ -3527,13 +3538,13 @@ def ohlc(self) -> DataFrame: if not is_numeric: raise DataError("No numeric types to aggregate") - res_values = self.grouper._cython_operation( + res_values = self._grouper._cython_operation( "aggregate", obj._values, "ohlc", axis=0, min_count=-1 ) agg_names = ["open", "high", "low", "close"] result = self.obj._constructor_expanddim( - res_values, index=self.grouper.result_index, columns=agg_names + res_values, index=self._grouper.result_index, columns=agg_names ) return self._reindex_output(result) @@ -3846,7 +3857,7 @@ def rolling(self, *args, **kwargs) -> RollingGroupby: return RollingGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, _as_index=self.as_index, **kwargs, ) @@ -3868,7 +3879,7 @@ def expanding(self, *args, **kwargs) -> ExpandingGroupby: return ExpandingGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, **kwargs, ) @@ -3888,7 +3899,7 @@ def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: return ExponentialMovingWindowGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, **kwargs, ) @@ -3920,7 +3931,7 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) if direction == "bfill": sorted_labels = sorted_labels[::-1] @@ -3945,7 +3956,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: # np.take_along_axis if isinstance(values, np.ndarray): dtype = values.dtype - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: # dropped null groups give rise to nan in the result dtype = ensure_dtype_can_hold_na(values.dtype) out = np.empty(values.shape, dtype=dtype) @@ -4251,7 +4262,7 @@ def _nth( if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info # Drop NA values in grouping mask = mask & (ids != -1) @@ -4280,14 +4291,14 @@ def _nth( grouper: np.ndarray | Index | ops.BaseGrouper if len(dropped) == len(self._selected_obj): # Nothing was dropped, can use the same grouper - grouper = self.grouper + grouper = self._grouper else: # we don't have the grouper info available # (e.g. we have selected out # a column that is not in the current object) - axis = self.grouper.axis - grouper = self.grouper.codes_info[axis.isin(dropped.index)] - if self.grouper.has_dropped_na: + axis = self._grouper.axis + grouper = self._grouper.codes_info[axis.isin(dropped.index)] + if self._grouper.has_dropped_na: # Null groups need to still be encoded as -1 when passed to groupby nulls = grouper == -1 # error: No overload variant of "where" matches argument types @@ -4352,10 +4363,10 @@ def quantile( mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") obj = self._wrap_agged_manager(mgr) if self.axis == 1: - splitter = self.grouper._get_splitter(obj.T, axis=self.axis) + splitter = self._grouper._get_splitter(obj.T, axis=self.axis) sdata = splitter._sorted_data.T else: - splitter = self.grouper._get_splitter(obj, axis=self.axis) + splitter = self._grouper._get_splitter(obj, axis=self.axis) sdata = splitter._sorted_data starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) @@ -4462,7 +4473,7 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info nqs = len(qs) func = partial( @@ -4595,16 +4606,16 @@ def ngroup(self, ascending: bool = True): """ obj = self._obj_with_exclusions index = obj._get_axis(self.axis) - comp_ids = self.grouper.group_info[0] + comp_ids = self._grouper.group_info[0] dtype: type - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) dtype = np.float64 else: dtype = np.int64 - if any(ping._passed_categorical for ping in self.grouper.groupings): + if any(ping._passed_categorical for ping in self._grouper.groupings): # comp_ids reflect non-observed groups, we need only observed comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 @@ -5182,7 +5193,7 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5417,9 +5428,9 @@ def pct_change( limit = 0 filled = getattr(self, fill_method)(limit=limit) if self.axis == 0: - fill_grp = filled.groupby(self.grouper.codes, group_keys=self.group_keys) + fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) else: - fill_grp = filled.T.groupby(self.grouper.codes, group_keys=self.group_keys) + fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys) shifted = fill_grp.shift(periods=periods, freq=freq) if self.axis == 1: shifted = shifted.T @@ -5521,7 +5532,7 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: Series or DataFrame Filtered _selected_obj. """ - ids = self.grouper.group_info[0] + ids = self._grouper.group_info[0] mask = mask & (ids != -1) if self.axis == 0: @@ -5561,7 +5572,7 @@ def _reindex_output( Series or DataFrame Object (potentially) re-indexed to include all possible groups. """ - groupings = self.grouper.groupings + groupings = self._grouper.groupings if len(groupings) == 1: return output @@ -5578,7 +5589,7 @@ def _reindex_output( return output levels_list = [ping._group_index for ping in groupings] - names = self.grouper.names + names = self._grouper.names if qs is not None: # error: Argument 1 to "append" of "list" has incompatible type # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" @@ -5600,7 +5611,7 @@ def _reindex_output( # GH 13204 # Here, the categorical in-axis groupers, which need to be fully # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self.grouper.names) + # output = output.set_index(self._grouper.names) # .reindex(index).reset_index() # but special care has to be taken because of possible not-in-axis # groupers. @@ -5616,7 +5627,7 @@ def _reindex_output( output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex( + output = output.set_index(self._grouper.result_index).reindex( index, copy=False, fill_value=fill_value ) @@ -5732,7 +5743,7 @@ def sample( random_state = com.random_state(random_state) - group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) + group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis) sampled_indices = [] for labels, obj in group_iterator: @@ -5796,16 +5807,16 @@ def _idxmax_idxmin( axis = self.axis if not self.observed and any( - ping._passed_categorical for ping in self.grouper.groupings + ping._passed_categorical for ping in self._grouper.groupings ): expected_len = np.prod( - [len(ping._group_index) for ping in self.grouper.groupings] + [len(ping._group_index) for ping in self._grouper.groupings] ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + if len(self._grouper.groupings) == 1: + result_len = len(self._grouper.groupings[0].grouping_vector.unique()) else: # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) + result_len = len(self._grouper.result_index) assert result_len <= expected_len has_unobserved = result_len < expected_len diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f3579e6c13a19..001b045fd7a55 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -15,7 +15,6 @@ Generic, final, ) -import warnings import numpy as np @@ -33,7 +32,6 @@ ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -618,7 +616,7 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - keys = self._group_keys_seq + keys = self.group_keys_seq yield from zip(keys, splitter) @final @@ -640,7 +638,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: @final @cache_readonly - def _group_keys_seq(self): + def group_keys_seq(self): if len(self.groupings) == 1: return self.levels[0] else: @@ -649,16 +647,6 @@ def _group_keys_seq(self): # provide "flattened" iterator for multi-group setting return get_flattened_list(ids, ngroups, self.levels, self.codes) - @property - def group_keys_seq(self): - warnings.warn( - "group_keys_seq is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._group_keys_seq - @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -786,27 +774,17 @@ def ngroups(self) -> int: return len(self.result_index) @property - def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: codes = self.codes ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) - @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - warnings.warn( - "reconstructed_codes is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._reconstructed_codes - @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: return self.groupings[0]._result_index.rename(self.names[0]) - codes = self._reconstructed_codes + codes = self.reconstructed_codes levels = [ping._result_index for ping in self.groupings] return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names @@ -820,7 +798,7 @@ def get_group_levels(self) -> list[ArrayLike]: return [self.groupings[0]._group_arraylike] name_list = [] - for ping, codes in zip(self.groupings, self._reconstructed_codes): + for ping, codes in zip(self.groupings, self.reconstructed_codes): codes = ensure_platform_int(codes) levels = ping._group_arraylike.take(codes) @@ -929,7 +907,7 @@ def apply_groupwise( ) -> tuple[list, bool]: mutated = False splitter = self._get_splitter(data, axis=axis) - group_keys = self._group_keys_seq + group_keys = self.group_keys_seq result_values = [] # This calls DataSplitter.__iter__ @@ -1109,7 +1087,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: ) @cache_readonly - def _reconstructed_codes(self) -> list[np.ndarray]: + def reconstructed_codes(self) -> list[np.ndarray]: # get unique result indices, and prepend 0 as groupby starts from the first return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d54f6d31f6144..1804d75e453b6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -142,7 +142,7 @@ class Resampler(BaseGroupBy, PandasObject): After resampling, see aggregate, apply, and transform functions. """ - grouper: BinGrouper + _grouper: BinGrouper _timegrouper: TimeGrouper binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat @@ -184,7 +184,7 @@ def __init__( self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index ) - self.binner, self.grouper = self._get_binner() + self.binner, self._grouper = self._get_binner() self._selection = selection if self._timegrouper.key is not None: self.exclusions = frozenset([self._timegrouper.key]) @@ -414,7 +414,7 @@ def _gotitem(self, key, ndim: int, subset=None): subset : object, default None subset to act on """ - grouper = self.grouper + grouper = self._grouper if subset is None: subset = self.obj if key is not None: @@ -434,7 +434,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): """ Re-evaluate the obj with a groupby aggregation. """ - grouper = self.grouper + grouper = self._grouper # Excludes `on` column when provided obj = self._obj_with_exclusions @@ -1764,7 +1764,7 @@ def _downsample(self, how, **kwargs): # error: Item "None" of "Optional[Any]" has no attribute "binlabels" if ( (ax.freq is not None or ax.inferred_freq is not None) - and len(self.grouper.binlabels) > len(ax) + and len(self._grouper.binlabels) > len(ax) and how is None ): # let's do an asfreq @@ -1773,10 +1773,10 @@ def _downsample(self, how, **kwargs): # we are downsampling # we want to call the actual grouper method here if self.axis == 0: - result = obj.groupby(self.grouper).aggregate(how, **kwargs) + result = obj.groupby(self._grouper).aggregate(how, **kwargs) else: # test_resample_axis1 - result = obj.T.groupby(self.grouper).aggregate(how, **kwargs).T + result = obj.T.groupby(self._grouper).aggregate(how, **kwargs).T return self._wrap_result(result) @@ -2253,7 +2253,7 @@ def _get_grouper( ) -> tuple[BinGrouper, NDFrameT]: # create the resampler and return our binner r = self._get_resampler(obj) - return r.grouper, cast(NDFrameT, r.obj) + return r._grouper, cast(NDFrameT, r.obj) def _get_time_bins(self, ax: DatetimeIndex): if not isinstance(ax, DatetimeIndex): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c43c16cded852..f498ad3567ff2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -265,7 +265,7 @@ def _groupby_and_merge( if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) - for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis): + for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis): if rby is None: rhs = right else: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 82718d4c43a65..b2a915589cba7 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -171,7 +171,7 @@ def __internal_pivot_table( observed_bool = False if observed is lib.no_default else observed grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna) if observed is lib.no_default and any( - ping._passed_categorical for ping in grouped.grouper.groupings + ping._passed_categorical for ping in grouped._grouper.groupings ): warnings.warn( "The default value of observed=False is deprecated and will change " diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 4e8221f67a74d..8a14fc0af75c1 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -28,8 +28,11 @@ def test_grouping_grouper(self, data_for_grouping): "B": data_for_grouping, } ) - gr1 = df.groupby("A").grouper.groupings[0] - gr2 = df.groupby("B").grouper.groupings[0] + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gr1 = df.groupby("A").grouper.groupings[0] + with tm.assert_produces_warning(FutureWarning, match=msg): + gr2 = df.groupby("B").grouper.groupings[0] tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c3bcd30796e63..e33660ccf0df8 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -168,14 +168,19 @@ def test_agg_grouping_is_list_tuple(ts): ) grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouping_vector - grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouper = grouped.grouper.groupings[0].grouping_vector + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) - grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) result = grouped.agg("mean") expected = grouped.mean() diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py index 0ce6a6462a5d8..94e672d4892fe 100644 --- a/pandas/tests/groupby/methods/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -18,7 +18,7 @@ def test_group_shift_with_null_key(): # Generate a moderately large dataframe with occasional missing # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly + # force `-1` in `labels` array of `g._grouper.group_info` exactly # at those places, where the group-by key is partially missing. df = DataFrame( [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 802cae9ff65f0..9a0cbaf376008 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1537,7 +1537,9 @@ def test_groupby_nat_exclude(): tm.assert_index_equal(grouped.groups[k], e) # confirm obj is not filtered - tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) assert grouped.ngroups == 2 expected = { @@ -3303,13 +3305,3 @@ def test_groupby_ffill_with_duplicated_index(): result = df.groupby(level=0).ffill() expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) tm.assert_frame_equal(result, expected, check_dtype=False) - - -@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"]) -def test_depr_grouper_attrs(attr): - # GH#56148 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - gb = df.groupby("a") - msg = f"{attr} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb.grouper, attr) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4f54621b19b64..05d56ab45b0cd 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -172,7 +172,9 @@ def test_grouper_dropna_propagation(dropna): # GH 36604 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) - assert gb.grouper.dropna == dropna + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert gb.grouper.dropna == dropna @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index c401762dace23..ab8e3c39b55b2 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -428,7 +428,9 @@ def test_grouper_getting_correct_binner(self): tm.assert_frame_equal(result, expected) def test_grouper_iter(self, df): - assert sorted(df.groupby("A").grouper) == ["bar", "foo"] + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert sorted(df.groupby("A").grouper) == ["bar", "foo"] def test_empty_groups(self, df): # see gh-1048 @@ -438,7 +440,9 @@ def test_empty_groups(self, df): def test_groupby_grouper(self, df): grouped = df.groupby("A") - result = df.groupby(grouped.grouper).mean(numeric_only=True) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(grouped.grouper).mean(numeric_only=True) expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) @@ -748,14 +752,16 @@ def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_da # GH 17537 grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_grouping_labels(self, multiindex_dataframe_random_data): grouped = multiindex_dataframe_random_data.groupby( multiindex_dataframe_random_data.index.get_level_values(0) ) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 @@ -814,19 +820,26 @@ def test_groupby_empty(self): tm.assert_series_equal(result, expected) # check group properties - assert len(gr.grouper.groupings) == 1 - tm.assert_numpy_array_equal( - gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) - ) + msg = "SeriesGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert len(gr.grouper.groupings) == 1 + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_numpy_array_equal( + gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) + ) - tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_numpy_array_equal( + gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) + ) - assert gr.grouper.group_info[2] == 0 + with tm.assert_produces_warning(FutureWarning, match=msg): + assert gr.grouper.group_info[2] == 0 # check name - assert s.groupby(s).grouper.names == ["name"] + msg = "SeriesGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert s.groupby(s).grouper.names == ["name"] def test_groupby_level_index_value_all_na(self): # issue 20519 @@ -1024,8 +1037,10 @@ def test_grouping_is_iterable(self, tsframe): grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) # test it works - for g in grouped.grouper.groupings[0]: - pass + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + for g in grouped.grouper.groupings[0]: + pass def test_multi_iter(self): s = Series(np.arange(6)) @@ -1161,7 +1176,9 @@ def test_grouping_string_repr(self): df = DataFrame([[1, 2, 3]], columns=mi) gr = df.groupby(df[("A", "a")]) - result = gr.grouper.groupings[0].__repr__() + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gr.grouper.groupings[0].__repr__() expected = "Grouping(('A', 'a'))" assert result == expected @@ -1170,8 +1187,11 @@ def test_grouping_by_key_is_in_axis(): # GH#50413 - Groupers specified by key are in-axis df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a") gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False) - assert not gb.grouper.groupings[0].in_axis - assert gb.grouper.groupings[1].in_axis + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not gb.grouper.groupings[0].in_axis + with tm.assert_produces_warning(FutureWarning, match=msg): + assert gb.grouper.groupings[1].in_axis # Currently only in-axis groupings are including in the result when as_index=False; # This is likely to change in the future. @@ -1196,7 +1216,9 @@ def test_grouper_groups(): msg = "Use GroupBy.grouper instead" with tm.assert_produces_warning(FutureWarning, match=msg): res = grper.grouper - assert res is gb.grouper + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert res is gb.grouper msg = "Grouper.obj is deprecated and will be removed" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1219,4 +1241,4 @@ def test_depr_grouping_attrs(attr): gb = df.groupby("a") msg = f"{attr} is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb.grouper.groupings[0], attr) + getattr(gb._grouper.groupings[0], attr) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a5ac5b09bfd34..f889bac8d1a41 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -52,8 +52,8 @@ def frame_for_truncated_bingrouper(): @pytest.fixture def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): """ - GroupBy object such that gb.grouper is a BinGrouper and - len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq) + GroupBy object such that gb._grouper is a BinGrouper and + len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq) Aggregations on this groupby should have @@ -67,7 +67,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - msg = "group_keys_seq is deprecated" + msg = "DataFrameGroupBy.grouper is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) @@ -157,7 +157,9 @@ def test_groupby_with_timegrouper_methods(self, should_sort): g = df.groupby(Grouper(freq="6ME")) assert g.group_keys - assert isinstance(g.grouper, BinGrouper) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert isinstance(g.grouper, BinGrouper) groups = g.groups assert isinstance(groups, dict) assert len(groups) == 3 @@ -880,7 +882,7 @@ def test_grouper_period_index(self): def test_groupby_apply_timegrouper_with_nat_dict_returns( self, groupby_with_truncated_bingrouper ): - # GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq + # GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq # have different lengths that goes through the `isinstance(values[0], dict)` # path gb = groupby_with_truncated_bingrouper diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 35699fe9647d7..b1b8348fb63dd 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1353,7 +1353,9 @@ def func(grp): # Check that the fastpath raises, see _transform_general obj = gb._obj_with_exclusions - gen = gb.grouper.get_iterator(obj, axis=gb.axis) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gen = gb.grouper.get_iterator(obj, axis=gb.axis) fast_path, slow_path = gb._define_paths(func) _, group = next(gen) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 046a4749925d8..285f240028152 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -106,7 +106,7 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr.grouper.shape) + assert is_int64_overflow_possible(gr._grouper.shape) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], From 6157a0f95e06a9523ae15983d8532aa5fb8e646d Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 16 Dec 2023 11:10:12 -0500 Subject: [PATCH 2/4] DEPR: groupby.grouper --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8209525721b98..33d0164ab3882 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -472,7 +472,7 @@ Other Deprecations - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) -- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`) +- Deprecated the :class:`.DataFrameGroupBy.grouper` and :class:`SeriesGroupBy.grouper`; these will be removed in a future version of pandas (:issue:`56521`) - Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`) - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) From 0362778f1cb55f46dfc71cd4c220864527d24c02 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 18 Dec 2023 08:15:45 -0500 Subject: [PATCH 3/4] fix whatsnew, tests --- doc/source/whatsnew/v2.2.0.rst | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 11 +--- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_groupby_dropna.py | 4 +- pandas/tests/groupby/test_grouping.py | 62 ++++++++----------- pandas/tests/groupby/test_timegrouper.py | 8 +-- .../tests/groupby/transform/test_transform.py | 4 +- 7 files changed, 36 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 33d0164ab3882..2c3066ef8ea22 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -472,7 +472,7 @@ Other Deprecations - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) -- Deprecated the :class:`.DataFrameGroupBy.grouper` and :class:`SeriesGroupBy.grouper`; these will be removed in a future version of pandas (:issue:`56521`) +- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`) - Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`) - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e33660ccf0df8..6223a153df358 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -168,19 +168,14 @@ def test_agg_grouping_is_list_tuple(ts): ) grouped = df.groupby(lambda x: x.year) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouper = grouped.grouper.groupings[0].grouping_vector - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) + grouper = grouped._grouper.groupings[0].grouping_vector + grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper)) result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) result = grouped.agg("mean") expected = grouped.mean() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9a0cbaf376008..94263df8a93a0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1537,9 +1537,7 @@ def test_groupby_nat_exclude(): tm.assert_index_equal(grouped.groups[k], e) # confirm obj is not filtered - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + tm.assert_frame_equal(grouped._grouper.groupings[0].obj, df) assert grouped.ngroups == 2 expected = { diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 05d56ab45b0cd..73638eba0a3b3 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -172,9 +172,7 @@ def test_grouper_dropna_propagation(dropna): # GH 36604 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert gb.grouper.dropna == dropna + assert gb._grouper.dropna == dropna @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index ab8e3c39b55b2..363ff883385db 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -428,9 +428,13 @@ def test_grouper_getting_correct_binner(self): tm.assert_frame_equal(result, expected) def test_grouper_iter(self, df): + gb = df.groupby("A") msg = "DataFrameGroupBy.grouper is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - assert sorted(df.groupby("A").grouper) == ["bar", "foo"] + grouper = gb.grouper + result = sorted(grouper) + expected = ["bar", "foo"] + assert result == expected def test_empty_groups(self, df): # see gh-1048 @@ -439,10 +443,10 @@ def test_empty_groups(self, df): def test_groupby_grouper(self, df): grouped = df.groupby("A") - msg = "DataFrameGroupBy.grouper is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(grouped.grouper).mean(numeric_only=True) + grouper = grouped.grouper + result = df.groupby(grouper).mean(numeric_only=True) expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) @@ -759,9 +763,7 @@ def test_grouping_labels(self, multiindex_dataframe_random_data): multiindex_dataframe_random_data.index.get_level_values(0) ) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 @@ -820,26 +822,25 @@ def test_groupby_empty(self): tm.assert_series_equal(result, expected) # check group properties - msg = "SeriesGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert len(gr.grouper.groupings) == 1 - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_numpy_array_equal( - gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) - ) + assert len(gr._grouper.groupings) == 1 + tm.assert_numpy_array_equal( + gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) + ) - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) - ) + tm.assert_numpy_array_equal( + gr._grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) + ) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert gr.grouper.group_info[2] == 0 + assert gr._grouper.group_info[2] == 0 # check name + gb = s.groupby(s) msg = "SeriesGroupBy.grouper is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - assert s.groupby(s).grouper.names == ["name"] + grouper = gb.grouper + result = grouper.names + expected = ["name"] + assert result == expected def test_groupby_level_index_value_all_na(self): # issue 20519 @@ -1037,10 +1038,8 @@ def test_grouping_is_iterable(self, tsframe): grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) # test it works - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - for g in grouped.grouper.groupings[0]: - pass + for g in grouped._grouper.groupings[0]: + pass def test_multi_iter(self): s = Series(np.arange(6)) @@ -1176,9 +1175,7 @@ def test_grouping_string_repr(self): df = DataFrame([[1, 2, 3]], columns=mi) gr = df.groupby(df[("A", "a")]) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gr.grouper.groupings[0].__repr__() + result = gr._grouper.groupings[0].__repr__() expected = "Grouping(('A', 'a'))" assert result == expected @@ -1187,11 +1184,8 @@ def test_grouping_by_key_is_in_axis(): # GH#50413 - Groupers specified by key are in-axis df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a") gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False) - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert not gb.grouper.groupings[0].in_axis - with tm.assert_produces_warning(FutureWarning, match=msg): - assert gb.grouper.groupings[1].in_axis + assert not gb._grouper.groupings[0].in_axis + assert gb._grouper.groupings[1].in_axis # Currently only in-axis groupings are including in the result when as_index=False; # This is likely to change in the future. @@ -1216,9 +1210,7 @@ def test_grouper_groups(): msg = "Use GroupBy.grouper instead" with tm.assert_produces_warning(FutureWarning, match=msg): res = grper.grouper - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert res is gb.grouper + assert res is gb._grouper msg = "Grouper.obj is deprecated and will be removed" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index f889bac8d1a41..d357a65e79796 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -67,9 +67,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) return gb @@ -157,9 +155,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): g = df.groupby(Grouper(freq="6ME")) assert g.group_keys - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isinstance(g.grouper, BinGrouper) + assert isinstance(g._grouper, BinGrouper) groups = g.groups assert isinstance(groups, dict) assert len(groups) == 3 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index b1b8348fb63dd..a2ecd6c65db60 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1353,9 +1353,7 @@ def func(grp): # Check that the fastpath raises, see _transform_general obj = gb._obj_with_exclusions - msg = "DataFrameGroupBy.grouper is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gen = gb.grouper.get_iterator(obj, axis=gb.axis) + gen = gb._grouper.get_iterator(obj, axis=gb.axis) fast_path, slow_path = gb._define_paths(func) _, group = next(gen) From 6509fcd54d75e84735cf45a9dfe9530f31043289 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 18 Dec 2023 08:47:58 -0500 Subject: [PATCH 4/4] Restore test --- pandas/tests/groupby/test_groupby.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 94263df8a93a0..fce7caa90cce4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1,4 +1,5 @@ from datetime import datetime +import decimal from decimal import Decimal import re @@ -3303,3 +3304,23 @@ def test_groupby_ffill_with_duplicated_index(): result = df.groupby(level=0).ffill() expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize("test_series", [True, False]) +def test_decimal_na_sort(test_series): + # GH#54847 + # We catch both TypeError and decimal.InvalidOperation exceptions in safe_sort. + # If this next assert raises, we can just catch TypeError + assert not isinstance(decimal.InvalidOperation, TypeError) + df = DataFrame( + { + "key": [Decimal(1), Decimal(1), None, None], + "value": [Decimal(2), Decimal(3), Decimal(4), Decimal(5)], + } + ) + gb = df.groupby("key", dropna=False) + if test_series: + gb = gb["value"] + result = gb._grouper.result_index + expected = Index([Decimal(1), None], name="key") + tm.assert_index_equal(result, expected)