From 30013ee067e7ebc03ce4e0fe6c0d44c44205f3e4 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 16:38:15 +0800 Subject: [PATCH] Move calculation of observed grouper to when initialising groupby --- pandas/core/groupby/generic.py | 6 ++-- pandas/core/groupby/groupby.py | 56 +++++++++++++--------------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bbfc35b8e30b7..d298139d72554 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2040,12 +2040,13 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, + observed_grouper=self.observed_grouper, + observed_exclusions=self.observed_exclusions, ) elif ndim == 1: if subset is None: subset = self.obj[key] - orig_obj = self.orig_obj if not self.observed else None return SeriesGroupBy( subset, self.keys, @@ -2058,7 +2059,8 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, - orig_obj=orig_obj, + observed_grouper=self.observed_grouper, + observed_exclusions=self.observed_exclusions, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7661902a65579..c34c3a3861255 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -586,9 +586,10 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "keys", "level", "obj", - "orig_obj", "observed", "sort", + "observed_grouper", + "observed_exclusions", } _grouper: ops.BaseGrouper @@ -1107,7 +1108,8 @@ def __init__( group_keys: bool = True, observed: bool = False, dropna: bool = True, - orig_obj: NDFrameT | None = None, + observed_grouper: ops.BaseGrouper | None = None, + observed_exclusions: frozenset[Hashable] | None = None, ) -> None: self._selection = selection @@ -1119,8 +1121,8 @@ def __init__( self.sort = sort self.group_keys = group_keys self.dropna = dropna - self.orig_obj = obj if orig_obj is None else orig_obj + orig_obj = obj if grouper is None: grouper, exclusions, obj = get_grouper( obj, @@ -1136,6 +1138,21 @@ def __init__( self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() + if not observed and observed_grouper is None: + observed_grouper, observed_exclusions, _ = get_grouper( + orig_obj, + self.keys, + level=self.level, + sort=self.sort, + observed=True, + dropna=self.dropna, + ) + + self.observed_grouper = observed_grouper + self.observed_exclusions = ( + frozenset(observed_exclusions) if observed_exclusions else frozenset() + ) + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) @@ -1887,44 +1904,15 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - grouper, exclusions, obj = get_grouper( - self.orig_obj, - self.keys, - level=self.level, - sort=self.sort, - observed=True, - dropna=self.dropna, - ) - exclusions = frozenset(exclusions) if exclusions else frozenset() - obj_has_not_changed = self.orig_obj.equals(self.obj) - with ( com.temp_setattr(self, "observed", True), - com.temp_setattr(self, "_grouper", grouper), - com.temp_setattr(self, "exclusions", exclusions), - com.temp_setattr(self, "obj", obj, condition=obj_has_not_changed), + com.temp_setattr(self, "_grouper", self.observed_grouper), + com.temp_setattr(self, "exclusions", self.observed_exclusions), ): return self._reduction_kernel_transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - # with com.temp_setattr(self, "as_index", True): - # # GH#49834 - result needs groups in the index for - # # _wrap_transform_fast_result - # if func in ["idxmin", "idxmax"]: - # func = cast(Literal["idxmin", "idxmax"], func) - # result = self._idxmax_idxmin(func, True, *args, **kwargs) - # else: - # if engine is not None: - # kwargs["engine"] = engine - # kwargs["engine_kwargs"] = engine_kwargs - # result = getattr(self, func)(*args, **kwargs) - - # print("result with observed = False\n", result.to_string()) - # r = self._wrap_transform_fast_result(result) - # print("reindexed result", r.to_string()) - # return r - @final def _reduction_kernel_transform( self, func, *args, engine=None, engine_kwargs=None, **kwargs