Skip to content

Commit

Permalink
BUG/PERF: groupby.transform with unobserved categories (#58084)
Browse files Browse the repository at this point in the history
  • Loading branch information
undermyumbrella1 authored May 8, 2024
1 parent ca55d77 commit 8d543ba
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 19 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)


Reshaping
Expand Down
48 changes: 32 additions & 16 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1875,24 +1875,40 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

else:
# i.e. func in base.reduction_kernels
if self.observed:
return self._reduction_kernel_transform(
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)

# GH#30918 Use _transform_fast only when we know func is an aggregation
# If func is a reduction, we need to broadcast the
# result to the whole group. Compute func result
# and deal with possible broadcasting below.
with com.temp_setattr(self, "as_index", True):
# GH#49834 - result needs groups in the index for
# _wrap_transform_fast_result
if func in ["idxmin", "idxmax"]:
func = cast(Literal["idxmin", "idxmax"], func)
result = self._idxmax_idxmin(func, True, *args, **kwargs)
else:
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)
with (
com.temp_setattr(self, "observed", True),
com.temp_setattr(self, "_grouper", self._grouper.observed_grouper),
):
return self._reduction_kernel_transform(
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)

@final
def _reduction_kernel_transform(
self, func, *args, engine=None, engine_kwargs=None, **kwargs
):
# GH#30918 Use _transform_fast only when we know func is an aggregation
# If func is a reduction, we need to broadcast the
# result to the whole group. Compute func result
# and deal with possible broadcasting below.
with com.temp_setattr(self, "as_index", True):
# GH#49834 - result needs groups in the index for
# _wrap_transform_fast_result
if func in ["idxmin", "idxmax"]:
func = cast(Literal["idxmin", "idxmax"], func)
result = self._idxmax_idxmin(func, True, *args, **kwargs)
else:
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)

return self._wrap_transform_fast_result(result)
return self._wrap_transform_fast_result(result)

@final
def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
Expand Down
22 changes: 22 additions & 0 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,28 @@ def groups(self) -> dict[Hashable, Index]:
cats = Categorical.from_codes(codes, uniques, validate=False)
return self._index.groupby(cats)

@property
def observed_grouping(self) -> Grouping:
if self._observed:
return self

return self._observed_grouping

@cache_readonly
def _observed_grouping(self) -> Grouping:
grouping = Grouping(
self._index,
self._orig_grouper,
obj=self.obj,
level=self.level,
sort=self._sort,
observed=True,
in_axis=self.in_axis,
dropna=self._dropna,
uniques=self._uniques,
)
return grouping


def get_grouper(
obj: NDFrameT,
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,19 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:

return result_index, ids

@property
def observed_grouper(self) -> BaseGrouper:
if all(ping._observed for ping in self.groupings):
return self

return self._observed_grouper

@cache_readonly
def _observed_grouper(self) -> BaseGrouper:
groupings = [ping.observed_grouping for ping in self.groupings]
grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna)
return grouper

def _ob_index_and_ids(
self,
levels: list[Index],
Expand Down Expand Up @@ -1154,6 +1167,10 @@ def groupings(self) -> list[grouper.Grouping]:
)
return [ping]

@property
def observed_grouper(self) -> BinGrouper:
return self


def _is_indexed_like(obj, axes) -> bool:
if isinstance(obj, Series):
Expand Down
51 changes: 48 additions & 3 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1232,9 +1232,9 @@ def test_categorical_and_not_categorical_key(observed):
tm.assert_frame_equal(result, expected_explicit)

# Series case
result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
"sum"
)
gb = df_with_categorical.groupby(["A", "C"], observed=observed)
gbp = gb["B"]
result = gbp.transform("sum")
expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
tm.assert_series_equal(result, expected)
expected_explicit = Series([4, 2, 4], name="B")
Expand Down Expand Up @@ -1535,3 +1535,48 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels():
result = df.groupby(series, as_index=False).transform("sum")
expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["int32", "float32"])
def test_min_one_unobserved_category_no_type_coercion(dtype):
# GH#58084
df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]})
df["B"] = df["B"].astype(dtype)
gb = df.groupby("A", observed=False)
result = gb.transform("min")

expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype)
tm.assert_frame_equal(expected, result)


def test_min_all_empty_data_no_type_coercion():
# GH#58084
df = DataFrame(
{
"X": Categorical(
[],
categories=[1, "randomcat", 100],
),
"Y": [],
}
)
df["Y"] = df["Y"].astype("int32")

gb = df.groupby("X", observed=False)
result = gb.transform("min")

expected = DataFrame({"Y": []}, dtype="int32")
tm.assert_frame_equal(expected, result)


def test_min_one_dim_no_type_coercion():
# GH#58084
df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]})
df["Y"] = df["Y"].astype("int32")
categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5])

gb = df.groupby(categories, observed=False)
result = gb.transform("min")

expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32")
tm.assert_frame_equal(expected, result)

0 comments on commit 8d543ba

Please sign in to comment.