diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e4dad8800d78f..a81d9b8ec3607 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -457,6 +457,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) +- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) Reshaping diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f44ef8c4dbbfa..4ebc149256336 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1875,24 +1875,40 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): else: # i.e. func in base.reduction_kernels + if self.observed: + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - # GH#30918 Use _transform_fast only when we know func is an aggregation - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if func in ["idxmin", "idxmax"]: - func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin(func, True, *args, **kwargs) - else: - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + with ( + com.temp_setattr(self, "observed", True), + com.temp_setattr(self, "_grouper", self._grouper.observed_grouper), + ): + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + @final + def _reduction_kernel_transform( + self, func, *args, engine=None, engine_kwargs=None, **kwargs + ): + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) - return self._wrap_transform_fast_result(result) + return self._wrap_transform_fast_result(result) @final def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2d10bd5d00eb2..e75a5b9089f5f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -668,6 +668,28 @@ def groups(self) -> dict[Hashable, Index]: cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) + @property + def observed_grouping(self) -> Grouping: + if self._observed: + return self + + return self._observed_grouping + + @cache_readonly + def _observed_grouping(self) -> Grouping: + grouping = Grouping( + self._index, + self._orig_grouper, + obj=self.obj, + level=self.level, + sort=self._sort, + observed=True, + in_axis=self.in_axis, + dropna=self._dropna, + uniques=self._uniques, + ) + return grouping + def get_grouper( obj: NDFrameT, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index effa94b1606bd..4f40c4f4283f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -823,6 +823,19 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: return result_index, ids + @property + def observed_grouper(self) -> BaseGrouper: + if all(ping._observed for ping in self.groupings): + return self + + return self._observed_grouper + + @cache_readonly + def _observed_grouper(self) -> BaseGrouper: + groupings = [ping.observed_grouping for ping in self.groupings] + grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna) + return grouper + def _ob_index_and_ids( self, levels: list[Index], @@ -1154,6 +1167,10 @@ def groupings(self) -> list[grouper.Grouping]: ) return [ping] + @property + def observed_grouper(self) -> BinGrouper: + return self + def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 245fb9c7babd7..d6d545a8c4834 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1232,9 +1232,9 @@ def test_categorical_and_not_categorical_key(observed): tm.assert_frame_equal(result, expected_explicit) # Series case - result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( - "sum" - ) + gb = df_with_categorical.groupby(["A", "C"], observed=observed) + gbp = gb["B"] + result = gbp.transform("sum") expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") tm.assert_series_equal(result, expected) expected_explicit = Series([4, 2, 4], name="B") @@ -1535,3 +1535,48 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): result = df.groupby(series, as_index=False).transform("sum") expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int32", "float32"]) +def test_min_one_unobserved_category_no_type_coercion(dtype): + # GH#58084 + df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) + df["B"] = df["B"].astype(dtype) + gb = df.groupby("A", observed=False) + result = gb.transform("min") + + expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_min_all_empty_data_no_type_coercion(): + # GH#58084 + df = DataFrame( + { + "X": Categorical( + [], + categories=[1, "randomcat", 100], + ), + "Y": [], + } + ) + df["Y"] = df["Y"].astype("int32") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": []}, dtype="int32") + tm.assert_frame_equal(expected, result) + + +def test_min_one_dim_no_type_coercion(): + # GH#58084 + df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) + df["Y"] = df["Y"].astype("int32") + categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) + + gb = df.groupby(categories, observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") + tm.assert_frame_equal(expected, result)