Skip to content

Commit

Permalink
REF: Compute complete result_index upfront in groupby (pandas-dev#55738)
Browse files Browse the repository at this point in the history
* REF: Compute correct result_index upfront in groupby

* Refinements

* Refinements

* Refinements

* Restore inferring index dtype

* Test fixups

* Refinements

* Refinements

* fixup

* fixup

* fixup

* Fix sorting and non-sorting

* Cleanup

* Call ensure_plantform_int last

* fixup

* fixup

* REF: Compute correct result_index upfront in groupby

* Add test

* Remove test

* Move unobserved to the end

* cleanup

* cleanup

* cleanup

* Merge fixup

* fixup

* fixup

* Fixup and test

* whatsnew

* type ignore

* Refactor & type annotations

* Better bikeshed
  • Loading branch information
rhshadrach authored and pmhatre1 committed May 7, 2024
1 parent fb148c5 commit 3bb6568
Show file tree
Hide file tree
Showing 14 changed files with 283 additions and 421 deletions.
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby arguments ``dropna`` and ``sort`` (:issue:`55919`, :issue:`56966`, :issue:`56851`)
- Bug in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` would fail with multiple categorical groupings when ``as_index=False`` (:issue:`52848`)
- Bug in :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.any`, and :meth:`.DataFrameGroupBy.all` would result in NA values on unobserved groups; they now result in ``1``, ``False``, and ``True`` respectively (:issue:`55783`)
- Bug in :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)
-

Reshaping
^^^^^^^^^
Expand Down
26 changes: 17 additions & 9 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,6 @@ def _wrap_applied_output(
# GH #823 #24880
index = self._grouper.result_index
res_df = self.obj._constructor_expanddim(values, index=index)
res_df = self._reindex_output(res_df)
# if self.observed is False,
# keep all-NaN rows created while re-indexing
res_ser = res_df.stack(future_stack=True)
Expand All @@ -437,7 +436,7 @@ def _wrap_applied_output(
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return self._reindex_output(result)
return result

def _aggregate_named(self, func, *args, **kwargs):
# Note: this is very similar to _aggregate_series_pure_python,
Expand Down Expand Up @@ -658,7 +657,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
2023-02-01 1
Freq: MS, dtype: int64
"""
ids, _, ngroups = self._grouper.group_info
ids, ngroups = self._grouper.group_info
val = self.obj._values
codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)

Expand Down Expand Up @@ -691,7 +690,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return self._reindex_output(result, fill_value=0)
return result

@doc(Series.describe)
def describe(self, percentiles=None, include=None, exclude=None) -> Series:
Expand Down Expand Up @@ -719,7 +718,7 @@ def value_counts(
from pandas.core.reshape.merge import get_join_indexers
from pandas.core.reshape.tile import cut

ids, _, _ = self._grouper.group_info
ids, _ = self._grouper.group_info
val = self.obj._values

index_names = self._grouper.names + [self.obj.name]
Expand Down Expand Up @@ -789,9 +788,18 @@ def value_counts(
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

# multi-index components
codes = self._grouper.reconstructed_codes
if isinstance(self._grouper.result_index, MultiIndex):
codes = list(self._grouper.result_index.codes)
else:
codes = [
algorithms.factorize(
self._grouper.result_index,
sort=self._grouper._sort,
use_na_sentinel=self._grouper.dropna,
)[0]
]
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
levels = [ping._group_index for ping in self._grouper.groupings] + [lev]
levels = self._grouper.levels + [lev]

if dropna:
mask = codes[-1] != -1
Expand Down Expand Up @@ -834,7 +842,7 @@ def value_counts(
# ndarray[Any, Any]], Index, Series]]
_, idx = get_join_indexers(
left, # type: ignore[arg-type]
right, # type: ignore[arg-type]
right,
sort=False,
how="left",
)
Expand Down Expand Up @@ -1605,7 +1613,7 @@ def _wrap_applied_output_series(
if not self.as_index:
result = self._insert_inaxis_grouper(result)

return self._reindex_output(result)
return result

def _cython_transform(
self,
Expand Down
Loading

0 comments on commit 3bb6568

Please sign in to comment.