diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3bb61e7ce6215..2c39318fa28b3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -213,6 +213,11 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby arguments ``dropna`` and ``sort`` (:issue:`55919`, :issue:`56966`, :issue:`56851`) +- Bug in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` would fail with multiple categorical groupings when ``as_index=False`` (:issue:`52848`) +- Bug in :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.any`, and :meth:`.DataFrameGroupBy.all` would result in NA values on unobserved groups; they now result in ``1``, ``False``, and ``True`` respectively (:issue:`55783`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 448d052ed9531..f68a5f605e331 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -411,7 +411,6 @@ def _wrap_applied_output( # GH #823 #24880 index = self._grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) - res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing res_ser = res_df.stack(future_stack=True) @@ -437,7 +436,7 @@ def _wrap_applied_output( if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result) + return result def _aggregate_named(self, func, *args, **kwargs): # Note: this is very similar to _aggregate_series_pure_python, @@ -658,7 +657,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) @@ -691,7 +690,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) + return result @doc(Series.describe) def describe(self, percentiles=None, include=None, exclude=None) -> Series: @@ -719,7 +718,7 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self._grouper.group_info + ids, _ = self._grouper.group_info val = self.obj._values index_names = self._grouper.names + [self.obj.name] @@ -789,9 +788,18 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self._grouper.reconstructed_codes + if isinstance(self._grouper.result_index, MultiIndex): + codes = list(self._grouper.result_index.codes) + else: + codes = [ + algorithms.factorize( + self._grouper.result_index, + sort=self._grouper._sort, + use_na_sentinel=self._grouper.dropna, + )[0] + ] codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping._group_index for ping in self._grouper.groupings] + [lev] + levels = self._grouper.levels + [lev] if dropna: mask = codes[-1] != -1 @@ -834,7 +842,7 @@ def value_counts( # ndarray[Any, Any]], Index, Series]] _, idx = get_join_indexers( left, # type: ignore[arg-type] - right, # type: ignore[arg-type] + right, sort=False, how="left", ) @@ -1605,7 +1613,7 @@ def _wrap_applied_output_series( if not self.as_index: result = self._insert_inaxis_grouper(result) - return self._reindex_output(result) + return result def _cython_transform( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 67e25531990ec..1440bd0adfd26 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -50,7 +50,6 @@ class providing the base-class of operations. NDFrameT, PositionalIndexer, RandomState, - Scalar, npt, ) from pandas.compat.numpy import function as nv @@ -98,7 +97,6 @@ class providing the base-class of operations. from pandas.core.arrays import ( ArrowExtensionArray, BaseMaskedArray, - Categorical, ExtensionArray, FloatingArray, IntegerArray, @@ -127,7 +125,6 @@ class providing the base-class of operations. GroupByNthSelector, ) from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, RangeIndex, @@ -794,7 +791,7 @@ def __repr__(self) -> str: @final @property - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """ Dict {group name -> group labels}. @@ -1459,7 +1456,7 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self._grouper.result_ilocs()) + original_positions = Index(self._grouper.result_ilocs) result = result.set_axis(original_positions, axis=0, copy=False) result = result.sort_index(axis=0) if self._grouper.has_dropped_na: @@ -1536,10 +1533,9 @@ def _wrap_aggregated_output( # We get here with len(qs) != 1 and not self.as_index # in test_pass_args_kwargs index = _insert_quantile_level(index, qs) - result.index = index - return self._reindex_output(result, qs=qs) + return result def _wrap_applied_output( self, @@ -1555,8 +1551,8 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self._grouper.group_info - sorted_index = self._grouper._sort_idx + ids, ngroups = self._grouper.group_info + sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids sorted_data = data.take(sorted_index, axis=0).to_numpy() @@ -1607,7 +1603,7 @@ def _numba_agg_general( ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self._grouper.group_info + ids, _ = self._grouper.group_info ngroups = self._grouper.ngroups res_mgr = df._mgr.apply( @@ -1973,7 +1969,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self._grouper.group_info + ids = self._grouper.ids result = result.reindex(self._grouper.result_index, axis=0, copy=False) if self.obj.ndim == 1: @@ -2025,7 +2021,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2234,7 +2230,7 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info mask = ids != -1 is_series = data.ndim == 1 @@ -2265,15 +2261,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(hfunc) new_obj = self._wrap_agged_manager(new_mgr) + result = self._wrap_aggregated_output(new_obj) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_aggregated_output() returns. GH 35028 - # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false - with com.temp_setattr(self, "observed", True): - result = self._wrap_aggregated_output(new_obj) - - return self._reindex_output(result, fill_value=0) + return result @final @Substitution(name="groupby") @@ -2738,19 +2728,6 @@ def _value_counts( result_series = cast(Series, gb.size()) result_series.name = name - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping._result_index for ping in groupings] - multi_index = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ) - result_series = result_series.reindex(multi_index, fill_value=0) - if sort: # Sort by the values result_series = result_series.sort_values( @@ -2977,10 +2954,6 @@ def size(self) -> DataFrame | Series: dtype_backend=dtype_backend, ) - with com.temp_setattr(self, "as_index", True): - # size already has the desired behavior in GH#49519, but this makes the - # as_index=False path of _reindex_output fail on categorical groupers. - result = self._reindex_output(result, fill_value=0) if not self.as_index: # error: Incompatible types in assignment (expression has # type "DataFrame", variable has type "Series") @@ -3058,7 +3031,7 @@ def sum( npfunc=np.sum, ) - return self._reindex_output(result, fill_value=0) + return result @final @doc( @@ -3476,7 +3449,7 @@ def ohlc(self) -> DataFrame: result = self.obj._constructor_expanddim( res_values, index=self._grouper.result_index, columns=agg_names ) - return self._reindex_output(result) + return result result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) return result @@ -3852,7 +3825,7 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info col_func = partial( libgroupby.group_fillna_indexer, @@ -4174,7 +4147,7 @@ def _nth( if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self._grouper.group_info + ids, _ = self._grouper.group_info # Drop NA values in grouping mask = mask & (ids != -1) @@ -4378,7 +4351,10 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info + if self.dropna: + # splitter drops NA groups, we need to do the same + ids = ids[ids >= 0] nqs = len(qs) func = partial( @@ -5009,7 +4985,7 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self._grouper.group_info + ids, ngroups = self._grouper.group_info res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5326,99 +5302,6 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: mask = mask & (ids != -1) return self._selected_obj[mask] - @final - def _reindex_output( - self, - output: OutputFrameOrSeries, - fill_value: Scalar = np.nan, - qs: npt.NDArray[np.float64] | None = None, - ) -> OutputFrameOrSeries: - """ - If we have categorical groupers, then we might want to make sure that - we have a fully re-indexed output to the levels. This means expanding - the output space to accommodate all values in the cartesian product of - our groups, regardless of whether they were observed in the data or - not. This will expand the output space if there are missing groups. - - The method returns early without modifying the input if the number of - groupings is less than 2, self.observed == True or none of the groupers - are categorical. - - Parameters - ---------- - output : Series or DataFrame - Object resulting from grouping and applying an operation. - fill_value : scalar, default np.nan - Value to use for unobserved categories if self.observed is False. - qs : np.ndarray[float64] or None, default None - quantile values, only relevant for quantile. - - Returns - ------- - Series or DataFrame - Object (potentially) re-indexed to include all possible groups. - """ - groupings = self._grouper.groupings - if len(groupings) == 1: - return output - - # if we only care about the observed values - # we are done - elif self.observed: - return output - - # reindexing only applies to a Categorical grouper - elif not any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in groupings - ): - return output - - levels_list = [ping._group_index for ping in groupings] - names = self._grouper.names - if qs is not None: - # error: Argument 1 to "append" of "list" has incompatible type - # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" - levels_list.append(qs) # type: ignore[arg-type] - names = names + [None] - index = MultiIndex.from_product(levels_list, names=names) - if self.sort: - index = index.sort_values() - - if self.as_index: - # Always holds for SeriesGroupBy unless GH#36507 is implemented - return output.reindex(index=index, copy=False, fill_value=fill_value) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self._grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `output`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [ - (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis - ] - if len(in_axis_grps) > 0: - g_nums, g_names = zip(*in_axis_grps) - output = output.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - output = output.set_index(self._grouper.result_index).reindex( - index, copy=False, fill_value=fill_value - ) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - if len(in_axis_grps) > 0: - output = output.reset_index(level=g_nums) - - return output.reset_index(drop=True) - @final def sample( self, @@ -5576,14 +5459,10 @@ def _idxmax_idxmin( if not self.observed and any( ping._passed_categorical for ping in self._grouper.groupings ): - expected_len = np.prod( - [len(ping._group_index) for ping in self._grouper.groupings] - ) - if len(self._grouper.groupings) == 1: - result_len = len(self._grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self._grouper.result_index) + expected_len = len(self._grouper.result_index) + # TODO: Better way to find # of observed groups? + group_sizes = self._grouper.size() + result_len = group_sizes[group_sizes > 0].shape[0] assert result_len <= expected_len has_unobserved = result_len < expected_len diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1e6658e5dfd39..827c44736c6c0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -35,7 +35,6 @@ from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, ) @@ -639,7 +638,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self._group_index) + return len(self.uniques) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -654,89 +653,9 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] - @cache_readonly - def _group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - if self._all_grouper is not None: - # retain dtype for categories, including unobserved ones - return self._result_index._values - - elif self._passed_categorical: - return self._group_index._values - - return self._codes_and_uniques[1] - - @property - def group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - warnings.warn( - "group_arraylike is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._group_arraylike - - @cache_readonly - def _result_index(self) -> Index: - # result_index retains dtype for categories, including unobserved ones, - # which group_index does not - if self._all_grouper is not None: - group_idx = self._group_index - assert isinstance(group_idx, CategoricalIndex) - cats = self._orig_cats - # set_categories is dynamically added - return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self._group_index - - @property - def result_index(self) -> Index: - warnings.warn( - "result_index is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._result_index - - @cache_readonly - def _group_index(self) -> Index: - codes, uniques = self._codes_and_uniques - if not self._dropna and self._passed_categorical: - assert isinstance(uniques, Categorical) - if self._sort and (codes == len(uniques)).any(): - # Add NA value on the end when sorting - uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories, validate=False - ) - elif len(codes) > 0: - # Need to determine proper placement of NA value when not sorting - cat = self.grouping_vector - na_idx = (cat.codes < 0).argmax() - if cat.codes[na_idx] < 0: - # count number of unique codes that comes before the nan value - na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) - new_codes = np.insert(uniques.codes, na_unique_idx, -1) - uniques = Categorical.from_codes( - new_codes, uniques.categories, validate=False - ) - return Index._with_infer(uniques, name=self.name) - @property - def group_index(self) -> Index: - warnings.warn( - "group_index is deprecated and will be removed in a future " - "version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - return self._group_index + def uniques(self) -> ArrayLike: + return self._codes_and_uniques[1] @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: @@ -756,29 +675,31 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) - uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered, validate=False - ) - - codes = cat.codes + has_dropped_na = False if not self._dropna: - na_mask = codes < 0 + na_mask = cat.isna() if np.any(na_mask): + has_dropped_na = True if self._sort: - # Replace NA codes with `largest code + 1` + # NA goes at the end, gets `largest non-NA code + 1` na_code = len(categories) - codes = np.where(na_mask, na_code, codes) else: - # Insert NA code into the codes based on first appearance - # A negative code must exist, no need to check codes[na_idx] < 0 + # Insert NA in result based on first appearance, need + # the number of unique codes prior na_idx = na_mask.argmax() - # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(codes[:na_idx]) - codes = np.where(codes >= na_code, codes + 1, codes) - codes = np.where(na_mask, na_code, codes) + na_code = algorithms.nunique_ints(cat.codes[:na_idx]) + ucodes = np.insert(ucodes, na_code, -1) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + codes = cat.codes - if not self._observed: - uniques = uniques.reorder_categories(self._orig_cats) + if has_dropped_na: + if not self._sort: + # NA code is based on first appearance, increment higher codes + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) return codes, uniques @@ -802,8 +723,10 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: return codes, uniques @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self._group_index, validate=False) + def groups(self) -> dict[Hashable, Index]: + codes, uniques = self._codes_and_uniques + uniques = Index._with_infer(uniques, name=self.name) + cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 632ff7356d1c7..46ef0f38706bc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -49,6 +49,7 @@ maybe_fill, ) +from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -61,7 +62,6 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -614,7 +614,8 @@ def get_iterator(self, data: NDFrameT) -> Iterator[tuple[Hashable, NDFrameT]]: for each group """ splitter = self._get_splitter(data) - keys = self.group_keys_seq + # TODO: Would be more efficient to skip unobserved for transforms + keys = self.result_index yield from zip(keys, splitter) @final @@ -624,26 +625,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info return _get_splitter( data, ids, ngroups, sorted_ids=self._sorted_ids, - sort_idx=self._sort_idx, + sort_idx=self.result_ilocs, ) - @final - @cache_readonly - def group_keys_seq(self): - if len(self.groupings) == 1: - return self.levels[0] - else: - ids, _, ngroups = self.group_info - - # provide "flattened" iterator for multi-group setting - return get_flattened_list(ids, ngroups, self.levels, self.codes) - @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -651,10 +641,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] - keys = [ping._group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + return get_indexer_dict(codes_list, self.levels) @final + @cache_readonly def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. @@ -662,18 +652,15 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = get_group_index( - self.codes, self.shape, sort=self._sort, xnull=True - ) - group_index, _ = compress_group_index(group_index, sort=self._sort) + ids = self.ids if self.has_dropped_na: - mask = np.where(group_index >= 0) + mask = np.where(ids >= 0) # Count how many gaps are caused by previous null values for each position - null_gaps = np.cumsum(group_index == -1)[mask] - group_index = group_index[mask] + null_gaps = np.cumsum(ids == -1)[mask] + ids = ids[mask] - result = get_group_index_sorter(group_index, self.ngroups) + result = get_group_index_sorter(ids, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps @@ -681,14 +668,17 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: return result - @final @property def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property def levels(self) -> list[Index]: - return [ping._group_index for ping in self.groupings] + if len(self.groupings) > 1: + # mypy doesn't know result_index must be a MultiIndex + return list(self.result_index.levels) # type: ignore[attr-defined] + else: + return [self.result_index] @property def names(self) -> list[Hashable]: @@ -699,7 +689,7 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -708,20 +698,19 @@ def size(self) -> Series: return Series(out, index=self.result_index, dtype="int64", copy=False) @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: return self.groupings[0].groups - else: - to_groupby = [] - for ping in self.groupings: - gv = ping.grouping_vector - if not isinstance(gv, BaseGrouper): - to_groupby.append(gv) - else: - to_groupby.append(gv.groupings[0].grouping_vector) - index = MultiIndex.from_arrays(to_groupby) - return self.axis.groupby(index) + result_index, ids = self.result_index_and_ids + values = result_index._values + categories = Categorical(ids, categories=np.arange(len(result_index))) + result = { + # mypy is not aware that group has to be an integer + values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload] + for group, axis_ilocs in categories._reverse_indexer().items() + } + return result @final @cache_readonly @@ -735,73 +724,148 @@ def has_dropped_na(self) -> bool: """ Whether grouper has null value(s) that are dropped. """ - return bool((self.group_info[0] < 0).any()) + return bool((self.ids < 0).any()) @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() - - ngroups = len(obs_group_ids) - comp_ids = ensure_platform_int(comp_ids) - - return comp_ids, obs_group_ids, ngroups + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: + result_index, ids = self.result_index_and_ids + ngroups = len(result_index) + return ids, ngroups @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info return ids - @final - def _get_compressed_codes( - self, - ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype - if len(self.groupings) > 1: - group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self._sort) - # FIXME: compress_group_index's second return value is int64, not intp - - ping = self.groupings[0] - return ping.codes, np.arange(len(ping._group_index), dtype=np.intp) - @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - codes = self.codes - ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + def result_index(self) -> Index: + return self.result_index_and_ids[0] + + @property + def ids(self) -> npt.NDArray[np.intp]: + return self.result_index_and_ids[1] @cache_readonly - def result_index(self) -> Index: + def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: + levels = [Index._with_infer(ping.uniques) for ping in self.groupings] + obs = [ + ping._observed or not ping._passed_categorical for ping in self.groupings + ] + # When passed a categorical grouping, keep all categories + for k, (ping, level) in enumerate(zip(self.groupings, levels)): + if ping._passed_categorical: + levels[k] = level.set_categories(ping._orig_cats) + if len(self.groupings) == 1: - return self.groupings[0]._result_index.rename(self.names[0]) + result_index = levels[0] + result_index.name = self.names[0] + ids = ensure_platform_int(self.codes[0]) + elif all(obs): + result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names) + elif not any(obs): + result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) + else: + # Combine unobserved and observed parts + names = self.names + codes = [ping.codes for ping in self.groupings] + ob_indices = [idx for idx, ob in enumerate(obs) if ob] + unob_indices = [idx for idx, ob in enumerate(obs) if not ob] + ob_index, ob_ids = self._ob_index_and_ids( + levels=[levels[idx] for idx in ob_indices], + codes=[codes[idx] for idx in ob_indices], + names=[names[idx] for idx in ob_indices], + ) + unob_index, unob_ids = self._unob_index_and_ids( + levels=[levels[idx] for idx in unob_indices], + codes=[codes[idx] for idx in unob_indices], + names=[names[idx] for idx in unob_indices], + ) + + result_index_codes = np.concatenate( + [ + np.tile(unob_index.codes, len(ob_index)), + np.repeat(ob_index.codes, len(unob_index), axis=1), + ], + axis=0, + ) + _, index = np.unique(unob_indices + ob_indices, return_index=True) + result_index = MultiIndex( + levels=list(unob_index.levels) + list(ob_index.levels), + codes=result_index_codes, + names=list(unob_index.names) + list(ob_index.names), + ).reorder_levels(index) + ids = len(unob_index) * ob_ids + unob_ids + + if self._sort: + # Sort result_index and recode ids using the new order + sorter = result_index.argsort() + result_index = result_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ids = ensure_platform_int(ids) + ids = index.take(ids) + else: + # Recode ids and reorder result_index with observed groups up front, + # unobserved at the end + ids, uniques = compress_group_index(ids, sort=False) + ids = ensure_platform_int(ids) + taker = np.concatenate( + [uniques, np.delete(np.arange(len(result_index)), uniques)] + ) + result_index = result_index.take(taker) + + return result_index, ids - codes = self.reconstructed_codes - levels = [ping._result_index for ping in self.groupings] - return MultiIndex( - levels=levels, codes=codes, verify_integrity=False, names=self.names + def _ob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + shape = tuple(len(level) for level in levels) + group_index = get_group_index(codes, shape, sort=True, xnull=True) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids = ensure_platform_int(ob_ids) + ob_index_codes = decons_obs_group_ids( + ob_ids, obs_group_ids, shape, codes, xnull=True + ) + ob_index = MultiIndex( + levels=levels, + codes=ob_index_codes, + names=names, + verify_integrity=False, ) + ob_ids = ensure_platform_int(ob_ids) + return ob_index, ob_ids + + def _unob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + shape = tuple(len(level) for level in levels) + unob_ids = get_group_index(codes, shape, sort=True, xnull=True) + unob_index = MultiIndex.from_product(levels, names=names) + unob_ids = ensure_platform_int(unob_ids) + return unob_index, unob_ids @final - def get_group_levels(self) -> list[ArrayLike]: + def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper + result_index = self.result_index if len(self.groupings) == 1: - return [self.groupings[0]._group_arraylike] - - name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): - codes = ensure_platform_int(codes) - levels = ping._group_arraylike.take(codes) - - name_list.append(levels) - - return name_list + return [result_index] + return [ + result_index.get_level_values(level) + for level in range(result_index.nlevels) + ] # ------------------------------------------------------------ # Aggregation functions @@ -823,14 +887,12 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - ids, _, _ = self.group_info - ngroups = self.ngroups return cy_op.cython_operation( values=values, axis=axis, min_count=min_count, - comp_ids=ids, - ngroups=ngroups, + comp_ids=self.ids, + ngroups=self.ngroups, **kwargs, ) @@ -871,7 +933,7 @@ def agg_series( def _aggregate_series_pure_python( self, obj: Series, func: Callable ) -> npt.NDArray[np.object_]: - _, _, ngroups = self.group_info + _, ngroups = self.group_info result = np.empty(ngroups, dtype="O") initialized = False @@ -897,7 +959,7 @@ def apply_groupwise( ) -> tuple[list, bool]: mutated = False splitter = self._get_splitter(data) - group_keys = self.group_keys_seq + group_keys = self.result_index result_values = [] # This calls DataSplitter.__iter__ @@ -933,18 +995,14 @@ def apply_groupwise( # ------------------------------------------------------------ # Methods for sorting subsets of our GroupBy's object - @final - @cache_readonly - def _sort_idx(self) -> npt.NDArray[np.intp]: - # Counting sort indexer - ids, _, ngroups = self.group_info - return get_group_index_sorter(ids, ngroups) - @final @cache_readonly def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids, _, _ = self.group_info - return ids.take(self._sort_idx) + result = self.ids.take(self.result_ilocs) + if getattr(self, "dropna", True): + # BinGrouper has no dropna + result = result[result >= 0] + return result class BinGrouper(BaseGrouper): @@ -1015,7 +1073,7 @@ def nkeys(self) -> int: @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] @@ -1054,9 +1112,8 @@ def indices(self): return indices @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.intp) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1065,16 +1122,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return ( - ensure_platform_int(comp_ids), - obs_group_ids, - ngroups, - ) - - @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: - # get unique result indices, and prepend 0 as groupby starts from the first - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + return (ensure_platform_int(comp_ids), ngroups) @cache_readonly def result_index(self) -> Index: @@ -1083,6 +1131,14 @@ def result_index(self) -> Index: return self.binlabels + @cache_readonly + def codes(self) -> list[npt.NDArray[np.intp]]: + return [self.group_info[0]] + + @cache_readonly + def result_index_and_ids(self): + return self.result_index, self.group_info[0] + @property def levels(self) -> list[Index]: return [self.binlabels] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0a0d6243e8414..e87ecb1b6011c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6460,7 +6460,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return True @final - def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict[Hashable, Index]: """ Group the index labels by a given array of values. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 927f2305045ae..51d91e4113c4e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -734,7 +734,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - observed=False, + observed=dropna, **kwargs, # type: ignore[arg-type] ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index bfe7d8075f430..24990e64bb51c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1204,7 +1204,7 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize): elif not sort and vc_sort: taker = [0, 2, 1, 3] else: - taker = [2, 3, 0, 1] + taker = [2, 1, 0, 3] expected = expected.take(taker) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 727a77f52fe48..7df1b822e516a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -42,8 +42,8 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.nan, - "any": np.nan, + "all": True, + "any": False, "count": 0, "corrwith": np.nan, "first": np.nan, @@ -56,7 +56,7 @@ def f(a): "min": np.nan, "nth": np.nan, "nunique": 0, - "prod": np.nan, + "prod": 1, "quantile": np.nan, "sem": np.nan, "size": 0, @@ -1275,11 +1275,7 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") - if operation == "agg": - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = expected.fillna(0, downcast="infer") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1452,18 +1448,21 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( result = agg(*args) - zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func] for idx in unobserved: val = result.loc[idx] - assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin) # If we expect unobserved values to be zero, we also expect the dtype to be int. # Except for .sum(). If the observed categories sum to dtype=float (i.e. their # sums have decimals), then the zeros for the missing categories should also be # floats. - if zero_or_nan == 0 and reduction_func != "sum": - assert np.issubdtype(result.dtype, np.integer) + if missing_fillin == 0: + if reduction_func in ["count", "nunique", "size"]: + assert np.issubdtype(result.dtype, np.integer) + else: + assert reduction_func in ["sum", "any"] def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): @@ -2111,15 +2110,6 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" request.applymarker(pytest.mark.xfail(reason=msg)) - elif ( - reduction_func == "nunique" - and not test_series - and len(keys) != 1 - and not observed - and not as_index - ): - msg = "GH#52848 - raises a ValueError" - request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a06d104e7e44c..db45067162bc3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -149,7 +149,7 @@ def test_len_nan_group(): df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) assert len(df.groupby("a")) == 0 assert len(df.groupby("b")) == 3 - assert len(df.groupby(["a", "b"])) == 3 + assert len(df.groupby(["a", "b"])) == 0 def test_basic_regression(): @@ -1929,6 +1929,33 @@ def test_groupby_groups_in_BaseGrouper(): assert result.groups == expected.groups +def test_groups_sort_dropna(sort, dropna): + # GH#56966, GH#56851 + df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]]) + keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)] + values = [ + Index([0], dtype="int64"), + Index([1], dtype="int64"), + Index([2], dtype="int64"), + ] + if sort: + taker = [2, 0] if dropna else [2, 0, 1] + else: + taker = [0, 2] if dropna else [0, 1, 2] + expected = {keys[idx]: values[idx] for idx in taker} + + gb = df.groupby([0, 1], sort=sort, dropna=dropna) + result = gb.groups + + for result_key, expected_key in zip(result.keys(), expected.keys()): + # Compare as NumPy arrays to handle np.nan + result_key = np.array(result_key) + expected_key = np.array(expected_key) + tm.assert_numpy_array_equal(result_key, expected_key) + for result_value, expected_value in zip(result.values(), expected.values()): + tm.assert_index_equal(result_value, expected_value) + + @pytest.mark.parametrize( "op, expected", [ diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index ee1df1242442f..27b4508feb314 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -708,7 +708,7 @@ def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_da # GH 17537 grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.ids, exp_labels) def test_grouping_labels(self, multiindex_dataframe_random_data): grouped = multiindex_dataframe_random_data.groupby( @@ -779,11 +779,7 @@ def test_groupby_empty(self): gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) - tm.assert_numpy_array_equal( - gr._grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) - ) - - assert gr._grouper.group_info[2] == 0 + assert gr._grouper.group_info[1] == 0 # check name gb = s.groupby(s) @@ -1161,13 +1157,3 @@ def test_grouper_groups(): msg = "Grouper.indexer is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grper.indexer - - -@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"]) -def test_depr_grouping_attrs(attr): - # GH#56148 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - gb = df.groupby("a") - msg = f"{attr} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb._grouper.groupings[0], attr) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index ee5c61794e96d..e5af61618882c 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -576,16 +576,6 @@ def test_groupby_raises_category_on_category( return empty_groups = not observed and any(group.empty for group in gb.groups.values()) - if ( - not observed - and how != "transform" - and isinstance(by, list) - and isinstance(by[0], str) - and by == ["a", "b"] - ): - assert not empty_groups - # TODO: empty_groups should be true due to unobserved categorical combinations - empty_groups = True if how == "transform": # empty groups will be ignored empty_groups = False diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index b8891da388695..aba3b2f27c633 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -67,7 +67,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) + assert len(gb._grouper.result_index) != len(gb._grouper.codes) return gb diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f020fd45c87d9..11e3b5e205bdc 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -258,9 +258,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): expected = DataFrame( {"B": values}, index=Index( - Categorical.from_codes( - codes, categories=["low", "high"], ordered=dropna - ), + Categorical.from_codes(codes, categories=["low", "high"], ordered=True), name="A", ), )