Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical #55268

Merged
merged 11 commits into from
Oct 8, 2023
2 changes: 1 addition & 1 deletion .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
run: |
cd asv_bench
asv machine --yes
asv run --quick --dry-run --durations=30 --python=same
asv run --quick --dry-run --durations=30 --python=same --show-stderr
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved

build_docker_dev_environment:
name: Build Docker Dev Environment
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ Plotting

Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
-
- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`)
-

Reshaping
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11896,7 +11896,7 @@ def _logical_func(

def any(
self,
axis: Axis = 0,
axis: Axis | None = 0,
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
bool_only: bool_t = False,
skipna: bool_t = True,
**kwargs,
Expand Down
34 changes: 16 additions & 18 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,15 +1185,23 @@ def nsmallest(
def idxmin(
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
) -> Series:
result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
return result.astype(self.obj.index.dtype) if result.empty else result
if axis is not lib.no_default:
axis = self.obj._get_axis_number(axis)
self._deprecate_axis(axis, "idxmin")
else:
axis = 0
return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)

@doc(Series.idxmax.__doc__)
def idxmax(
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
) -> Series:
result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
return result.astype(self.obj.index.dtype) if result.empty else result
if axis is not lib.no_default:
axis = self.obj._get_axis_number(axis)
self._deprecate_axis(axis, "idxmax")
else:
axis = 0
return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)

@doc(Series.corr.__doc__)
def corr(
Expand Down Expand Up @@ -2195,14 +2203,9 @@ def idxmax(
else:
axis = self.axis

def func(df):
return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = "idxmax"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
return self._idxmax_idxmin(
"idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna
)
return result.astype(self.obj.index.dtype) if result.empty else result

def idxmin(
self,
Expand Down Expand Up @@ -2290,14 +2293,9 @@ def idxmin(
else:
axis = self.axis

def func(df):
return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = "idxmin"
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
return self._idxmax_idxmin(
"idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna
)
return result.astype(self.obj.index.dtype) if result.empty else result

boxplot = boxplot_frame_groupby

Expand Down
113 changes: 109 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2015,10 +2015,16 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
with com.temp_setattr(self, "as_index", True):
# GH#49834 - result needs groups in the index for
# _wrap_transform_fast_result
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)
if func in ["idxmin", "idxmax"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it not possible to stick with the same engine pattern that is in place for this? At first glance I'm wondering what makes these different from min/`max`` that requires branching here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this is the same issue as i asked about somewhere else: _idxmax_idxmin accepts ignore_unobserved while idxmin/idxmax do not

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main difference in behavior is that min/max do not raise on unobserved categories, while idxmin and idxmax do.

Example - agg
df = pd.DataFrame({'a': pd.Categorical([1, 1, 2], categories=[1, 2, 3]), 'b': [3, 4, 5]})
gb = df.groupby('a', observed=False)
result = gb.min()
print(result)
#      b
# a     
# 1  3.0
# 2  5.0
# 3  NaN

However, the fact that we don't do something special for min/max means that transform unnecessarily coerces to float:

Example - transform
df = pd.DataFrame({'a': pd.Categorical([1, 1, 2], categories=[1, 2, 3]), 'b': [3, 4, 5]})
gb = df.groupby('a', observed=False)
result = gb.transform('min')
print(result)
#      b
# 0  3.0
# 1  3.0
# 2  5.0

I consider this a bug in min/max with transform.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've opened #55326 to track

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK thanks - that is helpful context. So do you see the solution for min/max being the same as what you have for idxmin/idxmax here?

I think the broader issue is that we've wanted over time to move away from branching for function specializations within groupby. If that still holds true then I wonder what prevents us from sticking with the existing kwargs interface to solve both this PR and eventually solve min/max's issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to explore different solutions for min/max and other aggregations, but I don't know what that could be at this time.

I don't understand what you're suggesting with kwargs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I thought the engine / engine_kwargs were specialized arguments for each function implementation, but I see now those are meant for numba.

The numba functions are UDFs right? I'm assuming from the branch here that we would never want to pass numba arguments to _idxmax_idxmin

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct - we never get here when using numba.

func = cast(Literal["idxmin", "idxmax"], func)
result = self._idxmax_idxmin(
func, ignore_unobserved=True, axis=self.axis, **kwargs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We definitely don't need *args here right? Seems like something could get discarded compared to the other branch

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack - thanks! Fixed and test added.

)
else:
if engine is not None:
kwargs["engine"] = engine
kwargs["engine_kwargs"] = engine_kwargs
result = getattr(self, func)(*args, **kwargs)

return self._wrap_transform_fast_result(result)

Expand Down Expand Up @@ -5720,6 +5726,105 @@ def sample(
sampled_indices = np.concatenate(sampled_indices)
return self._selected_obj.take(sampled_indices, axis=self.axis)

def _idxmax_idxmin(
self,
how: Literal["idxmax", "idxmin"],
axis: Axis = 0,
numeric_only: bool = False,
skipna: bool = True,
ignore_unobserved: bool = False,
):
"""Compute idxmax/idxmin.

Parameters
----------
how: {"idxmin", "idxmax"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick i think missing space between "how" and colon

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also usually but not always we have single-quotes inside these (no idea why and i genuinely dont care, extra since this is private)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - fixed in #54234

Whether to compute idxmin or idxmax.
axis : {{0 or 'index', 1 or 'columns'}}, default None
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
If axis is not provided, grouper's axis is used.
numeric_only : bool, default False
Include only float, int, boolean columns.
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
ignore_unobserved : bool, default False
When True and an unobserved group is encountered, do not raise. This used
for transform where unobserved groups do not play an impact on the result.

Returns
-------
Series or DataFrame
idxmax or idxmin for the groupby operation.
"""
if not self.observed and any(
ping._passed_categorical for ping in self.grouper.groupings
):
expected_len = np.prod(
[len(ping.group_index) for ping in self.grouper.groupings]
)
if len(self.grouper.groupings) == 1:
result_len = len(self.grouper.groupings[0].grouping_vector.unique())
else:
# result_index only contains observed groups in this case
result_len = len(self.grouper.result_index)
assert result_len <= expected_len
has_unobserved = result_len < expected_len

raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved
# Only raise an error if there are columns to compute; otherwise we return
# an empty DataFrame with an index (possibly including unobserved) but no
# columns
data = self._obj_with_exclusions
if raise_err and isinstance(data, DataFrame):
if numeric_only:
data = data._get_numeric_data()
raise_err = len(data.columns) > 0
else:
raise_err = False
if raise_err:
raise ValueError(
f"Can't get {how} of an empty group due to unobserved categories. "
"Specify observed=True in groupby instead."
)

try:
if self.obj.ndim == 1:
result = self._op_via_apply(how, skipna=skipna)
else:

def func(df):
method = getattr(df, how)
return method(axis=axis, skipna=skipna, numeric_only=numeric_only)

func.__name__ = how
result = self._python_apply_general(
func, self._obj_with_exclusions, not_indexed_same=True
)
except ValueError as err:
name = "argmax" if how == "idxmax" else "argmin"
if f"attempt to get {name} of an empty sequence" in str(err):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would this be simpler if we just said changed "arg" to "idx" in the cython method with a comment as to why we are using an apparently-wrong message?

Copy link
Member Author

@rhshadrach rhshadrach Oct 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This arises from a call to NumPy's argmin in nanops.nanargmin:

pd.Series().idxmin()
# ValueError: attempt to get argmin of an empty sequence

In #54234, this remains only for the axis=1 case, and so once that deprecation is enforced, this code will be removed entirely.

raise ValueError(
f"Can't get {how} of an empty group due to unobserved categories. "
"Specify observed=True in groupby instead."
) from None
raise

result = result.astype(self.obj.index.dtype) if result.empty else result

if not skipna:
has_na_value = result.isnull().any(axis=None)
if has_na_value:
warnings.warn(
f"The behavior of {type(self).__name__}.{how} with all-NA "
"values, or any-NA and skipna=False, is deprecated. In a future "
"version this will raise ValueError",
FutureWarning,
stacklevel=find_stack_level(),
)

return result


@doc(GroupBy)
def get_groupby(
Expand Down
52 changes: 44 additions & 8 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
return

agg = getattr(series_groupby, reduction_func)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
agg(*args)
return

result = agg(*args)

assert len(result) == expected_length
Expand Down Expand Up @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(

series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, reduction_func)

if reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
agg(*args)
return

result = agg(*args)

zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
Expand Down Expand Up @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)

args = get_groupby_method_args(reduction_func, df)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(df_grp, reduction_func)(*args)
return

res = getattr(df_grp, reduction_func)(*args)

expected = _results_for_groupbys_with_missing_categories[reduction_func]
Expand Down Expand Up @@ -1883,14 +1910,7 @@ def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
):
# GH#48749
if (
reduction_func in ("idxmax", "idxmin")
and not observed
and index_kind != "multi"
):
msg = "GH#10694 - idxmax/min fail with unused categories"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif reduction_func == "corrwith" and not as_index:
if reduction_func == "corrwith" and not as_index:
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif index_kind != "range" and not as_index:
Expand All @@ -1912,6 +1932,15 @@ def test_category_order_reducer(
df = df.set_index(keys)
args = get_groupby_method_args(reduction_func, df)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)

if not observed and reduction_func in ["idxmin", "idxmax"]:
# idxmin and idxmax are designed to fail on empty inputs
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(gb, reduction_func)(*args)
return

op_result = getattr(gb, reduction_func)(*args)
if as_index:
result = op_result.index.get_level_values("a").categories
Expand Down Expand Up @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys
gb = gb["b"]
args = get_groupby_method_args(reduction_func, df)

if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
gb.agg([reduction_func], *args)
return

result = gb.agg([reduction_func], *args)
expected = getattr(gb, reduction_func)(*args)

Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1():
gb2.idxmax(axis=1)


@pytest.mark.parametrize(
"func, values, expected_values, warn",
[
("idxmin", [0, 1, 2], [0, 2], None),
("idxmax", [0, 1, 2], [1, 2], None),
("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning),
("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning),
("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning),
("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning),
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series):
# GH#54234
df = DataFrame(
{
"a": [1, 1, 2],
"b": values,
}
)
gb = df.groupby("a")
index = Index([1, 2], name="a")
expected = DataFrame({"b": expected_values}, index=index)
if test_series:
gb = gb["b"]
expected = expected["b"]
klass = "Series" if test_series else "DataFrame"
msg = f"The behavior of {klass}GroupBy.{func} with all-NA values"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb, func)(skipna=False)
tm.assert_equal(result, expected)


@pytest.mark.parametrize("numeric_only", [True, False, None])
def test_axis1_numeric_only(request, groupby_func, numeric_only):
if groupby_func in ("idxmax", "idxmin"):
Expand Down
31 changes: 14 additions & 17 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error():
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(
columns, keys, values, method, op, request, using_array_manager, dropna
):
def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna):
# GH8093 & GH26411
override_dtype = None

if (
isinstance(values, Categorical)
and len(keys) == 1
and op in ["idxmax", "idxmin"]
):
mark = pytest.mark.xfail(
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
)
request.node.add_marker(mark)

if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"
Expand Down Expand Up @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected():
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)

if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
with pytest.raises(TypeError, match=msg):
if (
isinstance(values, Categorical)
and not values.ordered
and op in ["min", "max", "idxmin", "idxmax"]
):
if op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
klass = TypeError
else:
msg = f"Can't get {op} of an empty group due to unobserved categories"
klass = ValueError
with pytest.raises(klass, match=msg):
get_result()

if isinstance(columns, list):
if op in ["min", "max"] and isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)
expected = get_categorical_invalid_expected()
Expand Down
Loading