Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Ambiguous Behavior of Tuple as Grouping #29755

Merged
merged 7 commits into from
Nov 25, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
- Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`)
- Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`)
- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`)
- A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`)
simonjayhawkins marked this conversation as resolved.
Show resolved Hide resolved
- Removed :meth:`Series.from_array` (:issue:`18258`)
- Removed :meth:`DataFrame.from_items` (:issue:`18458`)
- Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`)
Expand Down
17 changes: 15 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ class providing the base-class of operations.
import re
import types
from typing import (
Callable,
Dict,
FrozenSet,
Hashable,
Iterable,
List,
Mapping,
Expand Down Expand Up @@ -343,14 +345,25 @@ def _group_selection_context(groupby):
groupby._reset_group_selection()


KeysArgType = Optional[
simonjayhawkins marked this conversation as resolved.
Show resolved Hide resolved
Union[
Hashable,
List[Hashable],
Callable[[Hashable], Hashable],
List[Callable[[Hashable], Hashable]],
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty large signature and maybe worth trimming down / cleaning up (see #22278) but documenting as-is state

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yikes. maybe a comment pointing to the relevant issue?

Mapping[Hashable, Hashable],
]
]


class _GroupBy(PandasObject, SelectionMixin):
_group_selection = None
_apply_whitelist = frozenset() # type: FrozenSet[str]

def __init__(
self,
obj: NDFrame,
keys=None,
keys: KeysArgType = None,
axis: int = 0,
level=None,
grouper: "Optional[ops.BaseGrouper]" = None,
Expand Down Expand Up @@ -2504,7 +2517,7 @@ def _reindex_output(
@Appender(GroupBy.__doc__)
def get_groupby(
obj: NDFrame,
by=None,
by: KeysArgType = None,
axis: int = 0,
level=None,
grouper: "Optional[ops.BaseGrouper]" = None,
Expand Down
24 changes: 0 additions & 24 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""

from typing import Hashable, List, Optional, Tuple
import warnings

import numpy as np

Expand All @@ -14,7 +13,6 @@
ensure_categorical,
is_categorical_dtype,
is_datetime64_dtype,
is_hashable,
is_list_like,
is_scalar,
is_timedelta64_dtype,
Expand Down Expand Up @@ -514,28 +512,6 @@ def get_grouper(
elif isinstance(key, ops.BaseGrouper):
return key, [], obj

# In the future, a tuple key will always mean an actual key,
# not an iterable of keys. In the meantime, we attempt to provide
# a warning. We can assume that the user wanted a list of keys when
# the key is not in the index. We just have to be careful with
# unhashable elements of `key`. Any unhashable elements implies that
# they wanted a list of keys.
# https://github.com/pandas-dev/pandas/issues/18314
if isinstance(key, tuple):
all_hashable = is_hashable(key)
if (
all_hashable and key not in obj and set(key).issubset(obj)
) or not all_hashable:
# column names ('a', 'b') -> ['a', 'b']
# arrays like (a, b) -> [a, b]
msg = (
"Interpreting tuple 'by' as a list of keys, rather than "
"a single key. Use 'by=[...]' instead of 'by=(...)'. In "
"the future, a tuple will always mean a single key."
)
warnings.warn(msg, FutureWarning, stacklevel=5)
key = list(key)

if not isinstance(key, list):
keys = [key]
match_axis_length = False
Expand Down
29 changes: 9 additions & 20 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1734,34 +1734,23 @@ def test_empty_dataframe_groupby():
tm.assert_frame_equal(result, expected)


def test_tuple_warns():
def test_tuple_as_grouping():
# https://github.com/pandas-dev/pandas/issues/18314
df = pd.DataFrame(
{
("a", "b"): [1, 1, 2, 2],
"a": [1, 1, 1, 2],
"b": [1, 2, 2, 2],
("a", "b"): [1, 1, 1, 1],
"a": [2, 2, 2, 2],
"b": [2, 2, 2, 2],
"c": [1, 1, 1, 1],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this dataframe need to change? the rest of the test change makes sense

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't need to just figured easier to disambiguate between this and the respective a / b columns if they have entirely different values

}
)
with tm.assert_produces_warning(FutureWarning) as w:
df[["a", "b", "c"]].groupby(("a", "b")).c.mean()

assert "Interpreting tuple 'by' as a list" in str(w[0].message)
with pytest.raises(KeyError):
df[["a", "b", "c"]].groupby(("a", "b"))

with tm.assert_produces_warning(None):
df.groupby(("a", "b")).c.mean()


def test_tuple_warns_unhashable():
# https://github.com/pandas-dev/pandas/issues/18314
business_dates = date_range(start="4/1/2014", end="6/30/2014", freq="B")
df = DataFrame(1, index=business_dates, columns=["a", "b"])

with tm.assert_produces_warning(FutureWarning) as w:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

assert "Interpreting tuple 'by' as a list" in str(w[0].message)
result = df.groupby(("a", "b"))["c"].sum()
expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b")))
tm.assert_series_equal(result, expected)


def test_tuple_correct_keyerror():
Expand Down