Skip to content

Commit

Permalink
BUG: GroupBy.value_counts sorting order (#56016)
Browse files Browse the repository at this point in the history
* BUG: GroupBy.value_counts sorting order

* Whatsnew

* cleanup

* Fix categorical and add test

* cleanup
  • Loading branch information
rhshadrach authored Nov 17, 2023
1 parent c95e943 commit dbf8aaf
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 19 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,9 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`56007`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`56007`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`56007`)

Reshaping
^^^^^^^^^
Expand Down
27 changes: 18 additions & 9 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2821,11 +2821,27 @@ def _value_counts(
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
multi_index, _ = MultiIndex.from_product(
multi_index = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
).sortlevel()
)
result_series = result_series.reindex(multi_index, fill_value=0)

if sort:
# Sort by the values
result_series = result_series.sort_values(
ascending=ascending, kind="stable"
)
if self.sort:
# Sort by the groupings
names = result_series.index.names
# GH#56007 - Temporarily replace names in case they are integers
result_series.index.names = range(len(names))
index_level = list(range(len(self.grouper.groupings)))
result_series = result_series.sort_index(
level=index_level, sort_remaining=False
)
result_series.index.names = names

if normalize:
# Normalize the results by dividing by the original group sizes.
# We are guaranteed to have the first N levels be the
Expand All @@ -2845,13 +2861,6 @@ def _value_counts(
# Handle groups of non-observed categories
result_series = result_series.fillna(0.0)

if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result_series = result_series.sort_values(ascending=ascending).sort_index(
level=index_level, sort_remaining=False
)

result: Series | DataFrame
if self.as_index:
result = result_series
Expand Down
82 changes: 72 additions & 10 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby(
"sort, ascending, expected_rows, expected_count, expected_group_size",
[
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
(True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
(True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
],
)
def test_compound(
Expand Down Expand Up @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false(
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "male", "high"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "male", "medium"),
("US", "male", "high"),
("US", "female", "medium"),
("US", "female", "low"),
("ASIA", "male", "low"),
("ASIA", "male", "high"),
("ASIA", "female", "medium"),
("ASIA", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
("ASIA", "female", "high"),
("ASIA", "female", "low"),
("ASIA", "female", "medium"),
("ASIA", "male", "high"),
("ASIA", "male", "low"),
("ASIA", "male", "medium"),
]

Expand Down Expand Up @@ -1177,3 +1177,65 @@ def test_value_counts_integer_columns():
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("vc_sort", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
def test_value_counts_sort(sort, vc_sort, normalize):
# GH#55951
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
gb = df.groupby("a", sort=sort)
result = gb.value_counts(sort=vc_sort, normalize=normalize)

if normalize:
values = [2 / 3, 1 / 3, 1.0]
else:
values = [2, 1, 1]
index = MultiIndex(
levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
)
expected = Series(values, index=index, name="proportion" if normalize else "count")
if sort and vc_sort:
taker = [0, 1, 2]
elif sort and not vc_sort:
taker = [0, 1, 2]
elif not sort and vc_sort:
taker = [0, 2, 1]
else:
taker = [2, 1, 0]
expected = expected.take(taker)

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("vc_sort", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
def test_value_counts_sort_categorical(sort, vc_sort, normalize):
# GH#55951
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
gb = df.groupby("a", sort=sort, observed=True)
result = gb.value_counts(sort=vc_sort, normalize=normalize)

if normalize:
values = [2 / 3, 1 / 3, 1.0, 0.0]
else:
values = [2, 1, 1, 0]
name = "proportion" if normalize else "count"
expected = DataFrame(
{
"a": Categorical([1, 1, 2, 2]),
0: Categorical([3, 4, 3, 4]),
name: values,
}
).set_index(["a", 0])[name]
if sort and vc_sort:
taker = [0, 1, 2, 3]
elif sort and not vc_sort:
taker = [0, 1, 2, 3]
elif not sort and vc_sort:
taker = [0, 2, 1, 3]
else:
taker = [2, 3, 0, 1]
expected = expected.take(taker)

tm.assert_series_equal(result, expected)

0 comments on commit dbf8aaf

Please sign in to comment.