From c118e1f26f49005f1a03e3b4a5f9b3a30b122928 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 23 Feb 2024 18:19:43 -0500 Subject: [PATCH 1/2] PERF: groupby(...).__len__ --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/conftest.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 23 +++++++++++++++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d44824183f05f..4dcbe74c5d868 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -178,6 +178,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) +- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:``) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/conftest.py b/pandas/conftest.py index 3add32293ea06..730b84fdc6201 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -271,7 +271,7 @@ def axis(request): return request.param -@pytest.fixture(params=[True, False, None]) +@pytest.fixture(params=[True, False]) def observed(request): """ Pass in the observed keyword to groupby for [True, False] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 668c9863612d8..9768a932f73f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -596,7 +596,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): @final def __len__(self) -> int: - return len(self.groups) + return self._grouper.ngroups @final def __repr__(self) -> str: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 46a307e2f28d9..dade1a985257d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -149,6 +149,29 @@ def test_len_nan_group(): assert len(df.groupby(["a", "b"])) == 0 +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_len_categorical(dropna, observed, keys): + # GH#?? + df = DataFrame( + { + "a": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "b": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "c": 1, + } + ) + gb = df.groupby(keys, observed=observed, dropna=dropna) + result = len(gb) + if observed and dropna: + expected = 2 + elif observed and not dropna: + expected = 3 + elif len(keys) == 1: + expected = 3 if dropna else 4 + else: + expected = 9 if dropna else 16 + assert result == expected, f"{result} vs {expected}" + + def test_basic_regression(): # regression result = Series([1.0 * x for x in list(range(1, 10)) * 10]) From 496763b9bad2769b98868e4623a314b1f8fd8432 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 23 Feb 2024 18:21:33 -0500 Subject: [PATCH 2/2] GH# --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4dcbe74c5d868..35c88d544c985 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -178,7 +178,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) -- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:``) +- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dade1a985257d..d02e22c29159f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -151,7 +151,7 @@ def test_len_nan_group(): @pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) def test_len_categorical(dropna, observed, keys): - # GH#?? + # GH#57595 df = DataFrame( { "a": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]),