diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a33322aebab34..16cfc3efc1024 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -321,6 +321,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`) - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) - Performance improvement in :meth:`Series.str` methods (:issue:`55736`) +- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing into a non-unique index (:issue:`55816`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 19acd4acbdee7..336af306d410f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -404,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): cdef: ndarray[htfunc_t] keys ndarray[htfunc_t] modes + ndarray[uint8_t] res_mask = None int64_t[::1] counts int64_t count, _, max_count = -1 - Py_ssize_t nkeys, k, j = 0 + Py_ssize_t nkeys, k, na_counter, j = 0 - keys, counts, _ = value_count(values, dropna, mask=mask) + keys, counts, na_counter = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) @@ -440,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): modes[j] = keys[k] - return modes[:j + 1] + if na_counter > 0: + res_mask = np.zeros(j+1, dtype=np.bool_) + res_mask[j] = True + return modes[:j + 1], res_mask {{py: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d1fa95ed63d09..b38a27a9f6d0a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1034,7 +1034,10 @@ def mode( values = _ensure_data(values) - npresult = htable.mode(values, dropna=dropna, mask=mask) + npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) + if res_mask is not None: + return npresult, res_mask # type: ignore[return-value] + try: npresult = np.sort(npresult) except TypeError as err: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c8447397c7bfe..58909643ed46a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -69,6 +69,7 @@ from pandas.core.algorithms import ( factorize_array, isin, + mode, take, ) from pandas.core.array_algos import ( @@ -1069,6 +1070,15 @@ def value_counts(self, dropna: bool = True) -> Series: ) return Series(arr, index=index, name="count", copy=False) + def _mode(self, dropna: bool = True) -> Self: + if dropna: + result = mode(self._data, dropna=dropna, mask=self._mask) + res_mask = np.zeros(result.shape, dtype=np.bool_) + else: + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) # type: ignore[arg-type] + return result[result.argsort()] + @doc(ExtensionArray.equals) def equals(self, other) -> bool: if type(self) != type(other): diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 2c8f4c4149528..e54764f9ac4a6 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -644,13 +644,13 @@ def test_mode(self, dtype, writable): values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 values.flags.writeable = writable - result = ht.mode(values, False) + result = ht.mode(values, False)[0] assert result == 42 def test_mode_stable(self, dtype, writable): values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys = ht.mode(values, False) + keys = ht.mode(values, False)[0] tm.assert_numpy_array_equal(keys, values) @@ -658,7 +658,7 @@ def test_modes_with_nans(): # GH42688, nans aren't mangled nulls = [pd.NA, np.nan, pd.NaT, None] values = np.array([True] + nulls * 2, dtype=np.object_) - modes = ht.mode(values, False) + modes = ht.mode(values, False)[0] assert modes.size == len(nulls) @@ -724,8 +724,8 @@ def test_ismember_no(self, dtype): def test_mode(self, dtype): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) - assert ht.mode(values, True) == 42 - assert np.isnan(ht.mode(values, False)) + assert ht.mode(values, True)[0] == 42 + assert np.isnan(ht.mode(values, False)[0]) def test_ismember_tuple_with_nans(): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index fbdf843a998bb..f79e58427688b 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -29,6 +29,28 @@ def test_mode_extension_dtype(as_period): tm.assert_series_equal(res, ser) +def test_mode_nullable_dtype(any_numeric_ea_dtype): + # GH#55340 + ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser[-1] = pd.NA + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_reductions_td64_with_nat(): # GH#8617 ser = Series([0, pd.NaT], dtype="m8[ns]")