Skip to content

Commit

Permalink
Implement masked algorithm for mode (#55340)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Nov 18, 2023
1 parent 47a596e commit 1969079
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ Performance improvements
- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
Expand Down
10 changes: 7 additions & 3 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -404,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
cdef:
ndarray[htfunc_t] keys
ndarray[htfunc_t] modes
ndarray[uint8_t] res_mask = None

int64_t[::1] counts
int64_t count, _, max_count = -1
Py_ssize_t nkeys, k, j = 0
Py_ssize_t nkeys, k, na_counter, j = 0

keys, counts, _ = value_count(values, dropna, mask=mask)
keys, counts, na_counter = value_count(values, dropna, mask=mask)
nkeys = len(keys)

modes = np.empty(nkeys, dtype=values.dtype)
Expand Down Expand Up @@ -440,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):

modes[j] = keys[k]

return modes[:j + 1]
if na_counter > 0:
res_mask = np.zeros(j+1, dtype=np.bool_)
res_mask[j] = True
return modes[:j + 1], res_mask


{{py:
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,10 @@ def mode(

values = _ensure_data(values)

npresult = htable.mode(values, dropna=dropna, mask=mask)
npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
if res_mask is not None:
return npresult, res_mask # type: ignore[return-value]

try:
npresult = np.sort(npresult)
except TypeError as err:
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
from pandas.core.algorithms import (
factorize_array,
isin,
mode,
take,
)
from pandas.core.array_algos import (
Expand Down Expand Up @@ -1069,6 +1070,15 @@ def value_counts(self, dropna: bool = True) -> Series:
)
return Series(arr, index=index, name="count", copy=False)

def _mode(self, dropna: bool = True) -> Self:
if dropna:
result = mode(self._data, dropna=dropna, mask=self._mask)
res_mask = np.zeros(result.shape, dtype=np.bool_)
else:
result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
result = type(self)(result, res_mask) # type: ignore[arg-type]
return result[result.argsort()]

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
if type(self) != type(other):
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,21 +644,21 @@ def test_mode(self, dtype, writable):
values = np.repeat(np.arange(N).astype(dtype), 5)
values[0] = 42
values.flags.writeable = writable
result = ht.mode(values, False)
result = ht.mode(values, False)[0]
assert result == 42

def test_mode_stable(self, dtype, writable):
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
values.flags.writeable = writable
keys = ht.mode(values, False)
keys = ht.mode(values, False)[0]
tm.assert_numpy_array_equal(keys, values)


def test_modes_with_nans():
# GH42688, nans aren't mangled
nulls = [pd.NA, np.nan, pd.NaT, None]
values = np.array([True] + nulls * 2, dtype=np.object_)
modes = ht.mode(values, False)
modes = ht.mode(values, False)[0]
assert modes.size == len(nulls)


Expand Down Expand Up @@ -724,8 +724,8 @@ def test_ismember_no(self, dtype):

def test_mode(self, dtype):
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
assert ht.mode(values, True) == 42
assert np.isnan(ht.mode(values, False))
assert ht.mode(values, True)[0] == 42
assert np.isnan(ht.mode(values, False)[0])


def test_ismember_tuple_with_nans():
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/series/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,28 @@ def test_mode_extension_dtype(as_period):
tm.assert_series_equal(res, ser)


def test_mode_nullable_dtype(any_numeric_ea_dtype):
# GH#55340
ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype)
result = ser.mode(dropna=False)
expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype)
tm.assert_series_equal(result, expected)

result = ser.mode(dropna=True)
expected = Series([2, 3], dtype=any_numeric_ea_dtype)
tm.assert_series_equal(result, expected)

ser[-1] = pd.NA

result = ser.mode(dropna=True)
expected = Series([2, 3], dtype=any_numeric_ea_dtype)
tm.assert_series_equal(result, expected)

result = ser.mode(dropna=False)
expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
tm.assert_series_equal(result, expected)


def test_reductions_td64_with_nat():
# GH#8617
ser = Series([0, pd.NaT], dtype="m8[ns]")
Expand Down

0 comments on commit 1969079

Please sign in to comment.