From 6f0cd8d9ed251aa80f7415c565176fc33487110a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:18:59 +0100 Subject: [PATCH] ENH: Implement masked algorithm for value_counts (#54984) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/hashtable.pyi | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 41 +++++++++++-------- pandas/core/algorithms.py | 8 ++-- pandas/core/arrays/masked.py | 32 ++++++--------- pandas/core/arrays/sparse/array.py | 2 +- pandas/tests/libs/test_hashtable.py | 19 +++++++-- .../tests/series/methods/test_value_counts.py | 19 +++++++++ 8 files changed, 78 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6ddb1613076ac..9dc095e6de6ff 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,6 +77,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 2bc6d74fe6aee..0ac914e86f699 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -240,7 +240,7 @@ def value_count( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ..., -) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values] +) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values] # arr and values should have same dtype def ismember( diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b9cf6011481af..19acd4acbdee7 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): {{endif}} cdef: - Py_ssize_t i = 0 + Py_ssize_t i = 0, na_counter = 0, na_add = 0 Py_ssize_t n = len(values) kh_{{ttype}}_t *table @@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 bint uses_mask = mask is not None bint isna_entry = False - if uses_mask and not dropna: - raise NotImplementedError("uses_mask not implemented with dropna=False") - # we track the order in which keys are first seen (GH39009), # khash-map isn't insertion-ordered, thus: # table maps keys to counts @@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 for i in range(n): val = {{to_c_type}}(values[i]) + if uses_mask: + isna_entry = mask[i] + if dropna: - if uses_mask: - isna_entry = mask[i] - else: + if not uses_mask: isna_entry = is_nan_{{c_type}}(val) if not dropna or not isna_entry: - k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - table.vals[k] += 1 + if uses_mask and isna_entry: + na_counter += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = 1 - result_keys.append(val) + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + result_keys.append(val) {{endif}} # collect counts in the order corresponding to result_keys: + if na_counter > 0: + na_add = 1 cdef: - int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) + int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64) for i in range(table.size): {{if dtype == 'object'}} @@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 {{endif}} result_counts[i] = table.vals[k] + if na_counter > 0: + result_counts[table.size] = na_counter + result_keys.append(val) + kh_destroy_{{ttype}}(table) - return result_keys.to_array(), result_counts.base + return result_keys.to_array(), result_counts.base, na_counter @cython.wraparound(False) @@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): ndarray[htfunc_t] modes int64_t[::1] counts - int64_t count, max_count = -1 + int64_t count, _, max_count = -1 Py_ssize_t nkeys, k, j = 0 - keys, counts = value_count(values, dropna, mask=mask) + keys, counts, _ = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d74bb8b83e4e..c952178f4c998 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -924,7 +924,7 @@ def value_counts_internal( else: values = _ensure_arraylike(values, func_name="value_counts") - keys, counts = value_counts_arraylike(values, dropna) + keys, counts, _ = value_counts_arraylike(values, dropna) if keys.dtype == np.float16: keys = keys.astype(np.float32) @@ -949,7 +949,7 @@ def value_counts_internal( # Called once from SparseArray, otherwise could be private def value_counts_arraylike( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None -) -> tuple[ArrayLike, npt.NDArray[np.int64]]: +) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: """ Parameters ---------- @@ -965,7 +965,7 @@ def value_counts_arraylike( original = values values = _ensure_data(values) - keys, counts = htable.value_count(values, dropna, mask=mask) + keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period @@ -975,7 +975,7 @@ def value_counts_arraylike( keys, counts = keys[mask], counts[mask] res_keys = _reconstruct_data(keys, original.dtype, original) - return res_keys, counts + return res_keys, counts, na_counter def duplicated( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fdcbe67bbc371..9b85fb0477e6f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series: ) from pandas.arrays import IntegerArray - keys, value_counts = algos.value_counts_arraylike( - self._data, dropna=True, mask=self._mask + keys, value_counts, na_counter = algos.value_counts_arraylike( + self._data, dropna=dropna, mask=self._mask ) + mask_index = np.zeros((len(value_counts),), dtype=np.bool_) + mask = mask_index.copy() - if dropna: - res = Series(value_counts, index=keys, name="count", copy=False) - res.index = res.index.astype(self.dtype) - res = res.astype("Int64") - return res + if na_counter > 0: + mask_index[-1] = True - # if we want nans, count the mask - counts = np.empty(len(value_counts) + 1, dtype="int64") - counts[:-1] = value_counts - counts[-1] = self._mask.sum() - - index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value) - index = index.astype(self.dtype) - - mask = np.zeros(len(counts), dtype="bool") - counts_array = IntegerArray(counts, mask) - - return Series(counts_array, index=index, name="count", copy=False) + arr = IntegerArray(value_counts, mask) + index = Index( + self.dtype.construct_array_type()( + keys, mask_index # type: ignore[arg-type] + ) + ) + return Series(arr, index=index, name="count", copy=False) @doc(ExtensionArray.equals) def equals(self, other) -> bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 00cbe1286c195..4d5eef960293f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series: Series, ) - keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0 and (not self._null_fill_value or not dropna): mask = isna(keys) if self._null_fill_value else keys == self.fill_value diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b78e6426ca17f..2c8f4c4149528 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable): expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) + def test_value_count_mask(self, dtype): + if dtype == np.object_: + pytest.skip("mask not implemented for object dtype") + values = np.array([1] * 5, dtype=dtype) + mask = np.zeros((5,), dtype=np.bool_) + mask[1] = True + mask[4] = True + keys, counts, na_counter = ht.value_count(values, False, mask=mask) + assert len(keys) == 2 + assert na_counter == 2 + def test_value_count_stable(self, dtype, writable): # GH12679 values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(keys, values) assert np.all(counts == 1) @@ -685,9 +696,9 @@ def test_unique_label_indices(): class TestHelpFunctionsWithNans: def test_value_count(self, dtype): values = np.array([np.nan, np.nan, np.nan], dtype=dtype) - keys, counts = ht.value_count(values, True) + keys, counts, _ = ht.value_count(values, True) assert len(keys) == 0 - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) assert len(keys) == 1 and np.all(np.isnan(keys)) assert counts[0] == 3 diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f54489ac8a8b4..bde9902fec6e9 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected): # GH 17927 result = Series(input_array).value_counts() tm.assert_series_equal(result, expected) + + def test_value_counts_masked(self): + # GH#54984 + dtype = "Int64" + ser = Series([1, 2, None, 2, None, 3], dtype=dtype) + result = ser.value_counts(dropna=False) + expected = Series( + [2, 2, 1, 1], + index=Index([2, None, 1, 3], dtype=dtype), + dtype=dtype, + name="count", + ) + tm.assert_series_equal(result, expected) + + result = ser.value_counts(dropna=True) + expected = Series( + [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" + ) + tm.assert_series_equal(result, expected)