Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: check monotonicity inside _can_use_libjoin #55342

Merged
merged 12 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ from pandas._libs.missing cimport (
is_matching_na,
)

from decimal import InvalidOperation

# Defines shift of MultiIndex codes to avoid negative codes (missing values)
multiindex_nulls_shift = 2

Expand Down Expand Up @@ -248,6 +250,10 @@ cdef class IndexEngine:

@property
def is_unique(self) -> bool:
# for why we check is_monotonic_increasing here, see
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
if self.need_monotonic_check:
self.is_monotonic_increasing
if self.need_unique_check:
self._do_unique_check()

Expand Down Expand Up @@ -281,7 +287,7 @@ cdef class IndexEngine:
values = self.values
self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \
self._call_monotonic(values)
except TypeError:
except (TypeError, InvalidOperation):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added to catch the Decimal("NaN") < Decimal("NaN") case

self.monotonic_inc = 0
self.monotonic_dec = 0
is_strict_monotonic = 0
Expand Down Expand Up @@ -843,6 +849,10 @@ cdef class SharedEngine:

@property
def is_unique(self) -> bool:
# for why we check is_monotonic_increasing here, see
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
if self.need_monotonic_check:
self.is_monotonic_increasing
if self.need_unique_check:
arr = self.values.unique()
self.unique = len(arr) == len(self.values)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
many repeated values.

>>> df['object'].astype('category').memory_usage(deep=True)
5244
5136
"""
result = self._constructor_sliced(
[c.memory_usage(index=False, deep=deep) for col, c in self.items()],
Expand Down
20 changes: 10 additions & 10 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None):

if (
sort in (None, True)
and self.is_monotonic_increasing
and other.is_monotonic_increasing
and not (self.has_duplicates and other.has_duplicates)
and (self.is_unique or other.is_unique)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the order here can have a difference. is_monotonic_increasing will provide is_unique "for free" if the index is both monotonic and unique and it hasn't already been cached.

import pandas as pd
import numpy as np

arr = np.arange(1_000_000)

%timeit idx=pd.Index(arr); idx.is_unique and idx.is_monotonic_increasing
# 68.7 ms ± 5.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

%timeit idx=pd.Index(arr); idx.is_monotonic_increasing and idx.is_unique
# 2.91 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. i had expected the cache for all of these to get populated at the same time inside IndexEngine, will take a closer look

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, looks like engine.is_monotonic_increasing will populate the engine.is_unique cache but not vice-versa. All of the paths within the engine that check is_unique do that check after checking self.is_monotonic_increasing. im inclined to update the engine code so that the unique check always populates the is_monotonic_increasing cache in order to 1) make the perf not dependent on the order these are accessed and 2) avoid populating engine.mapping in these cases (which allocates memory)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im inclined to update the engine code so that the unique check always populates the is_monotonic_increasing cache in order to 1) make the perf not dependent on the order these are accessed and 2) avoid populating engine.mapping in these cases (which allocates memory)

that sounds good to me

and self._can_use_libjoin
and other._can_use_libjoin
):
Expand Down Expand Up @@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False):
"""
intersection specialized to the case with matching dtypes.
"""
if (
self.is_monotonic_increasing
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
):
if self._can_use_libjoin and other._can_use_libjoin:
try:
res_indexer, indexer, _ = self._inner_indexer(other)
except TypeError:
Expand Down Expand Up @@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
def _join_monotonic(
self, other: Index, how: JoinHow = "left"
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# We only get here with matching dtypes and both monotonic increasing
# We only get here with (caller is responsible for ensuring):
# 1) matching dtypes
# 2) both monotonic increasing
# 3) other.is_unique or self.is_unique
assert other.dtype == self.dtype
assert self._can_use_libjoin and other._can_use_libjoin

Expand Down Expand Up @@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool:
making a copy. If we cannot, this negates the performance benefit
of using libjoin.
"""
if not self.is_monotonic_increasing:
# The libjoin functions all assume monotonicity.
return False

if type(self) is Index:
# excludes EAs, but include masks, we get here with monotonic
# values only, meaning no NA
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,6 @@ def data_for_grouping():


class TestCategorical(base.ExtensionTests):
@pytest.mark.xfail(reason="Memory usage doesn't match")
def test_memory_usage(self, data):
# TODO: Is this deliberate?
super().test_memory_usage(data)

def test_contains(self, data, data_missing):
# GH-37867
# na value handling in Categorical.__contains__ is deprecated.
Expand Down
Loading