-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF: check monotonicity inside _can_use_libjoin #55342
Changes from all commits
a28e2c6
d9cfa91
851f725
5ff3239
a4b702b
6fee89e
6bf14b5
4f89583
81e74a9
6f1b081
94ebbf6
73acca0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None): | |
|
||
if ( | ||
sort in (None, True) | ||
and self.is_monotonic_increasing | ||
and other.is_monotonic_increasing | ||
and not (self.has_duplicates and other.has_duplicates) | ||
and (self.is_unique or other.is_unique) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the order here can have a difference.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. i had expected the cache for all of these to get populated at the same time inside IndexEngine, will take a closer look There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, looks like engine.is_monotonic_increasing will populate the engine.is_unique cache but not vice-versa. All of the paths within the engine that check is_unique do that check after checking self.is_monotonic_increasing. im inclined to update the engine code so that the unique check always populates the is_monotonic_increasing cache in order to 1) make the perf not dependent on the order these are accessed and 2) avoid populating There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
that sounds good to me |
||
and self._can_use_libjoin | ||
and other._can_use_libjoin | ||
): | ||
|
@@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False): | |
""" | ||
intersection specialized to the case with matching dtypes. | ||
""" | ||
if ( | ||
self.is_monotonic_increasing | ||
and other.is_monotonic_increasing | ||
and self._can_use_libjoin | ||
and other._can_use_libjoin | ||
): | ||
if self._can_use_libjoin and other._can_use_libjoin: | ||
try: | ||
res_indexer, indexer, _ = self._inner_indexer(other) | ||
except TypeError: | ||
|
@@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: | |
def _join_monotonic( | ||
self, other: Index, how: JoinHow = "left" | ||
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: | ||
# We only get here with matching dtypes and both monotonic increasing | ||
# We only get here with (caller is responsible for ensuring): | ||
# 1) matching dtypes | ||
# 2) both monotonic increasing | ||
# 3) other.is_unique or self.is_unique | ||
assert other.dtype == self.dtype | ||
assert self._can_use_libjoin and other._can_use_libjoin | ||
|
||
|
@@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool: | |
making a copy. If we cannot, this negates the performance benefit | ||
of using libjoin. | ||
""" | ||
if not self.is_monotonic_increasing: | ||
# The libjoin functions all assume monotonicity. | ||
return False | ||
|
||
if type(self) is Index: | ||
# excludes EAs, but include masks, we get here with monotonic | ||
# values only, meaning no NA | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added to catch the
Decimal("NaN") < Decimal("NaN")
case