From 23c20deb7a6e7857b57abba72c15a21d8dbdfe7c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 9 Dec 2023 14:00:17 -0500 Subject: [PATCH] BUG: outer join on equal indexes not sorting (#56426) * outer join on equal indexes to sort by default * whatsnew * fix test * remove Index._join_precedence --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/computation/align.py | 2 +- pandas/core/indexes/base.py | 31 +++++++++--------------- pandas/core/indexes/datetimelike.py | 2 -- pandas/core/reshape/merge.py | 5 +--- pandas/tests/indexes/multi/test_join.py | 13 ++++------ pandas/tests/indexes/test_base.py | 11 +++++---- pandas/tests/indexes/test_old_base.py | 6 ++++- pandas/tests/reshape/merge/test_merge.py | 20 +++++++-------- pandas/tests/series/test_arithmetic.py | 3 +++ 10 files changed, 43 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 486440af46b79..49c28e3dc5d85 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -246,7 +246,7 @@ These are bug fixes that might have notable behavior changes. In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not always return a result that followed the documented sort behavior. pandas now -follows the documented sort behavior in merge and join operations (:issue:`54611`). +follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`). As documented, ``sort=True`` sorts the join keys lexicographically in the resulting :class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 85d412d044ba8..cd852ba9249cf 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -110,7 +110,7 @@ def _align_core(terms): ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how="outer") + axes[ax] = axes[ax].union(itm) for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f9f42b9788a25..9d998b46dbeed 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -368,9 +368,6 @@ class Index(IndexOpsMixin, PandasObject): Index([1, 2, 3], dtype='uint8') """ - # To hand over control to subclasses - _join_precedence = 1 - # similar to __array_priority__, positions Index after Series and DataFrame # but before ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 2000 @@ -4564,6 +4561,7 @@ def join( Index([1, 2, 3, 4, 5, 6], dtype='int64') """ other = ensure_index(other) + sort = sort or how == "outer" if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): if (self.tz is None) ^ (other.tz is None): @@ -4614,15 +4612,6 @@ def join( rindexer = np.array([]) return join_index, None, rindexer - if self._join_precedence < other._join_precedence: - flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} - how = flip.get(how, how) - join_index, lidx, ridx = other.join( - self, how=how, level=level, return_indexers=True - ) - lidx, ridx = ridx, lidx - return join_index, lidx, ridx - if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) this = self.astype(dtype, copy=False) @@ -4666,18 +4655,20 @@ def _join_via_get_indexer( # Note: at this point we have checked matching dtypes if how == "left": - join_index = self + join_index = self.sort_values() if sort else self elif how == "right": - join_index = other + join_index = other.sort_values() if sort else other elif how == "inner": join_index = self.intersection(other, sort=sort) elif how == "outer": - # TODO: sort=True here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.union(other) - - if sort and how in ["left", "right"]: - join_index = join_index.sort_values() + try: + join_index = self.union(other, sort=sort) + except TypeError: + join_index = self.union(other) + try: + join_index = _maybe_try_sort(join_index, sort) + except TypeError: + pass if join_index is self: lindexer = None diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 264ca8aa11495..2b03a64236128 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -442,8 +442,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _join_precedence = 10 - @property def unit(self) -> str: return self._data.unit diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0756b25adedcd..f07c4fb8f7d5f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -759,7 +759,7 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes - self.sort = sort + self.sort = sort or how == "outer" self.left_index = left_index self.right_index = right_index @@ -1694,9 +1694,6 @@ def get_join_indexers( elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) - if not sort and how == "outer": - sort = True - # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 700af142958b3..edd0feaaa1159 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -51,8 +51,11 @@ def test_join_level_corner_case(idx): def test_join_self(idx, join_type): - joined = idx.join(idx, how=join_type) - tm.assert_index_equal(joined, idx) + result = idx.join(idx, how=join_type) + expected = idx + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) def test_join_multi(): @@ -89,12 +92,6 @@ def test_join_multi(): tm.assert_numpy_array_equal(ridx, exp_ridx) -def test_join_self_unique(idx, join_type): - if idx.is_unique: - joined = idx.join(idx, how=join_type) - assert (idx == joined).all() - - def test_join_multi_wrong_order(): # GH 25760 # GH 28956 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bb8822f047330..2b0bb884f0706 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -987,8 +987,11 @@ def test_slice_keep_name(self): indirect=True, ) def test_join_self(self, index, join_type): - joined = index.join(index, how=join_type) - assert index is joined + result = index.join(index, how=join_type) + expected = index + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): @@ -1072,10 +1075,8 @@ def test_outer_join_sort(self): with tm.assert_produces_warning(RuntimeWarning): result = left_index.join(right_index, how="outer") - # right_index in this case because DatetimeIndex has join precedence - # over int64 Index with tm.assert_produces_warning(RuntimeWarning): - expected = right_index.astype(object).union(left_index.astype(object)) + expected = left_index.astype(object).union(right_index.astype(object)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b467e93b75e96..fd0d984053bd6 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -30,6 +30,7 @@ period_range, ) import pandas._testing as tm +import pandas.core.algorithms as algos from pandas.core.arrays import BaseMaskedArray @@ -653,7 +654,10 @@ def test_join_self_unique(self, join_type, simple_index): idx = simple_index if idx.is_unique: joined = idx.join(idx, how=join_type) - assert (idx == joined).all() + expected = simple_index + if join_type == "outer": + expected = algos.safe_sort(expected) + tm.assert_index_equal(joined, expected) def test_map(self, simple_index): # callable diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7538894bbf1c9..d7a343ae9f152 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1838,6 +1838,9 @@ def test_merge_empty(self, left_empty, how, exp): elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") + if how == "outer": + expected = expected.sort_values("A", ignore_index=True) + tm.assert_frame_equal(result, expected) @@ -2913,16 +2916,13 @@ def test_merge_combinations( expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() elif how == "outer": - if on_index and left_unique and left["key"].equals(right["key"]): - expected = DataFrame({"key": left["key"]}) - else: - left_counts = left["key"].value_counts() - right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1) - expected_counts = expected_counts.astype(np.intp) - expected = expected_counts.index.values.repeat(expected_counts.values) - expected = DataFrame({"key": expected}) - expected = expected.sort_values("key") + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") if on_index: expected = expected.set_index("key") diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 75237be212030..b40e2e99dae2e 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -754,6 +754,9 @@ def test_series_add_tz_mismatch_converts_to_utc(self): uts2 = ser2.tz_convert("utc") expected = uts1 + uts2 + # sort since input indexes are not equal + expected = expected.sort_index() + assert result.index.tz is timezone.utc tm.assert_series_equal(result, expected)