From 1c1bb854cce61a8d653c468bcce70db84ac07cf9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 3 Oct 2023 20:29:44 -0400 Subject: [PATCH] BUG: DataFrame.join reordering index levels when joining on subset of levels (#55370) * BUG: DataFrame.join reordering index levels when joining on subset of levels * update more tests * move whatsnew to notable section --- doc/source/whatsnew/v2.2.0.rst | 33 ++++++++++++++++++++--- pandas/core/indexes/base.py | 7 +++++ pandas/tests/reshape/merge/test_join.py | 16 +++++++---- pandas/tests/reshape/merge/test_multi.py | 28 +++++++++++-------- pandas/tests/series/methods/test_align.py | 4 +-- pandas/tests/series/test_arithmetic.py | 4 +-- 6 files changed, 69 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d498c84358448..7a177344a42c7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -133,10 +133,36 @@ and ``sort=False``: result -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder +index levels when joining on two indexes with different levels (:issue:`34133`). + +.. ipython:: python + + left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) + right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + result = left.join(right) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + left right + B A C + 1 x 1 1 2 + 2 x 2 1 2 + +*New Behavior* + +.. ipython:: python + + result .. --------------------------------------------------------------------------- .. _whatsnew_220.api_breaking: @@ -342,6 +368,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Sparse ^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9017ff121976b..11f2cc8ebf1ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4725,6 +4725,13 @@ def _join_multi(self, other: Index, how: JoinHow): multi_join_idx = multi_join_idx.remove_unused_levels() + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + return multi_join_idx, lidx, ridx jl = next(iter(overlap)) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 20daa388c2c88..c630ba6a43cb1 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -902,7 +902,7 @@ def test_join_inner_multiindex_deterministic_order(): result = left.join(right, how="inner") expected = DataFrame( {"e": [5], "f": [6]}, - index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")), ) tm.assert_frame_equal(result, expected) @@ -926,10 +926,16 @@ def test_join_multiindex_one_level(join_type): ) right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) - expected = DataFrame( - {"c": [3], "d": [4]}, - index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), - ) + if join_type == "right": + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + ) + else: + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index ab010bdb909f1..c029acf0c8938 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -69,11 +69,6 @@ def on_cols_multi(): return ["Origin", "Destination", "Period"] -@pytest.fixture -def idx_cols_multi(): - return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] - - class TestMergeMulti: def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] @@ -815,9 +810,13 @@ def test_join_multi_levels2(self): class TestJoinMultiMulti: - def test_join_multi_multi( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi - ): + def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -826,7 +825,7 @@ def test_join_multi_multi( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) @@ -834,11 +833,18 @@ def test_join_multi_multi( tm.assert_frame_equal(result, expected) def test_join_multi_empty_frames( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + self, left_multi, right_multi, join_type, on_cols_multi ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) + expected = ( merge( left_multi.reset_index(), @@ -846,7 +852,7 @@ def test_join_multi_empty_frames( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index e1b3dd4888ef5..d36fd5335bdfc 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -240,10 +240,10 @@ def test_align_left_different_named_levels(): result_left, result_right = left.align(right) expected_left = Series( - [2], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) expected_right = Series( - [1], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 8547fd6988791..d40ff6c139653 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -941,8 +941,8 @@ def test_series_varied_multiindex_alignment(): expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( - [("x", 1, "a"), ("x", 2, "a"), ("y", 1, "a"), ("y", 2, "a")], - names=["xy", "num", "ab"], + [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)], + names=["ab", "xy", "num"], ), ) tm.assert_series_equal(result, expected)