From d450c0e392473c7d4728703033ebfbaf1635a7c7 Mon Sep 17 00:00:00 2001 From: Renan Date: Wed, 6 Dec 2023 01:58:56 -0300 Subject: [PATCH] BUG: unstack with sort=False fails when used with the level parameter (#54987) Assign new codes to labels when sort=False. This is done so that the data appears to be already sorted, fixing the bug. --- pandas/core/reshape/reshape.py | 37 +++++++++++++++++------- pandas/tests/frame/test_stack_unstack.py | 15 ++++++++++ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d6922ba58d2b95..f8d4f278454b21 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -153,6 +153,16 @@ def __init__( self._make_selectors() + @cache_readonly + def sorted_labels(self) -> list[np.ndarray]: + if self.sort: + return self.labels + + v = self.level + codes = list(self.index.codes) + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + return to_sort + @cache_readonly def _indexer_and_to_sort( self, @@ -162,8 +172,17 @@ def _indexer_and_to_sort( ]: v = self.level - codes = list(self.index.codes) levs = list(self.index.levels) + codes = list(self.index.codes) + + if not self.sort: + codes = [list(code) for code in codes] + ids_code = [ + ({y: x for x, y in enumerate(sorted(set(code), key=code.index))}, code) + for code in codes + ] + codes = [np.array([d[x] for x in code]) for d, code in ids_code] + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -174,25 +193,23 @@ def _indexer_and_to_sort( return indexer, to_sort @cache_readonly - def sorted_labels(self) -> list[np.ndarray]: + def labels(self) -> list[np.ndarray]: indexer, to_sort = self._indexer_and_to_sort if self.sort: return [line.take(indexer) for line in to_sort] return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - if self.sort: - indexer, _ = self._indexer_and_to_sort - - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values - return values + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values def _make_selectors(self): new_levels = self.new_index_levels # make the mask - remaining_labels = self.sorted_labels[:-1] + remaining_labels = self.labels[:-1] + choosen_labels = self.labels[-1] level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) @@ -202,7 +219,7 @@ def _make_selectors(self): stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride - selector = self.sorted_labels[-1] + stride * comp_index + self.lift + selector = choosen_labels + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2e7e8eba270c0c..4a8de95c258b81 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1318,6 +1318,21 @@ def test_unstack_sort_false(frame_or_series, dtype): [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + + result = obj.unstack(level=0, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")]) + else: + expected_columns = ["two", "one"] + expected = DataFrame( + [[1.0, 3.0], [2.0, 4.0]], + index=MultiIndex.from_tuples([('z', 'b'), ('y', 'a')]), + columns=expected_columns, + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) + result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: