From 3bd4fdf36354abccc55aa4ca1518f1819fe3058c Mon Sep 17 00:00:00 2001 From: Renan Date: Wed, 6 Dec 2023 01:58:56 -0300 Subject: [PATCH 1/5] BUG: unstack with sort=False fails when used with the level parameter (#54987) Assign new codes to labels when sort=False. This is done so that the data appears to be already sorted, fixing the bug. --- pandas/core/reshape/reshape.py | 32 +++++++++++++++++------- pandas/tests/frame/test_stack_unstack.py | 15 +++++++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d6922ba58d2b9..ba1bcd063f9a6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,5 +1,6 @@ from __future__ import annotations +import collections import itertools from typing import ( TYPE_CHECKING, @@ -162,8 +163,13 @@ def _indexer_and_to_sort( ]: v = self.level - codes = list(self.index.codes) levs = list(self.index.levels) + codes = list(self.index.codes) + + if not self.sort: + # Create new codes considering that labels are already sorted + codes = [np.array(factorize(code)[0], dtype=code.dtype) for code in codes] + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -174,25 +180,33 @@ def _indexer_and_to_sort( return indexer, to_sort @cache_readonly - def sorted_labels(self) -> list[np.ndarray]: + def labels(self) -> list[np.ndarray]: indexer, to_sort = self._indexer_and_to_sort if self.sort: return [line.take(indexer) for line in to_sort] return to_sort - def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: + @cache_readonly + def sorted_labels(self) -> list[np.ndarray]: if self.sort: - indexer, _ = self._indexer_and_to_sort + return self.labels - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values - return values + v = self.level + codes = list(self.index.codes) + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + return to_sort + + def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values def _make_selectors(self): new_levels = self.new_index_levels # make the mask - remaining_labels = self.sorted_labels[:-1] + remaining_labels = self.labels[:-1] + choosen_labels = self.labels[-1] level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) @@ -202,7 +216,7 @@ def _make_selectors(self): stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride - selector = self.sorted_labels[-1] + stride * comp_index + self.lift + selector = choosen_labels + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2e7e8eba270c0..5c7fde826faf3 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1318,6 +1318,21 @@ def test_unstack_sort_false(frame_or_series, dtype): [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + + result = obj.unstack(level=0, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")]) + else: + expected_columns = ["two", "one"] + expected = DataFrame( + [[1.0, 3.0], [2.0, 4.0]], + index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + columns=expected_columns, + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) + result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: From 86f7017ccbe799c6f508141aca77dd9d3ff0861a Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 31 Mar 2024 08:39:16 -0400 Subject: [PATCH 2/5] Minor refactor and cleanup --- pandas/core/reshape/reshape.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c436307eb5657..7704c96e584e2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,5 @@ from __future__ import annotations -import collections import itertools from typing import ( TYPE_CHECKING, @@ -171,7 +170,7 @@ def _indexer_and_to_sort( if not self.sort: # Create new codes considering that labels are already sorted - codes = [np.array(factorize(code)[0], dtype=code.dtype) for code in codes] + codes = [factorize(code)[0] for code in codes] to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -183,22 +182,12 @@ def _indexer_and_to_sort( return indexer, to_sort @cache_readonly - def labels(self) -> list[np.ndarray]: + def sorted_labels(self) -> list[np.ndarray]: indexer, to_sort = self._indexer_and_to_sort if self.sort: return [line.take(indexer) for line in to_sort] return to_sort - @cache_readonly - def sorted_labels(self) -> list[np.ndarray]: - if self.sort: - return self.labels - - v = self.level - codes = list(self.index.codes) - to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] - return to_sort - def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort sorted_values = algos.take_nd(values, indexer, axis=0) @@ -208,8 +197,7 @@ def _make_selectors(self) -> None: new_levels = self.new_index_levels # make the mask - remaining_labels = self.labels[:-1] - choosen_labels = self.labels[-1] + remaining_labels = self.sorted_labels[:-1] level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) @@ -219,7 +207,7 @@ def _make_selectors(self) -> None: stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride - selector = choosen_labels + stride * comp_index + self.lift + selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) @@ -389,7 +377,13 @@ def _repeater(self) -> np.ndarray: @cache_readonly def new_index(self) -> MultiIndex | Index: # Does not depend on values or value_columns - result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + if self.sort: + labels = self.sorted_labels[:-1] + else: + v = self.level + codes = list(self.index.codes) + labels = codes[:v] + codes[v + 1 :] + result_codes = [lab.take(self.compressor) for lab in labels] # construct the new index if len(self.new_index_levels) == 1: From 345eb4f0b4cc6eff103947b0a9ebe1d6d47eca85 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 2 May 2024 18:35:39 -0400 Subject: [PATCH 3/5] Cleanup & remove test --- pandas/core/reshape/reshape.py | 4 +--- pandas/tests/reshape/test_pivot.py | 13 ------------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 864674b7fc089..4a4080a9dc57b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -167,13 +167,11 @@ def _indexer_and_to_sort( ]: v = self.level - levs = list(self.index.levels) codes = list(self.index.codes) - if not self.sort: # Create new codes considering that labels are already sorted codes = [factorize(code)[0] for code in codes] - + levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 97f06b0e379f4..03c89d89b30f8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2703,16 +2703,3 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): index=Index(["a", "b", "All"], name=0), ) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("m", [1, 10]) - def test_unstack_shares_memory(self, m): - # GH#56633 - levels = np.arange(m) - index = MultiIndex.from_product([levels] * 2) - values = np.arange(m * m * 100).reshape(m * m, 100) - df = DataFrame(values, index, np.arange(100)) - df_orig = df.copy() - result = df.unstack(sort=False) - assert np.shares_memory(df._values, result._values) is (m == 1) - result.iloc[0, 0] = -1 - tm.assert_frame_equal(df, df_orig) From cff156b9f4d771dd4ec5ab52254baccfdaca8807 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 4 May 2024 07:58:26 -0400 Subject: [PATCH 4/5] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9e7349a061295..14690cb246a2b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -455,7 +455,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) -- +- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) Sparse ^^^^^^ From 436fc8f8dfb41d813ae7be17dda2eaf4f52c86b6 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 20 May 2024 21:36:43 -0400 Subject: [PATCH 5/5] Revert test removal --- pandas/tests/reshape/test_pivot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 03c89d89b30f8..4a13c1f5e1167 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2703,3 +2703,15 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): index=Index(["a", "b", "All"], name=0), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("m", [1, 10]) + def test_unstack_copy(self, m): + # GH#56633 + levels = np.arange(m) + index = MultiIndex.from_product([levels] * 2) + values = np.arange(m * m * 100).reshape(m * m, 100) + df = DataFrame(values, index, np.arange(100)) + df_orig = df.copy() + result = df.unstack(sort=False) + result.iloc[0, 0] = -1 + tm.assert_frame_equal(df, df_orig)