From 248c96671a945b623e7a2a7f993b0c957d4f4cf0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 11 Aug 2023 11:17:09 +0200 Subject: [PATCH] CoW: Avoid tracking references in delete if not necessary (#54111) --- pandas/core/internals/blocks.py | 5 +++-- pandas/core/internals/managers.py | 9 +++++++-- .../copy_view/test_core_functionalities.py | 18 ++++++++++++++---- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d0de1ed77c3a6..ecb9cd47d7995 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -318,7 +318,7 @@ def take_block_columns(self, indices: npt.NDArray[np.intp]) -> Self: @final def getitem_block_columns( - self, slicer: slice, new_mgr_locs: BlockPlacement + self, slicer: slice, new_mgr_locs: BlockPlacement, ref_inplace_op: bool = False ) -> Self: """ Perform __getitem__-like, return result as block. @@ -326,7 +326,8 @@ def getitem_block_columns( Only supports slices that preserve dimensionality. """ new_values = self._slice(slicer) - return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) + refs = self.refs if not ref_inplace_op or self.refs.has_reference() else None + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final def _can_hold_element(self, element: Any) -> bool: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1533dc77321cb..2a404f46dcc47 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -673,6 +673,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, + ref_inplace_op: bool = False, ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -688,6 +689,8 @@ def _slice_take_blocks_ax0( This is used when called from ops.blockwise.operate_blockwise. use_na_proxy : bool, default False Whether to use a np.void ndarray for newly introduced columns. + ref_inplace_op: bool, default False + Don't track refs if True because we operate inplace Returns ------- @@ -718,7 +721,9 @@ def _slice_take_blocks_ax0( # views instead of copies blocks = [ blk.getitem_block_columns( - slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i) + slice(ml, ml + 1), + new_mgr_locs=BlockPlacement(i), + ref_inplace_op=ref_inplace_op, ) for i, ml in enumerate(slobj) ] @@ -1380,7 +1385,7 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] return type(self)(tuple(nbs), axes, verify_integrity=False) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 08e49aa3813d8..5c177465d2fa4 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -80,11 +80,21 @@ def test_delete(using_copy_on_write): ) del df["b"] if using_copy_on_write: - # TODO: This should not have references, delete makes a shallow copy - # but keeps the blocks alive - assert df._mgr.blocks[0].refs.has_reference() - assert df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[0].refs.has_reference() + assert not df._mgr.blocks[1].refs.has_reference() df = df[["a"]] if using_copy_on_write: assert not df._mgr.blocks[0].refs.has_reference() + + +def test_delete_reference(using_copy_on_write): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] + ) + x = df[:] + del df["b"] + if using_copy_on_write: + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert x._mgr.blocks[0].refs.has_reference()