From 6eb63ceef6377ecfa752eed4bfb554887aca874d Mon Sep 17 00:00:00 2001 From: Willian Wang Date: Sun, 15 Oct 2023 21:11:00 -0300 Subject: [PATCH] CoW: Use weakref callbacks to track dead references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Lucas Silva Mayer --- pandas/_libs/internals.pyx | 45 +++++++++++------------- pandas/tests/copy_view/test_internals.py | 15 ++++---- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index fdfb8e1c99f6e6..90db90d5e14222 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -890,29 +890,29 @@ cdef class BlockValuesRefs: """ cdef: public list referenced_blocks - public int clear_counter + public int dead_counter + object __weakref__ + object _weakref_cb def __cinit__(self, blk: Block | None = None) -> None: + def _weakref_cb(item: weakref.ref, selfref: weakref.ref = weakref.ref(self)) -> None: + self = selfref() + if self is not None: + self.dead_counter += 1 + if self.dead_counter > 256 and self.dead_counter > len(self.referenced_blocks) // 2: + self._clear_dead_references() + self._weakref_cb = _weakref_cb if blk is not None: - self.referenced_blocks = [weakref.ref(blk)] + self.referenced_blocks = [weakref.ref(blk, self._weakref_cb)] else: self.referenced_blocks = [] - self.clear_counter = 500 # set reasonably high - - def _clear_dead_references(self, force=False) -> None: - # Use exponential backoff to decide when we want to clear references - # if force=False. Clearing for every insertion causes slowdowns if - # all these objects stay alive, e.g. df.items() for wide DataFrames - # see GH#55245 and GH#55008 - if force or len(self.referenced_blocks) > self.clear_counter: - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] - nr_of_refs = len(self.referenced_blocks) - if nr_of_refs < self.clear_counter // 2: - self.clear_counter = max(self.clear_counter // 2, 500) - elif nr_of_refs > self.clear_counter: - self.clear_counter = max(self.clear_counter * 2, nr_of_refs) + + def _clear_dead_references(self) -> None: + old_len = len(self.referenced_blocks) + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + self.dead_counter = self.dead_counter - (old_len - len(self.referenced_blocks)) def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. @@ -922,8 +922,7 @@ cdef class BlockValuesRefs: blk : Block The block that the new references should point to. """ - self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(blk)) + self.referenced_blocks.append(weakref.ref(blk, self._weakref_cb)) def add_index_reference(self, index: object) -> None: """Adds a new reference to our reference collection when creating an index. @@ -933,8 +932,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ - self._clear_dead_references() - self.referenced_blocks.append(weakref.ref(index)) + self.referenced_blocks.append(weakref.ref(index, self._weakref_cb)) def has_reference(self) -> bool: """Checks if block has foreign references. @@ -946,6 +944,5 @@ cdef class BlockValuesRefs: ------- bool """ - self._clear_dead_references(force=True) # Checking for more references than block pointing to itself - return len(self.referenced_blocks) > 1 + return len(self.referenced_blocks) - self.dead_counter > 1 diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e9..56e14be806da6a 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -121,31 +121,28 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): assert not np.shares_memory(get_array(df, col), get_array(df2, col)) -def test_exponential_backoff(): - # GH#55518 +def test_clear_dead_references(): + # GH#55539 df = DataFrame({"a": [1, 2, 3]}) for i in range(490): df.copy(deep=False) - assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + assert len(df._mgr.blocks[0].refs.referenced_blocks) - df._mgr.blocks[0].refs.dead_counter == 1 df = DataFrame({"a": [1, 2, 3]}) dfs = [df.copy(deep=False) for i in range(510)] for i in range(20): df.copy(deep=False) - assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 - assert df._mgr.blocks[0].refs.clear_counter == 1000 + assert len(df._mgr.blocks[0].refs.referenced_blocks) - df._mgr.blocks[0].refs.dead_counter == 511 for i in range(500): df.copy(deep=False) - # Don't reduce since we still have over 500 objects alive - assert df._mgr.blocks[0].refs.clear_counter == 1000 + assert len(df._mgr.blocks[0].refs.referenced_blocks) - df._mgr.blocks[0].refs.dead_counter == 511 dfs = dfs[:300] for i in range(500): df.copy(deep=False) - # Reduce since there are less than 500 objects alive - assert df._mgr.blocks[0].refs.clear_counter == 500 + assert len(df._mgr.blocks[0].refs.referenced_blocks) - df._mgr.blocks[0].refs.dead_counter == 301