From 336240940edc4392490e00d3178dfd3c9f1e0ee0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 22 Oct 2023 15:41:13 +0200 Subject: [PATCH] Backport PR #55518: CoW: Use exponential backoff when clearing dead references --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/_libs/internals.pyx | 24 ++++++++++++++----- pandas/tests/copy_view/test_internals.py | 30 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..ef69e874fdaf3 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) +- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 859ebce36eaa8..10b0f4776e0d1 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -944,17 +944,29 @@ cdef class BlockValuesRefs: """ cdef: public list referenced_blocks + public int clear_counter def __cinit__(self, blk: SharedBlock | None = None) -> None: if blk is not None: self.referenced_blocks = [weakref.ref(blk)] else: self.referenced_blocks = [] - - def _clear_dead_references(self) -> None: - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self.clear_counter = 500 # set reasonably high + + def _clear_dead_references(self, force=False) -> None: + # Use exponential backoff to decide when we want to clear references + # if force=False. Clearing for every insertion causes slowdowns if + # all these objects stay alive, e.g. df.items() for wide DataFrames + # see GH#55245 and GH#55008 + if force or len(self.referenced_blocks) > self.clear_counter: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + nr_of_refs = len(self.referenced_blocks) + if nr_of_refs < self.clear_counter // 2: + self.clear_counter = max(self.clear_counter // 2, 500) + elif nr_of_refs > self.clear_counter: + self.clear_counter = max(self.clear_counter * 2, nr_of_refs) def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. @@ -988,6 +1000,6 @@ cdef class BlockValuesRefs: ------- bool """ - self._clear_dead_references() + self._clear_dead_references(force=True) # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 9180bd5a3a426..a727331307d7e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -119,3 +119,33 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): else: for col in df.columns: assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + + +def test_exponential_backoff(): + # GH#55518 + df = DataFrame({"a": [1, 2, 3]}) + for i in range(490): + df.copy(deep=False) + + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + + df = DataFrame({"a": [1, 2, 3]}) + dfs = [df.copy(deep=False) for i in range(510)] + + for i in range(20): + df.copy(deep=False) + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + for i in range(500): + df.copy(deep=False) + + # Don't reduce since we still have over 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + dfs = dfs[:300] + for i in range(500): + df.copy(deep=False) + + # Reduce since there are less than 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 500