From 79c663f00a3b1e4610a6bb45380830e6c554bfac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 17:11:13 +0200 Subject: [PATCH 1/8] CoW: Use exponential backoff when clearing dead references --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/_libs/internals.pyx | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index cc51e22265d7c..26fc64c9b93f3 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) +- Fixed performance regression in Copy-on-Write mechanism (:issue:`55256`, :issue:`55245`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3b1a6bc7436c3..e7ca22e8d0c46 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -890,17 +890,25 @@ cdef class BlockValuesRefs: """ cdef: public list referenced_blocks + public int clear_counter def __cinit__(self, blk: Block | None = None) -> None: if blk is not None: self.referenced_blocks = [weakref.ref(blk)] else: self.referenced_blocks = [] + self.clear_counter = 500 # set reasonably high - def _clear_dead_references(self) -> None: - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + def _clear_dead_references(self, force=False) -> None: + if force or len(self.referenced_blocks) > self.clear_counter: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + nr_of_refs = len(self.referenced_blocks) + if nr_of_refs < self.clear_counter // 2: + self.clear_counter = self.clear_counter // 2 + elif nr_of_refs > self.clear_counter: + self.clear_counter = self.clear_counter * 2 def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. @@ -934,6 +942,6 @@ cdef class BlockValuesRefs: ------- bool """ - self._clear_dead_references() + self._clear_dead_references(force=True) # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 From a2f81a4793e4557e1f770ce77c676a4aa29a63ee Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 17:14:19 +0200 Subject: [PATCH 2/8] Adjust slightly --- pandas/_libs/internals.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index e7ca22e8d0c46..438e2db74fb5c 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -908,7 +908,7 @@ cdef class BlockValuesRefs: if nr_of_refs < self.clear_counter // 2: self.clear_counter = self.clear_counter // 2 elif nr_of_refs > self.clear_counter: - self.clear_counter = self.clear_counter * 2 + self.clear_counter = min(self.clear_counter * 2, nr_of_refs) def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. From 4423d19e27da9967c6eecd9acab5f9c77a64623c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 17:22:01 +0200 Subject: [PATCH 3/8] Add max --- pandas/_libs/internals.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 438e2db74fb5c..8a8da6a054e27 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -906,7 +906,7 @@ cdef class BlockValuesRefs: ] nr_of_refs = len(self.referenced_blocks) if nr_of_refs < self.clear_counter // 2: - self.clear_counter = self.clear_counter // 2 + self.clear_counter = max(self.clear_counter // 2, 500) elif nr_of_refs > self.clear_counter: self.clear_counter = min(self.clear_counter * 2, nr_of_refs) From d4c159b54e3b28c4867a10c3fafb8799166fd120 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 17:23:33 +0200 Subject: [PATCH 4/8] Add max --- pandas/_libs/internals.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8a8da6a054e27..17f1c8da03f9f 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -908,7 +908,7 @@ cdef class BlockValuesRefs: if nr_of_refs < self.clear_counter // 2: self.clear_counter = max(self.clear_counter // 2, 500) elif nr_of_refs > self.clear_counter: - self.clear_counter = min(self.clear_counter * 2, nr_of_refs) + self.clear_counter = max(self.clear_counter * 2, nr_of_refs) def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. From da639c87f4e8e6fb2eea02c38845d4242255018f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 18:13:27 +0200 Subject: [PATCH 5/8] Add comment --- pandas/_libs/internals.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 17f1c8da03f9f..fdfb8e1c99f6e 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -900,6 +900,10 @@ cdef class BlockValuesRefs: self.clear_counter = 500 # set reasonably high def _clear_dead_references(self, force=False) -> None: + # Use exponential backoff to decide when we want to clear references + # if force=False. Clearing for every insertion causes slowdowns if + # all these objects stay alive, e.g. df.items() for wide DataFrames + # see GH#55245 and GH#55008 if force or len(self.referenced_blocks) > self.clear_counter: self.referenced_blocks = [ ref for ref in self.referenced_blocks if ref() is not None From 4d8c8fbf46c177c545e269653065d0d52856f7cc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 19:39:54 +0200 Subject: [PATCH 6/8] Add test for exponential backoff --- pandas/tests/copy_view/test_internals.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 9180bd5a3a426..77841e7cdc357 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -119,3 +119,32 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): else: for col in df.columns: assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + + +def test_exponential_backoff(): + df = DataFrame({"a": [1, 2, 3]}) + for i in range(490): + df.copy(deep=False) + + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + + df = DataFrame({"a": [1, 2, 3]}) + dfs = [df.copy(deep=False) for i in range(510)] + + for i in range(20): + df.copy(deep=False) + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + for i in range(500): + df.copy(deep=False) + + # Don't reduce since we still have over 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + dfs = dfs[:300] + for i in range(500): + df.copy(deep=False) + + # Reduce since there are less than 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 500 From dd202a664276b6cdbb659a3177b74ef4e643a41f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 14 Oct 2023 20:13:09 +0200 Subject: [PATCH 7/8] Update doc/source/whatsnew/v2.1.2.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 26fc64c9b93f3..3f2c506554880 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) -- Fixed performance regression in Copy-on-Write mechanism (:issue:`55256`, :issue:`55245`) +- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: From 4664c52cbfbc79186cf76c39f6265d6845e1b61a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 20:20:45 +0200 Subject: [PATCH 8/8] Update test_internals.py --- pandas/tests/copy_view/test_internals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 77841e7cdc357..a727331307d7e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -122,6 +122,7 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): def test_exponential_backoff(): + # GH#55518 df = DataFrame({"a": [1, 2, 3]}) for i in range(490): df.copy(deep=False)