Skip to content

Commit 66ec371

Browse files
Backport PR #55518 on branch 2.1.x (CoW: Use exponential backoff when clearing dead references) (#55625)
Backport PR #55518: CoW: Use exponential backoff when clearing dead references Co-authored-by: Patrick Hoefler <[email protected]>
1 parent c26935c commit 66ec371

File tree

3 files changed

+49
-6
lines changed

3 files changed

+49
-6
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
1818
- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
1919
- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
20+
- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)
2021

2122
.. ---------------------------------------------------------------------------
2223
.. _whatsnew_212.bug_fixes:

pandas/_libs/internals.pyx

+18-6
Original file line numberDiff line numberDiff line change
@@ -944,17 +944,29 @@ cdef class BlockValuesRefs:
944944
"""
945945
cdef:
946946
public list referenced_blocks
947+
public int clear_counter
947948

948949
def __cinit__(self, blk: SharedBlock | None = None) -> None:
949950
if blk is not None:
950951
self.referenced_blocks = [weakref.ref(blk)]
951952
else:
952953
self.referenced_blocks = []
953-
954-
def _clear_dead_references(self) -> None:
955-
self.referenced_blocks = [
956-
ref for ref in self.referenced_blocks if ref() is not None
957-
]
954+
self.clear_counter = 500 # set reasonably high
955+
956+
def _clear_dead_references(self, force=False) -> None:
957+
# Use exponential backoff to decide when we want to clear references
958+
# if force=False. Clearing for every insertion causes slowdowns if
959+
# all these objects stay alive, e.g. df.items() for wide DataFrames
960+
# see GH#55245 and GH#55008
961+
if force or len(self.referenced_blocks) > self.clear_counter:
962+
self.referenced_blocks = [
963+
ref for ref in self.referenced_blocks if ref() is not None
964+
]
965+
nr_of_refs = len(self.referenced_blocks)
966+
if nr_of_refs < self.clear_counter // 2:
967+
self.clear_counter = max(self.clear_counter // 2, 500)
968+
elif nr_of_refs > self.clear_counter:
969+
self.clear_counter = max(self.clear_counter * 2, nr_of_refs)
958970

959971
def add_reference(self, blk: SharedBlock) -> None:
960972
"""Adds a new reference to our reference collection.
@@ -988,6 +1000,6 @@ cdef class BlockValuesRefs:
9881000
-------
9891001
bool
9901002
"""
991-
self._clear_dead_references()
1003+
self._clear_dead_references(force=True)
9921004
# Checking for more references than block pointing to itself
9931005
return len(self.referenced_blocks) > 1

pandas/tests/copy_view/test_internals.py

+30
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,33 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
119119
else:
120120
for col in df.columns:
121121
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
122+
123+
124+
def test_exponential_backoff():
125+
# GH#55518
126+
df = DataFrame({"a": [1, 2, 3]})
127+
for i in range(490):
128+
df.copy(deep=False)
129+
130+
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
131+
132+
df = DataFrame({"a": [1, 2, 3]})
133+
dfs = [df.copy(deep=False) for i in range(510)]
134+
135+
for i in range(20):
136+
df.copy(deep=False)
137+
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
138+
assert df._mgr.blocks[0].refs.clear_counter == 1000
139+
140+
for i in range(500):
141+
df.copy(deep=False)
142+
143+
# Don't reduce since we still have over 500 objects alive
144+
assert df._mgr.blocks[0].refs.clear_counter == 1000
145+
146+
dfs = dfs[:300]
147+
for i in range(500):
148+
df.copy(deep=False)
149+
150+
# Reduce since there are less than 500 objects alive
151+
assert df._mgr.blocks[0].refs.clear_counter == 500

0 commit comments

Comments
 (0)