From d968cb3cd952751a5f2f29c28fe03ec39fb8852e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 11:10:26 +0100 Subject: [PATCH 01/22] Add ref tracking to Block --- pandas/_libs/internals.pyx | 60 +++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3333ac1115177..3ad2c80994dbd 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -580,9 +580,16 @@ cdef class SharedBlock: """ cdef: public BlockPlacement _mgr_locs + public BlockValuesRefs refs readonly int ndim - def __cinit__(self, values, placement: BlockPlacement, ndim: int): + def __cinit__( + self, + values, + placement: BlockPlacement, + ndim: int, + refs: BlockValuesRefs | None = None, + ): """ Parameters ---------- @@ -594,6 +601,10 @@ cdef class SharedBlock: """ self._mgr_locs = placement self.ndim = ndim + if refs is None: + self.refs = BlockValuesRefs(self) + else: + self.refs = refs cpdef __reduce__(self): args = (self.values, self.mgr_locs.indexer, self.ndim) @@ -619,9 +630,15 @@ cdef class NumpyBlock(SharedBlock): cdef: public ndarray values - def __cinit__(self, ndarray values, BlockPlacement placement, int ndim): + def __cinit__( + self, + ndarray values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values cpdef NumpyBlock getitem_block_index(self, slice slicer): @@ -641,9 +658,15 @@ cdef class NDArrayBackedBlock(SharedBlock): cdef public: NDArrayBacked values - def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim): + def __cinit__( + self, + NDArrayBacked values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): @@ -660,9 +683,15 @@ cdef class Block(SharedBlock): cdef: public object values - def __cinit__(self, object values, BlockPlacement placement, int ndim): + def __cinit__( + self, + object values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values @@ -839,3 +868,20 @@ cdef class BlockManager: return type(self)( tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False ) + + +cdef class BlockValuesRefs: + cdef: + public list referenced_blocks + + def __cinit__(self, blk: SharedBlock) -> None: + self.referenced_blocks = [weakref.ref(blk)] + + cdef void add_reference(self, blk: SharedBlock): + self.referenced_blocks.append(weakref.ref(blk)) + + cdef bint has_reference(self): + self.referenced_blocks = [ + obj for obj in self.referenced_blocks if obj() is not None + ] + return len(self.referenced_blocks) > 0 From 6a41cdb9dbe4837a18fcb6b63cc480962a28f238 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 14:04:18 +0100 Subject: [PATCH 02/22] Add typing and poc in copy --- pandas/_libs/internals.pyi | 14 +++++++++++++- pandas/_libs/internals.pyx | 8 +++++--- pandas/core/internals/blocks.py | 12 ++++++++++-- pandas/core/internals/managers.py | 15 +++------------ pandas/tests/copy_view/test_methods.py | 3 ++- 5 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 79bdbea71e4d8..5dfcc3726c84f 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -4,6 +4,7 @@ from typing import ( final, overload, ) +import weakref import numpy as np @@ -59,8 +60,13 @@ class SharedBlock: _mgr_locs: BlockPlacement ndim: int values: ArrayLike + refs: BlockValuesRefs def __init__( - self, values: ArrayLike, placement: BlockPlacement, ndim: int + self, + values: ArrayLike, + placement: BlockPlacement, + ndim: int, + refs: BlockValuesRefs | None = ..., ) -> None: ... class NumpyBlock(SharedBlock): @@ -87,3 +93,9 @@ class BlockManager: ) -> None: ... def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... def _rebuild_blknos_and_blklocs(self) -> None: ... + +class BlockValuesRefs: + referenced_blocks: list[weakref.ref] + def __init__(self, blk: SharedBlock) -> None: ... + def add_reference(self, blk: SharedBlock) -> None: ... + def has_reference(self) -> bool: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3ad2c80994dbd..8ffddc8e6394a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -604,6 +604,7 @@ cdef class SharedBlock: if refs is None: self.refs = BlockValuesRefs(self) else: + refs.add_reference(self) self.refs = refs cpdef __reduce__(self): @@ -877,11 +878,12 @@ cdef class BlockValuesRefs: def __cinit__(self, blk: SharedBlock) -> None: self.referenced_blocks = [weakref.ref(blk)] - cdef void add_reference(self, blk: SharedBlock): + def add_reference(self, blk: SharedBlock) -> None: self.referenced_blocks.append(weakref.ref(blk)) - cdef bint has_reference(self): + def has_reference(self) -> bool: self.referenced_blocks = [ obj for obj in self.referenced_blocks if obj() is not None ] - return len(self.referenced_blocks) > 0 + # Checking for more references than block pointing to itself + return len(self.referenced_blocks) > 1 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bb555690b867a..7b520f06a3b8b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -20,7 +20,10 @@ lib, writers, ) -from pandas._libs.internals import BlockPlacement +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) from pandas._libs.missing import NA from pandas._libs.tslibs import IncompatibleFrequency from pandas._typing import ( @@ -144,6 +147,7 @@ class Block(PandasObject): values: np.ndarray | ExtensionArray ndim: int + refs: BlockValuesRefs __init__: Callable __slots__ = () @@ -506,9 +510,13 @@ def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: def copy(self, deep: bool = True) -> Block: """copy constructor""" values = self.values + refs: BlockValuesRefs | None if deep: values = values.copy() - return type(self)(values, placement=self._mgr_locs, ndim=self.ndim) + refs = None + else: + refs = self.refs + return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs) # --------------------------------------------------------------------- # Replace diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 81c5810d29456..2b04a079f5ffb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -256,8 +256,9 @@ def _has_no_reference_block(self, blkno: int) -> bool: """ # TODO(CoW) include `or self.refs[blkno]() is None` ? return ( - self.refs is None or self.refs[blkno] is None - ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 + (self.refs is None or self.refs[blkno] is None) + and weakref.getweakrefcount(self.blocks[blkno]) == 0 + ) and not self.blocks[blkno].refs.has_reference() def _clear_reference_block(self, blkno: int) -> None: """ @@ -629,17 +630,7 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) - new_refs: list[weakref.ref | None] | None - if deep: - new_refs = None - parent = None - else: - new_refs = [weakref.ref(blk) for blk in self.blocks] - parent = self - res.axes = new_axes - res.refs = new_refs - res.parent = parent if self.ndim > 1: # Avoid needing to re-compute these diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index fbd4bbfb38c27..0e7f361257a61 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -36,7 +36,8 @@ def test_copy_shallow(using_copy_on_write): # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is not None + assert df_copy._mgr.blocks[0].refs.has_reference() + assert df_copy._mgr.blocks[1].refs.has_reference() if using_copy_on_write: # mutating shallow copy doesn't mutate original From b4779d240aba645172f765b525f1df888ca40e87 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 16:18:16 +0100 Subject: [PATCH 03/22] Change reference tracking --- pandas/_libs/internals.pyx | 22 +- pandas/core/generic.py | 4 +- pandas/core/internals/blocks.py | 21 +- pandas/core/internals/concat.py | 18 +- pandas/core/internals/managers.py | 213 +++++--------------- pandas/core/internals/ops.py | 2 +- pandas/core/reshape/concat.py | 16 +- pandas/tests/copy_view/test_constructors.py | 4 +- pandas/tests/copy_view/test_internals.py | 34 +--- pandas/tests/copy_view/test_methods.py | 3 +- 10 files changed, 87 insertions(+), 250 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8ffddc8e6394a..3ce0e2bb94eef 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -649,7 +649,7 @@ cdef class NumpyBlock(SharedBlock): Assumes self.ndim == 2 """ new_values = self.values[..., slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) cdef class NDArrayBackedBlock(SharedBlock): @@ -677,7 +677,7 @@ cdef class NDArrayBackedBlock(SharedBlock): Assumes self.ndim == 2 """ new_values = self.values[..., slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) cdef class Block(SharedBlock): @@ -703,15 +703,11 @@ cdef class BlockManager: public list axes public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs - public list refs - public object parent def __cinit__( self, blocks=None, axes=None, - refs=None, - parent=None, verify_integrity=True, ): # None as defaults for unpickling GH#42345 @@ -725,8 +721,6 @@ cdef class BlockManager: self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable - self.refs = refs - self.parent = parent # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False @@ -835,16 +829,12 @@ cdef class BlockManager: ndarray blknos, blklocs nbs = [] - nrefs = [] for blk in self.blocks: nb = blk.getitem_block_index(slobj) nbs.append(nb) - nrefs.append(weakref.ref(blk)) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - mgr = type(self)( - tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False - ) + mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs @@ -857,7 +847,7 @@ cdef class BlockManager: def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: - new_blocks, new_refs = self._slice_take_blocks_ax0(slobj) + new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: return self._get_index_slice(slobj) else: @@ -866,9 +856,7 @@ cdef class BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)( - tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False - ) + return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) cdef class BlockValuesRefs: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8fa86e80e1a44..2720d064d6f1c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -768,8 +768,8 @@ def swapaxes( ) assert isinstance(new_mgr, BlockManager) assert isinstance(self._mgr, BlockManager) - new_mgr.parent = self._mgr - new_mgr.refs = [weakref.ref(self._mgr.blocks[0])] + new_mgr.blocks[0].refs = self._mgr.blocks[0].refs + new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0]) return self._constructor(new_mgr).__finalize__(self, method="swapaxes") elif (copy or copy is None) and self._mgr.is_single_block: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7b520f06a3b8b..e941a40357228 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -269,7 +269,8 @@ def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)(new_values, new_mgr_locs, self.ndim) + refs = self.refs if isinstance(slicer, slice) else None + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final def getitem_block_columns( @@ -285,7 +286,7 @@ def getitem_block_columns( if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) @final def _can_hold_element(self, element: Any) -> bool: @@ -1352,6 +1353,10 @@ def delete(self, loc) -> list[Block]: new_blocks: list[Block] = [] previous_loc = -1 + # TODO(CoW): This is tricky, if parent block goes out of scope + # all split blocks are referencing each other even though they + # don't share data + refs = self.refs if self.refs.has_reference() else None for idx in loc: if idx == previous_loc + 1: @@ -1362,7 +1367,9 @@ def delete(self, loc) -> list[Block]: # argument type "Tuple[slice, slice]" values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa locs = mgr_locs_arr[previous_loc + 1 : idx] - nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim) + nb = type(self)( + values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs + ) new_blocks.append(nb) previous_loc = idx @@ -1818,7 +1825,7 @@ def getitem_block_index(self, slicer: slice) -> ExtensionBlock: # GH#42787 in principle this is equivalent to values[..., slicer], but we don't # require subclasses of ExtensionArray to support that form (for now). new_values = self.values[slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: # only reached with ndim == 2 and axis == 1 @@ -2119,7 +2126,9 @@ def new_block_2d(values: ArrayLike, placement: BlockPlacement): return klass(values, ndim=2, placement=placement) -def new_block(values, placement, *, ndim: int) -> Block: +def new_block( + values, placement, *, ndim: int, refs: BlockValuesRefs | None = None +) -> Block: # caller is responsible for ensuring values is NOT a PandasArray if not isinstance(placement, BlockPlacement): @@ -2130,7 +2139,7 @@ def new_block(values, placement, *, ndim: int) -> Block: klass = get_block_type(values.dtype) values = maybe_coerce_values(values) - return klass(values, ndim=ndim, placement=placement) + return klass(values, ndim=ndim, placement=placement, refs=refs) def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d46b51a2ee954..bedd4d92a1ea3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -7,7 +7,6 @@ Sequence, cast, ) -import weakref import numpy as np @@ -62,10 +61,7 @@ ensure_block_shape, new_block_2d, ) -from pandas.core.internals.managers import ( - BlockManager, - using_copy_on_write, -) +from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: from pandas import Index @@ -271,8 +267,6 @@ def _concat_managers_axis0( offset = 0 blocks = [] - refs: list[weakref.ref | None] = [] - parents: list = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] @@ -289,17 +283,9 @@ def _concat_managers_axis0( nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) - if not made_copy and not copy and using_copy_on_write(): - refs.extend([weakref.ref(blk) for blk in mgr.blocks]) - parents.append(mgr) - elif using_copy_on_write(): - refs.extend([None] * len(mgr.blocks)) - offset += len(mgr.items) - result_parents = parents if parents else None - result_ref = refs if refs else None - result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref) + result = BlockManager(tuple(blocks), axes) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2b04a079f5ffb..66b77b13905d2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -11,7 +11,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -146,8 +145,6 @@ class BaseBlockManager(DataManager): _blklocs: npt.NDArray[np.intp] blocks: tuple[Block, ...] axes: list[Index] - refs: list[weakref.ref | None] | None - parent: object @property def ndim(self) -> int: @@ -156,17 +153,11 @@ def ndim(self) -> int: _known_consolidated: bool _is_consolidated: bool - def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None: + def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: raise NotImplementedError @classmethod - def from_blocks( - cls: type_t[T], - blocks: list[Block], - axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, - ) -> T: + def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T: raise NotImplementedError @property @@ -254,20 +245,7 @@ def _has_no_reference_block(self, blkno: int) -> bool: (whether it references another array or is itself being referenced) Returns True if the block has no references. """ - # TODO(CoW) include `or self.refs[blkno]() is None` ? - return ( - (self.refs is None or self.refs[blkno] is None) - and weakref.getweakrefcount(self.blocks[blkno]) == 0 - ) and not self.blocks[blkno].refs.has_reference() - - def _clear_reference_block(self, blkno: int) -> None: - """ - Clear any reference for column `i`. - """ - if self.refs is not None: - self.refs[blkno] = None - if com.all_none(*self.refs): - self.parent = None + return not self.blocks[blkno].refs.has_reference() def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) @@ -574,23 +552,17 @@ def _combine( # TODO(CoW) we could optimize here if we know that the passed blocks # are fully "owned" (eg created from an operation, not coming from # an existing manager) - new_refs: list[weakref.ref | None] | None = None if copy else [] for b in blocks: nb = b.copy(deep=copy) nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) - if not copy: - # None has no attribute "append" - new_refs.append(weakref.ref(b)) # type: ignore[union-attr] axes = list(self.axes) if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks( - new_blocks, axes, new_refs, parent=None if copy else self - ) + return type(self).from_blocks(new_blocks, axes) @property def nblocks(self) -> int: @@ -654,7 +626,7 @@ def consolidate(self: T) -> T: if self.is_consolidated(): return self - bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False) + bm = type(self)(self.blocks, self.axes, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -716,14 +688,14 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks, new_refs = self._slice_take_blocks_ax0( + new_blocks = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice, use_na_proxy=use_na_proxy, ) - parent = None if com.all_none(*new_refs) else self else: + print(self.blocks) new_blocks = [ blk.take_nd( indexer, @@ -734,13 +706,11 @@ def reindex_indexer( ) for blk in self.blocks ] - new_refs = None - parent = None new_axes = list(self.axes) new_axes[axis] = new_axis - new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent) + new_mgr = type(self).from_blocks(new_blocks, new_axes) if axis == 1: # We can avoid the need to rebuild these new_mgr._blknos = self.blknos.copy() @@ -754,7 +724,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, - ) -> tuple[list[Block], list[weakref.ref | None]]: + ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -789,9 +759,7 @@ def _slice_take_blocks_ax0( if sllen == 0: return [], [] bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [ - weakref.ref(blk) - ] + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -805,9 +773,7 @@ def _slice_take_blocks_ax0( ) for i, ml in enumerate(slobj) ] - # We have - # all(np.shares_memory(nb.values, blk.values) for nb in blocks) - return blocks, [weakref.ref(blk)] * len(blocks) + return blocks else: bp = BlockPlacement(slice(0, sllen)) return [ @@ -817,7 +783,7 @@ def _slice_take_blocks_ax0( new_mgr_locs=bp, fill_value=fill_value, ) - ], [None] + ] if sl_type == "slice": blknos = self.blknos[slobj] @@ -833,7 +799,6 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] - refs: list[weakref.ref | None] = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: @@ -846,7 +811,6 @@ def _slice_take_blocks_ax0( use_na_proxy=use_na_proxy, ) ) - refs.append(None) else: blk = self.blocks[blkno] @@ -860,7 +824,6 @@ def _slice_take_blocks_ax0( newblk = blk.copy(deep=False) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) blocks.append(newblk) - refs.append(weakref.ref(blk)) else: # GH#32779 to avoid the performance penalty of copying, @@ -873,7 +836,6 @@ def _slice_take_blocks_ax0( if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) - refs.append(weakref.ref(blk)) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -883,13 +845,11 @@ def _slice_take_blocks_ax0( nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) - refs.append(weakref.ref(blk)) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) blocks.append(nb) - refs.append(None) - return blocks, refs + return blocks def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -974,8 +934,6 @@ def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, verify_integrity: bool = True, ) -> None: @@ -1007,28 +965,13 @@ def _verify_integrity(self) -> None: f"block items\n# manager items: {len(self.items)}, # " f"tot_items: {tot_items}" ) - if self.refs is not None: - if len(self.refs) != len(self.blocks): - raise AssertionError( - "Number of passed refs must equal the number of blocks: " - f"{len(self.refs)} refs vs {len(self.blocks)} blocks." - "\nIf you see this error, please report a bug at " - "https://github.com/pandas-dev/pandas/issues" - ) @classmethod - def from_blocks( - cls, - blocks: list[Block], - axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, - ) -> BlockManager: + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ - parent = parent if using_copy_on_write() else None - return cls(blocks, axes, refs, parent, verify_integrity=False) + return cls(blocks, axes, verify_integrity=False) # ---------------------------------------------------------------- # Indexing @@ -1047,10 +990,14 @@ def fast_xs(self, loc: int) -> SingleBlockManager: """ if len(self.blocks) == 1: result = self.blocks[0].iget((slice(None), loc)) - block = new_block(result, placement=slice(0, len(result)), ndim=1) # in the case of a single block, the new block is a view - ref = weakref.ref(self.blocks[0]) - return SingleBlockManager(block, self.axes[0], [ref], parent=self) + block = new_block( + result, + placement=slice(0, len(result)), + ndim=1, + refs=self.blocks[0].refs, + ) + return SingleBlockManager(block, self.axes[0]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -1093,10 +1040,10 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) - nb = type(block)(values, placement=bp, ndim=1) - ref = weakref.ref(block) if track_ref else None - parent = self if track_ref else None - return SingleBlockManager(nb, self.axes[1], [ref], parent) + nb = type(block)( + values, placement=bp, ndim=1, refs=block.refs if track_ref else None + ) + return SingleBlockManager(nb, self.axes[1]) def iget_values(self, i: int) -> ArrayLike: """ @@ -1228,7 +1175,7 @@ def value_getitem(placement): self._iset_split_block(blkno_l, blk_locs) if len(removed_blknos): - # Remove blocks & update blknos and refs accordingly + # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1239,18 +1186,14 @@ def value_getitem(placement): self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) - if self.refs is not None: - self.refs = [ - ref - for i, ref in enumerate(self.refs) - if i not in set(removed_blknos) - ] if unfit_val_locs: unfit_idxr = np.concatenate(unfit_mgr_locs) unfit_count = len(unfit_idxr) new_blocks: list[Block] = [] + # TODO(CoW) is this always correct to assume that the new_blocks + # are not referencing anything else? if value_is_extension_type: # This code (ab-)uses the fact that EA blocks contain only # one item. @@ -1281,10 +1224,6 @@ def value_getitem(placement): self._blklocs[unfit_idxr] = np.arange(unfit_count) self.blocks += tuple(new_blocks) - # TODO(CoW) is this always correct to assume that the new_blocks - # are not referencing anything else? - if self.refs is not None: - self.refs = list(self.refs) + [None] * len(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False @@ -1321,13 +1260,6 @@ def _iset_split_block( first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - - if value is not None: - # Only clear if we set new values - self._clear_reference_block(blkno_l) - nr_blocks = len(self.blocks) blocks_tup = ( self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup @@ -1357,7 +1289,6 @@ def _iset_single( if using_copy_on_write() and not self._has_no_reference_block(blkno): # perform Copy-on-Write and clear the reference copy = True - self._clear_reference_block(blkno) iloc = self.blklocs[loc] blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return @@ -1366,7 +1297,6 @@ def _iset_single( old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks - self._clear_reference_block(blkno) return def column_setitem( @@ -1384,7 +1314,6 @@ def column_setitem( blocks = list(self.blocks) blocks[blkno] = blocks[blkno].copy() self.blocks = tuple(blocks) - self._clear_reference_block(blkno) # this manager is only created temporarily to mutate the values in place # so don't track references, otherwise the `setitem` would perform CoW again @@ -1418,6 +1347,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: value = ensure_block_shape(value, ndim=self.ndim) bp = BlockPlacement(slice(loc, loc + 1)) + # TODO(CoW) do we always "own" the passed `value`? block = new_block_2d(values=value, placement=bp) if not len(self.blocks): @@ -1430,9 +1360,6 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: self.axes[0] = new_axis self.blocks += (block,) - # TODO(CoW) do we always "own" the passed `value`? - if self.refs is not None: - self.refs += [None] self._known_consolidated = False @@ -1486,12 +1413,10 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - # TODO this might not be needed (can a delete ever be done in chained manner?) - parent = None if com.all_none(*new_refs) else self - return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False) + return type(self)(tuple(nbs), axes, verify_integrity=False) # ---------------------------------------------------------------- # Block-wise Operation @@ -1837,10 +1762,7 @@ def _consolidate_inplace(self) -> None: # the DataFrame's _item_cache. The exception is for newly-created # BlockManager objects not yet attached to a DataFrame. if not self.is_consolidated(): - if self.refs is None: - self.blocks = _consolidate(self.blocks) - else: - self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs) + self.blocks = _consolidate(self.blocks) self._is_consolidated = True self._known_consolidated = True self._rebuild_blknos_and_blklocs() @@ -1862,8 +1784,6 @@ def __init__( self, block: Block, axis: Index, - refs: list[weakref.ref | None] | None = None, - parent: object = None, verify_integrity: bool = False, ) -> None: # Assertions disabled for performance @@ -1872,25 +1792,19 @@ def __init__( self.axes = [axis] self.blocks = (block,) - self.refs = refs - self.parent = parent if using_copy_on_write() else None @classmethod def from_blocks( cls, blocks: list[Block], axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, ) -> SingleBlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ assert len(blocks) == 1 assert len(axes) == 1 - if refs is not None: - assert len(refs) == 1 - return cls(blocks[0], axes[0], refs, parent, verify_integrity=False) + return cls(blocks[0], axes[0], verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: @@ -1907,13 +1821,9 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: blk = self.blocks[0] arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) - new_blk = type(blk)(arr, placement=bp, ndim=2) + new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) axes = [columns, self.axes[0]] - refs: list[weakref.ref | None] = [weakref.ref(blk)] - parent = self if using_copy_on_write() else None - return BlockManager( - [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False - ) + return BlockManager([new_blk], axes=axes, verify_integrity=False) def _has_no_reference(self, i: int = 0) -> bool: """ @@ -1921,9 +1831,7 @@ def _has_no_reference(self, i: int = 0) -> bool: (whether it references another array or is itself being referenced) Returns True if the column has no references. """ - return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount( - self.blocks[0] - ) == 0 + return not self.blocks[0].refs.has_reference() def __getstate__(self): block_values = [b.values for b in self.blocks] @@ -1991,19 +1899,18 @@ def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager: and com.is_bool_indexer(indexer) and indexer.all() ): - return type(self)(blk, self.index, [weakref.ref(blk)], parent=self) + return type(self)(blk.copy(deep=False), self.index) array = blk._slice(indexer) if array.ndim > 1: # This will be caught by Series._get_values raise ValueError("dimension-expanding indexing not allowed") bp = BlockPlacement(slice(0, len(array))) - block = type(blk)(array, placement=bp, ndim=1) + # TODO(CoW) in theory only need to track reference if new_array is a view + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) new_idx = self.index[indexer] - # TODO(CoW) in theory only need to track reference if new_array is a view - ref = weakref.ref(blk) - return type(self)(block, new_idx, [ref], parent=self) + return type(self)(block, new_idx) def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: # Assertion disabled for performance @@ -2014,11 +1921,11 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) bp = BlockPlacement(slice(0, len(array))) - block = type(blk)(array, placement=bp, ndim=1) - new_index = self.index._getitem_slice(slobj) # TODO this method is only used in groupby SeriesSplitter at the moment, - # so passing refs / parent is not yet covered by the tests - return type(self)(block, new_index, [weakref.ref(blk)], parent=self) + # so passing refs is not yet covered by the tests + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) @property def index(self) -> Index: @@ -2064,8 +1971,6 @@ def setitem_inplace(self, indexer, value) -> None: """ if using_copy_on_write() and not self._has_no_reference(0): self.blocks = (self._block.copy(),) - self.refs = None - self.parent = None self._cache.clear() super().setitem_inplace(indexer, value) @@ -2080,9 +1985,6 @@ def idelete(self, indexer) -> SingleBlockManager: self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) self._cache.clear() - # clear reference since delete always results in a new array - self.refs = None - self.parent = None return self def fast_xs(self, loc): @@ -2302,31 +2204,6 @@ def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: return tuple(new_blocks) -def _consolidate_with_refs( - blocks: tuple[Block, ...], refs -) -> tuple[tuple[Block, ...], list[weakref.ref | None]]: - """ - Merge blocks having same dtype, exclude non-consolidating blocks, handling - refs - """ - gkey = lambda x: x[0]._consolidate_key - grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey) - - new_blocks: list[Block] = [] - new_refs: list[weakref.ref | None] = [] - for (_can_consolidate, dtype), group_blocks_refs in grouper: - group_blocks, group_refs = list(zip(*list(group_blocks_refs))) - merged_blocks, consolidated = _merge_blocks( - list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate - ) - new_blocks = extend_blocks(merged_blocks, new_blocks) - if consolidated: - new_refs.extend([None]) - else: - new_refs.extend(group_refs) - return tuple(new_blocks), new_refs - - def _merge_blocks( blocks: list[Block], dtype: DtypeObj, can_consolidate: bool ) -> tuple[list[Block], bool]: diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 477dc98aa2b2b..24fc51a96d9df 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -36,7 +36,7 @@ def _iter_block_pairs( left_ea = blk_vals.ndim == 1 - rblks, _ = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) + rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index f8220649bf890..f37b0604628c8 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -14,7 +14,6 @@ cast, overload, ) -import weakref import numpy as np @@ -551,8 +550,9 @@ def __init__( obj = sample._constructor(obj, columns=[name], copy=False) if using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works - obj._mgr.parent = original_obj # type: ignore[union-attr] - obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501 + for i in range(len(original_obj._mgr.blocks)): + obj._mgr.blocks[i].refs = original_obj._mgr.blocks[i].refs + obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) obj.columns = [new_name] @@ -612,13 +612,9 @@ def get_result(self): typ=get_option("mode.data_manager"), ) if using_copy_on_write() and not self.copy: - parents = [obj._mgr for obj in self.objs] - mgr.parent = parents # type: ignore[union-attr] - refs = [ - weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] - for obj in self.objs - ] - mgr.refs = refs # type: ignore[union-attr] + for i in range(len(self.objs)): + mgr.blocks[i].refs = self.objs[i]._mgr.blocks[0].refs + mgr.blocks[i].refs.add_reference(mgr.blocks[i]) df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index c04c733e5ee1d..f5805455e326f 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -18,7 +18,7 @@ def test_series_from_series(using_copy_on_write): assert np.shares_memory(ser.values, result.values) if using_copy_on_write: - assert result._mgr.refs is not None + assert result._mgr.blocks[0].refs.has_reference() if using_copy_on_write: # mutating new series copy doesn't mutate original @@ -72,4 +72,4 @@ def test_series_from_series_with_reindex(using_copy_on_write): result = Series(ser, index=[0, 1, 2, 3]) assert not np.shares_memory(ser.values, result.values) if using_copy_on_write: - assert result._mgr.refs is None or result._mgr.refs[0] is None + assert not result._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 506ae7d3465c5..67022e533dbc4 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -20,52 +20,32 @@ def test_consolidate(using_copy_on_write): subset = df[:] # each block of subset references a block of df - assert subset._mgr.refs is not None and all( - ref is not None for ref in subset._mgr.refs - ) + assert all(blk.refs.has_reference() for blk in subset._mgr.blocks) # consolidate the two int64 blocks subset._consolidate_inplace() # the float64 block still references the parent one because it still a view - assert subset._mgr.refs[0] is not None + assert subset._mgr.blocks[0].refs.has_reference() # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) # but avoids caching df["b"] assert np.shares_memory(get_array(df, "b"), get_array(subset, "b")) # the new consolidated int64 block does not reference another - assert subset._mgr.refs[1] is None + assert not subset._mgr.blocks[1].refs.has_reference() # the parent dataframe now also only is linked for the float column - assert df._mgr._has_no_reference(0) - assert not df._mgr._has_no_reference(1) - assert df._mgr._has_no_reference(2) + assert not df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[2].refs.has_reference() # and modifying subset still doesn't modify parent if using_copy_on_write: subset.iloc[0, 1] = 0.0 - assert df._mgr._has_no_reference(1) + assert not df._mgr.blocks[1].refs.has_reference() assert df.loc[0, "b"] == 0.1 -@td.skip_array_manager_invalid_test -def test_clear_parent(using_copy_on_write): - # ensure to clear parent reference if we are no longer viewing data from parent - if not using_copy_on_write: - pytest.skip("test only relevant when using copy-on-write") - - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - assert subset._mgr.parent is not None - - # replacing existing columns loses the references to the parent df - subset["a"] = 0 - assert subset._mgr.parent is not None - # when losing the last reference, also the parent should be reset - subset["b"] = 0 - assert subset._mgr.parent is None - - @pytest.mark.single_cpu @td.skip_array_manager_invalid_test def test_switch_options(): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 0e7f361257a61..dc5711f1b2ddd 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -22,7 +22,8 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is None + assert not df_copy._mgr.blocks[0].refs.has_reference() + assert not df_copy._mgr.blocks[1].refs.has_reference() # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 From c7fe560459f1dc7412a3a55279e1c6d2af7b7450 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 16:50:03 +0100 Subject: [PATCH 04/22] Fix mypy --- pandas/compat/pickle_compat.py | 2 +- pandas/core/generic.py | 4 +++- pandas/core/internals/managers.py | 3 +-- pandas/core/reshape/concat.py | 10 +++++----- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 2fbd3a6cdb046..bfee616d52aac 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -168,7 +168,7 @@ def load_newobj(self) -> None: arr = np.array([], dtype="m8[ns]") obj = cls.__new__(cls, arr, arr.dtype) elif cls is BlockManager and not args: - obj = cls.__new__(cls, (), [], None, False) + obj = cls.__new__(cls, (), [], False) else: obj = cls.__new__(cls, *args) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2720d064d6f1c..2fc5c25dac039 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -769,7 +769,9 @@ def swapaxes( assert isinstance(new_mgr, BlockManager) assert isinstance(self._mgr, BlockManager) new_mgr.blocks[0].refs = self._mgr.blocks[0].refs - new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0]) + new_mgr.blocks[0].refs.add_reference( + new_mgr.blocks[0] # type: ignore[arg-type] + ) return self._constructor(new_mgr).__finalize__(self, method="swapaxes") elif (copy or copy is None) and self._mgr.is_single_block: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 66b77b13905d2..88d00e47ce694 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -695,7 +695,6 @@ def reindex_indexer( use_na_proxy=use_na_proxy, ) else: - print(self.blocks) new_blocks = [ blk.take_nd( indexer, @@ -757,7 +756,7 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [], [] + return [] bp = BlockPlacement(slice(0, sllen)) return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] elif not allow_fill or self.ndim == 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index f37b0604628c8..042ea42407dff 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -550,9 +550,9 @@ def __init__( obj = sample._constructor(obj, columns=[name], copy=False) if using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works - for i in range(len(original_obj._mgr.blocks)): - obj._mgr.blocks[i].refs = original_obj._mgr.blocks[i].refs - obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) + for i in range(len(original_obj._mgr.blocks)): # type: ignore[union-attr] # noqa + obj._mgr.blocks[i].refs = original_obj._mgr.blocks[i].refs # type: ignore[union-attr] # noqa + obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa obj.columns = [new_name] @@ -613,8 +613,8 @@ def get_result(self): ) if using_copy_on_write() and not self.copy: for i in range(len(self.objs)): - mgr.blocks[i].refs = self.objs[i]._mgr.blocks[0].refs - mgr.blocks[i].refs.add_reference(mgr.blocks[i]) + mgr.blocks[i].refs = self.objs[i]._mgr.blocks[0].refs # type: ignore[union-attr] # noqa + mgr.blocks[i].refs.add_reference(mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") From 47269ce34906f53b363472902ffd4c6b8902140a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 16:59:20 +0100 Subject: [PATCH 05/22] Add docs --- pandas/_libs/internals.pyx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3ce0e2bb94eef..021c229fe64c9 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -860,6 +860,11 @@ cdef class BlockManager: cdef class BlockValuesRefs: + """Tracks all references to a given array. + + Keeps track of all blocks (through weak references) that reference the same + data. + """ cdef: public list referenced_blocks @@ -867,9 +872,25 @@ cdef class BlockValuesRefs: self.referenced_blocks = [weakref.ref(blk)] def add_reference(self, blk: SharedBlock) -> None: + """Adds a new reference to our reference collection. + + Parameters + ---------- + blk: SharedBlock + The block that the new references should point to. + """ self.referenced_blocks.append(weakref.ref(blk)) def has_reference(self) -> bool: + """Checks if block has foreign references. + + A reference is only relevant if it is still alive. The reference to + ourselves does not count. + + Returns + ------- + bool + """ self.referenced_blocks = [ obj for obj in self.referenced_blocks if obj() is not None ] From 60340539a7b5823d7720a766352bce8a348db566 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 17:12:26 +0100 Subject: [PATCH 06/22] Add tests --- pandas/tests/copy_view/test_methods.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index dc5711f1b2ddd..cf316f678fe71 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1216,3 +1216,34 @@ def test_isetitem(using_copy_on_write): assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) else: assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + + +def test_assigning_to_same_variable_removes_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df = df.reset_index() + arr = get_array(df, "a") + df.iloc[0, 1] = 100 # Write into a + + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_dont_track_unnecessary_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + + df["b"] = 100 + arr = get_array(df, "a") + df.iloc[0, 0] = 100 # Check that we correctly track reference + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_with_view_copies(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + expected = df.copy() + + df["b"] = 100 + arr = get_array(df, "a") + df.iloc[0, 0] = 100 # Check that we correctly track reference + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(view, expected) From 4d4f85622dcce8cdd929e346fc15275335978cee Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 19:14:49 +0100 Subject: [PATCH 07/22] Fix pylint --- pandas/core/reshape/concat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 042ea42407dff..43e0b77a90a85 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -550,8 +550,8 @@ def __init__( obj = sample._constructor(obj, columns=[name], copy=False) if using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works - for i in range(len(original_obj._mgr.blocks)): # type: ignore[union-attr] # noqa - obj._mgr.blocks[i].refs = original_obj._mgr.blocks[i].refs # type: ignore[union-attr] # noqa + for i, block in enumerate(original_obj._mgr.blocks): # type: ignore[union-attr] # noqa + obj._mgr.blocks[i].refs = block.refs # type: ignore[union-attr] # noqa obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa obj.columns = [new_name] @@ -612,8 +612,8 @@ def get_result(self): typ=get_option("mode.data_manager"), ) if using_copy_on_write() and not self.copy: - for i in range(len(self.objs)): - mgr.blocks[i].refs = self.objs[i]._mgr.blocks[0].refs # type: ignore[union-attr] # noqa + for i, obj in enumerate(self.objs): + mgr.blocks[i].refs = obj._mgr.blocks[0].refs # type: ignore[union-attr] # noqa mgr.blocks[i].refs.add_reference(mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa df = cons(mgr, copy=False) df.columns = columns From 71013221f0181f7f9ca154da59509a55d0c90804 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 19:16:02 +0100 Subject: [PATCH 08/22] Add dev docs for CoW --- doc/source/development/copy_on_write.rst | 32 ++++++++++++++++++++++++ doc/source/development/index.rst | 1 + 2 files changed, 33 insertions(+) create mode 100644 doc/source/development/copy_on_write.rst diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst new file mode 100644 index 0000000000000..133f27391dfbf --- /dev/null +++ b/doc/source/development/copy_on_write.rst @@ -0,0 +1,32 @@ +.. _copy_on_write:: + +{{ header }} + +************* +Copy on write +************* + +Copy on Write is a mechanism to simplify the indexing API and improve +performance through avoiding copies if possible. +CoW means that any DataFrame or Series derived from another in any way always +behaves as a copy. + +Reference tracking +------------------ + +To be able to determine, if we have to make a copy when writing into a DataFrame, +we have to be aware, if the values are shared with another DataFrame. pandas +keeps track of all ``Blocks`` that share values with another block internally to +be able to tell when a copy needs to be triggered. The reference tracking +mechanism is implemented on the Block level. + +We use a custom reference tracker object, e.g. ``BlockValuesRefs`` that keeps +track of every block, whose values share memory with each other. The reference +is held through a weak-reference. Every two blocks that share some memory should +point to the same ``BlockValuesRefs`` object. If one block goes out of +scope, the reference to this block dies. As a consequence, the reference tracker +object always knows how many blocks are alive and share memory. + +We can ask the reference tracking object if there is another block alive that shares +data with us before writing into the values. We can trigger a copy before +writing if there is in fact another block alive. diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index c741441cf67a1..69f04494a271c 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -18,6 +18,7 @@ Development contributing_codebase maintaining internals + copy_on_write debugging_extensions extending developer From 189c7eac0553d47667559260937a2bd32cec0072 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Feb 2023 20:41:31 +0100 Subject: [PATCH 09/22] Update copy_on_write.rst --- doc/source/development/copy_on_write.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst index 133f27391dfbf..2c7a54c60cbc7 100644 --- a/doc/source/development/copy_on_write.rst +++ b/doc/source/development/copy_on_write.rst @@ -1,4 +1,4 @@ -.. _copy_on_write:: +.. _copy_on_write: {{ header }} From b79686926eac07c9c3b9d8e9c7e24be2b8d9e801 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Feb 2023 22:12:26 +0100 Subject: [PATCH 10/22] Add tests --- .../copy_view/test_core_functionalities.py | 77 +++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 31 -------- 2 files changed, 77 insertions(+), 31 deletions(-) create mode 100644 pandas/tests/copy_view/test_core_functionalities.py diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py new file mode 100644 index 0000000000000..4461ff3758e89 --- /dev/null +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_assigning_to_same_variable_removes_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df = df.reset_index() + arr = get_array(df, "a") + df.iloc[0, 1] = 100 # Write into a + + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_dont_track_unnecessary_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + + df["b"] = 100 + arr = get_array(df, "a") + df.iloc[0, 0] = 100 # Check that we correctly removed the reference + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_with_view_copies(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + expected = df.copy() + + df["b"] = 100 + arr = get_array(df, "a") + df.iloc[0, 0] = 100 # Check that we correctly track reference + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(view, expected) + + +def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + + df["b"] = 100 + arr = get_array(df, "a") + view = None # noqa + df.iloc[0, 0] = 100 # Check that we correctly release the reference + if using_copy_on_write: + pytest.mark.xfail("blk.delete does not track references correctly") + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_out_of_scope(using_copy_on_write): + def func(): + df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1}) + # create some subset + result = df[["a", "b"]] + return result + + result = func() + if using_copy_on_write: + assert not result._mgr.blocks[0].refs.has_reference() + assert not result._mgr.blocks[1].refs.has_reference() + + +def test_delete(using_copy_on_write): + df = DataFrame(np.random.randn(4, 3), columns=["a", "b", "c"]) + del df["b"] + if using_copy_on_write: + # TODO: This should not have references, delete makes a shallow copy + # but keeps the blocks alive + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + + df = df[["a"]] + if using_copy_on_write: + assert not df._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index cf316f678fe71..dc5711f1b2ddd 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1216,34 +1216,3 @@ def test_isetitem(using_copy_on_write): assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) else: assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - - -def test_assigning_to_same_variable_removes_references(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - df = df.reset_index() - arr = get_array(df, "a") - df.iloc[0, 1] = 100 # Write into a - - assert np.shares_memory(arr, get_array(df, "a")) - - -def test_setitem_dont_track_unnecessary_references(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) - - df["b"] = 100 - arr = get_array(df, "a") - df.iloc[0, 0] = 100 # Check that we correctly track reference - assert np.shares_memory(arr, get_array(df, "a")) - - -def test_setitem_with_view_copies(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) - view = df[:] - expected = df.copy() - - df["b"] = 100 - arr = get_array(df, "a") - df.iloc[0, 0] = 100 # Check that we correctly track reference - if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(view, expected) From 4712c0b5a9d129ae92e9164045c63011e53fae7b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Feb 2023 00:57:45 +0100 Subject: [PATCH 11/22] Fix xfail --- pandas/conftest.py | 1 + pandas/tests/copy_view/test_core_functionalities.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 64a8f0f9efc1d..08c6df08131c0 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1933,6 +1933,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 4461ff3758e89..f92bf26a31029 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -37,7 +37,7 @@ def test_setitem_with_view_copies(using_copy_on_write): tm.assert_frame_equal(view, expected) -def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write): +def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] @@ -46,7 +46,10 @@ def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write): view = None # noqa df.iloc[0, 0] = 100 # Check that we correctly release the reference if using_copy_on_write: - pytest.mark.xfail("blk.delete does not track references correctly") + mark = pytest.mark.xfail( + reason="blk.delete does not track references correctly" + ) + request.node.add_marker(mark) assert np.shares_memory(arr, get_array(df, "a")) From f3f425c5911574a2e24c977f7bf21467ef1dd994 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Feb 2023 00:57:50 +0100 Subject: [PATCH 12/22] Fix xfail --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 08c6df08131c0..64a8f0f9efc1d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1933,7 +1933,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 1c65bd8401d1944970791a0b34eb9fe3d8368134 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 6 Feb 2023 22:15:12 +0100 Subject: [PATCH 13/22] Update pandas/_libs/internals.pyx Co-authored-by: Joris Van den Bossche --- pandas/_libs/internals.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 021c229fe64c9..d821b9308c500 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -602,6 +602,9 @@ cdef class SharedBlock: self._mgr_locs = placement self.ndim = ndim if refs is None: + # if no refs are passed, that means we are creating a Block from + # new values that it uniquely owns -> start a new BlockValuesRefs + # object that only references this block self.refs = BlockValuesRefs(self) else: refs.add_reference(self) From 896b6a87485f5f3e4db4d53b00d78df436dde7b7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 6 Feb 2023 22:15:35 +0100 Subject: [PATCH 14/22] Update pandas/_libs/internals.pyx Co-authored-by: Joris Van den Bossche --- pandas/_libs/internals.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index d821b9308c500..d86ebb3f32465 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -607,6 +607,9 @@ cdef class SharedBlock: # object that only references this block self.refs = BlockValuesRefs(self) else: + # if refs are passed, this is the BlockValuesRefs object that is shared + # with the parent blocks which share the values, and a reference to this + # new block is added refs.add_reference(self) self.refs = refs From 1122d0cc191dda406499be93b9ea60cc0f708692 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 6 Feb 2023 22:15:44 +0100 Subject: [PATCH 15/22] Update doc/source/development/copy_on_write.rst Co-authored-by: Joris Van den Bossche --- doc/source/development/copy_on_write.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst index 2c7a54c60cbc7..9d39c1a7ed42b 100644 --- a/doc/source/development/copy_on_write.rst +++ b/doc/source/development/copy_on_write.rst @@ -20,7 +20,7 @@ keeps track of all ``Blocks`` that share values with another block internally to be able to tell when a copy needs to be triggered. The reference tracking mechanism is implemented on the Block level. -We use a custom reference tracker object, e.g. ``BlockValuesRefs`` that keeps +We use a custom reference tracker object, ``BlockValuesRefs``, that keeps track of every block, whose values share memory with each other. The reference is held through a weak-reference. Every two blocks that share some memory should point to the same ``BlockValuesRefs`` object. If one block goes out of From cc1ad65d208430fde47d2a6ba6c3793c1faa9c45 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 6 Feb 2023 22:16:18 +0100 Subject: [PATCH 16/22] Update pandas/_libs/internals.pyx Co-authored-by: Joris Van den Bossche --- pandas/_libs/internals.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index d86ebb3f32465..a1bdc748f9c03 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -895,7 +895,7 @@ cdef class BlockValuesRefs: Returns ------- - bool + bool """ self.referenced_blocks = [ obj for obj in self.referenced_blocks if obj() is not None From 8c656a85b5b4fd0fc8cf9801a6d5ca338d8ea5ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 6 Feb 2023 22:28:36 +0100 Subject: [PATCH 17/22] Add comment --- pandas/tests/copy_view/test_core_functionalities.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index f92bf26a31029..8d22433e3a193 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -9,6 +9,7 @@ def test_assigning_to_same_variable_removes_references(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df = df.reset_index() + assert df._mgr._has_no_reference(1) arr = get_array(df, "a") df.iloc[0, 1] = 100 # Write into a @@ -20,7 +21,9 @@ def test_setitem_dont_track_unnecessary_references(using_copy_on_write): df["b"] = 100 arr = get_array(df, "a") - df.iloc[0, 0] = 100 # Check that we correctly removed the reference + # We split the block in setitem, if we are not careful the new blocks will + # reference each other triggering a copy + df.iloc[0, 0] = 100 assert np.shares_memory(arr, get_array(df, "a")) @@ -44,8 +47,12 @@ def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, reques df["b"] = 100 arr = get_array(df, "a") view = None # noqa - df.iloc[0, 0] = 100 # Check that we correctly release the reference + df.iloc[0, 0] = 100 if using_copy_on_write: + # Setitem split the block. Since the old block shared data with view + # all the new blocks are referencing view and each other. When view + # goes out of scope, they don't share data with any other block, + # so we should not trigger a copy mark = pytest.mark.xfail( reason="blk.delete does not track references correctly" ) From f44d9f262e9ad18133ef7046ef8d8881ecbaef6c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 6 Feb 2023 22:43:25 +0100 Subject: [PATCH 18/22] Adjust docstring --- pandas/_libs/internals.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index a1bdc748f9c03..c08e21d02aa6d 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -598,6 +598,8 @@ cdef class SharedBlock: placement : BlockPlacement ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + refs: BlockValuesRefs, optional + Ref tracking object or None if block does not have any refs. """ self._mgr_locs = placement self.ndim = ndim From a6fbcb9e6f8765f54fa9ce8768cf348377b9d449 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 20:28:31 +0100 Subject: [PATCH 19/22] Fix array manager test --- pandas/tests/copy_view/test_core_functionalities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 8d22433e3a193..204e26b35d680 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -9,7 +9,8 @@ def test_assigning_to_same_variable_removes_references(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df = df.reset_index() - assert df._mgr._has_no_reference(1) + if using_copy_on_write: + assert df._mgr._has_no_reference(1) arr = get_array(df, "a") df.iloc[0, 1] = 100 # Write into a From fc71adda341a78afc4b3c5a545160a6a17611b3a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 22:36:31 +0100 Subject: [PATCH 20/22] Switch to WeakSet --- pandas/_libs/internals.pyx | 9 +++------ pandas/conftest.py | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index c08e21d02aa6d..b5ff69d92492f 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -874,10 +874,10 @@ cdef class BlockValuesRefs: data. """ cdef: - public list referenced_blocks + public object referenced_blocks def __cinit__(self, blk: SharedBlock) -> None: - self.referenced_blocks = [weakref.ref(blk)] + self.referenced_blocks = weakref.WeakSet([blk]) def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. @@ -887,7 +887,7 @@ cdef class BlockValuesRefs: blk: SharedBlock The block that the new references should point to. """ - self.referenced_blocks.append(weakref.ref(blk)) + self.referenced_blocks.add(blk) def has_reference(self) -> bool: """Checks if block has foreign references. @@ -899,8 +899,5 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - obj for obj in self.referenced_blocks if obj() is not None - ] # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 diff --git a/pandas/conftest.py b/pandas/conftest.py index 888205366f9e6..6e6b00e91dfbb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1949,6 +1949,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From a45a70a582536f5fe6a2792ea32e83d79427b148 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 22:37:22 +0100 Subject: [PATCH 21/22] Switch to WeakSet --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 6e6b00e91dfbb..888205366f9e6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1949,7 +1949,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From b05d3aaff1c5047156a7945fa8d55c7875edad77 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 23:05:31 +0100 Subject: [PATCH 22/22] Add doc explanation about weakset --- doc/source/development/copy_on_write.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst index 9d39c1a7ed42b..34625ed645615 100644 --- a/doc/source/development/copy_on_write.rst +++ b/doc/source/development/copy_on_write.rst @@ -27,6 +27,15 @@ point to the same ``BlockValuesRefs`` object. If one block goes out of scope, the reference to this block dies. As a consequence, the reference tracker object always knows how many blocks are alive and share memory. +Whenever a :class:`DataFrame` or :class:`Series` object is sharing data with another +object, it is required that each of those objects have its own BlockManager and Block +objects. Thus, in other words, one Block instance (that is held by a DataFrame, not +necessarily for intermediate objects) should always be uniquely used for only +a single DataFrame/Series object. For example, when you want to use the same +Block for another object, you can create a shallow copy of the Block instance +with ``block.copy(deep=False)`` (which will create a new Block instance with +the same underlying values and which will correctly set up the references). + We can ask the reference tracking object if there is another block alive that shares data with us before writing into the values. We can trigger a copy before writing if there is in fact another block alive.