pandas-dev · phofl · Feb 8, 2023 · Feb 3, 2023 · Feb 3, 2023 · Feb 3, 2023
@@ -0,0 +1,41 @@
+.. _copy_on_write:
+
+{{ header }}
+
+*************
+Copy on write
+*************
+
+Copy on Write is a mechanism to simplify the indexing API and improve
+performance through avoiding copies if possible.
+CoW means that any DataFrame or Series derived from another in any way always
+behaves as a copy.
+
+Reference tracking
+------------------
+
+To be able to determine, if we have to make a copy when writing into a DataFrame,
+we have to be aware, if the values are shared with another DataFrame. pandas
+keeps track of all ``Blocks`` that share values with another block internally to
+be able to tell when a copy needs to be triggered. The reference tracking
+mechanism is implemented on the Block level.
+
+We use a custom reference tracker object, ``BlockValuesRefs``, that keeps
+track of every block, whose values share memory with each other. The reference
+is held through a weak-reference. Every two blocks that share some memory should
+point to the same ``BlockValuesRefs`` object. If one block goes out of
+scope, the reference to this block dies. As a consequence, the reference tracker
+object always knows how many blocks are alive and share memory.
+
+Whenever a :class:`DataFrame` or :class:`Series` object is sharing data with another
+object, it is required that each of those objects have its own BlockManager and Block
+objects. Thus, in other words, one Block instance (that is held by a DataFrame, not
+necessarily for intermediate objects) should always be uniquely used for only
+a single DataFrame/Series object. For example, when you want to use the same
+Block for another object, you can create a shallow copy of the Block instance
+with ``block.copy(deep=False)`` (which will create a new Block instance with
+the same underlying values and which will correctly set up the references).
+
+We can ask the reference tracking object if there is another block alive that shares
+data with us before writing into the values. We can trigger a copy before
+writing if there is in fact another block alive.
@@ -18,6 +18,7 @@ Development
     contributing_codebase
     maintaining
     internals
+    copy_on_write
     debugging_extensions
     extending
     developer

diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
@@ -4,6 +4,7 @@ from typing import (
     final,
     overload,
 )
+import weakref
 
 import numpy as np
 
@@ -59,8 +60,13 @@ class SharedBlock:
     _mgr_locs: BlockPlacement
     ndim: int
     values: ArrayLike
+    refs: BlockValuesRefs
     def __init__(
-        self, values: ArrayLike, placement: BlockPlacement, ndim: int
+        self,
+        values: ArrayLike,
+        placement: BlockPlacement,
+        ndim: int,
+        refs: BlockValuesRefs | None = ...,
     ) -> None: ...
 
 class NumpyBlock(SharedBlock):
@@ -87,3 +93,9 @@ class BlockManager:
     ) -> None: ...
     def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ...
     def _rebuild_blknos_and_blklocs(self) -> None: ...
+
+class BlockValuesRefs:
+    referenced_blocks: list[weakref.ref]
+    def __init__(self, blk: SharedBlock) -> None: ...
+    def add_reference(self, blk: SharedBlock) -> None: ...
+    def has_reference(self) -> bool: ...
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -580,9 +580,16 @@ cdef class SharedBlock:
     """
     cdef:
         public BlockPlacement _mgr_locs
+        public BlockValuesRefs refs
         readonly int ndim
 
-    def __cinit__(self, values, placement: BlockPlacement, ndim: int):
+    def __cinit__(
+        self,
+        values,
+        placement: BlockPlacement,
+        ndim: int,
+        refs: BlockValuesRefs | None = None,
+    ):
         """
         Parameters
         ----------
@@ -591,9 +598,22 @@ cdef class SharedBlock:
         placement : BlockPlacement
         ndim : int
             1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
+        refs: BlockValuesRefs, optional
+            Ref tracking object or None if block does not have any refs.
         """
         self._mgr_locs = placement
         self.ndim = ndim
+        if refs is None:
+            # if no refs are passed, that means we are creating a Block from
+            # new values that it uniquely owns -> start a new BlockValuesRefs
+            # object that only references this block
+            self.refs = BlockValuesRefs(self)
+        else:
+            # if refs are passed, this is the BlockValuesRefs object that is shared
+            # with the parent blocks which share the values, and a reference to this
+            # new block is added
+            refs.add_reference(self)
+            self.refs = refs
 
     cpdef __reduce__(self):
         args = (self.values, self.mgr_locs.indexer, self.ndim)
@@ -619,9 +639,15 @@ cdef class NumpyBlock(SharedBlock):
     cdef:
         public ndarray values
 
-    def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
+    def __cinit__(
+        self,
+        ndarray values,
+        BlockPlacement placement,
+        int ndim,
+        refs: BlockValuesRefs | None = None,
+    ):
         # set values here; the (implicit) call to SharedBlock.__cinit__ will
-        #  set placement and ndim
+        #  set placement, ndim and refs
         self.values = values
 
     cpdef NumpyBlock getitem_block_index(self, slice slicer):
@@ -631,7 +657,7 @@ cdef class NumpyBlock(SharedBlock):
         Assumes self.ndim == 2
         """
         new_values = self.values[..., slicer]
-        return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
+        return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
 
 
 cdef class NDArrayBackedBlock(SharedBlock):
@@ -641,9 +667,15 @@ cdef class NDArrayBackedBlock(SharedBlock):
     cdef public:
         NDArrayBacked values
 
-    def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
+    def __cinit__(
+        self,
+        NDArrayBacked values,
+        BlockPlacement placement,
+        int ndim,
+        refs: BlockValuesRefs | None = None,
+    ):
         # set values here; the (implicit) call to SharedBlock.__cinit__ will
-        #  set placement and ndim
+        #  set placement, ndim and refs
         self.values = values
 
     cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
@@ -653,16 +685,22 @@ cdef class NDArrayBackedBlock(SharedBlock):
         Assumes self.ndim == 2
         """
         new_values = self.values[..., slicer]
-        return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
+        return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
 
 
 cdef class Block(SharedBlock):
     cdef:
         public object values
 
-    def __cinit__(self, object values, BlockPlacement placement, int ndim):
+    def __cinit__(
+        self,
+        object values,
+        BlockPlacement placement,
+        int ndim,
+        refs: BlockValuesRefs | None = None,
+    ):
         # set values here; the (implicit) call to SharedBlock.__cinit__ will
-        #  set placement and ndim
+        #  set placement, ndim and refs
         self.values = values
 
 
@@ -673,15 +711,11 @@ cdef class BlockManager:
         public list axes
         public bint _known_consolidated, _is_consolidated
         public ndarray _blknos, _blklocs
-        public list refs
-        public object parent
 
     def __cinit__(
         self,
         blocks=None,
         axes=None,
-        refs=None,
-        parent=None,
         verify_integrity=True,
     ):
         # None as defaults for unpickling GH#42345
@@ -695,8 +729,6 @@ cdef class BlockManager:
 
         self.blocks = blocks
         self.axes = axes.copy()  # copy to make sure we are not remotely-mutable
-        self.refs = refs
-        self.parent = parent
 
         # Populate known_consolidate, blknos, and blklocs lazily
         self._known_consolidated = False
@@ -805,16 +837,12 @@ cdef class BlockManager:
             ndarray blknos, blklocs
 
         nbs = []
-        nrefs = []
         for blk in self.blocks:
             nb = blk.getitem_block_index(slobj)
             nbs.append(nb)
-            nrefs.append(weakref.ref(blk))
 
         new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
-        mgr = type(self)(
-            tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False
-        )
+        mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
 
         # We can avoid having to rebuild blklocs/blknos
         blklocs = self._blklocs
@@ -827,7 +855,7 @@ cdef class BlockManager:
     def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
 
         if axis == 0:
-            new_blocks, new_refs = self._slice_take_blocks_ax0(slobj)
+            new_blocks = self._slice_take_blocks_ax0(slobj)
         elif axis == 1:
             return self._get_index_slice(slobj)
         else:
@@ -836,6 +864,40 @@ cdef class BlockManager:
         new_axes = list(self.axes)
         new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
 
-        return type(self)(
-            tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False
-        )
+        return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
+
+
+cdef class BlockValuesRefs:
+    """Tracks all references to a given array.
+
+    Keeps track of all blocks (through weak references) that reference the same
+    data.
+    """
+    cdef:
+        public object referenced_blocks
+
+    def __cinit__(self, blk: SharedBlock) -> None:
+        self.referenced_blocks = weakref.WeakSet([blk])
+
+    def add_reference(self, blk: SharedBlock) -> None:
+        """Adds a new reference to our reference collection.
+
+        Parameters
+        ----------
+        blk: SharedBlock
+            The block that the new references should point to.
+        """
+        self.referenced_blocks.add(blk)
+
+    def has_reference(self) -> bool:
+        """Checks if block has foreign references.
+
+        A reference is only relevant if it is still alive. The reference to
+        ourselves does not count.
+
+        Returns
+        -------
+        bool
+        """
+        # Checking for more references than block pointing to itself
+        return len(self.referenced_blocks) > 1
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -168,7 +168,7 @@ def load_newobj(self) -> None:
         arr = np.array([], dtype="m8[ns]")
         obj = cls.__new__(cls, arr, arr.dtype)
     elif cls is BlockManager and not args:
-        obj = cls.__new__(cls, (), [], None, False)
+        obj = cls.__new__(cls, (), [], False)
     else:
         obj = cls.__new__(cls, *args)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -768,8 +768,10 @@ def swapaxes(
             )
             assert isinstance(new_mgr, BlockManager)
             assert isinstance(self._mgr, BlockManager)
-            new_mgr.parent = self._mgr
-            new_mgr.refs = [weakref.ref(self._mgr.blocks[0])]
+            new_mgr.blocks[0].refs = self._mgr.blocks[0].refs
+            new_mgr.blocks[0].refs.add_reference(
+                new_mgr.blocks[0]  # type: ignore[arg-type]
+            )
             return self._constructor(new_mgr).__finalize__(self, method="swapaxes")
 
         elif (copy or copy is None) and self._mgr.is_single_block: