Skip to content

Commit 5d17d73

Browse files
authored
CoW: Push reference tracking down to the block level (#51144)
1 parent fcb8b80 commit 5d17d73

15 files changed

+326
-272
lines changed
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
.. _copy_on_write:
2+
3+
{{ header }}
4+
5+
*************
6+
Copy on write
7+
*************
8+
9+
Copy on Write is a mechanism to simplify the indexing API and improve
10+
performance through avoiding copies if possible.
11+
CoW means that any DataFrame or Series derived from another in any way always
12+
behaves as a copy.
13+
14+
Reference tracking
15+
------------------
16+
17+
To be able to determine, if we have to make a copy when writing into a DataFrame,
18+
we have to be aware, if the values are shared with another DataFrame. pandas
19+
keeps track of all ``Blocks`` that share values with another block internally to
20+
be able to tell when a copy needs to be triggered. The reference tracking
21+
mechanism is implemented on the Block level.
22+
23+
We use a custom reference tracker object, ``BlockValuesRefs``, that keeps
24+
track of every block, whose values share memory with each other. The reference
25+
is held through a weak-reference. Every two blocks that share some memory should
26+
point to the same ``BlockValuesRefs`` object. If one block goes out of
27+
scope, the reference to this block dies. As a consequence, the reference tracker
28+
object always knows how many blocks are alive and share memory.
29+
30+
Whenever a :class:`DataFrame` or :class:`Series` object is sharing data with another
31+
object, it is required that each of those objects have its own BlockManager and Block
32+
objects. Thus, in other words, one Block instance (that is held by a DataFrame, not
33+
necessarily for intermediate objects) should always be uniquely used for only
34+
a single DataFrame/Series object. For example, when you want to use the same
35+
Block for another object, you can create a shallow copy of the Block instance
36+
with ``block.copy(deep=False)`` (which will create a new Block instance with
37+
the same underlying values and which will correctly set up the references).
38+
39+
We can ask the reference tracking object if there is another block alive that shares
40+
data with us before writing into the values. We can trigger a copy before
41+
writing if there is in fact another block alive.

doc/source/development/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Development
1818
contributing_codebase
1919
maintaining
2020
internals
21+
copy_on_write
2122
debugging_extensions
2223
extending
2324
developer

pandas/_libs/internals.pyi

+13-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ from typing import (
44
final,
55
overload,
66
)
7+
import weakref
78

89
import numpy as np
910

@@ -59,8 +60,13 @@ class SharedBlock:
5960
_mgr_locs: BlockPlacement
6061
ndim: int
6162
values: ArrayLike
63+
refs: BlockValuesRefs
6264
def __init__(
63-
self, values: ArrayLike, placement: BlockPlacement, ndim: int
65+
self,
66+
values: ArrayLike,
67+
placement: BlockPlacement,
68+
ndim: int,
69+
refs: BlockValuesRefs | None = ...,
6470
) -> None: ...
6571

6672
class NumpyBlock(SharedBlock):
@@ -87,3 +93,9 @@ class BlockManager:
8793
) -> None: ...
8894
def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ...
8995
def _rebuild_blknos_and_blklocs(self) -> None: ...
96+
97+
class BlockValuesRefs:
98+
referenced_blocks: list[weakref.ref]
99+
def __init__(self, blk: SharedBlock) -> None: ...
100+
def add_reference(self, blk: SharedBlock) -> None: ...
101+
def has_reference(self) -> bool: ...

pandas/_libs/internals.pyx

+86-24
Original file line numberDiff line numberDiff line change
@@ -580,9 +580,16 @@ cdef class SharedBlock:
580580
"""
581581
cdef:
582582
public BlockPlacement _mgr_locs
583+
public BlockValuesRefs refs
583584
readonly int ndim
584585

585-
def __cinit__(self, values, placement: BlockPlacement, ndim: int):
586+
def __cinit__(
587+
self,
588+
values,
589+
placement: BlockPlacement,
590+
ndim: int,
591+
refs: BlockValuesRefs | None = None,
592+
):
586593
"""
587594
Parameters
588595
----------
@@ -591,9 +598,22 @@ cdef class SharedBlock:
591598
placement : BlockPlacement
592599
ndim : int
593600
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
601+
refs: BlockValuesRefs, optional
602+
Ref tracking object or None if block does not have any refs.
594603
"""
595604
self._mgr_locs = placement
596605
self.ndim = ndim
606+
if refs is None:
607+
# if no refs are passed, that means we are creating a Block from
608+
# new values that it uniquely owns -> start a new BlockValuesRefs
609+
# object that only references this block
610+
self.refs = BlockValuesRefs(self)
611+
else:
612+
# if refs are passed, this is the BlockValuesRefs object that is shared
613+
# with the parent blocks which share the values, and a reference to this
614+
# new block is added
615+
refs.add_reference(self)
616+
self.refs = refs
597617

598618
cpdef __reduce__(self):
599619
args = (self.values, self.mgr_locs.indexer, self.ndim)
@@ -619,9 +639,15 @@ cdef class NumpyBlock(SharedBlock):
619639
cdef:
620640
public ndarray values
621641

622-
def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
642+
def __cinit__(
643+
self,
644+
ndarray values,
645+
BlockPlacement placement,
646+
int ndim,
647+
refs: BlockValuesRefs | None = None,
648+
):
623649
# set values here; the (implicit) call to SharedBlock.__cinit__ will
624-
# set placement and ndim
650+
# set placement, ndim and refs
625651
self.values = values
626652

627653
cpdef NumpyBlock getitem_block_index(self, slice slicer):
@@ -631,7 +657,7 @@ cdef class NumpyBlock(SharedBlock):
631657
Assumes self.ndim == 2
632658
"""
633659
new_values = self.values[..., slicer]
634-
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
660+
return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
635661

636662

637663
cdef class NDArrayBackedBlock(SharedBlock):
@@ -641,9 +667,15 @@ cdef class NDArrayBackedBlock(SharedBlock):
641667
cdef public:
642668
NDArrayBacked values
643669

644-
def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
670+
def __cinit__(
671+
self,
672+
NDArrayBacked values,
673+
BlockPlacement placement,
674+
int ndim,
675+
refs: BlockValuesRefs | None = None,
676+
):
645677
# set values here; the (implicit) call to SharedBlock.__cinit__ will
646-
# set placement and ndim
678+
# set placement, ndim and refs
647679
self.values = values
648680

649681
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
@@ -653,16 +685,22 @@ cdef class NDArrayBackedBlock(SharedBlock):
653685
Assumes self.ndim == 2
654686
"""
655687
new_values = self.values[..., slicer]
656-
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
688+
return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
657689

658690

659691
cdef class Block(SharedBlock):
660692
cdef:
661693
public object values
662694

663-
def __cinit__(self, object values, BlockPlacement placement, int ndim):
695+
def __cinit__(
696+
self,
697+
object values,
698+
BlockPlacement placement,
699+
int ndim,
700+
refs: BlockValuesRefs | None = None,
701+
):
664702
# set values here; the (implicit) call to SharedBlock.__cinit__ will
665-
# set placement and ndim
703+
# set placement, ndim and refs
666704
self.values = values
667705

668706

@@ -673,15 +711,11 @@ cdef class BlockManager:
673711
public list axes
674712
public bint _known_consolidated, _is_consolidated
675713
public ndarray _blknos, _blklocs
676-
public list refs
677-
public object parent
678714

679715
def __cinit__(
680716
self,
681717
blocks=None,
682718
axes=None,
683-
refs=None,
684-
parent=None,
685719
verify_integrity=True,
686720
):
687721
# None as defaults for unpickling GH#42345
@@ -695,8 +729,6 @@ cdef class BlockManager:
695729

696730
self.blocks = blocks
697731
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
698-
self.refs = refs
699-
self.parent = parent
700732

701733
# Populate known_consolidate, blknos, and blklocs lazily
702734
self._known_consolidated = False
@@ -805,16 +837,12 @@ cdef class BlockManager:
805837
ndarray blknos, blklocs
806838

807839
nbs = []
808-
nrefs = []
809840
for blk in self.blocks:
810841
nb = blk.getitem_block_index(slobj)
811842
nbs.append(nb)
812-
nrefs.append(weakref.ref(blk))
813843

814844
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
815-
mgr = type(self)(
816-
tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False
817-
)
845+
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
818846

819847
# We can avoid having to rebuild blklocs/blknos
820848
blklocs = self._blklocs
@@ -827,7 +855,7 @@ cdef class BlockManager:
827855
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
828856

829857
if axis == 0:
830-
new_blocks, new_refs = self._slice_take_blocks_ax0(slobj)
858+
new_blocks = self._slice_take_blocks_ax0(slobj)
831859
elif axis == 1:
832860
return self._get_index_slice(slobj)
833861
else:
@@ -836,6 +864,40 @@ cdef class BlockManager:
836864
new_axes = list(self.axes)
837865
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
838866

839-
return type(self)(
840-
tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False
841-
)
867+
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
868+
869+
870+
cdef class BlockValuesRefs:
871+
"""Tracks all references to a given array.
872+
873+
Keeps track of all blocks (through weak references) that reference the same
874+
data.
875+
"""
876+
cdef:
877+
public object referenced_blocks
878+
879+
def __cinit__(self, blk: SharedBlock) -> None:
880+
self.referenced_blocks = weakref.WeakSet([blk])
881+
882+
def add_reference(self, blk: SharedBlock) -> None:
883+
"""Adds a new reference to our reference collection.
884+
885+
Parameters
886+
----------
887+
blk: SharedBlock
888+
The block that the new references should point to.
889+
"""
890+
self.referenced_blocks.add(blk)
891+
892+
def has_reference(self) -> bool:
893+
"""Checks if block has foreign references.
894+
895+
A reference is only relevant if it is still alive. The reference to
896+
ourselves does not count.
897+
898+
Returns
899+
-------
900+
bool
901+
"""
902+
# Checking for more references than block pointing to itself
903+
return len(self.referenced_blocks) > 1

pandas/compat/pickle_compat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def load_newobj(self) -> None:
168168
arr = np.array([], dtype="m8[ns]")
169169
obj = cls.__new__(cls, arr, arr.dtype)
170170
elif cls is BlockManager and not args:
171-
obj = cls.__new__(cls, (), [], None, False)
171+
obj = cls.__new__(cls, (), [], False)
172172
else:
173173
obj = cls.__new__(cls, *args)
174174

pandas/core/generic.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -768,8 +768,10 @@ def swapaxes(
768768
)
769769
assert isinstance(new_mgr, BlockManager)
770770
assert isinstance(self._mgr, BlockManager)
771-
new_mgr.parent = self._mgr
772-
new_mgr.refs = [weakref.ref(self._mgr.blocks[0])]
771+
new_mgr.blocks[0].refs = self._mgr.blocks[0].refs
772+
new_mgr.blocks[0].refs.add_reference(
773+
new_mgr.blocks[0] # type: ignore[arg-type]
774+
)
773775
return self._constructor(new_mgr).__finalize__(self, method="swapaxes")
774776

775777
elif (copy or copy is None) and self._mgr.is_single_block:

0 commit comments

Comments
 (0)