Skip to content

Commit 65c0d2b

Browse files
BUG: CoW - correctly track references for chained operations (pandas-dev#48996)
Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 273f0fe commit 65c0d2b

File tree

6 files changed

+270
-19
lines changed

6 files changed

+270
-19
lines changed

doc/source/whatsnew/v1.5.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Fixed regressions
2424

2525
Bug fixes
2626
~~~~~~~~~
27-
-
27+
- Bug in the Copy-on-Write implementation losing track of views in certain chained indexing cases (:issue:`48996`)
2828
-
2929

3030
.. ---------------------------------------------------------------------------

pandas/_libs/internals.pyx

+9-3
Original file line numberDiff line numberDiff line change
@@ -676,8 +676,9 @@ cdef class BlockManager:
676676
public bint _known_consolidated, _is_consolidated
677677
public ndarray _blknos, _blklocs
678678
public list refs
679+
public object parent
679680

680-
def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True):
681+
def __cinit__(self, blocks=None, axes=None, refs=None, parent=None, verify_integrity=True):
681682
# None as defaults for unpickling GH#42345
682683
if blocks is None:
683684
# This adds 1-2 microseconds to DataFrame(np.array([]))
@@ -690,6 +691,7 @@ cdef class BlockManager:
690691
self.blocks = blocks
691692
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
692693
self.refs = refs
694+
self.parent = parent
693695

694696
# Populate known_consolidate, blknos, and blklocs lazily
695697
self._known_consolidated = False
@@ -805,7 +807,9 @@ cdef class BlockManager:
805807
nrefs.append(weakref.ref(blk))
806808

807809
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
808-
mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False)
810+
mgr = type(self)(
811+
tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False
812+
)
809813

810814
# We can avoid having to rebuild blklocs/blknos
811815
blklocs = self._blklocs
@@ -827,4 +831,6 @@ cdef class BlockManager:
827831
new_axes = list(self.axes)
828832
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
829833

830-
return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False)
834+
return type(self)(
835+
tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False
836+
)

pandas/core/internals/managers.py

+43-12
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
import pandas.core.algorithms as algos
5858
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
5959
from pandas.core.arrays.sparse import SparseDtype
60+
import pandas.core.common as com
6061
from pandas.core.construction import (
6162
ensure_wrapped_if_datetimelike,
6263
extract_array,
@@ -148,6 +149,7 @@ class BaseBlockManager(DataManager):
148149
blocks: tuple[Block, ...]
149150
axes: list[Index]
150151
refs: list[weakref.ref | None] | None
152+
parent: object
151153

152154
@property
153155
def ndim(self) -> int:
@@ -165,6 +167,7 @@ def from_blocks(
165167
blocks: list[Block],
166168
axes: list[Index],
167169
refs: list[weakref.ref | None] | None = None,
170+
parent: object = None,
168171
) -> T:
169172
raise NotImplementedError
170173

@@ -264,6 +267,8 @@ def _clear_reference_block(self, blkno: int) -> None:
264267
"""
265268
if self.refs is not None:
266269
self.refs[blkno] = None
270+
if com.all_none(*self.refs):
271+
self.parent = None
267272

268273
def get_dtypes(self):
269274
dtypes = np.array([blk.dtype for blk in self.blocks])
@@ -605,7 +610,9 @@ def _combine(
605610
axes[-1] = index
606611
axes[0] = self.items.take(indexer)
607612

608-
return type(self).from_blocks(new_blocks, axes, new_refs)
613+
return type(self).from_blocks(
614+
new_blocks, axes, new_refs, parent=None if copy else self
615+
)
609616

610617
@property
611618
def nblocks(self) -> int:
@@ -648,11 +655,14 @@ def copy_func(ax):
648655
new_refs: list[weakref.ref | None] | None
649656
if deep:
650657
new_refs = None
658+
parent = None
651659
else:
652660
new_refs = [weakref.ref(blk) for blk in self.blocks]
661+
parent = self
653662

654663
res.axes = new_axes
655664
res.refs = new_refs
665+
res.parent = parent
656666

657667
if self.ndim > 1:
658668
# Avoid needing to re-compute these
@@ -744,6 +754,7 @@ def reindex_indexer(
744754
only_slice=only_slice,
745755
use_na_proxy=use_na_proxy,
746756
)
757+
parent = None if com.all_none(*new_refs) else self
747758
else:
748759
new_blocks = [
749760
blk.take_nd(
@@ -756,11 +767,12 @@ def reindex_indexer(
756767
for blk in self.blocks
757768
]
758769
new_refs = None
770+
parent = None
759771

760772
new_axes = list(self.axes)
761773
new_axes[axis] = new_axis
762774

763-
new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs)
775+
new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent)
764776
if axis == 1:
765777
# We can avoid the need to rebuild these
766778
new_mgr._blknos = self.blknos.copy()
@@ -995,6 +1007,7 @@ def __init__(
9951007
blocks: Sequence[Block],
9961008
axes: Sequence[Index],
9971009
refs: list[weakref.ref | None] | None = None,
1010+
parent: object = None,
9981011
verify_integrity: bool = True,
9991012
) -> None:
10001013

@@ -1059,11 +1072,13 @@ def from_blocks(
10591072
blocks: list[Block],
10601073
axes: list[Index],
10611074
refs: list[weakref.ref | None] | None = None,
1075+
parent: object = None,
10621076
) -> BlockManager:
10631077
"""
10641078
Constructor for BlockManager and SingleBlockManager with same signature.
10651079
"""
1066-
return cls(blocks, axes, refs, verify_integrity=False)
1080+
parent = parent if _using_copy_on_write() else None
1081+
return cls(blocks, axes, refs, parent, verify_integrity=False)
10671082

10681083
# ----------------------------------------------------------------
10691084
# Indexing
@@ -1085,7 +1100,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10851100
block = new_block(result, placement=slice(0, len(result)), ndim=1)
10861101
# in the case of a single block, the new block is a view
10871102
ref = weakref.ref(self.blocks[0])
1088-
return SingleBlockManager(block, self.axes[0], [ref])
1103+
return SingleBlockManager(block, self.axes[0], [ref], parent=self)
10891104

10901105
dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
10911106

@@ -1119,7 +1134,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
11191134
block = new_block(result, placement=slice(0, len(result)), ndim=1)
11201135
return SingleBlockManager(block, self.axes[0])
11211136

1122-
def iget(self, i: int) -> SingleBlockManager:
1137+
def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
11231138
"""
11241139
Return the data as a SingleBlockManager.
11251140
"""
@@ -1129,7 +1144,9 @@ def iget(self, i: int) -> SingleBlockManager:
11291144
# shortcut for select a single-dim from a 2-dim BM
11301145
bp = BlockPlacement(slice(0, len(values)))
11311146
nb = type(block)(values, placement=bp, ndim=1)
1132-
return SingleBlockManager(nb, self.axes[1], [weakref.ref(block)])
1147+
ref = weakref.ref(block) if track_ref else None
1148+
parent = self if track_ref else None
1149+
return SingleBlockManager(nb, self.axes[1], [ref], parent)
11331150

11341151
def iget_values(self, i: int) -> ArrayLike:
11351152
"""
@@ -1371,7 +1388,9 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None
13711388
self.blocks = tuple(blocks)
13721389
self._clear_reference_block(blkno)
13731390

1374-
col_mgr = self.iget(loc)
1391+
# this manager is only created temporarily to mutate the values in place
1392+
# so don't track references, otherwise the `setitem` would perform CoW again
1393+
col_mgr = self.iget(loc, track_ref=False)
13751394
new_mgr = col_mgr.setitem((idx,), value)
13761395
self.iset(loc, new_mgr._block.values, inplace=True)
13771396

@@ -1469,7 +1488,9 @@ def idelete(self, indexer) -> BlockManager:
14691488
nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True)
14701489
new_columns = self.items[~is_deleted]
14711490
axes = [new_columns, self.axes[1]]
1472-
return type(self)(tuple(nbs), axes, new_refs, verify_integrity=False)
1491+
# TODO this might not be needed (can a delete ever be done in chained manner?)
1492+
parent = None if com.all_none(*new_refs) else self
1493+
return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False)
14731494

14741495
# ----------------------------------------------------------------
14751496
# Block-wise Operation
@@ -1875,6 +1896,7 @@ def __init__(
18751896
block: Block,
18761897
axis: Index,
18771898
refs: list[weakref.ref | None] | None = None,
1899+
parent: object = None,
18781900
verify_integrity: bool = False,
18791901
fastpath=lib.no_default,
18801902
) -> None:
@@ -1893,13 +1915,15 @@ def __init__(
18931915
self.axes = [axis]
18941916
self.blocks = (block,)
18951917
self.refs = refs
1918+
self.parent = parent if _using_copy_on_write() else None
18961919

18971920
@classmethod
18981921
def from_blocks(
18991922
cls,
19001923
blocks: list[Block],
19011924
axes: list[Index],
19021925
refs: list[weakref.ref | None] | None = None,
1926+
parent: object = None,
19031927
) -> SingleBlockManager:
19041928
"""
19051929
Constructor for BlockManager and SingleBlockManager with same signature.
@@ -1908,7 +1932,7 @@ def from_blocks(
19081932
assert len(axes) == 1
19091933
if refs is not None:
19101934
assert len(refs) == 1
1911-
return cls(blocks[0], axes[0], refs, verify_integrity=False)
1935+
return cls(blocks[0], axes[0], refs, parent, verify_integrity=False)
19121936

19131937
@classmethod
19141938
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
@@ -1928,7 +1952,10 @@ def to_2d_mgr(self, columns: Index) -> BlockManager:
19281952
new_blk = type(blk)(arr, placement=bp, ndim=2)
19291953
axes = [columns, self.axes[0]]
19301954
refs: list[weakref.ref | None] = [weakref.ref(blk)]
1931-
return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False)
1955+
parent = self if _using_copy_on_write() else None
1956+
return BlockManager(
1957+
[new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False
1958+
)
19321959

19331960
def _has_no_reference(self, i: int = 0) -> bool:
19341961
"""
@@ -2010,7 +2037,7 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana
20102037
new_idx = self.index[indexer]
20112038
# TODO(CoW) in theory only need to track reference if new_array is a view
20122039
ref = weakref.ref(blk)
2013-
return type(self)(block, new_idx, [ref])
2040+
return type(self)(block, new_idx, [ref], parent=self)
20142041

20152042
def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
20162043
# Assertion disabled for performance
@@ -2023,7 +2050,9 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
20232050
bp = BlockPlacement(slice(0, len(array)))
20242051
block = type(blk)(array, placement=bp, ndim=1)
20252052
new_index = self.index._getitem_slice(slobj)
2026-
return type(self)(block, new_index, [weakref.ref(blk)])
2053+
# TODO this method is only used in groupby SeriesSplitter at the moment,
2054+
# so passing refs / parent is not yet covered by the tests
2055+
return type(self)(block, new_index, [weakref.ref(blk)], parent=self)
20272056

20282057
@property
20292058
def index(self) -> Index:
@@ -2070,6 +2099,7 @@ def setitem_inplace(self, indexer, value) -> None:
20702099
if _using_copy_on_write() and not self._has_no_reference(0):
20712100
self.blocks = (self._block.copy(),)
20722101
self.refs = None
2102+
self.parent = None
20732103
self._cache.clear()
20742104

20752105
super().setitem_inplace(indexer, value)
@@ -2086,6 +2116,7 @@ def idelete(self, indexer) -> SingleBlockManager:
20862116
self._cache.clear()
20872117
# clear reference since delete always results in a new array
20882118
self.refs = None
2119+
self.parent = None
20892120
return self
20902121

20912122
def fast_xs(self, loc):

0 commit comments

Comments
 (0)