Skip to content

Commit eb7a024

Browse files
jorisvandenbosschemeeseeksmachine
authored andcommitted
Backport PR pandas-dev#48996: BUG: CoW - correctly track references for chained operations
1 parent 250954b commit eb7a024

File tree

6 files changed

+270
-19
lines changed

6 files changed

+270
-19
lines changed

doc/source/whatsnew/v1.5.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Fixed regressions
2121

2222
Bug fixes
2323
~~~~~~~~~
24-
-
24+
- Bug in the Copy-on-Write implementation losing track of views in certain chained indexing cases (:issue:`48996`)
2525
-
2626

2727
.. ---------------------------------------------------------------------------

pandas/_libs/internals.pyx

+9-3
Original file line numberDiff line numberDiff line change
@@ -676,8 +676,9 @@ cdef class BlockManager:
676676
public bint _known_consolidated, _is_consolidated
677677
public ndarray _blknos, _blklocs
678678
public list refs
679+
public object parent
679680

680-
def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True):
681+
def __cinit__(self, blocks=None, axes=None, refs=None, parent=None, verify_integrity=True):
681682
# None as defaults for unpickling GH#42345
682683
if blocks is None:
683684
# This adds 1-2 microseconds to DataFrame(np.array([]))
@@ -690,6 +691,7 @@ cdef class BlockManager:
690691
self.blocks = blocks
691692
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
692693
self.refs = refs
694+
self.parent = parent
693695

694696
# Populate known_consolidate, blknos, and blklocs lazily
695697
self._known_consolidated = False
@@ -805,7 +807,9 @@ cdef class BlockManager:
805807
nrefs.append(weakref.ref(blk))
806808

807809
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
808-
mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False)
810+
mgr = type(self)(
811+
tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False
812+
)
809813

810814
# We can avoid having to rebuild blklocs/blknos
811815
blklocs = self._blklocs
@@ -827,4 +831,6 @@ cdef class BlockManager:
827831
new_axes = list(self.axes)
828832
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
829833

830-
return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False)
834+
return type(self)(
835+
tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False
836+
)

pandas/core/internals/managers.py

+43-12
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import pandas.core.algorithms as algos
5656
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
5757
from pandas.core.arrays.sparse import SparseDtype
58+
import pandas.core.common as com
5859
from pandas.core.construction import (
5960
ensure_wrapped_if_datetimelike,
6061
extract_array,
@@ -146,6 +147,7 @@ class BaseBlockManager(DataManager):
146147
blocks: tuple[Block, ...]
147148
axes: list[Index]
148149
refs: list[weakref.ref | None] | None
150+
parent: object
149151

150152
@property
151153
def ndim(self) -> int:
@@ -163,6 +165,7 @@ def from_blocks(
163165
blocks: list[Block],
164166
axes: list[Index],
165167
refs: list[weakref.ref | None] | None = None,
168+
parent: object = None,
166169
) -> T:
167170
raise NotImplementedError
168171

@@ -262,6 +265,8 @@ def _clear_reference_block(self, blkno: int) -> None:
262265
"""
263266
if self.refs is not None:
264267
self.refs[blkno] = None
268+
if com.all_none(*self.refs):
269+
self.parent = None
265270

266271
def get_dtypes(self):
267272
dtypes = np.array([blk.dtype for blk in self.blocks])
@@ -602,7 +607,9 @@ def _combine(
602607
axes[-1] = index
603608
axes[0] = self.items.take(indexer)
604609

605-
return type(self).from_blocks(new_blocks, axes, new_refs)
610+
return type(self).from_blocks(
611+
new_blocks, axes, new_refs, parent=None if copy else self
612+
)
606613

607614
@property
608615
def nblocks(self) -> int:
@@ -645,11 +652,14 @@ def copy_func(ax):
645652
new_refs: list[weakref.ref | None] | None
646653
if deep:
647654
new_refs = None
655+
parent = None
648656
else:
649657
new_refs = [weakref.ref(blk) for blk in self.blocks]
658+
parent = self
650659

651660
res.axes = new_axes
652661
res.refs = new_refs
662+
res.parent = parent
653663

654664
if self.ndim > 1:
655665
# Avoid needing to re-compute these
@@ -738,6 +748,7 @@ def reindex_indexer(
738748
only_slice=only_slice,
739749
use_na_proxy=use_na_proxy,
740750
)
751+
parent = None if com.all_none(*new_refs) else self
741752
else:
742753
new_blocks = [
743754
blk.take_nd(
@@ -750,11 +761,12 @@ def reindex_indexer(
750761
for blk in self.blocks
751762
]
752763
new_refs = None
764+
parent = None
753765

754766
new_axes = list(self.axes)
755767
new_axes[axis] = new_axis
756768

757-
new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs)
769+
new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent)
758770
if axis == 1:
759771
# We can avoid the need to rebuild these
760772
new_mgr._blknos = self.blknos.copy()
@@ -989,6 +1001,7 @@ def __init__(
9891001
blocks: Sequence[Block],
9901002
axes: Sequence[Index],
9911003
refs: list[weakref.ref | None] | None = None,
1004+
parent: object = None,
9921005
verify_integrity: bool = True,
9931006
) -> None:
9941007

@@ -1053,11 +1066,13 @@ def from_blocks(
10531066
blocks: list[Block],
10541067
axes: list[Index],
10551068
refs: list[weakref.ref | None] | None = None,
1069+
parent: object = None,
10561070
) -> BlockManager:
10571071
"""
10581072
Constructor for BlockManager and SingleBlockManager with same signature.
10591073
"""
1060-
return cls(blocks, axes, refs, verify_integrity=False)
1074+
parent = parent if _using_copy_on_write() else None
1075+
return cls(blocks, axes, refs, parent, verify_integrity=False)
10611076

10621077
# ----------------------------------------------------------------
10631078
# Indexing
@@ -1079,7 +1094,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10791094
block = new_block(result, placement=slice(0, len(result)), ndim=1)
10801095
# in the case of a single block, the new block is a view
10811096
ref = weakref.ref(self.blocks[0])
1082-
return SingleBlockManager(block, self.axes[0], [ref])
1097+
return SingleBlockManager(block, self.axes[0], [ref], parent=self)
10831098

10841099
dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
10851100

@@ -1113,7 +1128,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
11131128
block = new_block(result, placement=slice(0, len(result)), ndim=1)
11141129
return SingleBlockManager(block, self.axes[0])
11151130

1116-
def iget(self, i: int) -> SingleBlockManager:
1131+
def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
11171132
"""
11181133
Return the data as a SingleBlockManager.
11191134
"""
@@ -1123,7 +1138,9 @@ def iget(self, i: int) -> SingleBlockManager:
11231138
# shortcut for select a single-dim from a 2-dim BM
11241139
bp = BlockPlacement(slice(0, len(values)))
11251140
nb = type(block)(values, placement=bp, ndim=1)
1126-
return SingleBlockManager(nb, self.axes[1], [weakref.ref(block)])
1141+
ref = weakref.ref(block) if track_ref else None
1142+
parent = self if track_ref else None
1143+
return SingleBlockManager(nb, self.axes[1], [ref], parent)
11271144

11281145
def iget_values(self, i: int) -> ArrayLike:
11291146
"""
@@ -1365,7 +1382,9 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None
13651382
self.blocks = tuple(blocks)
13661383
self._clear_reference_block(blkno)
13671384

1368-
col_mgr = self.iget(loc)
1385+
# this manager is only created temporarily to mutate the values in place
1386+
# so don't track references, otherwise the `setitem` would perform CoW again
1387+
col_mgr = self.iget(loc, track_ref=False)
13691388
new_mgr = col_mgr.setitem((idx,), value)
13701389
self.iset(loc, new_mgr._block.values, inplace=True)
13711390

@@ -1463,7 +1482,9 @@ def idelete(self, indexer) -> BlockManager:
14631482
nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True)
14641483
new_columns = self.items[~is_deleted]
14651484
axes = [new_columns, self.axes[1]]
1466-
return type(self)(tuple(nbs), axes, new_refs, verify_integrity=False)
1485+
# TODO this might not be needed (can a delete ever be done in chained manner?)
1486+
parent = None if com.all_none(*new_refs) else self
1487+
return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False)
14671488

14681489
# ----------------------------------------------------------------
14691490
# Block-wise Operation
@@ -1869,6 +1890,7 @@ def __init__(
18691890
block: Block,
18701891
axis: Index,
18711892
refs: list[weakref.ref | None] | None = None,
1893+
parent: object = None,
18721894
verify_integrity: bool = False,
18731895
fastpath=lib.no_default,
18741896
) -> None:
@@ -1887,13 +1909,15 @@ def __init__(
18871909
self.axes = [axis]
18881910
self.blocks = (block,)
18891911
self.refs = refs
1912+
self.parent = parent if _using_copy_on_write() else None
18901913

18911914
@classmethod
18921915
def from_blocks(
18931916
cls,
18941917
blocks: list[Block],
18951918
axes: list[Index],
18961919
refs: list[weakref.ref | None] | None = None,
1920+
parent: object = None,
18971921
) -> SingleBlockManager:
18981922
"""
18991923
Constructor for BlockManager and SingleBlockManager with same signature.
@@ -1902,7 +1926,7 @@ def from_blocks(
19021926
assert len(axes) == 1
19031927
if refs is not None:
19041928
assert len(refs) == 1
1905-
return cls(blocks[0], axes[0], refs, verify_integrity=False)
1929+
return cls(blocks[0], axes[0], refs, parent, verify_integrity=False)
19061930

19071931
@classmethod
19081932
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
@@ -1922,7 +1946,10 @@ def to_2d_mgr(self, columns: Index) -> BlockManager:
19221946
new_blk = type(blk)(arr, placement=bp, ndim=2)
19231947
axes = [columns, self.axes[0]]
19241948
refs: list[weakref.ref | None] = [weakref.ref(blk)]
1925-
return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False)
1949+
parent = self if _using_copy_on_write() else None
1950+
return BlockManager(
1951+
[new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False
1952+
)
19261953

19271954
def _has_no_reference(self, i: int = 0) -> bool:
19281955
"""
@@ -2004,7 +2031,7 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana
20042031
new_idx = self.index[indexer]
20052032
# TODO(CoW) in theory only need to track reference if new_array is a view
20062033
ref = weakref.ref(blk)
2007-
return type(self)(block, new_idx, [ref])
2034+
return type(self)(block, new_idx, [ref], parent=self)
20082035

20092036
def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
20102037
# Assertion disabled for performance
@@ -2017,7 +2044,9 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
20172044
bp = BlockPlacement(slice(0, len(array)))
20182045
block = type(blk)(array, placement=bp, ndim=1)
20192046
new_index = self.index._getitem_slice(slobj)
2020-
return type(self)(block, new_index, [weakref.ref(blk)])
2047+
# TODO this method is only used in groupby SeriesSplitter at the moment,
2048+
# so passing refs / parent is not yet covered by the tests
2049+
return type(self)(block, new_index, [weakref.ref(blk)], parent=self)
20212050

20222051
@property
20232052
def index(self) -> Index:
@@ -2064,6 +2093,7 @@ def setitem_inplace(self, indexer, value) -> None:
20642093
if _using_copy_on_write() and not self._has_no_reference(0):
20652094
self.blocks = (self._block.copy(),)
20662095
self.refs = None
2096+
self.parent = None
20672097
self._cache.clear()
20682098

20692099
super().setitem_inplace(indexer, value)
@@ -2080,6 +2110,7 @@ def idelete(self, indexer) -> SingleBlockManager:
20802110
self._cache.clear()
20812111
# clear reference since delete always results in a new array
20822112
self.refs = None
2113+
self.parent = None
20832114
return self
20842115

20852116
def fast_xs(self, loc):

0 commit comments

Comments
 (0)