Skip to content

Commit 221f636

Browse files
API: New copy / view semantics using Copy-on-Write (#46958)
* API: New copy / view semantics using Copy-on-Write * fix more tests * Handle CoW in BM.iset * Handle CoW in xs * add bunch of todo comments and usage warnings * Insert None ref in BM.insert * Ensure to not assume copy_on_write is set in case of ArrayManager * Handle refs in BM._combine / test CoW in select_dtypes * handle get_numeric_data for single block manager * fix test_internals (get_numeric_data now uses CoW) * handle refs in consolidation * fix deep=None for ArrayManager * update copy/view tests from other PR * clean-up fast_xs workarounds now it returns a SingleBlockManager * tracks refs in to_frame * fixup after updata main and column_setitem + iloc inplace setitem changes (gh-45333) * fix inplace fillna + fixup new tests * address comments + update some todo comments * Update pandas/core/internals/managers.py Co-authored-by: Matthew Roeschke <[email protected]> * fixup linting * update new copy_view tests to use get_array helper * add comment to setitem * switch default to False, ensure CoW copies only happen when enabled + add additional test build with CoW * update type annotations * Fix stata issue to avoid SettingWithCopyWarning in read_stata * update type + option comment * fixup new rename test Co-authored-by: Matthew Roeschke <[email protected]>
1 parent ae47909 commit 221f636

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1019
-291
lines changed

.github/workflows/ubuntu.yml

+5
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ jobs:
5252
extra_apt: "language-pack-zh-hans"
5353
lang: "zh_CN.utf8"
5454
lc_all: "zh_CN.utf8"
55+
- name: "Copy-on-Write"
56+
env_file: actions-310.yaml
57+
pattern: "not slow and not network and not single_cpu"
58+
pandas_copy_on_write: "1"
5559
- name: "Data Manager"
5660
env_file: actions-38.yaml
5761
pattern: "not slow and not network and not single_cpu"
@@ -84,6 +88,7 @@ jobs:
8488
LC_ALL: ${{ matrix.lc_all || '' }}
8589
PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
8690
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
91+
PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
8792
TEST_ARGS: ${{ matrix.test_args || '' }}
8893
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
8994
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}

pandas/_libs/internals.pyx

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import defaultdict
2+
import weakref
23

34
cimport cython
45
from cpython.slice cimport PySlice_GetIndicesEx
@@ -674,8 +675,9 @@ cdef class BlockManager:
674675
public list axes
675676
public bint _known_consolidated, _is_consolidated
676677
public ndarray _blknos, _blklocs
678+
public list refs
677679

678-
def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
680+
def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True):
679681
# None as defaults for unpickling GH#42345
680682
if blocks is None:
681683
# This adds 1-2 microseconds to DataFrame(np.array([]))
@@ -687,6 +689,7 @@ cdef class BlockManager:
687689

688690
self.blocks = blocks
689691
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
692+
self.refs = refs
690693

691694
# Populate known_consolidate, blknos, and blklocs lazily
692695
self._known_consolidated = False
@@ -795,12 +798,14 @@ cdef class BlockManager:
795798
ndarray blknos, blklocs
796799

797800
nbs = []
801+
nrefs = []
798802
for blk in self.blocks:
799803
nb = blk.getitem_block_index(slobj)
800804
nbs.append(nb)
805+
nrefs.append(weakref.ref(blk))
801806

802807
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
803-
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
808+
mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False)
804809

805810
# We can avoid having to rebuild blklocs/blknos
806811
blklocs = self._blklocs
@@ -813,7 +818,7 @@ cdef class BlockManager:
813818
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
814819

815820
if axis == 0:
816-
new_blocks = self._slice_take_blocks_ax0(slobj)
821+
new_blocks, new_refs = self._slice_take_blocks_ax0(slobj)
817822
elif axis == 1:
818823
return self._get_index_slice(slobj)
819824
else:
@@ -822,4 +827,4 @@ cdef class BlockManager:
822827
new_axes = list(self.axes)
823828
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
824829

825-
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
830+
return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False)

pandas/compat/pickle_compat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def load_newobj(self):
224224
arr = np.array([], dtype="m8[ns]")
225225
obj = cls.__new__(cls, arr, arr.dtype)
226226
elif cls is BlockManager and not args:
227-
obj = cls.__new__(cls, (), [], False)
227+
obj = cls.__new__(cls, (), [], None, False)
228228
else:
229229
obj = cls.__new__(cls, *args)
230230

pandas/conftest.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,7 @@
2929
from decimal import Decimal
3030
import operator
3131
import os
32-
from typing import (
33-
Callable,
34-
Literal,
35-
)
32+
from typing import Callable
3633

3734
from dateutil.tz import (
3835
tzlocal,
@@ -1838,8 +1835,8 @@ def using_array_manager():
18381835

18391836

18401837
@pytest.fixture
1841-
def using_copy_on_write() -> Literal[False]:
1838+
def using_copy_on_write() -> bool:
18421839
"""
18431840
Fixture to check if Copy-on-Write is enabled.
18441841
"""
1845-
return False
1842+
return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block"

pandas/core/config_init.py

+20
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,26 @@ def use_inf_as_na_cb(key) -> None:
540540
)
541541

542542

543+
# TODO better name?
544+
copy_on_write_doc = """
545+
: bool
546+
Use new copy-view behaviour using Copy-on-Write. Defaults to False,
547+
unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable
548+
(if set to "1" for True, needs to be set before pandas is imported).
549+
"""
550+
551+
552+
with cf.config_prefix("mode"):
553+
cf.register_option(
554+
"copy_on_write",
555+
# Get the default from an environment variable, if set, otherwise defaults
556+
# to False. This environment variable can be set for testing.
557+
os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
558+
copy_on_write_doc,
559+
validator=is_bool,
560+
)
561+
562+
543563
# user warnings
544564
chained_assignment = """
545565
: string

pandas/core/frame.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -3708,13 +3708,19 @@ def _get_column_array(self, i: int) -> ArrayLike:
37083708
"""
37093709
Get the values of the i'th column (ndarray or ExtensionArray, as stored
37103710
in the Block)
3711+
3712+
Warning! The returned array is a view but doesn't handle Copy-on-Write,
3713+
so this should be used with caution (for read-only purposes).
37113714
"""
37123715
return self._mgr.iget_values(i)
37133716

37143717
def _iter_column_arrays(self) -> Iterator[ArrayLike]:
37153718
"""
37163719
Iterate over the arrays of all columns in order.
37173720
This returns the values as stored in the Block (ndarray or ExtensionArray).
3721+
3722+
Warning! The returned array is a view but doesn't handle Copy-on-Write,
3723+
so this should be used with caution (for read-only purposes).
37183724
"""
37193725
for i in range(len(self.columns)):
37203726
yield self._get_column_array(i)
@@ -5147,7 +5153,7 @@ def set_axis(
51475153
"labels",
51485154
[
51495155
("method", None),
5150-
("copy", True),
5156+
("copy", None),
51515157
("level", None),
51525158
("fill_value", np.nan),
51535159
("limit", None),
@@ -5372,7 +5378,7 @@ def rename(
53725378
index: Renamer | None = ...,
53735379
columns: Renamer | None = ...,
53745380
axis: Axis | None = ...,
5375-
copy: bool = ...,
5381+
copy: bool | None = ...,
53765382
inplace: Literal[True],
53775383
level: Level = ...,
53785384
errors: IgnoreRaise = ...,
@@ -5387,7 +5393,7 @@ def rename(
53875393
index: Renamer | None = ...,
53885394
columns: Renamer | None = ...,
53895395
axis: Axis | None = ...,
5390-
copy: bool = ...,
5396+
copy: bool | None = ...,
53915397
inplace: Literal[False] = ...,
53925398
level: Level = ...,
53935399
errors: IgnoreRaise = ...,
@@ -5402,7 +5408,7 @@ def rename(
54025408
index: Renamer | None = ...,
54035409
columns: Renamer | None = ...,
54045410
axis: Axis | None = ...,
5405-
copy: bool = ...,
5411+
copy: bool | None = ...,
54065412
inplace: bool = ...,
54075413
level: Level = ...,
54085414
errors: IgnoreRaise = ...,
@@ -5416,7 +5422,7 @@ def rename(
54165422
index: Renamer | None = None,
54175423
columns: Renamer | None = None,
54185424
axis: Axis | None = None,
5419-
copy: bool = True,
5425+
copy: bool | None = None,
54205426
inplace: bool = False,
54215427
level: Level = None,
54225428
errors: IgnoreRaise = "ignore",
@@ -6290,7 +6296,7 @@ class max type
62906296
if inplace:
62916297
new_obj = self
62926298
else:
6293-
new_obj = self.copy()
6299+
new_obj = self.copy(deep=None)
62946300
if allow_duplicates is not lib.no_default:
62956301
allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
62966302

pandas/core/generic.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -1046,7 +1046,7 @@ def _rename(
10461046
index: Renamer | None = None,
10471047
columns: Renamer | None = None,
10481048
axis: Axis | None = None,
1049-
copy: bool_t = True,
1049+
copy: bool_t | None = None,
10501050
inplace: bool_t = False,
10511051
level: Level | None = None,
10521052
errors: str = "ignore",
@@ -4161,6 +4161,12 @@ def _check_setitem_copy(self, t="setting", force=False):
41614161
df.iloc[0:5]['group'] = 'a'
41624162
41634163
"""
4164+
if (
4165+
config.get_option("mode.copy_on_write")
4166+
and config.get_option("mode.data_manager") == "block"
4167+
):
4168+
return
4169+
41644170
# return early if the check is not needed
41654171
if not (force or self._is_copy):
41664172
return
@@ -5261,7 +5267,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
52615267
axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
52625268
method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
52635269
level = kwargs.pop("level", None)
5264-
copy = kwargs.pop("copy", True)
5270+
copy = kwargs.pop("copy", None)
52655271
limit = kwargs.pop("limit", None)
52665272
tolerance = kwargs.pop("tolerance", None)
52675273
fill_value = kwargs.pop("fill_value", None)
@@ -5286,9 +5292,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
52865292
for axis, ax in axes.items()
52875293
if ax is not None
52885294
):
5289-
if copy:
5290-
return self.copy()
5291-
return self
5295+
return self.copy(deep=copy)
52925296

52935297
# check if we are a multi reindex
52945298
if self._needs_reindex_multi(axes, method, level):
@@ -6265,7 +6269,7 @@ def astype(
62656269
return cast(NDFrameT, result)
62666270

62676271
@final
6268-
def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
6272+
def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:
62696273
"""
62706274
Make a copy of this object's indices and data.
62716275

pandas/core/internals/array_manager.py

+10
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,11 @@ def copy(self: T, deep=True) -> T:
527527
-------
528528
BlockManager
529529
"""
530+
if deep is None:
531+
# ArrayManager does not yet support CoW, so deep=None always means
532+
# deep=True for now
533+
deep = True
534+
530535
# this preserves the notion of view copying of axes
531536
if deep:
532537
# hit in e.g. tests.io.json.test_pandas
@@ -591,6 +596,11 @@ def _reindex_indexer(
591596
592597
pandas-indexer with -1's only.
593598
"""
599+
if copy is None:
600+
# ArrayManager does not yet support CoW, so deep=None always means
601+
# deep=True for now
602+
copy = True
603+
594604
if indexer is None:
595605
if new_axis is self._axes[axis] and not copy:
596606
return self

pandas/core/internals/blocks.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -839,17 +839,22 @@ def _slice(
839839

840840
return self.values[slicer]
841841

842-
def set_inplace(self, locs, values: ArrayLike) -> None:
842+
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
843843
"""
844844
Modify block values in-place with new item value.
845845
846+
If copy=True, first copy the underlying values in place before modifying
847+
(for Copy-on-Write).
848+
846849
Notes
847850
-----
848851
`set_inplace` never creates a new array or new Block, whereas `setitem`
849852
_may_ create a new array and always creates a new Block.
850853
851854
Caller is responsible for checking values.dtype == self.dtype.
852855
"""
856+
if copy:
857+
self.values = self.values.copy()
853858
self.values[locs] = values
854859

855860
def take_nd(
@@ -1665,9 +1670,11 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
16651670
raise IndexError(f"{self} only contains one item")
16661671
return self.values
16671672

1668-
def set_inplace(self, locs, values: ArrayLike) -> None:
1673+
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
16691674
# When an ndarray, we should have locs.tolist() == [0]
16701675
# When a BlockPlacement we should have list(locs) == [0]
1676+
if copy:
1677+
self.values = self.values.copy()
16711678
self.values[:] = values
16721679

16731680
def _maybe_squeeze_arg(self, arg):

0 commit comments

Comments
 (0)