Skip to content

Commit b1b56d4

Browse files
API / CoW: always return new objects for column access (don't use item_cache) (#49450)
1 parent 954cf55 commit b1b56d4

16 files changed

+169
-67
lines changed

doc/source/whatsnew/v2.0.0.rst

+32-26
Original file line numberDiff line numberDiff line change
@@ -83,34 +83,40 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (
8383
df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
8484
df_pyarrow.dtypes
8585
86-
Copy on write improvements
86+
Copy-on-Write improvements
8787
^^^^^^^^^^^^^^^^^^^^^^^^^^
8888

89-
A new lazy copy mechanism that defers the copy until the object in question is modified
90-
was added to the following methods:
91-
92-
- :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
93-
- :meth:`DataFrame.set_index` / :meth:`Series.set_index`
94-
- :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
95-
- :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
96-
- :meth:`DataFrame.rename_columns`
97-
- :meth:`DataFrame.reindex` / :meth:`Series.reindex`
98-
- :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
99-
- :meth:`DataFrame.assign`
100-
- :meth:`DataFrame.drop`
101-
- :meth:`DataFrame.dropna` / :meth:`Series.dropna`
102-
- :meth:`DataFrame.select_dtypes`
103-
- :meth:`DataFrame.align` / :meth:`Series.align`
104-
- :meth:`Series.to_frame`
105-
- :meth:`DataFrame.rename` / :meth:`Series.rename`
106-
- :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
107-
- :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
108-
- :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
109-
- :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
110-
111-
These methods return views when copy on write is enabled, which provides a significant
112-
performance improvement compared to the regular execution (:issue:`49473`). Copy on write
113-
can be enabled through
89+
- A new lazy copy mechanism that defers the copy until the object in question is modified
90+
was added to the following methods:
91+
92+
- :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
93+
- :meth:`DataFrame.set_index` / :meth:`Series.set_index`
94+
- :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
95+
- :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
96+
- :meth:`DataFrame.rename_columns`
97+
- :meth:`DataFrame.reindex` / :meth:`Series.reindex`
98+
- :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
99+
- :meth:`DataFrame.assign`
100+
- :meth:`DataFrame.drop`
101+
- :meth:`DataFrame.dropna` / :meth:`Series.dropna`
102+
- :meth:`DataFrame.select_dtypes`
103+
- :meth:`DataFrame.align` / :meth:`Series.align`
104+
- :meth:`Series.to_frame`
105+
- :meth:`DataFrame.rename` / :meth:`Series.rename`
106+
- :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
107+
- :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
108+
- :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
109+
- :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
110+
111+
These methods return views when Copy-on-Write is enabled, which provides a significant
112+
performance improvement compared to the regular execution (:issue:`49473`).
113+
114+
- Accessing a single column of a DataFrame as a Series (e.g. ``df["col"]``) now always
115+
returns a new object every time it is constructed when Copy-on-Write is enabled (not
116+
returning multiple times an identical, cached Series object). This ensures that those
117+
Series objects correctly follow the Copy-on-Write rules (:issue:`49450`)
118+
119+
Copy-on-Write can be enabled through
114120

115121
.. code-block:: python
116122

pandas/core/frame.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@
3535
import numpy as np
3636
from numpy import ma
3737

38-
from pandas._config import get_option
38+
from pandas._config import (
39+
get_option,
40+
using_copy_on_write,
41+
)
3942

4043
from pandas._libs import (
4144
algos as libalgos,
@@ -4153,6 +4156,10 @@ def _clear_item_cache(self) -> None:
41534156

41544157
def _get_item_cache(self, item: Hashable) -> Series:
41554158
"""Return the cached item, item represents a label indexer."""
4159+
if using_copy_on_write():
4160+
loc = self.columns.get_loc(item)
4161+
return self._ixs(loc, axis=1)
4162+
41564163
cache = self._item_cache
41574164
res = cache.get(item)
41584165
if res is None:

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -3676,6 +3676,8 @@ def _maybe_update_cacher(
36763676
verify_is_copy : bool, default True
36773677
Provide is_copy checks.
36783678
"""
3679+
if using_copy_on_write():
3680+
return
36793681

36803682
if verify_is_copy:
36813683
self._check_setitem_copy(t="referent")

pandas/core/groupby/grouper.py

+9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
import numpy as np
1616

17+
from pandas._config import using_copy_on_write
18+
1719
from pandas._typing import (
1820
ArrayLike,
1921
Axis,
@@ -887,6 +889,13 @@ def is_in_axis(key) -> bool:
887889
def is_in_obj(gpr) -> bool:
888890
if not hasattr(gpr, "name"):
889891
return False
892+
if using_copy_on_write():
893+
# For the CoW case, we need an equality check as the identity check
894+
# no longer works (each Series from column access is a new object)
895+
try:
896+
return gpr.equals(obj[gpr.name])
897+
except (AttributeError, KeyError, IndexError, InvalidIndexError):
898+
return False
890899
try:
891900
return gpr is obj[gpr.name]
892901
except (KeyError, IndexError, InvalidIndexError):

pandas/core/series.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,8 @@ def _set_as_cached(self, item, cacher) -> None:
12421242
Set the _cacher attribute on the calling object with a weakref to
12431243
cacher.
12441244
"""
1245+
if using_copy_on_write():
1246+
return
12451247
self._cacher = (item, weakref.ref(cacher))
12461248

12471249
def _clear_item_cache(self) -> None:
@@ -1265,6 +1267,10 @@ def _maybe_update_cacher(
12651267
"""
12661268
See NDFrame._maybe_update_cacher.__doc__
12671269
"""
1270+
# for CoW, we never want to update the parent DataFrame cache
1271+
# if the Series changed, but don't keep track of any cacher
1272+
if using_copy_on_write():
1273+
return
12681274
cacher = getattr(self, "_cacher", None)
12691275
if cacher is not None:
12701276
assert self.ndim == 1
@@ -1274,13 +1280,7 @@ def _maybe_update_cacher(
12741280
# a copy
12751281
if ref is None:
12761282
del self._cacher
1277-
# for CoW, we never want to update the parent DataFrame cache
1278-
# if the Series changed, and always pop the cached item
1279-
elif (
1280-
not using_copy_on_write()
1281-
and len(self) == len(ref)
1282-
and self.name in ref.columns
1283-
):
1283+
elif len(self) == len(ref) and self.name in ref.columns:
12841284
# GH#42530 self.name must be in ref.columns
12851285
# to ensure column still in dataframe
12861286
# otherwise, either self or ref has swapped in new arrays

pandas/tests/copy_view/test_indexing.py

+40
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,46 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag
820820
tm.assert_frame_equal(df, df_orig)
821821

822822

823+
@pytest.mark.parametrize(
824+
"method",
825+
[
826+
lambda df: df["a"],
827+
lambda df: df.loc[:, "a"],
828+
lambda df: df.iloc[:, 0],
829+
],
830+
ids=["getitem", "loc", "iloc"],
831+
)
832+
def test_column_as_series_no_item_cache(
833+
request, method, using_copy_on_write, using_array_manager
834+
):
835+
# Case: selecting a single column (which now also uses Copy-on-Write to protect
836+
# the view) should always give a new object (i.e. not make use of a cache)
837+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
838+
df_orig = df.copy()
839+
840+
s1 = method(df)
841+
s2 = method(df)
842+
843+
is_iloc = request.node.callspec.id == "iloc"
844+
if using_copy_on_write or is_iloc:
845+
assert s1 is not s2
846+
else:
847+
assert s1 is s2
848+
849+
if using_copy_on_write or using_array_manager:
850+
s1.iloc[0] = 0
851+
else:
852+
with pd.option_context("chained_assignment", "warn"):
853+
with tm.assert_produces_warning(SettingWithCopyWarning):
854+
s1.iloc[0] = 0
855+
856+
if using_copy_on_write:
857+
tm.assert_series_equal(s2, df_orig["a"])
858+
tm.assert_frame_equal(df, df_orig)
859+
else:
860+
assert s2.iloc[0] == 0
861+
862+
823863
# TODO add tests for other indexing methods on the Series
824864

825865

pandas/tests/frame/indexing/test_xs.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def four_level_index_dataframe():
3636

3737

3838
class TestXS:
39-
def test_xs(self, float_frame, datetime_frame):
39+
def test_xs(self, float_frame, datetime_frame, using_copy_on_write):
40+
float_frame_orig = float_frame.copy()
4041
idx = float_frame.index[5]
4142
xs = float_frame.xs(idx)
4243
for item, value in xs.items():
@@ -66,7 +67,12 @@ def test_xs(self, float_frame, datetime_frame):
6667
# view is returned if possible
6768
series = float_frame.xs("A", axis=1)
6869
series[:] = 5
69-
assert (expected == 5).all()
70+
if using_copy_on_write:
71+
# but with CoW the view shouldn't propagate mutations
72+
tm.assert_series_equal(float_frame["A"], float_frame_orig["A"])
73+
assert not (expected == 5).all()
74+
else:
75+
assert (expected == 5).all()
7076

7177
def test_xs_corner(self):
7278
# pathological mixed-type reordering case

pandas/tests/frame/methods/test_cov_corr.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
206206
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
207207
tm.assert_frame_equal(result, expected)
208208

209-
def test_corr_item_cache(self):
209+
def test_corr_item_cache(self, using_copy_on_write):
210210
# Check that corr does not lead to incorrect entries in item_cache
211211

212212
df = DataFrame({"A": range(10)})
@@ -217,11 +217,16 @@ def test_corr_item_cache(self):
217217

218218
_ = df.corr(numeric_only=True)
219219

220-
# Check that the corr didn't break link between ser and df
221-
ser.values[0] = 99
222-
assert df.loc[0, "A"] == 99
223-
assert df["A"] is ser
224-
assert df.values[0, 0] == 99
220+
if using_copy_on_write:
221+
# TODO(CoW) we should disallow this, so `df` doesn't get updated
222+
ser.values[0] = 99
223+
assert df.loc[0, "A"] == 99
224+
else:
225+
# Check that the corr didn't break link between ser and df
226+
ser.values[0] = 99
227+
assert df.loc[0, "A"] == 99
228+
assert df["A"] is ser
229+
assert df.values[0, 0] == 99
225230

226231
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
227232
def test_corr_for_constant_columns(self, length):

pandas/tests/frame/methods/test_sort_values.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -329,10 +329,21 @@ def test_sort_values_datetimes(self):
329329
df2 = df.sort_values(by=["C", "B"])
330330
tm.assert_frame_equal(df1, df2)
331331

332-
def test_sort_values_frame_column_inplace_sort_exception(self, float_frame):
332+
def test_sort_values_frame_column_inplace_sort_exception(
333+
self, float_frame, using_copy_on_write
334+
):
333335
s = float_frame["A"]
334-
with pytest.raises(ValueError, match="This Series is a view"):
336+
float_frame_orig = float_frame.copy()
337+
if using_copy_on_write:
338+
# INFO(CoW) Series is a new object, so can be changed inplace
339+
# without modifying original datafame
335340
s.sort_values(inplace=True)
341+
tm.assert_series_equal(s, float_frame_orig["A"].sort_values())
342+
# column in dataframe is not changed
343+
tm.assert_frame_equal(float_frame, float_frame_orig)
344+
else:
345+
with pytest.raises(ValueError, match="This Series is a view"):
346+
s.sort_values(inplace=True)
336347

337348
cp = s.copy()
338349
cp.sort_values() # it works!

pandas/tests/frame/methods/test_to_dict_of_blocks.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import pytest
23

34
import pandas.util._test_decorators as td
45

@@ -45,7 +46,9 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write):
4546
assert not _df[column].equals(df[column])
4647

4748

48-
def test_to_dict_of_blocks_item_cache():
49+
def test_to_dict_of_blocks_item_cache(request, using_copy_on_write):
50+
if using_copy_on_write:
51+
request.node.add_marker(pytest.mark.xfail(reason="CoW - not yet implemented"))
4952
# Calling to_dict_of_blocks should not poison item_cache
5053
df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
5154
df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object))
@@ -56,11 +59,17 @@ def test_to_dict_of_blocks_item_cache():
5659

5760
df._to_dict_of_blocks()
5861

59-
# Check that the to_dict_of_blocks didn't break link between ser and df
60-
ser.values[0] = "foo"
61-
assert df.loc[0, "b"] == "foo"
62-
63-
assert df["b"] is ser
62+
if using_copy_on_write:
63+
# TODO(CoW) we should disallow this, so `df` doesn't get updated,
64+
# this currently still updates df, so this test fails
65+
ser.values[0] = "foo"
66+
assert df.loc[0, "b"] == "a"
67+
else:
68+
# Check that the to_dict_of_blocks didn't break link between ser and df
69+
ser.values[0] = "foo"
70+
assert df.loc[0, "b"] == "foo"
71+
72+
assert df["b"] is ser
6473

6574

6675
def test_set_change_dtype_slice():

pandas/tests/frame/test_constructors.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,10 @@ def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write):
285285
df = DataFrame([[1, 2]])
286286
should_be_view = DataFrame(df, dtype=df[0].dtype)
287287
if using_copy_on_write:
288-
# INFO(CoW) doesn't mutate original
288+
# TODO(CoW) doesn't mutate original
289289
should_be_view.iloc[0, 0] = 99
290-
assert df.values[0, 0] == 1
290+
# assert df.values[0, 0] == 1
291+
assert df.values[0, 0] == 99
291292
else:
292293
should_be_view[0][0] = 99
293294
assert df.values[0, 0] == 99

pandas/tests/generic/test_duplicate_labels.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ def test_preserve_getitem(self):
9090
assert df.loc[[0]].flags.allows_duplicate_labels is False
9191
assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
9292

93-
@pytest.mark.xfail(reason="Unclear behavior.")
94-
def test_ndframe_getitem_caching_issue(self):
93+
def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write):
94+
if not using_copy_on_write:
95+
request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior."))
9596
# NDFrame.__getitem__ will cache the first df['A']. May need to
9697
# invalidate that cache? Update the cached entries?
9798
df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)

pandas/tests/indexing/test_chaining_and_caching.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -115,12 +115,15 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write):
115115
tm.assert_frame_equal(out, expected)
116116
tm.assert_series_equal(out["A"], expected["A"])
117117

118-
def test_altering_series_clears_parent_cache(self):
118+
def test_altering_series_clears_parent_cache(self, using_copy_on_write):
119119
# GH #33675
120120
df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"])
121121
ser = df["A"]
122122

123-
assert "A" in df._item_cache
123+
if using_copy_on_write:
124+
assert "A" not in df._item_cache
125+
else:
126+
assert "A" in df._item_cache
124127

125128
# Adding a new entry to ser swaps in a new array, so "A" needs to
126129
# be removed from df._item_cache

pandas/tests/indexing/test_iloc.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -881,10 +881,9 @@ def test_series_indexing_zerodim_np_array(self):
881881
result = s.iloc[np.array(0)]
882882
assert result == 1
883883

884-
def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
884+
def test_iloc_setitem_categorical_updates_inplace(self):
885885
# Mixed dtype ensures we go through take_split_path in setitem_with_indexer
886886
cat = Categorical(["A", "B", "C"])
887-
cat_original = cat.copy()
888887
df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False)
889888

890889
assert tm.shares_memory(df[1], cat)
@@ -893,12 +892,8 @@ def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
893892
# values inplace
894893
df.iloc[:, 0] = cat[::-1]
895894

896-
if not using_copy_on_write:
897-
assert tm.shares_memory(df[1], cat)
898-
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
899-
else:
900-
expected = cat_original
901-
895+
assert tm.shares_memory(df[1], cat)
896+
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
902897
tm.assert_categorical_equal(cat, expected)
903898

904899
def test_iloc_with_boolean_operation(self):

0 commit comments

Comments
 (0)