Skip to content

API / CoW: always return new objects for column access (don't use item_cache) #49450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4219,6 +4219,13 @@ def _clear_item_cache(self) -> None:

def _get_item_cache(self, item: Hashable) -> Series:
"""Return the cached item, item represents a label indexer."""
if (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
):
loc = self.columns.get_loc(item)
return self._ixs(loc, axis=1)

cache = self._item_cache
res = cache.get(item)
if res is None:
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3694,6 +3694,11 @@ def _maybe_update_cacher(
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if (
config.get_option("mode.copy_on_write")
and config.get_option("mode.data_manager") == "block"
):
return

if verify_is_copy:
self._check_setitem_copy(t="referent")
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import numpy as np

from pandas._config import get_option

from pandas._typing import (
ArrayLike,
Axis,
Expand Down Expand Up @@ -844,6 +846,16 @@ def is_in_axis(key) -> bool:
def is_in_obj(gpr) -> bool:
if not hasattr(gpr, "name"):
return False
if (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
):
# For the CoW case, we need an equality check as the identity check
# no longer works (each Series from column access is a new object)
try:
return gpr.equals(obj[gpr.name])
except (AttributeError, KeyError, IndexError, InvalidIndexError):
return False
try:
return gpr is obj[gpr.name]
except (KeyError, IndexError, InvalidIndexError):
Expand Down
23 changes: 13 additions & 10 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,11 @@ def _set_as_cached(self, item, cacher) -> None:
Set the _cacher attribute on the calling object with a weakref to
cacher.
"""
if (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
):
return
self._cacher = (item, weakref.ref(cacher))

def _clear_item_cache(self) -> None:
Expand All @@ -1259,6 +1264,13 @@ def _maybe_update_cacher(
"""
See NDFrame._maybe_update_cacher.__doc__
"""
# for CoW, we never want to update the parent DataFrame cache
# if the Series changed, but don't keep track of any cacher
if (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
):
return
cacher = getattr(self, "_cacher", None)
if cacher is not None:
assert self.ndim == 1
Expand All @@ -1268,16 +1280,7 @@ def _maybe_update_cacher(
# a copy
if ref is None:
del self._cacher
# for CoW, we never want to update the parent DataFrame cache
# if the Series changed, and always pop the cached item
elif (
not (
get_option("mode.copy_on_write")
and get_option("mode.data_manager") == "block"
)
and len(self) == len(ref)
and self.name in ref.columns
):
elif len(self) == len(ref) and self.name in ref.columns:
# GH#42530 self.name must be in ref.columns
# to ensure column still in dataframe
# otherwise, either self or ref has swapped in new arrays
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,34 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag
tm.assert_frame_equal(df, df_orig)


def test_column_as_series_no_item_cache(using_copy_on_write, using_array_manager):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

s1 = df["a"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth testing the other indexing methods that will return a columns as a Series (loc/iloc)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, loc seems to use the same caching, so good to cover that in the test as well. Updated.

s2 = df["a"]

if using_copy_on_write:
assert s1 is not s2
else:
assert s1 is s2

if using_copy_on_write or using_array_manager:
s1.iloc[0] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
s1.iloc[0] = 0

if using_copy_on_write:
tm.assert_series_equal(s2, df_orig["a"])
tm.assert_series_equal(df["a"], df_orig["a"])
else:
assert s2.iloc[0] == 0


# TODO add tests for other indexing methods on the Series


Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def four_level_index_dataframe():


class TestXS:
def test_xs(self, float_frame, datetime_frame):
def test_xs(self, float_frame, datetime_frame, using_copy_on_write):
float_frame_orig = float_frame.copy()
idx = float_frame.index[5]
xs = float_frame.xs(idx)
for item, value in xs.items():
Expand Down Expand Up @@ -66,7 +67,12 @@ def test_xs(self, float_frame, datetime_frame):
# view is returned if possible
series = float_frame.xs("A", axis=1)
series[:] = 5
assert (expected == 5).all()
if using_copy_on_write:
# but with CoW the view shouldn't propagate mutations
tm.assert_series_equal(float_frame["A"], float_frame_orig["A"])
assert not (expected == 5).all()
else:
assert (expected == 5).all()

def test_xs_corner(self):
# pathological mixed-type reordering case
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)

def test_corr_item_cache(self):
def test_corr_item_cache(self, using_copy_on_write):
# Check that corr does not lead to incorrect entries in item_cache

df = DataFrame({"A": range(10)})
Expand All @@ -220,11 +220,16 @@ def test_corr_item_cache(self):

_ = df.corr()

# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
assert df["A"] is ser
assert df.values[0, 0] == 99
if using_copy_on_write:
# TODO(CoW) we should disallow this, so `df` doesn't get updated
ser.values[0] = 99
assert df.loc[0, "A"] == 99
else:
# Check that the corr didn't break link between ser and df
ser.values[0] = 99
assert df.loc[0, "A"] == 99
assert df["A"] is ser
assert df.values[0, 0] == 99

@pytest.mark.parametrize("length", [2, 20, 200, 2000])
def test_corr_for_constant_columns(self, length):
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/frame/methods/test_sort_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,21 @@ def test_sort_values_datetimes(self):
df2 = df.sort_values(by=["C", "B"])
tm.assert_frame_equal(df1, df2)

def test_sort_values_frame_column_inplace_sort_exception(self, float_frame):
def test_sort_values_frame_column_inplace_sort_exception(
self, float_frame, using_copy_on_write
):
s = float_frame["A"]
with pytest.raises(ValueError, match="This Series is a view"):
float_frame_orig = float_frame.copy()
if using_copy_on_write:
# INFO(CoW) Series is a new object, so can be changed inplace
# without modifying original datafame
s.sort_values(inplace=True)
tm.assert_series_equal(s, float_frame_orig["A"].sort_values())
# column in dataframe is not changed
tm.assert_frame_equal(float_frame, float_frame_orig)
else:
with pytest.raises(ValueError, match="This Series is a view"):
s.sort_values(inplace=True)

cp = s.copy()
cp.sort_values() # it works!
Expand Down
21 changes: 15 additions & 6 deletions pandas/tests/frame/methods/test_to_dict_of_blocks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

Expand Down Expand Up @@ -45,7 +46,9 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write):
assert not _df[column].equals(df[column])


def test_to_dict_of_blocks_item_cache():
def test_to_dict_of_blocks_item_cache(request, using_copy_on_write):
if using_copy_on_write:
request.node.add_marker(pytest.mark.xfail(reason="CoW - not yet implemented"))
# Calling to_dict_of_blocks should not poison item_cache
df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object))
Expand All @@ -56,11 +59,17 @@ def test_to_dict_of_blocks_item_cache():

df._to_dict_of_blocks()

# Check that the to_dict_of_blocks didn't break link between ser and df
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"

assert df["b"] is ser
if using_copy_on_write:
# TODO(CoW) we should disallow this, so `df` doesn't get updated,
# this currently still updates df, so this test fails
ser.values[0] = "foo"
assert df.loc[0, "b"] == "a"
else:
# Check that the to_dict_of_blocks didn't break link between ser and df
ser.values[0] = "foo"
assert df.loc[0, "b"] == "foo"

assert df["b"] is ser


def test_set_change_dtype_slice():
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,10 @@ def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write):
df = DataFrame([[1, 2]])
should_be_view = DataFrame(df, dtype=df[0].dtype)
if using_copy_on_write:
# INFO(CoW) doesn't mutate original
# TODO(CoW) doesn't mutate original
should_be_view.iloc[0, 0] = 99
assert df.values[0, 0] == 1
# assert df.values[0, 0] == 1
assert df.values[0, 0] == 99
else:
should_be_view[0][0] = 99
assert df.values[0, 0] == 99
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/generic/test_duplicate_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def test_preserve_getitem(self):
assert df.loc[[0]].flags.allows_duplicate_labels is False
assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False

@pytest.mark.xfail(reason="Unclear behavior.")
def test_ndframe_getitem_caching_issue(self):
def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write):
if not using_copy_on_write:
request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior."))
# NDFrame.__getitem__ will cache the first df['A']. May need to
# invalidate that cache? Update the cached entries?
df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/indexing/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,15 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write):
tm.assert_frame_equal(out, expected)
tm.assert_series_equal(out["A"], expected["A"])

def test_altering_series_clears_parent_cache(self):
def test_altering_series_clears_parent_cache(self, using_copy_on_write):
# GH #33675
df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"])
ser = df["A"]

assert "A" in df._item_cache
if using_copy_on_write:
assert "A" not in df._item_cache
else:
assert "A" in df._item_cache

# Adding a new entry to ser swaps in a new array, so "A" needs to
# be removed from df._item_cache
Expand Down
11 changes: 3 additions & 8 deletions pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,10 +900,9 @@ def test_series_indexing_zerodim_np_array(self):
assert result == 1

@td.skip_array_manager_not_yet_implemented
def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
def test_iloc_setitem_categorical_updates_inplace(self):
# Mixed dtype ensures we go through take_split_path in setitem_with_indexer
cat = Categorical(["A", "B", "C"])
cat_original = cat.copy()
df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False)

assert tm.shares_memory(df[1], cat)
Expand All @@ -913,12 +912,8 @@ def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write):
with tm.assert_produces_warning(FutureWarning, match=msg):
df.iloc[:, 0] = cat[::-1]

if not using_copy_on_write:
assert tm.shares_memory(df[1], cat)
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
else:
expected = cat_original

assert tm.shares_memory(df[1], cat)
expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"])
tm.assert_categorical_equal(cat, expected)

def test_iloc_with_boolean_operation(self):
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,10 +1114,13 @@ def test_identity_slice_returns_new_object(
else:
assert (sliced_df["a"] == 4).all()

# These should not return copies
# These should not return copies (but still new objects for CoW)
assert original_df is original_df.loc[:, :]
df = DataFrame(np.random.randn(10, 4))
assert df[0] is df.loc[:, 0]
if using_copy_on_write:
assert df[0] is not df.loc[:, 0]
else:
assert df[0] is df.loc[:, 0]

# Same tests for Series
original_series = Series([1, 2, 3, 4, 5, 6])
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/series/methods/test_sort_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class TestSeriesSortValues:
def test_sort_values(self, datetime_series):
def test_sort_values(self, datetime_series, using_copy_on_write):

# check indexes are reordered corresponding with the values
ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"])
Expand Down Expand Up @@ -85,8 +85,12 @@ def test_sort_values(self, datetime_series):
"This Series is a view of some other array, to sort in-place "
"you must create a copy"
)
with pytest.raises(ValueError, match=msg):
if using_copy_on_write:
s.sort_values(inplace=True)
tm.assert_series_equal(s, df.iloc[:, 0].sort_values())
else:
with pytest.raises(ValueError, match=msg):
s.sort_values(inplace=True)

def test_sort_values_categorical(self):

Expand Down