Skip to content

API / CoW: detect and raise error for chained assignment under Copy-on-Write #49467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ jobs:
env_file: actions-pypy-38.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "--max-worker-restart 0"
error_on_warnings: "0"
- name: "Numpy Dev"
env_file: actions-310-numpydev.yaml
pattern: "not slow and not network and not single_cpu"
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Exceptions and warnings
errors.AccessorRegistrationWarning
errors.AttributeConflictWarning
errors.CategoricalConversionWarning
errors.ChainedAssignmentError
errors.ClosedFileError
errors.CSSWarning
errors.DatabaseError
Expand Down
8 changes: 8 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ Copy-on-Write improvements
a modification to the data happens) when constructing a Series from an existing
Series with the default of ``copy=False`` (:issue:`50471`)

- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
will now always raise an exception when Copy-on-Write is enabled. In this mode,
chained assignment can never work because we are always setting into a temporary
object that is the result of an indexing operation (getitem), which under
Copy-on-Write always behaves as a copy. Thus, assigning through a chain
can never update the original Series or DataFrame. Therefore, an informative
error is raised to the user instead of silently doing nothing (:issue:`49467`)

Copy-on-Write can be enabled through

.. code-block:: python
Expand Down
2 changes: 2 additions & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
decompress_file,
ensure_clean,
ensure_safe_environment_variables,
raises_chained_assignment_error,
set_timezone,
use_numexpr,
with_csv_dialect,
Expand Down Expand Up @@ -1125,6 +1126,7 @@ def shares_memory(left, right) -> bool:
"rands",
"reset_display_options",
"RNGContext",
"raises_chained_assignment_error",
"round_trip_localpath",
"round_trip_pathlib",
"round_trip_pickle",
Expand Down
21 changes: 21 additions & 0 deletions pandas/_testing/contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

import numpy as np

from pandas.compat import PYPY
from pandas.errors import ChainedAssignmentError

from pandas import set_option

from pandas.io.common import get_handle
Expand Down Expand Up @@ -227,3 +230,21 @@ def __exit__(
) -> None:

np.random.set_state(self.start_state)


def raises_chained_assignment_error():

if PYPY:
from contextlib import nullcontext

return nullcontext()
else:
import pytest

return pytest.raises(
ChainedAssignmentError,
match=(
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment"
),
)
12 changes: 11 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import functools
from io import StringIO
import itertools
import sys
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -91,12 +92,17 @@
WriteBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import (
function as nv,
np_percentile_argname,
)
from pandas.errors import InvalidIndexError
from pandas.errors import (
ChainedAssignmentError,
InvalidIndexError,
_chained_assignment_msg,
)
from pandas.util._decorators import (
Appender,
Substitution,
Expand Down Expand Up @@ -3862,6 +3868,10 @@ def isetitem(self, loc, value) -> None:
self._iset_item_mgr(loc, arraylike, inplace=False)

def __setitem__(self, key, value):
if not PYPY and using_copy_on_write():
if sys.getrefcount(self) <= 3:
raise ChainedAssignmentError(_chained_assignment_msg)

key = com.apply_if_callable(key, self)

# see if we can slice the rows
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from contextlib import suppress
import sys
from typing import (
TYPE_CHECKING,
Hashable,
Expand All @@ -12,17 +13,22 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs.indexing import NDFrameIndexerBase
from pandas._libs.lib import item_from_zerodim
from pandas._typing import (
Axis,
AxisInt,
)
from pandas.compat import PYPY
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
IndexingError,
InvalidIndexError,
LossySetitemError,
_chained_assignment_msg,
)
from pandas.util._decorators import doc

Expand Down Expand Up @@ -830,6 +836,10 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:

@final
def __setitem__(self, key, value) -> None:
if not PYPY and using_copy_on_write():
if sys.getrefcount(self.obj) <= 2:
raise ChainedAssignmentError(_chained_assignment_msg)

check_dict_or_set_indexers(key)
if isinstance(key, tuple):
key = tuple(list(x) if is_iterator(x) else x for x in key)
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from __future__ import annotations

import sys
from textwrap import dedent
from typing import (
IO,
Expand Down Expand Up @@ -67,8 +68,13 @@
WriteBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat.numpy import function as nv
from pandas.errors import InvalidIndexError
from pandas.errors import (
ChainedAssignmentError,
InvalidIndexError,
_chained_assignment_msg,
)
from pandas.util._decorators import (
Appender,
Substitution,
Expand Down Expand Up @@ -1074,6 +1080,10 @@ def _get_value(self, label, takeable: bool = False):
return self.iloc[loc]

def __setitem__(self, key, value) -> None:
if not PYPY and using_copy_on_write():
if sys.getrefcount(self) <= 3:
raise ChainedAssignmentError(_chained_assignment_msg)

check_dict_or_set_indexers(key)
key = com.apply_if_callable(key, self)
cacher_needs_updating = self._check_is_chained_assignment_possible()
Expand Down
36 changes: 36 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,42 @@ class SettingWithCopyWarning(Warning):
"""


class ChainedAssignmentError(ValueError):
"""
Exception raised when trying to set using chained assignment.

When the ``mode.copy_on_write`` option is enabled, chained assignment can
never work. In such a situation, we are always setting into a temporary
object that is the result of an indexing operation (getitem), which under
Copy-on-Write always behaves as a copy. Thus, assigning through a chain
can never update the original Series or DataFrame.

For more information on view vs. copy,
see :ref:`the user guide<indexing.view_versus_copy>`.

Examples
--------
>>> pd.options.mode.copy_on_write = True
>>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
>>> df["A"][0:3] = 10 # doctest: +SKIP
... # ChainedAssignmentError: ...
"""


_chained_assignment_msg = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment.\n"
"When using the Copy-on-Write mode, such chained assignment never works "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never updates the original....

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's on the next line?

Copy link
Member

@phofl phofl Jan 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry, should have been clearer:

I would reword to:

When using the Copy-on-Write mode, such chained assignment never updates the original DataFrame or Series, ...

This sounds better to me

"to update the original DataFrame or Series, because the intermediate "
"object on which we are setting values always behaves as a copy.\n\n"
"Try using '.loc[row_indexer, col_indexer] = value' instead, to perform "
"the assignment in a single step.\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)


class NumExprClobberingError(NameError):
"""
Exception raised when trying to use a built-in numexpr name as a variable name.
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,12 +1245,15 @@ def test_setitem_column_update_inplace(self, using_copy_on_write):
df = DataFrame({col: np.zeros(len(labels)) for col in labels}, index=labels)
values = df._mgr.blocks[0].values

for label in df.columns:
df[label][label] = 1

if not using_copy_on_write:
for label in df.columns:
df[label][label] = 1

# diagonal values all updated
assert np.all(values[np.arange(10), np.arange(10)] == 1)
else:
with tm.raises_chained_assignment_error():
for label in df.columns:
df[label][label] = 1
# original dataframe not updated
assert np.all(values[np.arange(10), np.arange(10)] == 0)
3 changes: 2 additions & 1 deletion pandas/tests/frame/indexing/test_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def test_xs_view(self, using_array_manager, using_copy_on_write):
df_orig = dm.copy()

if using_copy_on_write:
dm.xs(2)[:] = 20
with tm.raises_chained_assignment_error():
dm.xs(2)[:] = 20
tm.assert_frame_equal(dm, df_orig)
elif using_array_manager:
# INFO(ArrayManager) with ArrayManager getting a row as a view is
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,11 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write):
)
repr(Y)
Y["e"] = Y["e"].astype("object")
Y["g"]["c"] = np.NaN
if using_copy_on_write:
with tm.raises_chained_assignment_error():
Y["g"]["c"] = np.NaN
else:
Y["g"]["c"] = np.NaN
repr(Y)
result = Y.sum() # noqa
exp = Y["g"].sum() # noqa
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/indexing/multiindex/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,13 @@ def test_cache_updating(using_copy_on_write):

# setting via chained assignment
# but actually works, since everything is a view
df.loc[0]["z"].iloc[0] = 1.0
result = df.loc[(0, 0), "z"]
if using_copy_on_write:
assert result == df_original.loc[0, "z"]
with tm.raises_chained_assignment_error():
df.loc[0]["z"].iloc[0] = 1.0
assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"]
else:
df.loc[0]["z"].iloc[0] = 1.0
result = df.loc[(0, 0), "z"]
assert result == 1

# correct setting
Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/indexing/multiindex/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,26 @@ def test_partial_set(
exp.iloc[65:85] = 0
tm.assert_frame_equal(df, exp)

df["A"].loc[2000, 4] = 1
if not using_copy_on_write:
exp["A"].loc[2000, 4].values[:] = 1
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["A"].loc[2000, 4] = 1
df.loc[(2000, 4), "A"] = 1
else:
df["A"].loc[2000, 4] = 1
exp.iloc[65:85, 0] = 1
tm.assert_frame_equal(df, exp)

df.loc[2000] = 5
exp.iloc[:100] = 5
tm.assert_frame_equal(df, exp)

# this works...for now
df["A"].iloc[14] = 5
if using_copy_on_write:
with tm.raises_chained_assignment_error():
df["A"].iloc[14] = 5
df["A"].iloc[14] == exp["A"].iloc[14]
else:
df["A"].iloc[14] = 5
assert df["A"].iloc[14] == 5

@pytest.mark.parametrize("dtype", [int, float])
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/indexing/multiindex/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,8 +501,8 @@ def test_frame_setitem_copy_raises(
# will raise/warn as its chained assignment
df = multiindex_dataframe_random_data.T
if using_copy_on_write:
# TODO(CoW) it would be nice if this could still warn/raise
df["foo"]["one"] = 2
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(SettingWithCopyError, match=msg):
Expand All @@ -516,7 +516,8 @@ def test_frame_setitem_copy_no_write(
expected = frame
df = frame.copy()
if using_copy_on_write:
df["foo"]["one"] = 2
with tm.raises_chained_assignment_error():
df["foo"]["one"] = 2
else:
msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(SettingWithCopyError, match=msg):
Expand Down
Loading