Skip to content

CoW: Avoid warning in apply for mixed dtype frame #56212

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from pandas._config import option_context

from pandas._libs import lib
from pandas._libs.internals import BlockValuesRefs
from pandas._typing import (
AggFuncType,
AggFuncTypeBase,
Expand Down Expand Up @@ -1254,6 +1255,8 @@ def series_generator(self) -> Generator[Series, None, None]:
ser = self.obj._ixs(0, axis=0)
mgr = ser._mgr

is_view = mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]

if isinstance(ser.dtype, ExtensionDtype):
# values will be incorrect for this block
# TODO(EA2D): special case would be unnecessary with 2D EAs
Expand All @@ -1267,6 +1270,14 @@ def series_generator(self) -> Generator[Series, None, None]:
ser._mgr = mgr
mgr.set_values(arr)
object.__setattr__(ser, "_name", name)
Comment on lines 1270 to 1272
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I don't understand how this part adds an extra ref after the first row (i.e. the reason you need to reset refs to have one ref to only itself)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I assume that's because we create a shallow copy of the result:

pandas/pandas/core/apply.py

Lines 1078 to 1084 in 7b528c9

for i, v in enumerate(series_gen):
# ignore SettingWithCopy here in case the user mutates
results[i] = self.func(v, *self.args, **self.kwargs)
if isinstance(results[i], ABCSeries):
# If we have a view on v, we need to make a copy because
# series_generator will swap out the underlying data
results[i] = results[i].copy(deep=False)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep exactly

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be cleaner to create a shallow "copy" of the Series that does not keep a reference? (I don't think we have any helper function to do that right now? except for Series(results[i]._values, copy=False, index=.., name=..), like what you did in the eval PR to create Series objects without a reference)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My assumption was that this is optimized for performance and I didn’t want to screw with that. And keeping the ugly parts in one method seemed like a fine compromise

that said, the udf could also set up references that could screw us over

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn’t want to screw with that

It's already taking a shallow copy always (even when not needed), so having a specialized shallow copy without tracking refs should at best slightly improve things.

The logic about requiring this shallow copy also already lives where the series_generator is used, so I think I find it slightly clearer to consolidate that logic there

Now, actually implementing a Series._shallow_copy_without_refs() is not fully trivial because AFAIK we can't just reuse the existing mgr and block copy methods and reset the refs after the fact, as that then already appended the refs object which is shared with the series generator series with a new ref.
So let's consider that for a follow-up PR if I want to pursue that ;)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am just going to update some comments here, then

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'd rather not do too many helpers that make this easy since you really shouldn't be doing this

if not is_view:
# In apply_series_generator we store the a shallow copy of the
# result, which potentially increases the ref count of this reused
# `ser` object (depending on the result of the applied function)
# -> if that happened and `ser` is already a copy, then we reset
# the refs here to avoid triggering a unnecessary CoW inside the
# applied function (https://github.com/pandas-dev/pandas/pull/56212)
mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr]
yield ser

@staticmethod
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2071,11 +2071,11 @@ def set_values(self, values: ArrayLike) -> None:
Set the values of the single block in place.

Use at your own risk! This does not check if the passed values are
valid for the current Block/SingleBlockManager (length, dtype, etc).
valid for the current Block/SingleBlockManager (length, dtype, etc),
and this does not properly keep track of references.
"""
# TODO(CoW) do we need to handle copy on write here? Currently this is
# only used for FrameColumnApply.series_generator (what if apply is
# mutating inplace?)
# NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator
# which handles CoW by setting the refs manually if necessary
self.blocks[0].values = values
self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))

Expand Down
13 changes: 2 additions & 11 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
DataFrame,
Series,
date_range,
notna,
)
import pandas._testing as tm

Expand Down Expand Up @@ -150,9 +149,7 @@ def test_transform_axis_1_raises():
Series([1]).transform("sum", axis=1)


# TODO(CoW-warn) should not need to warn
@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
def test_apply_modify_traceback(warn_copy_on_write):
def test_apply_modify_traceback():
data = DataFrame(
{
"A": [
Expand Down Expand Up @@ -207,15 +204,9 @@ def transform(row):
row["D"] = 7
return row

def transform2(row):
if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo":
row["D"] = 7
return row

msg = "'float' object has no attribute 'startswith'"
with pytest.raises(AttributeError, match=msg):
with tm.assert_cow_warning(warn_copy_on_write):
data.apply(transform, axis=1)
data.apply(transform, axis=1)


@pytest.mark.parametrize(
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2013,3 +2013,31 @@ def test_eval_inplace(using_copy_on_write, warn_copy_on_write):
df.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_frame_equal(df_view, df_orig)


def test_apply_modify_row(using_copy_on_write, warn_copy_on_write):
# Case: applying a function on each row as a Series object, where the
# function mutates the row object (which needs to trigger CoW if row is a view)
df = DataFrame({"A": [1, 2], "B": [3, 4]})
df_orig = df.copy()

def transform(row):
row["B"] = 100
return row

with tm.assert_cow_warning(warn_copy_on_write):
df.apply(transform, axis=1)

if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
assert df.loc[0, "B"] == 100

# row Series is a copy
df = DataFrame({"A": [1, 2], "B": ["b", "c"]})
df_orig = df.copy()

with tm.assert_produces_warning(None):
df.apply(transform, axis=1)

tm.assert_frame_equal(df, df_orig)