Skip to content

ENH: Make shallow copy for align nocopy with CoW #50917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5429,6 +5429,8 @@ def _reindex_with_indexers(

if (copy or copy is None) and new_data is self._mgr:
new_data = new_data.copy(deep=copy)
elif using_copy_on_write() and new_data is self._mgr:
new_data = new_data.copy(deep=copy)
Comment on lines 5430 to +5433
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (copy or copy is None) and new_data is self._mgr:
new_data = new_data.copy(deep=copy)
elif using_copy_on_write() and new_data is self._mgr:
new_data = new_data.copy(deep=copy)
if (copy or copy is None or using_copy_on_write()) and new_data is self._mgr:
new_data = new_data.copy(deep=copy)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also add an explicit test that covers this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer my version, this is easier to read imo. The conditions are nicely aligned.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The align copy false test covers this

Copy link
Member

@jorisvandenbossche jorisvandenbossche Jan 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also add an explicit test that covers this?

Sorry, that wasn't clear ;) This change is in _reindex_with_indexers, which is used by align, but is also used through reindex? (I assumed that align would use it through reindex, but that's actually not the case I see now, it is using it directly) And so I was wondering if this change is also needed for correct behaviour of reindex (and so if we missed something there), and if so, it would be good to also have a test for reindex that covers this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah got you.
We can't get there through reindex, because we have early exits preventing us from reaching _reindex_with_indexers if both axes are equal to the object that is reindexed.

I also looked into avoiding this function call when using align, but there is nothing to be gained there. It makes more sense covering this here.


return self._constructor(new_data).__finalize__(self)

Expand Down Expand Up @@ -9469,6 +9471,7 @@ def _align_series(
limit=None,
fill_axis: Axis = 0,
):
uses_cow = using_copy_on_write()

is_series = isinstance(self, ABCSeries)

Expand All @@ -9492,7 +9495,10 @@ def _align_series(
if is_series:
left = self._reindex_indexer(join_index, lidx, copy)
elif lidx is None or join_index is None:
left = self.copy(deep=copy) if copy or copy is None else self
if uses_cow:
left = self.copy(deep=copy)
else:
left = self.copy(deep=copy) if copy or copy is None else self
Comment on lines +9498 to +9501
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should consider just doing left = self.copy(deep=copy) in both cases. In general, we always return new objects from methods even with copy=False (with swapaxes being another exception).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree, but would prefer to keep things separate. We have a couple of other methods where we return self in case of len(index)==0 for example, would like to change them all together if we want to.

Can put up a pr for this first before merging here if you prefer

else:
left = self._constructor(
self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
Expand Down Expand Up @@ -9521,7 +9527,10 @@ def _align_series(
left = self._constructor(fdata)

if ridx is None:
right = other.copy(deep=copy) if copy or copy is None else other
if uses_cow:
right = other.copy(deep=copy)
else:
right = other.copy(deep=copy) if copy or copy is None else other
else:
right = other.reindex(join_index, level=level)

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4643,6 +4643,8 @@ def _reindex_indexer(
if indexer is None and (
new_index is None or new_index.names == self.index.names
):
if using_copy_on_write():
return self.copy(deep=copy)
if copy or copy is None:
return self.copy(deep=copy)
return self
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,41 @@ def test_align_series(using_copy_on_write):
tm.assert_series_equal(ser_other, ser_orig)


def test_align_copy_false(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()
df2, df3 = df.align(df, copy=False)

assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

if using_copy_on_write:
df2.loc[0, "a"] = 0
tm.assert_frame_equal(df, df_orig) # Original is unchanged

df3.loc[0, "a"] = 0
tm.assert_frame_equal(df, df_orig) # Original is unchanged


def test_align_with_series_copy_false(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ser = Series([1, 2, 3], name="x")
ser_orig = ser.copy()
df_orig = df.copy()
df2, ser2 = df.align(ser, copy=False, axis=0)

assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
assert np.shares_memory(get_array(ser, "x"), get_array(ser2, "x"))

if using_copy_on_write:
df2.loc[0, "a"] = 0
tm.assert_frame_equal(df, df_orig) # Original is unchanged

ser2.loc[0] = 0
tm.assert_series_equal(ser, ser_orig) # Original is unchanged


def test_to_frame(using_copy_on_write):
# Case: converting a Series to a DataFrame with to_frame
ser = Series([1, 2, 3])
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@ def test_frame_align_aware(self):
assert new1.index.tz is timezone.utc
assert new2.index.tz is timezone.utc

def test_align_float(self, float_frame):
def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(float_frame)
assert af._mgr is not float_frame._mgr

af, bf = float_frame.align(float_frame, copy=False)
assert af._mgr is float_frame._mgr
if not using_copy_on_write:
assert af._mgr is float_frame._mgr
else:
assert af._mgr is not float_frame._mgr

# axis = 0
other = float_frame.iloc[:-5, :3]
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/series/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_align_fill_method(
tm.assert_series_equal(ab, eb)


def test_align_nocopy(datetime_series):
def test_align_nocopy(datetime_series, using_copy_on_write):
b = datetime_series[:5].copy()

# do copy
Expand All @@ -95,7 +95,10 @@ def test_align_nocopy(datetime_series):
a = datetime_series.copy()
ra, _ = a.align(b, join="left", copy=False)
ra[:5] = 5
assert (a[:5] == 5).all()
if using_copy_on_write:
assert not (a[:5] == 5).any()
else:
assert (a[:5] == 5).all()

# do copy
a = datetime_series.copy()
Expand All @@ -109,7 +112,10 @@ def test_align_nocopy(datetime_series):
b = datetime_series[:5].copy()
_, rb = a.align(b, join="right", copy=False)
rb[:2] = 5
assert (b[:2] == 5).all()
if using_copy_on_write:
assert not (b[:2] == 5).any()
else:
assert (b[:2] == 5).all()


def test_align_same_index(datetime_series):
Expand Down