Skip to content

ENH: Add lazy copy to align #50432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4879,7 +4879,7 @@ def align(
join: AlignJoin = "outer",
axis: Axis | None = None,
level: Level = None,
copy: bool = True,
copy: bool | None = None,
fill_value=None,
method: FillnaOptions | None = None,
limit: int | None = None,
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5254,7 +5254,7 @@ def _reindex_with_indexers(
self: NDFrameT,
reindexers,
fill_value=None,
copy: bool_t = False,
copy: bool_t | None = False,
allow_dups: bool_t = False,
) -> NDFrameT:
"""allow_dups indicates an internal call here"""
Expand Down Expand Up @@ -5283,8 +5283,8 @@ def _reindex_with_indexers(
# If we've made a copy once, no need to make another one
copy = False

if copy and new_data is self._mgr:
new_data = new_data.copy()
if (copy or copy is None) and new_data is self._mgr:
new_data = new_data.copy(deep=copy)

return self._constructor(new_data).__finalize__(self)

Expand Down Expand Up @@ -9058,7 +9058,7 @@ def align(
join: AlignJoin = "outer",
axis: Axis | None = None,
level: Level = None,
copy: bool_t = True,
copy: bool_t | None = None,
fill_value: Hashable = None,
method: FillnaOptions | None = None,
limit: int | None = None,
Expand Down Expand Up @@ -9251,7 +9251,7 @@ def _align_frame(
join: AlignJoin = "outer",
axis: Axis | None = None,
level=None,
copy: bool_t = True,
copy: bool_t | None = None,
fill_value=None,
method=None,
limit=None,
Expand Down Expand Up @@ -9315,7 +9315,7 @@ def _align_series(
join: AlignJoin = "outer",
axis: Axis | None = None,
level=None,
copy: bool_t = True,
copy: bool_t | None = None,
fill_value=None,
method=None,
limit=None,
Expand Down Expand Up @@ -9344,7 +9344,7 @@ def _align_series(
if is_series:
left = self._reindex_indexer(join_index, lidx, copy)
elif lidx is None or join_index is None:
left = self.copy() if copy else self
left = self.copy(deep=copy) if copy or copy is None else self
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean there are still cases where self is being returned? (if you manually specify copy=False)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [1]: s1 = pd.Series([1, 2, 3])

In [2]: s2 = pd.Series([4, 5, 6])

In [3]: s11, s22 = s1.align(s2, copy=False)

In [4]: s11 is s1
Out[4]: True

In [5]: s11.iloc[0] = 10

In [6]: s1
Out[6]: 
0    10
1     2
2     3
dtype: int64

Of course, this is also because copy is already an existing keyword, so we don't just do self = self.copy(deep=None) under the hood as in other methods that in the past always returned a copy.
But for other methods that already have a copy keyword, I think for now we used copy=False to also mean shallow/lazy copy like copy=None with CoW enabled.
(it's also something we still need to discuss in general what to do with those copy keywords)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for other methods that already have a copy keyword, I think for now we used copy=False to also mean shallow/lazy copy like copy=None with CoW enabled.

Actually, that's not fully correct. It seems this depends on case by case depending on how it is implemented, and we didn't consistently cover this case. Will open a dedicated issue for this topic: #50535

else:
left = self._constructor(
self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
Expand Down Expand Up @@ -9373,7 +9373,7 @@ def _align_series(
left = self._constructor(fdata)

if ridx is None:
right = other
right = other.copy(deep=copy) if copy or copy is None else other
else:
right = other.reindex(join_index, level=level)

Expand Down
11 changes: 7 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4591,15 +4591,18 @@ def _reduce(
return op(delegate, skipna=skipna, **kwds)

def _reindex_indexer(
self, new_index: Index | None, indexer: npt.NDArray[np.intp] | None, copy: bool
self,
new_index: Index | None,
indexer: npt.NDArray[np.intp] | None,
copy: bool | None,
) -> Series:
# Note: new_index is None iff indexer is None
# if not None, indexer is np.intp
if indexer is None and (
new_index is None or new_index.names == self.index.names
):
if copy:
return self.copy()
if copy or copy is None:
return self.copy(deep=copy)
return self
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here


new_values = algorithms.take_nd(
Expand All @@ -4626,7 +4629,7 @@ def align(
join: AlignJoin = "outer",
axis: Axis | None = None,
level: Level = None,
copy: bool = True,
copy: bool | None = None,
fill_value: Hashable = None,
method: FillnaOptions | None = None,
limit: int | None = None,
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,53 @@ def test_select_dtypes(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize(
"func",
[
lambda x, y: x.align(y),
lambda x, y: x.align(y.a, axis=0),
lambda x, y: x.align(y.a.iloc[slice(0, 1)], axis=1),
],
)
def test_align_frame(using_copy_on_write, func):
df = DataFrame({"a": [1, 2, 3], "b": "a"})
df_orig = df.copy()
df_changed = df[["b", "a"]].copy()
df2, _ = func(df, df_changed)

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

df2.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
tm.assert_frame_equal(df, df_orig)


def test_align_series(using_copy_on_write):
ser = Series([1, 2])
ser_orig = ser.copy()
ser_other = ser.copy()
ser2, ser_other_result = ser.align(ser_other)

if using_copy_on_write:
assert np.shares_memory(ser2.values, ser.values)
assert np.shares_memory(ser_other_result.values, ser_other.values)
else:
assert not np.shares_memory(ser2.values, ser.values)
assert not np.shares_memory(ser_other_result.values, ser_other.values)

ser2.iloc[0] = 0
ser_other_result.iloc[0] = 0
if using_copy_on_write:
assert not np.shares_memory(ser2.values, ser.values)
assert not np.shares_memory(ser_other_result.values, ser_other.values)
tm.assert_series_equal(ser, ser_orig)
tm.assert_series_equal(ser_other, ser_orig)


def test_to_frame(using_copy_on_write):
# Case: converting a Series to a DataFrame with to_frame
ser = Series([1, 2, 3])
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,12 @@ def _check_align_fill(self, frame, kind, meth, ax, fax):
self._check_align(
empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1
)

def test_align_series_check_copy(self):
# GH#
df = DataFrame({0: [1, 2]})
ser = Series([1], name=0)
expected = ser.copy()
result, other = df.align(ser, axis=1)
ser.iloc[0] = 100
tm.assert_series_equal(other, expected)