Skip to content

ENH: Add lazy copy to concat and round #50501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Jan 17, 2023
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,7 @@ def __init__(
columns,
dtype=dtype,
typ=manager,
copy=copy,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is necessary to avoid a copy when transforming from series to df in mixed case. Should we hide this behind the CoW flag?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this will change behaviour (for the default non-CoW mode) if you pass copy=False ? (and so actually for the default copy=None, as that is translated to copy=False in such a case). Currently that still does make a copy always:

In [9]: s = pd.Series([1, 2, 3], name="a")

In [10]: df = pd.DataFrame(s, copy=False)

In [11]: s.iloc[0] = 100

In [12]: df
Out[12]: 
   a
0  1
1  2
2  3

One could consider that a bug (but then we also need to add testes for that case here), but given that we probably want to change that in the future to actually behave as a copy (i.e. the current behaviour), I would maybe not fix the "bug".
So, yes, then I would hide this behind a using_copy_on_write() check.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the option check

)
else:
mgr = ndarray_to_mgr(
Expand Down
20 changes: 18 additions & 2 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Sequence,
cast,
)
import weakref

import numpy as np

Expand Down Expand Up @@ -61,7 +62,10 @@
ensure_block_shape,
new_block_2d,
)
from pandas.core.internals.managers import BlockManager
from pandas.core.internals.managers import (
BlockManager,
using_copy_on_write,
)

if TYPE_CHECKING:
from pandas import Index
Expand Down Expand Up @@ -267,6 +271,8 @@ def _concat_managers_axis0(

offset = 0
blocks = []
refs: list[weakref.ref | None] = []
parents: list = []
for i, mgr in enumerate(mgrs):
# If we already reindexed, then we definitely don't need another copy
made_copy = had_reindexers[i]
Expand All @@ -283,8 +289,18 @@ def _concat_managers_axis0(
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)

if not made_copy and not copy and using_copy_on_write():
refs.extend([weakref.ref(blk) for blk in mgr.blocks])
parents.append(mgr)
elif using_copy_on_write():
refs.extend([None] * len(mgr.blocks))

offset += len(mgr.items)
return BlockManager(tuple(blocks), axes)

result_parents = parents if parents else None
result_ref = refs if refs else None
result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref)
return result


def _maybe_reindex_columns_na_proxy(
Expand Down
43 changes: 36 additions & 7 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@
cast,
overload,
)
import weakref

import numpy as np

from pandas._config import (
get_option,
using_copy_on_write,
)

from pandas._typing import (
Axis,
AxisInt,
Expand Down Expand Up @@ -47,6 +53,7 @@
get_unanimous_names,
)
from pandas.core.internals import concatenate_managers
from pandas.core.internals.construction import dict_to_mgr

if TYPE_CHECKING:
from pandas import (
Expand Down Expand Up @@ -155,7 +162,7 @@ def concat(
names=None,
verify_integrity: bool = False,
sort: bool = False,
copy: bool = True,
copy: bool | None = None,
) -> DataFrame | Series:
"""
Concatenate pandas objects along a particular axis.
Expand Down Expand Up @@ -363,6 +370,12 @@ def concat(
0 1 2
1 3 4
"""
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True

op = _Concatenator(
objs,
axis=axis,
Expand Down Expand Up @@ -523,18 +536,19 @@ def __init__(
)

else:
name = getattr(obj, "name", None)
name = new_name = getattr(obj, "name", None)
if ignore_index or name is None:
name = current_column
new_name = current_column
current_column += 1

# doing a row-wise concatenation so need everything
# to line up
if self._is_frame and axis == 1:
name = 0
new_name = 0
# mypy needs to know sample is not an NDFrame
sample = cast("DataFrame | Series", sample)
obj = sample._constructor({name: obj})
obj = sample._constructor(obj, columns=[name], copy=False)
obj.columns = [new_name]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a test that covers this mixed case for checking that the CoW works correctly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep the chained series case covers this, but will add an explicit one as well after hiding the change behind the cow flag


self.objs.append(obj)

Expand Down Expand Up @@ -584,7 +598,22 @@ def get_result(self):
cons = sample._constructor_expanddim

index, columns = self.new_axes
df = cons(data, index=index, copy=self.copy)
mgr = dict_to_mgr(
data,
index,
None,
copy=self.copy,
typ=get_option("mode.data_manager"),
)
if using_copy_on_write() and not self.copy:
parents = [obj._mgr for obj in self.objs]
mgr.parent = parents # type: ignore[union-attr]
refs = [
weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr]
for obj in self.objs
]
mgr.refs = refs # type: ignore[union-attr]
df = cons(mgr, copy=False)
df.columns = columns
return df.__finalize__(self, method="concat")

Expand All @@ -611,7 +640,7 @@ def get_result(self):
new_data = concatenate_managers(
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
)
if not self.copy:
if not self.copy and not using_copy_on_write():
new_data._consolidate_inplace()

cons = sample._constructor
Expand Down
154 changes: 154 additions & 0 deletions pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import numpy as np

from pandas import (
DataFrame,
Series,
concat,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


def test_concat_frames(using_copy_on_write):
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
df_orig = df.copy()
result = concat([df, df2], axis=1)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))

result.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))

result.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(df, df_orig)


def test_concat_frames_updating_input(using_copy_on_write):
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
result = concat([df, df2], axis=1)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
else:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))

expected = result.copy()
df.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))

df2.iloc[0, 0] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
tm.assert_frame_equal(result, expected)


def test_concat_series(using_copy_on_write):
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
ser_orig = ser.copy()
ser2_orig = ser2.copy()
result = concat([ser, ser2], axis=1)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)
else:
assert not np.shares_memory(get_array(result, "a"), ser.values)
assert not np.shares_memory(get_array(result, "b"), ser2.values)

result.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), ser.values)
assert np.shares_memory(get_array(result, "b"), ser2.values)

result.iloc[0, 1] = 1000
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), ser2.values)
tm.assert_series_equal(ser, ser_orig)
tm.assert_series_equal(ser2, ser2_orig)


def test_concat_frames_chained(using_copy_on_write):
df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
df2 = DataFrame({"c": [4, 5, 6]})
df3 = DataFrame({"d": [4, 5, 6]})
result = concat([concat([df1, df2], axis=1), df3], axis=1)
expected = result.copy()

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d"))

df1.iloc[0, 0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))

tm.assert_frame_equal(result, expected)


def test_concat_series_chained(using_copy_on_write):
ser1 = Series([1, 2, 3], name="a")
ser2 = Series([4, 5, 6], name="c")
ser3 = Series([4, 5, 6], name="d")
result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
expected = result.copy()

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))

ser1.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))

tm.assert_frame_equal(result, expected)


def test_concat_series_updating_input(using_copy_on_write):
ser = Series([1, 2], name="a")
ser2 = Series([3, 4], name="b")
expected = DataFrame({"a": [1, 2], "b": [3, 4]})
result = concat([ser, ser2], axis=1)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))

ser.iloc[0] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)

ser2.iloc[0] = 1000
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
tm.assert_frame_equal(result, expected)
17 changes: 17 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,23 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))


def test_round(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c"})
df2 = df.round()
df_orig = df.copy()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)


def test_reorder_levels(using_copy_on_write):
index = MultiIndex.from_tuples(
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"]
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import numpy as np
import pytest

from pandas._config import using_copy_on_write

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -997,6 +999,7 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path):
assert len(result)


@pytest.mark.skipif(using_copy_on_write(), reason="strides buggy with cow")
def test_hdfstore_strides(setup_path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this caused by the concat changes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah strides changed from 8 to 16, this is really weird

Copy link
Member

@jorisvandenbossche jorisvandenbossche Jan 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking into this a little bit, I think the issue is that in

pandas/pandas/io/pytables.py

Lines 3207 to 3212 in a0071f9

if len(dfs) > 0:
out = concat(dfs, axis=1)
out = out.reindex(columns=items, copy=False)
return out
return DataFrame(columns=axes[0], index=axes[1])

where we are creating the DataFrame, with the changes in this PR that concat won't copy if CoW is enabled. But it seems that the data from the HDF store always come back as F contiguous, while in pandas we want it as C contiguous for optimal performance.
Maybe we should just add a copy=True in the concat call in the mentioned snippet. That's already the default for non-CoW so won't change anything, and for CoW enabled also ensures we get the desired memory layout (at the expense of an extra copy while reading in, but to fix that, that can be a follow-up optimization investigating why the HDF store always returns F contiguous arrays)

Also commented about that on the related issue (that triggered adding this test): #22073

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this seems to fix it

# GH22073
df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_append_concat(self):
assert isinstance(result.index, PeriodIndex)
assert result.index[0] == s1.index[0]

def test_concat_copy(self, using_array_manager):
def test_concat_copy(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.randn(4, 3))
df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
df3 = DataFrame({5: "foo"}, index=range(4))
Expand Down Expand Up @@ -82,7 +82,7 @@ def test_concat_copy(self, using_array_manager):
result = concat([df, df2, df3, df4], axis=1, copy=False)
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
if using_array_manager:
if using_array_manager or using_copy_on_write:
# this is a view on some array in either df or df4
assert any(
np.shares_memory(arr, other)
Expand Down