-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add lazy copy to concat and round #50501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 25 commits
94c982e
7fbba4a
0db1c39
fc564f8
a7165c2
20a62bb
bca2123
3c4abd2
7734979
8bc3272
79cedd4
6c8a704
de678ab
dddc9f0
d18416d
5ccf23e
7648b23
99da573
a6afa8b
3d0772c
8b49210
48a4f7e
690f072
8ba35fe
2047d5c
e959c53
891e07c
04adc17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -731,6 +731,7 @@ def __init__( | |
columns, | ||
dtype=dtype, | ||
typ=manager, | ||
copy=copy, | ||
) | ||
else: | ||
mgr = ndarray_to_mgr( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,9 +14,15 @@ | |
cast, | ||
overload, | ||
) | ||
import weakref | ||
|
||
import numpy as np | ||
|
||
from pandas._config import ( | ||
get_option, | ||
using_copy_on_write, | ||
) | ||
|
||
from pandas._typing import ( | ||
Axis, | ||
AxisInt, | ||
|
@@ -47,6 +53,7 @@ | |
get_unanimous_names, | ||
) | ||
from pandas.core.internals import concatenate_managers | ||
from pandas.core.internals.construction import dict_to_mgr | ||
|
||
if TYPE_CHECKING: | ||
from pandas import ( | ||
|
@@ -155,7 +162,7 @@ def concat( | |
names=None, | ||
verify_integrity: bool = False, | ||
sort: bool = False, | ||
copy: bool = True, | ||
copy: bool | None = None, | ||
) -> DataFrame | Series: | ||
""" | ||
Concatenate pandas objects along a particular axis. | ||
|
@@ -363,6 +370,12 @@ def concat( | |
0 1 2 | ||
1 3 4 | ||
""" | ||
if copy is None: | ||
if using_copy_on_write(): | ||
copy = False | ||
else: | ||
copy = True | ||
|
||
op = _Concatenator( | ||
objs, | ||
axis=axis, | ||
|
@@ -523,18 +536,19 @@ def __init__( | |
) | ||
|
||
else: | ||
name = getattr(obj, "name", None) | ||
name = new_name = getattr(obj, "name", None) | ||
if ignore_index or name is None: | ||
name = current_column | ||
new_name = current_column | ||
current_column += 1 | ||
|
||
# doing a row-wise concatenation so need everything | ||
# to line up | ||
if self._is_frame and axis == 1: | ||
name = 0 | ||
new_name = 0 | ||
# mypy needs to know sample is not an NDFrame | ||
sample = cast("DataFrame | Series", sample) | ||
obj = sample._constructor({name: obj}) | ||
obj = sample._constructor(obj, columns=[name], copy=False) | ||
obj.columns = [new_name] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have a test that covers this mixed case for checking that the CoW works correctly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep the chained series case covers this, but will add an explicit one as well after hiding the change behind the cow flag |
||
|
||
self.objs.append(obj) | ||
|
||
|
@@ -584,7 +598,22 @@ def get_result(self): | |
cons = sample._constructor_expanddim | ||
|
||
index, columns = self.new_axes | ||
df = cons(data, index=index, copy=self.copy) | ||
mgr = dict_to_mgr( | ||
data, | ||
index, | ||
None, | ||
copy=self.copy, | ||
typ=get_option("mode.data_manager"), | ||
) | ||
if using_copy_on_write() and not self.copy: | ||
parents = [obj._mgr for obj in self.objs] | ||
mgr.parent = parents # type: ignore[union-attr] | ||
refs = [ | ||
weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] | ||
for obj in self.objs | ||
] | ||
mgr.refs = refs # type: ignore[union-attr] | ||
df = cons(mgr, copy=False) | ||
df.columns = columns | ||
return df.__finalize__(self, method="concat") | ||
|
||
|
@@ -611,7 +640,7 @@ def get_result(self): | |
new_data = concatenate_managers( | ||
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy | ||
) | ||
if not self.copy: | ||
if not self.copy and not using_copy_on_write(): | ||
new_data._consolidate_inplace() | ||
|
||
cons = sample._constructor | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import numpy as np | ||
|
||
from pandas import ( | ||
DataFrame, | ||
Series, | ||
concat, | ||
) | ||
import pandas._testing as tm | ||
from pandas.tests.copy_view.util import get_array | ||
|
||
|
||
def test_concat_frames(using_copy_on_write): | ||
df = DataFrame({"b": ["a"] * 3}) | ||
df2 = DataFrame({"a": ["a"] * 3}) | ||
df_orig = df.copy() | ||
result = concat([df, df2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
result.iloc[0, 0] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
result.iloc[0, 1] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
tm.assert_frame_equal(df, df_orig) | ||
|
||
|
||
def test_concat_frames_updating_input(using_copy_on_write): | ||
df = DataFrame({"b": ["a"] * 3}) | ||
df2 = DataFrame({"a": ["a"] * 3}) | ||
result = concat([df, df2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
expected = result.copy() | ||
df.iloc[0, 0] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
df2.iloc[0, 0] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_concat_series(using_copy_on_write): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ser = Series([1, 2], name="a") | ||
ser2 = Series([3, 4], name="b") | ||
ser_orig = ser.copy() | ||
ser2_orig = ser2.copy() | ||
result = concat([ser, ser2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), ser.values) | ||
assert np.shares_memory(get_array(result, "b"), ser2.values) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), ser.values) | ||
assert not np.shares_memory(get_array(result, "b"), ser2.values) | ||
|
||
result.iloc[0, 0] = 100 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), ser.values) | ||
assert np.shares_memory(get_array(result, "b"), ser2.values) | ||
|
||
result.iloc[0, 1] = 1000 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), ser2.values) | ||
tm.assert_series_equal(ser, ser_orig) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tm.assert_series_equal(ser2, ser2_orig) | ||
|
||
|
||
def test_concat_frames_chained(using_copy_on_write): | ||
df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) | ||
df2 = DataFrame({"c": [4, 5, 6]}) | ||
df3 = DataFrame({"d": [4, 5, 6]}) | ||
result = concat([concat([df1, df2], axis=1), df3], axis=1) | ||
expected = result.copy() | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert np.shares_memory(get_array(result, "c"), get_array(df2, "c")) | ||
assert np.shares_memory(get_array(result, "d"), get_array(df3, "d")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c")) | ||
assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d")) | ||
|
||
df1.iloc[0, 0] = 100 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_concat_series_chained(using_copy_on_write): | ||
ser1 = Series([1, 2, 3], name="a") | ||
ser2 = Series([4, 5, 6], name="c") | ||
ser3 = Series([4, 5, 6], name="d") | ||
result = concat([concat([ser1, ser2], axis=1), ser3], axis=1) | ||
expected = result.copy() | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) | ||
assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) | ||
assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) | ||
assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c")) | ||
assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d")) | ||
|
||
ser1.iloc[0] = 100 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a")) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_concat_series_updating_input(using_copy_on_write): | ||
ser = Series([1, 2], name="a") | ||
ser2 = Series([3, 4], name="b") | ||
expected = DataFrame({"a": [1, 2], "b": [3, 4]}) | ||
result = concat([ser, ser2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), get_array(ser, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) | ||
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) | ||
|
||
ser.iloc[0] = 100 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
ser2.iloc[0] = 1000 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b")) | ||
tm.assert_frame_equal(result, expected) |
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -11,6 +11,8 @@ | |||||||||||||
import numpy as np | ||||||||||||||
import pytest | ||||||||||||||
|
||||||||||||||
from pandas._config import using_copy_on_write | ||||||||||||||
|
||||||||||||||
import pandas as pd | ||||||||||||||
from pandas import ( | ||||||||||||||
DataFrame, | ||||||||||||||
|
@@ -997,6 +999,7 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): | |||||||||||||
assert len(result) | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.mark.skipif(using_copy_on_write(), reason="strides buggy with cow") | ||||||||||||||
def test_hdfstore_strides(setup_path): | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this caused by the concat changes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah strides changed from 8 to 16, this is really weird There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking into this a little bit, I think the issue is that in Lines 3207 to 3212 in a0071f9
where we are creating the DataFrame, with the changes in this PR that Also commented about that on the related issue (that triggered adding this test): #22073 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah this seems to fix it |
||||||||||||||
# GH22073 | ||||||||||||||
df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) | ||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is necessary to avoid a copy when transforming from series to df in mixed case. Should we hide this behind the CoW flag?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose this will change behaviour (for the default non-CoW mode) if you pass
copy=False
? (and so actually for the default copy=None, as that is translated to copy=False in such a case). Currently that still does make a copy always:One could consider that a bug (but then we also need to add testes for that case here), but given that we probably want to change that in the future to actually behave as a copy (i.e. the current behaviour), I would maybe not fix the "bug".
So, yes, then I would hide this behind a
using_copy_on_write()
check.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added the option check