-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add lazy copy to concat and round #50501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
94c982e
7fbba4a
0db1c39
fc564f8
a7165c2
20a62bb
bca2123
3c4abd2
7734979
8bc3272
79cedd4
6c8a704
de678ab
dddc9f0
d18416d
5ccf23e
7648b23
99da573
a6afa8b
3d0772c
8b49210
48a4f7e
690f072
8ba35fe
2047d5c
e959c53
891e07c
04adc17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,8 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._config import get_option | ||
|
||
from pandas._typing import ( | ||
Axis, | ||
AxisInt, | ||
|
@@ -47,6 +49,8 @@ | |
get_unanimous_names, | ||
) | ||
from pandas.core.internals import concatenate_managers | ||
from pandas.core.internals.construction import dict_to_mgr | ||
from pandas.core.internals.managers import _using_copy_on_write | ||
|
||
if TYPE_CHECKING: | ||
from pandas import ( | ||
|
@@ -155,7 +159,7 @@ def concat( | |
names=None, | ||
verify_integrity: bool = False, | ||
sort: bool = False, | ||
copy: bool = True, | ||
copy: bool | None = None, | ||
) -> DataFrame | Series: | ||
""" | ||
Concatenate pandas objects along a particular axis. | ||
|
@@ -363,6 +367,12 @@ def concat( | |
0 1 2 | ||
1 3 4 | ||
""" | ||
if copy is None: | ||
if _using_copy_on_write(): | ||
copy = False | ||
else: | ||
copy = True | ||
|
||
op = _Concatenator( | ||
objs, | ||
axis=axis, | ||
|
@@ -584,7 +594,16 @@ def get_result(self): | |
cons = sample._constructor_expanddim | ||
|
||
index, columns = self.new_axes | ||
df = cons(data, index=index, copy=self.copy) | ||
mgr = dict_to_mgr( | ||
data, | ||
index, | ||
None, | ||
copy=self.copy, | ||
typ=get_option("mode.data_manager"), | ||
) | ||
if _using_copy_on_write() and not self.copy: | ||
mgr = mgr.copy(deep=False) | ||
df = cons(mgr, copy=self.copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be doing a second copy now? (in case of I am also wondering if we could do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, set copy to false here |
||
df.columns = columns | ||
return df.__finalize__(self, method="concat") | ||
|
||
|
@@ -611,8 +630,10 @@ def get_result(self): | |
new_data = concatenate_managers( | ||
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy | ||
) | ||
if not self.copy: | ||
if not self.copy and not _using_copy_on_write(): | ||
new_data._consolidate_inplace() | ||
elif _using_copy_on_write() and not self.copy: | ||
new_data = new_data.copy(deep=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would expect that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might also not be sufficient this way for keeping track of the reference from parent to child. Because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking out this branch, I can confirm that is indeed an issue with the current implementation:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pushed the logic down and added your case as test. Works now |
||
|
||
cons = sample._constructor | ||
return cons(new_data).__finalize__(self, method="concat") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import numpy as np | ||
|
||
from pandas import ( | ||
DataFrame, | ||
Series, | ||
concat, | ||
) | ||
import pandas._testing as tm | ||
from pandas.tests.copy_view.util import get_array | ||
|
||
|
||
def test_concat_frames(using_copy_on_write): | ||
df = DataFrame({"b": ["a"] * 3}) | ||
df2 = DataFrame({"a": ["a"] * 3}) | ||
df_orig = df.copy() | ||
result = concat([df, df2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
result.iloc[0, 0] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b")) | ||
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
|
||
result.iloc[0, 1] = "d" | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a")) | ||
tm.assert_frame_equal(df, df_orig) | ||
|
||
|
||
def test_concat_series(using_copy_on_write): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ser = Series([1, 2], name="a") | ||
ser2 = Series([3, 4], name="b") | ||
ser_orig = ser.copy() | ||
result = concat([ser, ser2], axis=1) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), ser.values) | ||
assert np.shares_memory(get_array(result, "b"), ser2.values) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), ser.values) | ||
assert not np.shares_memory(get_array(result, "b"), ser2.values) | ||
|
||
result.iloc[0, 0] = 100 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), ser.values) | ||
assert np.shares_memory(get_array(result, "b"), ser2.values) | ||
|
||
result.iloc[0, 1] = 1000 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), ser2.values) | ||
tm.assert_series_equal(ser, ser_orig) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -23,6 +23,7 @@ | |||||||||||||
timedelta_range, | ||||||||||||||
) | ||||||||||||||
import pandas._testing as tm | ||||||||||||||
from pandas.core.internals.managers import _using_copy_on_write | ||||||||||||||
from pandas.tests.io.pytables.common import ( | ||||||||||||||
_maybe_remove, | ||||||||||||||
ensure_clean_store, | ||||||||||||||
|
@@ -1009,6 +1010,7 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): | |||||||||||||
assert len(result) | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.mark.skipif(_using_copy_on_write(), reason="strides buggy with cow") | ||||||||||||||
def test_hdfstore_strides(setup_path): | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this caused by the concat changes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah strides changed from 8 to 16, this is really weird There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking into this a little bit, I think the issue is that in Lines 3207 to 3212 in a0071f9
where we are creating the DataFrame, with the changes in this PR that Also commented about that on the related issue (that triggered adding this test): #22073 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah this seems to fix it |
||||||||||||||
# GH22073 | ||||||||||||||
df = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) | ||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah this is annoying, I thought that ci should fail, but makes sense that the import continues to work through there...