Skip to content

Commit c426dc0

Browse files
ENH: Add lazy copy to concat and round (#50501)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent b0c0d8a commit c426dc0

File tree

7 files changed

+261
-12
lines changed

7 files changed

+261
-12
lines changed

pandas/core/frame.py

+2
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,7 @@ def __init__(
723723
)
724724
elif getattr(data, "name", None) is not None:
725725
# i.e. Series/Index with non-None name
726+
_copy = copy if using_copy_on_write() else True
726727
mgr = dict_to_mgr(
727728
# error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
728729
# attribute "name"
@@ -731,6 +732,7 @@ def __init__(
731732
columns,
732733
dtype=dtype,
733734
typ=manager,
735+
copy=_copy,
734736
)
735737
else:
736738
mgr = ndarray_to_mgr(

pandas/core/internals/concat.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Sequence,
88
cast,
99
)
10+
import weakref
1011

1112
import numpy as np
1213

@@ -61,7 +62,10 @@
6162
ensure_block_shape,
6263
new_block_2d,
6364
)
64-
from pandas.core.internals.managers import BlockManager
65+
from pandas.core.internals.managers import (
66+
BlockManager,
67+
using_copy_on_write,
68+
)
6569

6670
if TYPE_CHECKING:
6771
from pandas import Index
@@ -267,6 +271,8 @@ def _concat_managers_axis0(
267271

268272
offset = 0
269273
blocks = []
274+
refs: list[weakref.ref | None] = []
275+
parents: list = []
270276
for i, mgr in enumerate(mgrs):
271277
# If we already reindexed, then we definitely don't need another copy
272278
made_copy = had_reindexers[i]
@@ -283,8 +289,18 @@ def _concat_managers_axis0(
283289
nb._mgr_locs = nb._mgr_locs.add(offset)
284290
blocks.append(nb)
285291

292+
if not made_copy and not copy and using_copy_on_write():
293+
refs.extend([weakref.ref(blk) for blk in mgr.blocks])
294+
parents.append(mgr)
295+
elif using_copy_on_write():
296+
refs.extend([None] * len(mgr.blocks))
297+
286298
offset += len(mgr.items)
287-
return BlockManager(tuple(blocks), axes)
299+
300+
result_parents = parents if parents else None
301+
result_ref = refs if refs else None
302+
result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref)
303+
return result
288304

289305

290306
def _maybe_reindex_columns_na_proxy(

pandas/core/reshape/concat.py

+42-7
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,15 @@
1414
cast,
1515
overload,
1616
)
17+
import weakref
1718

1819
import numpy as np
1920

21+
from pandas._config import (
22+
get_option,
23+
using_copy_on_write,
24+
)
25+
2026
from pandas._typing import (
2127
Axis,
2228
AxisInt,
@@ -47,6 +53,7 @@
4753
get_unanimous_names,
4854
)
4955
from pandas.core.internals import concatenate_managers
56+
from pandas.core.internals.construction import dict_to_mgr
5057

5158
if TYPE_CHECKING:
5259
from pandas import (
@@ -155,7 +162,7 @@ def concat(
155162
names=None,
156163
verify_integrity: bool = False,
157164
sort: bool = False,
158-
copy: bool = True,
165+
copy: bool | None = None,
159166
) -> DataFrame | Series:
160167
"""
161168
Concatenate pandas objects along a particular axis.
@@ -363,6 +370,12 @@ def concat(
363370
0 1 2
364371
1 3 4
365372
"""
373+
if copy is None:
374+
if using_copy_on_write():
375+
copy = False
376+
else:
377+
copy = True
378+
366379
op = _Concatenator(
367380
objs,
368381
axis=axis,
@@ -523,18 +536,25 @@ def __init__(
523536
)
524537

525538
else:
526-
name = getattr(obj, "name", None)
539+
original_obj = obj
540+
name = new_name = getattr(obj, "name", None)
527541
if ignore_index or name is None:
528-
name = current_column
542+
new_name = current_column
529543
current_column += 1
530544

531545
# doing a row-wise concatenation so need everything
532546
# to line up
533547
if self._is_frame and axis == 1:
534-
name = 0
548+
new_name = 0
535549
# mypy needs to know sample is not an NDFrame
536550
sample = cast("DataFrame | Series", sample)
537-
obj = sample._constructor({name: obj})
551+
obj = sample._constructor(obj, columns=[name], copy=False)
552+
if using_copy_on_write():
553+
# TODO(CoW): Remove when ref tracking in constructors works
554+
obj._mgr.parent = original_obj # type: ignore[union-attr]
555+
obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501
556+
557+
obj.columns = [new_name]
538558

539559
self.objs.append(obj)
540560

@@ -584,7 +604,22 @@ def get_result(self):
584604
cons = sample._constructor_expanddim
585605

586606
index, columns = self.new_axes
587-
df = cons(data, index=index, copy=self.copy)
607+
mgr = dict_to_mgr(
608+
data,
609+
index,
610+
None,
611+
copy=self.copy,
612+
typ=get_option("mode.data_manager"),
613+
)
614+
if using_copy_on_write() and not self.copy:
615+
parents = [obj._mgr for obj in self.objs]
616+
mgr.parent = parents # type: ignore[union-attr]
617+
refs = [
618+
weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr]
619+
for obj in self.objs
620+
]
621+
mgr.refs = refs # type: ignore[union-attr]
622+
df = cons(mgr, copy=False)
588623
df.columns = columns
589624
return df.__finalize__(self, method="concat")
590625

@@ -611,7 +646,7 @@ def get_result(self):
611646
new_data = concatenate_managers(
612647
mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
613648
)
614-
if not self.copy:
649+
if not self.copy and not using_copy_on_write():
615650
new_data._consolidate_inplace()
616651

617652
cons = sample._constructor

pandas/io/pytables.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3207,7 +3207,7 @@ def read(
32073207
dfs.append(df)
32083208

32093209
if len(dfs) > 0:
3210-
out = concat(dfs, axis=1)
3210+
out = concat(dfs, axis=1, copy=True)
32113211
out = out.reindex(columns=items, copy=False)
32123212
return out
32133213

+179
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
import numpy as np
2+
3+
from pandas import (
4+
DataFrame,
5+
Series,
6+
concat,
7+
)
8+
import pandas._testing as tm
9+
from pandas.tests.copy_view.util import get_array
10+
11+
12+
def test_concat_frames(using_copy_on_write):
13+
df = DataFrame({"b": ["a"] * 3})
14+
df2 = DataFrame({"a": ["a"] * 3})
15+
df_orig = df.copy()
16+
result = concat([df, df2], axis=1)
17+
18+
if using_copy_on_write:
19+
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
20+
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
21+
else:
22+
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
23+
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
24+
25+
result.iloc[0, 0] = "d"
26+
if using_copy_on_write:
27+
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
28+
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
29+
30+
result.iloc[0, 1] = "d"
31+
if using_copy_on_write:
32+
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
33+
tm.assert_frame_equal(df, df_orig)
34+
35+
36+
def test_concat_frames_updating_input(using_copy_on_write):
37+
df = DataFrame({"b": ["a"] * 3})
38+
df2 = DataFrame({"a": ["a"] * 3})
39+
result = concat([df, df2], axis=1)
40+
41+
if using_copy_on_write:
42+
assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
43+
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
44+
else:
45+
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
46+
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
47+
48+
expected = result.copy()
49+
df.iloc[0, 0] = "d"
50+
if using_copy_on_write:
51+
assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
52+
assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
53+
54+
df2.iloc[0, 0] = "d"
55+
if using_copy_on_write:
56+
assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
57+
tm.assert_frame_equal(result, expected)
58+
59+
60+
def test_concat_series(using_copy_on_write):
61+
ser = Series([1, 2], name="a")
62+
ser2 = Series([3, 4], name="b")
63+
ser_orig = ser.copy()
64+
ser2_orig = ser2.copy()
65+
result = concat([ser, ser2], axis=1)
66+
67+
if using_copy_on_write:
68+
assert np.shares_memory(get_array(result, "a"), ser.values)
69+
assert np.shares_memory(get_array(result, "b"), ser2.values)
70+
else:
71+
assert not np.shares_memory(get_array(result, "a"), ser.values)
72+
assert not np.shares_memory(get_array(result, "b"), ser2.values)
73+
74+
result.iloc[0, 0] = 100
75+
if using_copy_on_write:
76+
assert not np.shares_memory(get_array(result, "a"), ser.values)
77+
assert np.shares_memory(get_array(result, "b"), ser2.values)
78+
79+
result.iloc[0, 1] = 1000
80+
if using_copy_on_write:
81+
assert not np.shares_memory(get_array(result, "b"), ser2.values)
82+
tm.assert_series_equal(ser, ser_orig)
83+
tm.assert_series_equal(ser2, ser2_orig)
84+
85+
86+
def test_concat_frames_chained(using_copy_on_write):
87+
df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
88+
df2 = DataFrame({"c": [4, 5, 6]})
89+
df3 = DataFrame({"d": [4, 5, 6]})
90+
result = concat([concat([df1, df2], axis=1), df3], axis=1)
91+
expected = result.copy()
92+
93+
if using_copy_on_write:
94+
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
95+
assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
96+
assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
97+
else:
98+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
99+
assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
100+
assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
101+
102+
df1.iloc[0, 0] = 100
103+
if using_copy_on_write:
104+
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
105+
106+
tm.assert_frame_equal(result, expected)
107+
108+
109+
def test_concat_series_chained(using_copy_on_write):
110+
ser1 = Series([1, 2, 3], name="a")
111+
ser2 = Series([4, 5, 6], name="c")
112+
ser3 = Series([4, 5, 6], name="d")
113+
result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
114+
expected = result.copy()
115+
116+
if using_copy_on_write:
117+
assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
118+
assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
119+
assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
120+
else:
121+
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
122+
assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
123+
assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
124+
125+
ser1.iloc[0] = 100
126+
if using_copy_on_write:
127+
assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
128+
129+
tm.assert_frame_equal(result, expected)
130+
131+
132+
def test_concat_series_updating_input(using_copy_on_write):
133+
ser = Series([1, 2], name="a")
134+
ser2 = Series([3, 4], name="b")
135+
expected = DataFrame({"a": [1, 2], "b": [3, 4]})
136+
result = concat([ser, ser2], axis=1)
137+
138+
if using_copy_on_write:
139+
assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
140+
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
141+
else:
142+
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
143+
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
144+
145+
ser.iloc[0] = 100
146+
if using_copy_on_write:
147+
assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
148+
assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
149+
tm.assert_frame_equal(result, expected)
150+
151+
ser2.iloc[0] = 1000
152+
if using_copy_on_write:
153+
assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
154+
tm.assert_frame_equal(result, expected)
155+
156+
157+
def test_concat_mixed_series_frame(using_copy_on_write):
158+
df = DataFrame({"a": [1, 2, 3], "c": 1})
159+
ser = Series([4, 5, 6], name="d")
160+
result = concat([df, ser], axis=1)
161+
expected = result.copy()
162+
163+
if using_copy_on_write:
164+
assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
165+
assert np.shares_memory(get_array(result, "c"), get_array(df, "c"))
166+
assert np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
167+
else:
168+
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
169+
assert not np.shares_memory(get_array(result, "c"), get_array(df, "c"))
170+
assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
171+
172+
ser.iloc[0] = 100
173+
if using_copy_on_write:
174+
assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
175+
176+
df.iloc[0, 0] = 100
177+
if using_copy_on_write:
178+
assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
179+
tm.assert_frame_equal(result, expected)

pandas/tests/copy_view/test_methods.py

+17
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,23 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag
788788
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
789789

790790

791+
def test_round(using_copy_on_write):
792+
df = DataFrame({"a": [1, 2], "b": "c"})
793+
df2 = df.round()
794+
df_orig = df.copy()
795+
796+
if using_copy_on_write:
797+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
798+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
799+
else:
800+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
801+
802+
df2.iloc[0, 1] = "d"
803+
if using_copy_on_write:
804+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
805+
tm.assert_frame_equal(df, df_orig)
806+
807+
791808
def test_reorder_levels(using_copy_on_write):
792809
index = MultiIndex.from_tuples(
793810
[(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"]

0 commit comments

Comments
 (0)