Skip to content

Commit d458291

Browse files
API / CoW: DataFrame(<dict of Series>, copy=False) constructor now gives lazy copy (#50777)
1 parent c74d057 commit d458291

File tree

6 files changed

+130
-45
lines changed

6 files changed

+130
-45
lines changed

doc/source/whatsnew/v2.0.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ Copy-on-Write improvements
243243
a modification to the data happens) when constructing a Series from an existing
244244
Series with the default of ``copy=False`` (:issue:`50471`)
245245

246+
- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
247+
of Series objects and specifying ``copy=False``, will now use a lazy copy
248+
of those Series objects for the columns of the DataFrame (:issue:`50777`)
249+
246250
- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
247251
will now always raise an exception when Copy-on-Write is enabled. In this mode,
248252
chained assignment can never work because we are always setting into a temporary

pandas/core/internals/blocks.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -2210,15 +2210,17 @@ def get_block_type(dtype: DtypeObj):
22102210
return cls
22112211

22122212

2213-
def new_block_2d(values: ArrayLike, placement: BlockPlacement):
2213+
def new_block_2d(
2214+
values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None
2215+
):
22142216
# new_block specialized to case with
22152217
# ndim=2
22162218
# isinstance(placement, BlockPlacement)
22172219
# check_ndim/ensure_block_shape already checked
22182220
klass = get_block_type(values.dtype)
22192221

22202222
values = maybe_coerce_values(values)
2221-
return klass(values, ndim=2, placement=placement)
2223+
return klass(values, ndim=2, placement=placement, refs=refs)
22222224

22232225

22242226
def new_block(

pandas/core/internals/construction.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def arrays_to_mgr(
116116
index = ensure_index(index)
117117

118118
# don't force copy because getting jammed in an ndarray anyway
119-
arrays = _homogenize(arrays, index, dtype)
119+
arrays, refs = _homogenize(arrays, index, dtype)
120120
# _homogenize ensures
121121
# - all(len(x) == len(index) for x in arrays)
122122
# - all(x.ndim == 1 for x in arrays)
@@ -126,8 +126,10 @@ def arrays_to_mgr(
126126
else:
127127
index = ensure_index(index)
128128
arrays = [extract_array(x, extract_numpy=True) for x in arrays]
129+
# with _from_arrays, the passed arrays should never be Series objects
130+
refs = [None] * len(arrays)
129131

130-
# Reached via DataFrame._from_arrays; we do validation here
132+
# Reached via DataFrame._from_arrays; we do minimal validation here
131133
for arr in arrays:
132134
if (
133135
not isinstance(arr, (np.ndarray, ExtensionArray))
@@ -148,7 +150,7 @@ def arrays_to_mgr(
148150

149151
if typ == "block":
150152
return create_block_manager_from_column_arrays(
151-
arrays, axes, consolidate=consolidate
153+
arrays, axes, consolidate=consolidate, refs=refs
152154
)
153155
elif typ == "array":
154156
return ArrayManager(arrays, [index, columns])
@@ -547,9 +549,13 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray:
547549
return values
548550

549551

550-
def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
552+
def _homogenize(
553+
data, index: Index, dtype: DtypeObj | None
554+
) -> tuple[list[ArrayLike], list[Any]]:
551555
oindex = None
552556
homogenized = []
557+
# if the original array-like in `data` is a Series, keep track of this Series' refs
558+
refs: list[Any] = []
553559

554560
for val in data:
555561
if isinstance(val, ABCSeries):
@@ -559,7 +565,10 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
559565
# Forces alignment. No need to copy data since we
560566
# are putting it into an ndarray later
561567
val = val.reindex(index, copy=False)
562-
568+
if isinstance(val._mgr, SingleBlockManager):
569+
refs.append(val._mgr._block.refs)
570+
else:
571+
refs.append(None)
563572
val = val._values
564573
else:
565574
if isinstance(val, dict):
@@ -578,10 +587,11 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
578587

579588
val = sanitize_array(val, index, dtype=dtype, copy=False)
580589
com.require_length_match(val, index)
590+
refs.append(None)
581591

582592
homogenized.append(val)
583593

584-
return homogenized
594+
return homogenized, refs
585595

586596

587597
def _extract_index(data) -> Index:

pandas/core/internals/managers.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -2081,7 +2081,8 @@ def create_block_manager_from_blocks(
20812081
def create_block_manager_from_column_arrays(
20822082
arrays: list[ArrayLike],
20832083
axes: list[Index],
2084-
consolidate: bool = True,
2084+
consolidate: bool,
2085+
refs: list,
20852086
) -> BlockManager:
20862087
# Assertions disabled for performance (caller is responsible for verifying)
20872088
# assert isinstance(axes, list)
@@ -2095,7 +2096,7 @@ def create_block_manager_from_column_arrays(
20952096
# verify_integrity=False below.
20962097

20972098
try:
2098-
blocks = _form_blocks(arrays, consolidate)
2099+
blocks = _form_blocks(arrays, consolidate, refs)
20992100
mgr = BlockManager(blocks, axes, verify_integrity=False)
21002101
except ValueError as e:
21012102
raise_construction_error(len(arrays), arrays[0].shape, axes, e)
@@ -2149,13 +2150,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
21492150
return sep, isinstance(dtype, np.dtype), dtype
21502151

21512152

2152-
def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
2153+
def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
21532154
tuples = list(enumerate(arrays))
21542155

21552156
if not consolidate:
2156-
nbs = _tuples_to_blocks_no_consolidate(tuples)
2157+
nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
21572158
return nbs
21582159

2160+
# when consolidating, we can ignore refs (either stacking always copies,
2161+
# or the EA is already copied in the calling dict_to_mgr)
2162+
# TODO(CoW) check if this is also valid for rec_array_to_mgr
2163+
21592164
# group by dtype
21602165
grouper = itertools.groupby(tuples, _grouping_func)
21612166

@@ -2193,11 +2198,13 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
21932198
return nbs
21942199

21952200

2196-
def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]:
2201+
def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
21972202
# tuples produced within _form_blocks are of the form (placement, array)
21982203
return [
2199-
new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0]))
2200-
for x in tuples
2204+
new_block_2d(
2205+
ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
2206+
)
2207+
for ((i, arr), ref) in zip(tuples, refs)
22012208
]
22022209

22032210

pandas/core/reshape/concat.py

+6-29
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@
1717

1818
import numpy as np
1919

20-
from pandas._config import (
21-
get_option,
22-
using_copy_on_write,
23-
)
20+
from pandas._config import using_copy_on_write
2421

2522
from pandas._typing import (
2623
Axis,
@@ -52,7 +49,6 @@
5249
get_unanimous_names,
5350
)
5451
from pandas.core.internals import concatenate_managers
55-
from pandas.core.internals.construction import dict_to_mgr
5652

5753
if TYPE_CHECKING:
5854
from pandas import (
@@ -535,26 +531,18 @@ def __init__(
535531
)
536532

537533
else:
538-
original_obj = obj
539-
name = new_name = getattr(obj, "name", None)
534+
name = getattr(obj, "name", None)
540535
if ignore_index or name is None:
541-
new_name = current_column
536+
name = current_column
542537
current_column += 1
543538

544539
# doing a row-wise concatenation so need everything
545540
# to line up
546541
if self._is_frame and axis == 1:
547-
new_name = 0
542+
name = 0
548543
# mypy needs to know sample is not an NDFrame
549544
sample = cast("DataFrame | Series", sample)
550-
obj = sample._constructor(obj, columns=[name], copy=False)
551-
if using_copy_on_write():
552-
# TODO(CoW): Remove when ref tracking in constructors works
553-
for i, block in enumerate(original_obj._mgr.blocks): # type: ignore[union-attr] # noqa
554-
obj._mgr.blocks[i].refs = block.refs # type: ignore[union-attr] # noqa
555-
obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa
556-
557-
obj.columns = [new_name]
545+
obj = sample._constructor({name: obj}, copy=False)
558546

559547
self.objs.append(obj)
560548

@@ -604,18 +592,7 @@ def get_result(self):
604592
cons = sample._constructor_expanddim
605593

606594
index, columns = self.new_axes
607-
mgr = dict_to_mgr(
608-
data,
609-
index,
610-
None,
611-
copy=self.copy,
612-
typ=get_option("mode.data_manager"),
613-
)
614-
if using_copy_on_write() and not self.copy:
615-
for i, obj in enumerate(self.objs):
616-
mgr.blocks[i].refs = obj._mgr.blocks[0].refs # type: ignore[union-attr] # noqa
617-
mgr.blocks[i].refs.add_reference(mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa
618-
df = cons(mgr, copy=False)
595+
df = cons(data, index=index, copy=self.copy)
619596
df.columns = columns
620597
return df.__finalize__(self, method="concat")
621598

pandas/tests/copy_view/test_constructors.py

+86-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
import numpy as np
22
import pytest
33

4-
from pandas import Series
4+
from pandas import (
5+
DataFrame,
6+
Series,
7+
)
8+
import pandas._testing as tm
9+
from pandas.tests.copy_view.util import get_array
510

611
# -----------------------------------------------------------------------------
712
# Copy/view behaviour for Series / DataFrame constructors
@@ -75,3 +80,83 @@ def test_series_from_series_with_reindex(using_copy_on_write):
7580
assert not np.shares_memory(ser.values, result.values)
7681
if using_copy_on_write:
7782
assert not result._mgr.blocks[0].refs.has_reference()
83+
84+
85+
@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
86+
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
87+
@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
88+
def test_dataframe_from_dict_of_series(
89+
request, using_copy_on_write, columns, index, dtype
90+
):
91+
# Case: constructing a DataFrame from Series objects with copy=False
92+
# has to do a lazy following CoW rules
93+
# (the default for DataFrame(dict) is still to copy to ensure consolidation)
94+
s1 = Series([1, 2, 3])
95+
s2 = Series([4, 5, 6])
96+
s1_orig = s1.copy()
97+
expected = DataFrame(
98+
{"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
99+
)
100+
101+
result = DataFrame(
102+
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
103+
)
104+
105+
# the shallow copy still shares memory
106+
assert np.shares_memory(get_array(result, "a"), get_array(s1))
107+
108+
# mutating the new dataframe doesn't mutate original
109+
result.iloc[0, 0] = 10
110+
if using_copy_on_write:
111+
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
112+
tm.assert_series_equal(s1, s1_orig)
113+
else:
114+
assert s1.iloc[0] == 10
115+
116+
# the same when modifying the parent series
117+
s1 = Series([1, 2, 3])
118+
s2 = Series([4, 5, 6])
119+
result = DataFrame(
120+
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
121+
)
122+
s1.iloc[0] = 10
123+
if using_copy_on_write:
124+
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
125+
tm.assert_frame_equal(result, expected)
126+
else:
127+
assert result.iloc[0, 0] == 10
128+
129+
130+
@pytest.mark.parametrize("dtype", [None, "int64"])
131+
def test_dataframe_from_dict_of_series_with_reindex(dtype):
132+
# Case: constructing a DataFrame from Series objects with copy=False
133+
# and passing an index that requires an actual (no-view) reindex -> need
134+
# to ensure the result doesn't have refs set up to unnecessarily trigger
135+
# a copy on write
136+
s1 = Series([1, 2, 3])
137+
s2 = Series([4, 5, 6])
138+
df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
139+
140+
# df should own its memory, so mutating shouldn't trigger a copy
141+
arr_before = get_array(df, "a")
142+
assert not np.shares_memory(arr_before, get_array(s1))
143+
df.iloc[0, 0] = 100
144+
arr_after = get_array(df, "a")
145+
assert np.shares_memory(arr_before, arr_after)
146+
147+
148+
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
149+
def test_dataframe_from_dict_of_series_with_dtype(index):
150+
# Variant of above, but now passing a dtype that causes a copy
151+
# -> need to ensure the result doesn't have refs set up to unnecessarily
152+
# trigger a copy on write
153+
s1 = Series([1.0, 2.0, 3.0])
154+
s2 = Series([4, 5, 6])
155+
df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
156+
157+
# df should own its memory, so mutating shouldn't trigger a copy
158+
arr_before = get_array(df, "a")
159+
assert not np.shares_memory(arr_before, get_array(s1))
160+
df.iloc[0, 0] = 100
161+
arr_after = get_array(df, "a")
162+
assert np.shares_memory(arr_before, arr_after)

0 commit comments

Comments
 (0)