API / CoW: DataFrame(<dict of Series>, copy=False) constructor now gives lazy copy (#50777)

jorisvandenbossche · web-flow · commit d458291c27fe · 2023-02-10T18:56:03.000+01:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -243,6 +243,10 @@ Copy-on-Write improvements
   a modification to the data happens) when constructing a Series from an existing
   Series with the default of ``copy=False`` (:issue:`50471`)
 
+- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
+  of Series objects and specifying ``copy=False``, will now use a lazy copy
+  of those Series objects for the columns of the DataFrame (:issue:`50777`)
+
 - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
   will now always raise an exception when Copy-on-Write is enabled. In this mode,
   chained assignment can never work because we are always setting into a temporary
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -2210,15 +2210,17 @@ def get_block_type(dtype: DtypeObj):
     return cls
 
 
-def new_block_2d(values: ArrayLike, placement: BlockPlacement):
+def new_block_2d(
+    values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None
+):
     # new_block specialized to case with
     #  ndim=2
     #  isinstance(placement, BlockPlacement)
     #  check_ndim/ensure_block_shape already checked
     klass = get_block_type(values.dtype)
 
     values = maybe_coerce_values(values)
-    return klass(values, ndim=2, placement=placement)
+    return klass(values, ndim=2, placement=placement, refs=refs)
 
 
 def new_block(
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -116,7 +116,7 @@ def arrays_to_mgr(
             index = ensure_index(index)
 
         # don't force copy because getting jammed in an ndarray anyway
-        arrays = _homogenize(arrays, index, dtype)
+        arrays, refs = _homogenize(arrays, index, dtype)
         # _homogenize ensures
         #  - all(len(x) == len(index) for x in arrays)
         #  - all(x.ndim == 1 for x in arrays)
@@ -126,8 +126,10 @@ def arrays_to_mgr(
     else:
         index = ensure_index(index)
         arrays = [extract_array(x, extract_numpy=True) for x in arrays]
+        # with _from_arrays, the passed arrays should never be Series objects
+        refs = [None] * len(arrays)
 
-        # Reached via DataFrame._from_arrays; we do validation here
+        # Reached via DataFrame._from_arrays; we do minimal validation here
         for arr in arrays:
             if (
                 not isinstance(arr, (np.ndarray, ExtensionArray))
@@ -148,7 +150,7 @@ def arrays_to_mgr(
 
     if typ == "block":
         return create_block_manager_from_column_arrays(
-            arrays, axes, consolidate=consolidate
+            arrays, axes, consolidate=consolidate, refs=refs
         )
     elif typ == "array":
         return ArrayManager(arrays, [index, columns])
@@ -547,9 +549,13 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray:
     return values
 
 
-def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
+def _homogenize(
+    data, index: Index, dtype: DtypeObj | None
+) -> tuple[list[ArrayLike], list[Any]]:
     oindex = None
     homogenized = []
+    # if the original array-like in `data` is a Series, keep track of this Series' refs
+    refs: list[Any] = []
 
     for val in data:
         if isinstance(val, ABCSeries):
@@ -559,7 +565,10 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
                 # Forces alignment. No need to copy data since we
                 # are putting it into an ndarray later
                 val = val.reindex(index, copy=False)
-
+            if isinstance(val._mgr, SingleBlockManager):
+                refs.append(val._mgr._block.refs)
+            else:
+                refs.append(None)
             val = val._values
         else:
             if isinstance(val, dict):
@@ -578,10 +587,11 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
 
             val = sanitize_array(val, index, dtype=dtype, copy=False)
             com.require_length_match(val, index)
+            refs.append(None)
 
         homogenized.append(val)
 
-    return homogenized
+    return homogenized, refs
 
 
 def _extract_index(data) -> Index:
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -2081,7 +2081,8 @@ def create_block_manager_from_blocks(
 def create_block_manager_from_column_arrays(
     arrays: list[ArrayLike],
     axes: list[Index],
-    consolidate: bool = True,
+    consolidate: bool,
+    refs: list,
 ) -> BlockManager:
     # Assertions disabled for performance (caller is responsible for verifying)
     # assert isinstance(axes, list)
@@ -2095,7 +2096,7 @@ def create_block_manager_from_column_arrays(
     #  verify_integrity=False below.
 
     try:
-        blocks = _form_blocks(arrays, consolidate)
+        blocks = _form_blocks(arrays, consolidate, refs)
         mgr = BlockManager(blocks, axes, verify_integrity=False)
     except ValueError as e:
         raise_construction_error(len(arrays), arrays[0].shape, axes, e)
@@ -2149,13 +2150,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
     return sep, isinstance(dtype, np.dtype), dtype
 
 
-def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
+def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
     tuples = list(enumerate(arrays))
 
     if not consolidate:
-        nbs = _tuples_to_blocks_no_consolidate(tuples)
+        nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
         return nbs
 
+    # when consolidating, we can ignore refs (either stacking always copies,
+    # or the EA is already copied in the calling dict_to_mgr)
+    # TODO(CoW) check if this is also valid for rec_array_to_mgr
+
     # group by dtype
     grouper = itertools.groupby(tuples, _grouping_func)
 
@@ -2193,11 +2198,13 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
     return nbs
 
 
-def _tuples_to_blocks_no_consolidate(tuples) -> list[Block]:
+def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
     # tuples produced within _form_blocks are of the form (placement, array)
     return [
-        new_block_2d(ensure_block_shape(x[1], ndim=2), placement=BlockPlacement(x[0]))
-        for x in tuples
+        new_block_2d(
+            ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
+        )
+        for ((i, arr), ref) in zip(tuples, refs)
     ]
 
 
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -17,10 +17,7 @@
 
 import numpy as np
 
-from pandas._config import (
-    get_option,
-    using_copy_on_write,
-)
+from pandas._config import using_copy_on_write
 
 from pandas._typing import (
     Axis,
@@ -52,7 +49,6 @@
     get_unanimous_names,
 )
 from pandas.core.internals import concatenate_managers
-from pandas.core.internals.construction import dict_to_mgr
 
 if TYPE_CHECKING:
     from pandas import (
@@ -535,26 +531,18 @@ def __init__(
                     )
 
                 else:
-                    original_obj = obj
-                    name = new_name = getattr(obj, "name", None)
+                    name = getattr(obj, "name", None)
                     if ignore_index or name is None:
-                        new_name = current_column
+                        name = current_column
                         current_column += 1
 
                     # doing a row-wise concatenation so need everything
                     # to line up
                     if self._is_frame and axis == 1:
-                        new_name = 0
+                        name = 0
                     # mypy needs to know sample is not an NDFrame
                     sample = cast("DataFrame | Series", sample)
-                    obj = sample._constructor(obj, columns=[name], copy=False)
-                    if using_copy_on_write():
-                        # TODO(CoW): Remove when ref tracking in constructors works
-                        for i, block in enumerate(original_obj._mgr.blocks):  # type: ignore[union-attr]  # noqa
-                            obj._mgr.blocks[i].refs = block.refs  # type: ignore[union-attr]  # noqa
-                            obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i])  # type: ignore[arg-type, union-attr]  # noqa
-
-                    obj.columns = [new_name]
+                    obj = sample._constructor({name: obj}, copy=False)
 
                 self.objs.append(obj)
 
@@ -604,18 +592,7 @@ def get_result(self):
                 cons = sample._constructor_expanddim
 
                 index, columns = self.new_axes
-                mgr = dict_to_mgr(
-                    data,
-                    index,
-                    None,
-                    copy=self.copy,
-                    typ=get_option("mode.data_manager"),
-                )
-                if using_copy_on_write() and not self.copy:
-                    for i, obj in enumerate(self.objs):
-                        mgr.blocks[i].refs = obj._mgr.blocks[0].refs  # type: ignore[union-attr]  # noqa
-                        mgr.blocks[i].refs.add_reference(mgr.blocks[i])  # type: ignore[arg-type, union-attr]  # noqa
-                df = cons(mgr, copy=False)
+                df = cons(data, index=index, copy=self.copy)
                 df.columns = columns
                 return df.__finalize__(self, method="concat")
 
diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -1,7 +1,12 @@
 import numpy as np
 import pytest
 
-from pandas import Series
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
 
 # -----------------------------------------------------------------------------
 # Copy/view behaviour for Series / DataFrame constructors
@@ -75,3 +80,83 @@ def test_series_from_series_with_reindex(using_copy_on_write):
     assert not np.shares_memory(ser.values, result.values)
     if using_copy_on_write:
         assert not result._mgr.blocks[0].refs.has_reference()
+
+
+@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
+@pytest.mark.parametrize("index", [None, [0, 1, 2]])
+@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
+def test_dataframe_from_dict_of_series(
+    request, using_copy_on_write, columns, index, dtype
+):
+    # Case: constructing a DataFrame from Series objects with copy=False
+    # has to do a lazy following CoW rules
+    # (the default for DataFrame(dict) is still to copy to ensure consolidation)
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    s1_orig = s1.copy()
+    expected = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
+    )
+
+    result = DataFrame(
+        {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
+    )
+
+    # the shallow copy still shares memory
+    assert np.shares_memory(get_array(result, "a"), get_array(s1))
+
+    # mutating the new dataframe doesn't mutate original
+    result.iloc[0, 0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(result, "a"), get_array(s1))
+        tm.assert_series_equal(s1, s1_orig)
+    else:
+        assert s1.iloc[0] == 10
+
+    # the same when modifying the parent series
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    result = DataFrame(
+        {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
+    )
+    s1.iloc[0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(result, "a"), get_array(s1))
+        tm.assert_frame_equal(result, expected)
+    else:
+        assert result.iloc[0, 0] == 10
+
+
+@pytest.mark.parametrize("dtype", [None, "int64"])
+def test_dataframe_from_dict_of_series_with_reindex(dtype):
+    # Case: constructing a DataFrame from Series objects with copy=False
+    # and passing an index that requires an actual (no-view) reindex -> need
+    # to ensure the result doesn't have refs set up to unnecessarily trigger
+    # a copy on write
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
+
+    # df should own its memory, so mutating shouldn't trigger a copy
+    arr_before = get_array(df, "a")
+    assert not np.shares_memory(arr_before, get_array(s1))
+    df.iloc[0, 0] = 100
+    arr_after = get_array(df, "a")
+    assert np.shares_memory(arr_before, arr_after)
+
+
+@pytest.mark.parametrize("index", [None, [0, 1, 2]])
+def test_dataframe_from_dict_of_series_with_dtype(index):
+    # Variant of above, but now passing a dtype that causes a copy
+    # -> need to ensure the result doesn't have refs set up to unnecessarily
+    # trigger a copy on write
+    s1 = Series([1.0, 2.0, 3.0])
+    s2 = Series([4, 5, 6])
+    df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
+
+    # df should own its memory, so mutating shouldn't trigger a copy
+    arr_before = get_array(df, "a")
+    assert not np.shares_memory(arr_before, get_array(s1))
+    df.iloc[0, 0] = 100
+    arr_after = get_array(df, "a")
+    assert np.shares_memory(arr_before, arr_after)