pandas-dev · jorisvandenbossche · Feb 10, 2023 · Jan 16, 2023 · Jan 16, 2023 · Jan 16, 2023
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -10,10 +10,13 @@
     Hashable,
     Sequence,
 )
+import weakref
 
 import numpy as np
 from numpy import ma
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import lib
 from pandas._typing import (
     ArrayLike,
@@ -56,6 +59,7 @@
     ExtensionArray,
     FloatingArray,
     IntegerArray,
+    PandasArray,
 )
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import (
@@ -116,7 +120,7 @@ def arrays_to_mgr(
             index = ensure_index(index)
 
         # don't force copy because getting jammed in an ndarray anyway
-        arrays = _homogenize(arrays, index, dtype)
+        arrays, parents = _homogenize(arrays, index, dtype)
         # _homogenize ensures
         #  - all(len(x) == len(index) for x in arrays)
         #  - all(x.ndim == 1 for x in arrays)
@@ -125,7 +129,10 @@ def arrays_to_mgr(
 
     else:
         index = ensure_index(index)
-        arrays = [extract_array(x, extract_numpy=True) for x in arrays]
+        arrays = [
+            arr.to_numpy() if isinstance(arr, PandasArray) else arr for arr in arrays
+        ]
+        parents = None
 
         # Reached via DataFrame._from_arrays; we do validation here
         for arr in arrays:
@@ -148,7 +155,10 @@ def arrays_to_mgr(
 
     if typ == "block":
         return create_block_manager_from_column_arrays(
-            arrays, axes, consolidate=consolidate
+            arrays,
+            axes,
+            consolidate=consolidate,
+            parents=parents,
         )
     elif typ == "array":
         return ArrayManager(arrays, [index, columns])
@@ -547,20 +557,32 @@ def _ensure_2d(values: np.ndarray) -> np.ndarray:
     return values
 
 
-def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
+def _homogenize(
+    data, index: Index, dtype: DtypeObj | None
+) -> tuple[list[ArrayLike], list[Any] | None]:
     oindex = None
     homogenized = []
+    # if the original array-like in `data` is a Series, keep track of this Series
+    parents: list[Any] = []
 
     for val in data:
         if isinstance(val, ABCSeries):
+            hval = val
             if dtype is not None:
-                val = val.astype(dtype, copy=False)
-            if val.index is not index:
+                hval = hval.astype(dtype, copy=False)
+                if using_copy_on_write() and hval.values is val.values:
+                    # TODO(CoW) remove when astype() has implemented CoW
+                    refs = [weakref.ref(val._mgr._block)]  # type: ignore[union-attr]
+                    hval._mgr.refs = refs  # type: ignore[union-attr]
+                    hval._mgr.parent = val._mgr  # type: ignore[union-attr]
+            if hval.index is not index:
                 # Forces alignment. No need to copy data since we
                 # are putting it into an ndarray later
-                val = val.reindex(index, copy=False)
-
-            val = val._values
+                hval = hval.reindex(index, copy=False)
+            if hval is val:
+                hval = val.copy(deep=False)
+            homogenized.append(hval._values)
+            parents.append(hval)
         else:
             if isinstance(val, dict):
                 # GH#41785 this _should_ be equivalent to (but faster than)
@@ -578,10 +600,10 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
 
             val = sanitize_array(val, index, dtype=dtype, copy=False)
             com.require_length_match(val, index)
+            homogenized.append(val)
+            parents.append(None)
 
-        homogenized.append(val)
-
-    return homogenized
+    return homogenized, None if com.all_none(*parents) else parents
 
 
 def _extract_index(data) -> Index:

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -2132,6 +2132,7 @@ def create_block_manager_from_column_arrays(
     arrays: list[ArrayLike],
     axes: list[Index],
     consolidate: bool = True,
+    parents: list | None = None,
 ) -> BlockManager:
     # Assertions disabled for performance (caller is responsible for verifying)
     # assert isinstance(axes, list)
@@ -2146,7 +2147,30 @@ def create_block_manager_from_column_arrays(
 
     try:
         blocks = _form_blocks(arrays, consolidate)
-        mgr = BlockManager(blocks, axes, verify_integrity=False)
+        refs = None
+        parent = None
+        if parents is not None and using_copy_on_write():
+            # elements in `parents` are Series objects *if* the original input
+            # for the column was a Series, or otherwise None
+            # -> in case of a Series, keep track of its refs if it has those
+            # (this Series is already a view on the original one, so we can
+            # directly use its ref instead of creating a new ref to this Series)
+            refs = []
+            parent = []
+            for ser in parents:
+                if (
+                    ser is not None
+                    and ser._mgr.refs is not None
+                    and (ref := ser._mgr.refs[0]) is not None
+                ):
+                    refs.append(ref)
+                    parent.append(ser)
+                else:
+                    refs.append(None)
+
+        mgr = BlockManager(
+            blocks, axes, refs=refs, parent=parent, verify_integrity=False
+        )
     except ValueError as e:
         raise_construction_error(len(arrays), arrays[0].shape, axes, e)
     if consolidate:

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -14,14 +14,10 @@
     cast,
     overload,
 )
-import weakref
 
 import numpy as np
 
-from pandas._config import (
-    get_option,
-    using_copy_on_write,
-)
+from pandas._config import using_copy_on_write
 
 from pandas._typing import (
     Axis,
@@ -53,7 +49,6 @@
     get_unanimous_names,
 )
 from pandas.core.internals import concatenate_managers
-from pandas.core.internals.construction import dict_to_mgr
 
 if TYPE_CHECKING:
     from pandas import (
@@ -536,25 +531,18 @@ def __init__(
                     )
 
                 else:
-                    original_obj = obj
-                    name = new_name = getattr(obj, "name", None)
+                    name = getattr(obj, "name", None)
                     if ignore_index or name is None:
-                        new_name = current_column
+                        name = current_column
                         current_column += 1
 
                     # doing a row-wise concatenation so need everything
                     # to line up
                     if self._is_frame and axis == 1:
-                        new_name = 0
+                        name = 0
                     # mypy needs to know sample is not an NDFrame
                     sample = cast("DataFrame | Series", sample)
-                    obj = sample._constructor(obj, columns=[name], copy=False)
-                    if using_copy_on_write():
-                        # TODO(CoW): Remove when ref tracking in constructors works
-                        obj._mgr.parent = original_obj  # type: ignore[union-attr]
-                        obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])]  # type: ignore[union-attr]  # noqa: E501
-
-                    obj.columns = [new_name]
+                    obj = sample._constructor({name: obj}, copy=False)
 
                 self.objs.append(obj)
 
@@ -604,22 +592,7 @@ def get_result(self):
                 cons = sample._constructor_expanddim
 
                 index, columns = self.new_axes
-                mgr = dict_to_mgr(
-                    data,
-                    index,
-                    None,
-                    copy=self.copy,
-                    typ=get_option("mode.data_manager"),
-                )
-                if using_copy_on_write() and not self.copy:
-                    parents = [obj._mgr for obj in self.objs]
-                    mgr.parent = parents  # type: ignore[union-attr]
-                    refs = [
-                        weakref.ref(obj._mgr.blocks[0])  # type: ignore[union-attr]
-                        for obj in self.objs
-                    ]
-                    mgr.refs = refs  # type: ignore[union-attr]
-                df = cons(mgr, copy=False)
+                df = cons(data, index=index, copy=self.copy)
                 df.columns = columns
                 return df.__finalize__(self, method="concat")
 

diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -1,6 +1,12 @@
 import numpy as np
+import pytest
 
-from pandas import Series
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
 
 # -----------------------------------------------------------------------------
 # Copy/view behaviour for Series / DataFrame constructors
@@ -73,3 +79,37 @@ def test_series_from_series_with_reindex(using_copy_on_write):
     assert not np.shares_memory(ser.values, result.values)
     if using_copy_on_write:
         assert result._mgr.refs is None or result._mgr.refs[0] is None
+
+
+@pytest.mark.parametrize("dtype", [None, "int64"])
+@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
+def test_dataframe_from_dict_of_series(using_copy_on_write, columns, dtype):
+    # Case: constructing a DataFrame from Series objects with copy=False
+    # has to do a lazy following CoW rules
+    # (the default for DataFrame(dict) is still to copy to ensure consolidation)
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    s1_orig = s1.copy()
+    expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=dtype, columns=columns)
+
+    result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False)
+
+    # the shallow copy still shares memory
+    assert np.shares_memory(get_array(result, "a"), s1.values)
+
+    # mutating the new dataframe doesn't mutate original
+    result.iloc[0, 0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(result, "a"), s1.values)
+        tm.assert_series_equal(s1, s1_orig)
+    else:
+        assert s1.iloc[0] == 10
+
+    # the same when modifying the parent series
+    result = DataFrame({"a": s1, "b": s2}, columns=columns, dtype=dtype, copy=False)
+    s1.iloc[0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(result, "a"), s1.values)
+        tm.assert_frame_equal(result, expected)
+    else:
+        assert result.iloc[0, 0] == 10