API: Honor copy for dict-input in DataFrame

TomAugspurger · TomAugspurger · commit 7b892bd22d1f · 2020-06-19T10:35:16.000-05:00
Closes pandas-dev#32960
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -359,7 +359,12 @@ class DataFrame(NDFrame):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed. If None, infer.
     copy : bool, default False
-        Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+        Copy data from inputs. This only applies to specific cases.
+
+        * `data` is a DataFrame or 2D NumPy array
+        * `data` is a dict with at most one column per NumPy dtype.
+
+        Or all other cases, zero-copy construction cannot be ensured.
 
     See Also
     --------
@@ -456,7 +461,7 @@ def __init__(
             )
 
         elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
+            mgr = init_dict(data, index, columns, dtype=dtype, copy=copy)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -64,6 +64,7 @@ def arrays_to_mgr(
     columns,
     dtype: Optional[DtypeObj] = None,
     verify_integrity: bool = True,
+    copy: bool = False,
 ):
     """
     Segregate Series based on type and coerce into matrices.
@@ -80,7 +81,7 @@ def arrays_to_mgr(
             index = ensure_index(index)
 
         # don't force copy because getting jammed in an ndarray anyway
-        arrays = _homogenize(arrays, index, dtype)
+        arrays = _homogenize(arrays, index, dtype, copy=copy)
 
         columns = ensure_index(columns)
     else:
@@ -234,7 +235,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
     return create_block_manager_from_blocks(block_values, [columns, index])
 
 
-def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
+def init_dict(
+    data: Dict, index, columns, dtype: Optional[DtypeObj] = None, copy: bool = False
+):
     """
     Segregate Series based on type and coerce into matrices.
     Needs to handle a lot of exceptional cases.
@@ -272,6 +275,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
         keys = list(data.keys())
         columns = data_names = Index(keys)
         arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
+        # breakpoint()
         # GH#24096 need copy to be deep for datetime64tz case
         # TODO: See if we can avoid these copies
         arrays = [
@@ -280,7 +284,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, copy=copy)
 
 
 # ---------------------------------------------------------------------
@@ -326,14 +330,16 @@ def convert(v):
     return values
 
 
-def _homogenize(data, index, dtype: Optional[DtypeObj]):
+def _homogenize(data, index, dtype: Optional[DtypeObj], copy: bool = False):
     oindex = None
     homogenized = []
 
     for val in data:
         if isinstance(val, ABCSeries):
             if dtype is not None:
-                val = val.astype(dtype)
+                val = val.astype(dtype, copy=copy)
+            elif copy:
+                val = val.copy()
             if val.index is not index:
                 # Forces alignment. No need to copy data since we
                 # are putting it into an ndarray later
@@ -349,7 +355,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]):
                     val = dict(val)
                 val = lib.fast_multiget(val, oindex._values, default=np.nan)
             val = sanitize_array(
-                val, index, dtype=dtype, copy=False, raise_cast_failure=False
+                val, index, dtype=dtype, copy=copy, raise_cast_failure=False
             )
 
         homogenized.append(val)
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1817,10 +1817,13 @@ def _shape_compat(x):
 
     first = arrays[0]
     shape = (len(arrays),) + _shape_compat(first)
-
-    stacked = np.empty(shape, dtype=dtype)
-    for i, arr in enumerate(arrays):
-        stacked[i] = _asarray_compat(arr)
+    if len(arrays) == 1:
+        # allow for 0-copy construction from dict
+        stacked = _asarray_compat(first).reshape(shape)
+    else:
+        stacked = np.empty(shape, dtype=dtype)
+        for i, arr in enumerate(arrays):
+            stacked[i] = _asarray_compat(arr)
 
     return stacked, placement
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1909,12 +1909,16 @@ def test_constructor_ndarray_copy(self, float_frame):
         assert not (df.values[6] == 6).all()
 
     def test_constructor_series_copy(self, float_frame):
-        series = float_frame._series
+        series = float_frame._series.copy()
+
+        df = DataFrame({"A": series["A"]}, copy=True)
+        df["A"][:] = 5
+        assert not (series["A"] == 5).all()
 
         df = DataFrame({"A": series["A"]})
         df["A"][:] = 5
 
-        assert not (series["A"] == 5).all()
+        assert (series["A"] == 5).all()
 
     def test_constructor_with_nas(self):
         # GH 5016
@@ -2679,3 +2683,19 @@ def test_construction_from_set_raises(self):
         msg = "Set type is unordered"
         with pytest.raises(TypeError, match=msg):
             pd.DataFrame({"a": {1, 2, 3}})
+
+
+@pytest.mark.parametrize("copy", [False, True])
+def test_dict_nocopy(copy):
+    a = np.array([1, 2])
+    b = pd.array([1, 2])
+    df = pd.DataFrame({"a": a, "b": b}, copy=copy)
+    df.iloc[0, 0] = 0
+    df.iloc[0, 1] = 0
+
+    if copy:
+        assert a[0] == 1
+        assert b[0] == 1
+    else:
+        assert a[0] == 0
+        assert b[0] == 0