pandas-dev · TomAugspurger · Jun 19, 2020 · Jun 19, 2020 · Jun 19, 2020 · Jun 19, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -287,6 +287,7 @@ Other enhancements
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
 - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`).
+- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made, rather than ignoring it (:issue:`32960`)
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -390,7 +390,7 @@ def sanitize_array(
     data,
     index: Optional["Index"],
     dtype: Optional[DtypeObj] = None,
-    copy: bool = False,
+    copy: Optional[bool] = False,
     raise_cast_failure: bool = False,
 ) -> ArrayLike:
     """
@@ -412,6 +412,9 @@ def sanitize_array(
 
     # GH#846
     if isinstance(data, np.ndarray):
+        if copy is None:
+            # copy by default for DataFrame({"A": ndarray})
+            copy = True
 
         if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
             # possibility of nan -> garbage
@@ -428,15 +431,20 @@ def sanitize_array(
 
     elif isinstance(data, ABCExtensionArray):
         # it is already ensured above this is not a PandasArray
+        # no copy by default for DataFrame({"A": extension_array})
+        if copy is None:
+            copy = False
         subarr = data
 
         if dtype is not None:
             subarr = subarr.astype(dtype, copy=copy)
         elif copy:
+            # no copy by default from DataFrame.__init__
             subarr = subarr.copy()
         return subarr
 
     elif isinstance(data, (list, tuple)) and len(data) > 0:
+        copy = bool(copy)  # None -> False
         if dtype is not None:
             subarr = _try_cast(data, dtype, copy, raise_cast_failure)
         else:
@@ -446,16 +454,19 @@ def sanitize_array(
 
     elif isinstance(data, range):
         # GH#16804
+        copy = bool(copy)  # None -> False
         arr = np.arange(data.start, data.stop, data.step, dtype="int64")
         subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
     elif isinstance(data, abc.Set):
         raise TypeError("Set type is unordered")
     elif lib.is_scalar(data) and index is not None and dtype is not None:
+        copy = bool(copy)  # None -> False
         data = maybe_cast_to_datetime(data, dtype)
         if not lib.is_scalar(data):
             data = data[0]
         subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)
     else:
+        copy = bool(copy)  # None -> False
         subarr = _try_cast(data, dtype, copy, raise_cast_failure)
 
     # scalar like, GH

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -364,8 +364,22 @@ class DataFrame(NDFrame):
         RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed. If None, infer.
-    copy : bool, default False
-        Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+    copy : bool, optional
+        Copy data from inputs. This only applies for specific types of `data`
+        and the default behavior depends on `data`.
+
+        * `data` is a DataFrame or 2D NumPy array: *no* copy by default.
+           Specifying ``copy=True`` will copy the data.
+        * `data` is a dict:
+           By default arrays in `data` with NumPy dtypes in `data` are
+           copied, while extension types are not. Specifying ``copy=True``
+           will copy all of the values, and ``copy=False`` will attempt to
+           not copy the data. Note that if `data` has more than one value with
+           the same NumPy dtype then then data will be copied, regardless of
+           the value of `copy`.
+
+        For all other types of `data`, zero-copy construction cannot be ensured
+        and `copy` has no effect.
 
     See Also
     --------
@@ -441,7 +455,7 @@ def __init__(
         index: Optional[Axes] = None,
         columns: Optional[Axes] = None,
         dtype: Optional[Dtype] = None,
-        copy: bool = False,
+        copy: Optional[bool] = None,
     ):
         if data is None:
             data = {}
@@ -452,6 +466,7 @@ def __init__(
             data = data._mgr
 
         if isinstance(data, BlockManager):
+            copy = bool(copy)  # None -> False
             if index is None and columns is None and dtype is None and copy is False:
                 # GH#33357 fastpath
                 NDFrame.__init__(self, data)
@@ -462,10 +477,12 @@ def __init__(
             )
 
         elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
+            mgr = init_dict(data, index, columns, dtype=dtype, copy=copy)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 
+            copy = bool(copy)  # None -> False
+
             # masked recarray
             if isinstance(data, mrecords.MaskedRecords):
                 mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
@@ -482,6 +499,7 @@ def __init__(
                 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
 
         elif isinstance(data, (np.ndarray, Series, Index)):
+            copy = bool(copy)  # None -> False
             if data.dtype.names:
                 data_columns = list(data.dtype.names)
                 data = {k: data[k] for k in data_columns}
@@ -495,6 +513,7 @@ def __init__(
 
         # For data is list-like, or Iterable (will consume into list)
         elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
+            copy = bool(copy)  # None -> False
             if not isinstance(data, (abc.Sequence, ExtensionArray)):
                 data = list(data)
             if len(data) > 0:
@@ -538,6 +557,7 @@ def __init__(
                 mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
             else:
                 # Attempt to coerce to a numpy array
+                copy = bool(copy)  # None -> False
                 try:
                     arr = np.array(data, dtype=dtype, copy=copy)
                 except (ValueError, TypeError) as err:

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -64,6 +64,7 @@ def arrays_to_mgr(
     columns,
     dtype: Optional[DtypeObj] = None,
     verify_integrity: bool = True,
+    copy: Optional[bool] = False,
 ):
     """
     Segregate Series based on type and coerce into matrices.
@@ -80,7 +81,7 @@ def arrays_to_mgr(
             index = ensure_index(index)
 
         # don't force copy because getting jammed in an ndarray anyway
-        arrays = _homogenize(arrays, index, dtype)
+        arrays = _homogenize(arrays, index, dtype, copy=copy)
 
         columns = ensure_index(columns)
     else:
@@ -234,7 +235,13 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
     return create_block_manager_from_blocks(block_values, [columns, index])
 
 
-def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
+def init_dict(
+    data: Dict,
+    index,
+    columns,
+    dtype: Optional[DtypeObj] = None,
+    copy: Optional[bool] = False,
+):
     """
     Segregate Series based on type and coerce into matrices.
     Needs to handle a lot of exceptional cases.
@@ -280,7 +287,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, copy=copy)
 
 
 # ---------------------------------------------------------------------
@@ -326,14 +333,16 @@ def convert(v):
     return values
 
 
-def _homogenize(data, index, dtype: Optional[DtypeObj]):
+def _homogenize(data, index, dtype: Optional[DtypeObj], copy: Optional[bool] = False):
     oindex = None
     homogenized = []
 
     for val in data:
         if isinstance(val, ABCSeries):
             if dtype is not None:
-                val = val.astype(dtype)
+                val = val.astype(dtype, copy=copy)
+            elif copy:
+                val = val.copy()
             if val.index is not index:
                 # Forces alignment. No need to copy data since we
                 # are putting it into an ndarray later
@@ -349,7 +358,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]):
                     val = dict(val)
                 val = lib.fast_multiget(val, oindex._values, default=np.nan)
             val = sanitize_array(
-                val, index, dtype=dtype, copy=False, raise_cast_failure=False
+                val, index, dtype=dtype, copy=copy, raise_cast_failure=False
             )
 
         homogenized.append(val)

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1822,10 +1822,13 @@ def _shape_compat(x):
 
     first = arrays[0]
     shape = (len(arrays),) + _shape_compat(first)
-
-    stacked = np.empty(shape, dtype=dtype)
-    for i, arr in enumerate(arrays):
-        stacked[i] = _asarray_compat(arr)
+    if len(arrays) == 1:
+        # allow for 0-copy construction from dict
+        stacked = _asarray_compat(first).reshape(shape)
+    else:
+        stacked = np.empty(shape, dtype=dtype)
+        for i, arr in enumerate(arrays):
+            stacked[i] = _asarray_compat(arr)
 
     return stacked, placement
 

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1941,11 +1941,15 @@ def test_constructor_ndarray_copy(self, float_frame):
     def test_constructor_series_copy(self, float_frame):
         series = float_frame._series
 
-        df = DataFrame({"A": series["A"]})
+        df = DataFrame({"A": series["A"]}, copy=True)
         df["A"][:] = 5
 
         assert not (series["A"] == 5).all()
 
+        df = DataFrame({"A": series["A"]})  # no copy by default
+        df["A"][:] = 5
+        assert (series["A"] == 5).all()
+
     def test_constructor_with_nas(self):
         # GH 5016
         # na's in indices
@@ -2746,3 +2750,23 @@ def test_construction_from_set_raises(self):
         msg = "Set type is unordered"
         with pytest.raises(TypeError, match=msg):
             pd.DataFrame({"a": {1, 2, 3}})
+
+    @pytest.mark.parametrize("copy", [None, False, True])
+    def test_dict_nocopy(self, copy):
+        a = np.array([1, 2])
+        b = pd.array([1, 2])
+        df = pd.DataFrame({"a": a, "b": b}, copy=copy)
+        df.iloc[0, 0] = 0
+        df.iloc[0, 1] = 0
+
+        if copy is None:
+            # copy for ndarray, no copy for EA
+            assert a[0] == 1
+            assert b[0] == 0
+
+        elif copy:
+            assert a[0] == 1
+            assert b[0] == 1
+        else:
+            assert a[0] == 0
+            assert b[0] == 0