pandas-dev · jreback · Mar 31, 2021 · Oct 5, 2020 · Oct 5, 2020 · Oct 5, 2020
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -52,6 +52,7 @@ Other enhancements
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
 - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
 - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
+- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made, rather than ignoring it (:issue:`32960`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -650,7 +650,7 @@ def getPeriodData(nper=None):
 # make frame
 def makeTimeDataFrame(nper=None, freq="B"):
     data = getTimeSeriesData(nper, freq)
-    return DataFrame(data)
+    return DataFrame(data)._consolidate()
 
 
 def makeDataFrame():

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -695,7 +695,7 @@ def float_frame():
 
     [30 rows x 4 columns]
     """
-    return DataFrame(tm.getSeriesData())
+    return DataFrame(tm.getSeriesData())._consolidate()
 
 
 # ----------------------------------------------------------------
@@ -1189,7 +1189,7 @@ def any_nullable_int_dtype(request):
 
 
 @pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES)
-def any_numeric_dtype(request):
+def any_nullable_numeric_dtype(request):
     """
     Parameterized fixture for any nullable integer dtype and
     any float ea dtypes.

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -534,7 +534,7 @@ def __init__(
             )
 
         elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
+            mgr = init_dict(data, index, columns, dtype=dtype, copy=copy)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1764,7 +1764,9 @@ def describe(self, **kwargs):
             result = self.apply(lambda x: x.describe(**kwargs))
             if self.axis == 1:
                 return result.T
-            return result.unstack()
+            # FIXME: not being consolidated breaks
+            #  test_describe_with_duplicate_output_column_names
+            return result._consolidate().unstack()
 
     @final
     def resample(self, rule, *args, **kwargs):

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -68,6 +68,7 @@ def arrays_to_mgr(
     columns,
     dtype: Optional[DtypeObj] = None,
     verify_integrity: bool = True,
+    consolidate: bool = True,
 ):
     """
     Segregate Series based on type and coerce into matrices.
@@ -94,7 +95,9 @@ def arrays_to_mgr(
     # from BlockManager perspective
     axes = [columns, index]
 
-    return create_block_manager_from_arrays(arrays, arr_names, axes)
+    return create_block_manager_from_arrays(
+        arrays, arr_names, axes, consolidate=consolidate
+    )
 
 
 def masked_rec_array_to_mgr(
@@ -143,7 +146,13 @@ def masked_rec_array_to_mgr(
 # DataFrame Constructor Interface
 
 
-def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
+def init_ndarray(
+    values,
+    index,
+    columns,
+    dtype: Optional[DtypeObj],
+    copy: bool,
+):
     # input must be a ndarray, list, Series, index
 
     if isinstance(values, ABCSeries):
@@ -225,7 +234,14 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
     return create_block_manager_from_blocks(block_values, [columns, index])
 
 
-def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
+def init_dict(
+    data: Dict,
+    index,
+    columns,
+    *,
+    dtype: Optional[DtypeObj] = None,
+    copy: bool = True,
+):
     """
     Segregate Series based on type and coerce into matrices.
     Needs to handle a lot of exceptional cases.
@@ -259,6 +275,8 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
             val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
             arrays.loc[missing] = [val] * missing.sum()
 
+        arrays = list(arrays)
+
     else:
         keys = list(data.keys())
         columns = data_names = Index(keys)
@@ -269,7 +287,15 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+
+    if copy:
+        # arrays_to_mgr (via form_blocks) won't make copies for EAs
+        arrays = [x if not is_extension_array_dtype(x) else x.copy() for x in arrays]
+        # TODO: can we get rid of the dt64tz special case above?
+
+    return arrays_to_mgr(
+        arrays, data_names, index, columns, dtype=dtype, consolidate=copy
+    )
 
 
 def nested_data_to_arrays(

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from pandas._libs import internals as libinternals, lib
+from pandas._libs import NaT, internals as libinternals, lib
 from pandas._typing import ArrayLike, DtypeObj, Label, Shape
 from pandas.errors import PerformanceWarning
 from pandas.util._validators import validate_bool_kwarg
@@ -956,7 +956,15 @@ def fast_xs(self, loc: int) -> ArrayLike:
             # Such assignment may incorrectly coerce NaT to None
             # result[blk.mgr_locs] = blk._slice((slice(None), loc))
             for i, rl in enumerate(blk.mgr_locs):
-                result[rl] = blk.iget((i, loc))
+                out = blk.iget((i, loc))
+                if is_dtype_equal(blk.dtype, dtype) and dtype == "m8[ns]":
+                    # FIXME: kludge for NaT -> tdnat
+                    # TODO: need a test like test_sum_nanops_timedelta
+                    #  where initial DataFrame is not consolidated
+                    if out is NaT:
+                        result[rl] = np.timedelta64("NaT", "ns")
+                        continue
+                result[rl] = out
 
         if isinstance(dtype, ExtensionDtype):
             result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
@@ -1662,7 +1670,9 @@ def fast_xs(self, loc):
 # Constructor Helpers
 
 
-def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
+def create_block_manager_from_blocks(
+    blocks, axes: List[Index], consolidate: bool = True
+) -> BlockManager:
     try:
         if len(blocks) == 1 and not isinstance(blocks[0], Block):
             # if blocks[0] is of length 0, return empty blocks
@@ -1679,7 +1689,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
                 ]
 
         mgr = BlockManager(blocks, axes)
-        mgr._consolidate_inplace()
+        if consolidate:
+            mgr._consolidate_inplace()
         return mgr
 
     except ValueError as e:
@@ -1689,7 +1700,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
 
 
 def create_block_manager_from_arrays(
-    arrays, names: Index, axes: List[Index]
+    arrays,
+    names: Index,
+    axes: List[Index],
+    consolidate: bool = True,
 ) -> BlockManager:
     assert isinstance(names, Index)
     assert isinstance(axes, list)
@@ -1699,12 +1713,13 @@ def create_block_manager_from_arrays(
     # Note: just calling extract_array breaks tests that patch PandasArray._typ.
     arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays]
     try:
-        blocks = _form_blocks(arrays, names, axes)
+        blocks = _form_blocks(arrays, names, axes, consolidate)
         mgr = BlockManager(blocks, axes)
-        mgr._consolidate_inplace()
-        return mgr
     except ValueError as e:
         raise construction_error(len(arrays), arrays[0].shape, axes, e)
+    if consolidate:
+        mgr._consolidate_inplace()
+    return mgr
 
 
 def construction_error(tot_items, block_shape, axes, e=None):
@@ -1731,7 +1746,7 @@ def construction_error(tot_items, block_shape, axes, e=None):
 # -----------------------------------------------------------------------
 
 
-def _form_blocks(arrays, names: Index, axes) -> List[Block]:
+def _form_blocks(arrays, names: Index, axes, consolidate: bool) -> List[Block]:
     # put "leftover" items in float bucket, where else?
     # generalize?
     items_dict: DefaultDict[str, List] = defaultdict(list)
@@ -1757,23 +1772,31 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]:
 
     blocks: List[Block] = []
     if len(items_dict["FloatBlock"]):
-        float_blocks = _multi_blockify(items_dict["FloatBlock"])
+        float_blocks = _multi_blockify(
+            items_dict["FloatBlock"], consolidate=consolidate
+        )
         blocks.extend(float_blocks)
 
     if len(items_dict["ComplexBlock"]):
-        complex_blocks = _multi_blockify(items_dict["ComplexBlock"])
+        complex_blocks = _multi_blockify(
+            items_dict["ComplexBlock"], consolidate=consolidate
+        )
         blocks.extend(complex_blocks)
 
     if len(items_dict["TimeDeltaBlock"]):
-        timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"])
+        timedelta_blocks = _multi_blockify(
+            items_dict["TimeDeltaBlock"], consolidate=consolidate
+        )
         blocks.extend(timedelta_blocks)
 
     if len(items_dict["IntBlock"]):
-        int_blocks = _multi_blockify(items_dict["IntBlock"])
+        int_blocks = _multi_blockify(items_dict["IntBlock"], consolidate=consolidate)
         blocks.extend(int_blocks)
 
     if len(items_dict["DatetimeBlock"]):
-        datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE)
+        datetime_blocks = _simple_blockify(
+            items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate
+        )
         blocks.extend(datetime_blocks)
 
     if len(items_dict["DatetimeTZBlock"]):
@@ -1784,11 +1807,15 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]:
         blocks.extend(dttz_blocks)
 
     if len(items_dict["BoolBlock"]):
-        bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_)
+        bool_blocks = _simple_blockify(
+            items_dict["BoolBlock"], np.bool_, consolidate=consolidate
+        )
         blocks.extend(bool_blocks)
 
     if len(items_dict["ObjectBlock"]) > 0:
-        object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
+        object_blocks = _simple_blockify(
+            items_dict["ObjectBlock"], np.object_, consolidate=consolidate
+        )
         blocks.extend(object_blocks)
 
     if len(items_dict["CategoricalBlock"]) > 0:
@@ -1827,11 +1854,14 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]:
     return blocks
 
 
-def _simple_blockify(tuples, dtype) -> List[Block]:
+def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]:
     """
     return a single array of a block that has a single dtype; if dtype is
     not None, coerce to this dtype
     """
+    if not consolidate:
+        return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
+
     values, placement = _stack_arrays(tuples, dtype)
 
     # TODO: CHECK DTYPE?
@@ -1842,8 +1872,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]:
     return [block]
 
 
-def _multi_blockify(tuples, dtype=None):
+def _multi_blockify(tuples, dtype=None, consolidate: bool = True):
     """ return an array of blocks that potentially have different dtypes """
+
+    if not consolidate:
+        return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
+
     # group by dtype
     grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
 
@@ -1858,6 +1892,18 @@ def _multi_blockify(tuples, dtype=None):
     return new_blocks
 
 
+def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]:
+    # tuples produced within _form_blocks are of the form (placement, whatever, array)
+    if dtype is not None:
+        return [
+            make_block(
+                np.atleast_2d(x[2].astype(dtype, copy=False)), placement=x[0], ndim=2
+            )
+            for x in tuples
+        ]
+    return [make_block(np.atleast_2d(x[2]), placement=x[0], ndim=2) for x in tuples]
+
+
 def _stack_arrays(tuples, dtype):
 
     # fml

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -536,9 +536,7 @@ def test_df_mod_zero_df(self):
         # GH#3590, modulo as ints
         df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
 
-        # this is technically wrong, as the integer portion is coerced to float
-        # ###
-        first = Series([0, 0, 0, 0], dtype="float64")
+        first = Series([0, 0, 0, 0], dtype="int64")
         second = Series([np.nan, np.nan, np.nan, 0])
         expected = pd.DataFrame({"first": first, "second": second})
         result = df % df

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -1297,7 +1297,7 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne)
             f(df, 0)
 
     def test_comparison_protected_from_errstate(self):
-        missing_df = tm.makeDataFrame()
+        missing_df = tm.makeDataFrame()._consolidate()
         missing_df.iloc[0]["A"] = np.nan
         with np.errstate(invalid="ignore"):
             expected = missing_df.values < 0