Skip to content

Commit 5e7f7d8

Browse files
jbrockmendelvladu
authored andcommitted
API: honor copy=True when passing dict to DataFrame (pandas-dev#38939)
1 parent dcb01bc commit 5e7f7d8

File tree

12 files changed

+241
-41
lines changed

12 files changed

+241
-41
lines changed

doc/source/whatsnew/v1.3.0.rst

+26
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,30 @@ both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`)
110110
111111
For more, see :ref:`io.xml` in the user guide on IO tools.
112112

113+
.. _whatsnew_130.dataframe_honors_copy_with_dict:
114+
115+
DataFrame constructor honors ``copy=False`` with dict
116+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
117+
118+
When passing a dictionary to :class:`DataFrame` with ``copy=False``,
119+
a copy will no longer be made (:issue:`32960`)
120+
121+
.. ipython:: python
122+
123+
arr = np.array([1, 2, 3])
124+
df = pd.DataFrame({"A": arr, "B": arr.copy()}, copy=False)
125+
df
126+
127+
``df["A"]`` remains a view on ``arr``:
128+
129+
.. ipython:: python
130+
131+
arr[0] = 0
132+
assert df.iloc[0, 0] == 0
133+
134+
The default behavior when not passing ``copy`` will remain unchanged, i.e.
135+
a copy will be made.
136+
113137
.. _whatsnew_130.enhancements.other:
114138

115139
Other enhancements
@@ -546,6 +570,8 @@ Conversion
546570
- Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`)
547571
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
548572
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
573+
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
574+
-
549575

550576
Strings
551577
^^^^^^^

pandas/core/frame.py

+24-16
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,12 @@ class DataFrame(NDFrame, OpsMixin):
476476
RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
477477
dtype : dtype, default None
478478
Data type to force. Only a single dtype is allowed. If None, infer.
479-
copy : bool, default False
480-
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
479+
copy : bool or None, default None
480+
Copy data from inputs.
481+
For dict data, the default of None behaves like ``copy=True``. For DataFrame
482+
or 2d ndarray input, the default of None behaves like ``copy=False``.
483+
484+
.. versionchanged:: 1.3.0
481485
482486
See Also
483487
--------
@@ -555,8 +559,16 @@ def __init__(
555559
index: Optional[Axes] = None,
556560
columns: Optional[Axes] = None,
557561
dtype: Optional[Dtype] = None,
558-
copy: bool = False,
562+
copy: Optional[bool] = None,
559563
):
564+
565+
if copy is None:
566+
if isinstance(data, dict) or data is None:
567+
# retain pre-GH#38939 default behavior
568+
copy = True
569+
else:
570+
copy = False
571+
560572
if data is None:
561573
data = {}
562574
if dtype is not None:
@@ -565,18 +577,13 @@ def __init__(
565577
if isinstance(data, DataFrame):
566578
data = data._mgr
567579

568-
# first check if a Manager is passed without any other arguments
569-
# -> use fastpath (without checking Manager type)
570-
if (
571-
index is None
572-
and columns is None
573-
and dtype is None
574-
and copy is False
575-
and isinstance(data, (BlockManager, ArrayManager))
576-
):
577-
# GH#33357 fastpath
578-
NDFrame.__init__(self, data)
579-
return
580+
if isinstance(data, (BlockManager, ArrayManager)):
581+
# first check if a Manager is passed without any other arguments
582+
# -> use fastpath (without checking Manager type)
583+
if index is None and columns is None and dtype is None and not copy:
584+
# GH#33357 fastpath
585+
NDFrame.__init__(self, data)
586+
return
580587

581588
manager = get_option("mode.data_manager")
582589

@@ -586,7 +593,8 @@ def __init__(
586593
)
587594

588595
elif isinstance(data, dict):
589-
mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager)
596+
# GH#38939 de facto copy defaults to False only in non-dict cases
597+
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
590598
elif isinstance(data, ma.MaskedArray):
591599
import numpy.ma.mrecords as mrecords
592600

pandas/core/groupby/groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1807,7 +1807,9 @@ def describe(self, **kwargs):
18071807
result = self.apply(lambda x: x.describe(**kwargs))
18081808
if self.axis == 1:
18091809
return result.T
1810-
return result.unstack()
1810+
# FIXME: not being consolidated breaks
1811+
# test_describe_with_duplicate_output_column_names
1812+
return result._consolidate().unstack()
18111813

18121814
@final
18131815
def resample(self, rule, *args, **kwargs):

pandas/core/internals/construction.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,11 @@ def arrays_to_mgr(
101101
arr_names,
102102
index,
103103
columns,
104+
*,
104105
dtype: Optional[DtypeObj] = None,
105106
verify_integrity: bool = True,
106107
typ: Optional[str] = None,
108+
consolidate: bool = True,
107109
) -> Manager:
108110
"""
109111
Segregate Series based on type and coerce into matrices.
@@ -131,7 +133,9 @@ def arrays_to_mgr(
131133
axes = [columns, index]
132134

133135
if typ == "block":
134-
return create_block_manager_from_arrays(arrays, arr_names, axes)
136+
return create_block_manager_from_arrays(
137+
arrays, arr_names, axes, consolidate=consolidate
138+
)
135139
elif typ == "array":
136140
if len(columns) != len(arrays):
137141
assert len(arrays) == 0
@@ -181,7 +185,7 @@ def rec_array_to_mgr(
181185
if columns is None:
182186
columns = arr_columns
183187

184-
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)
188+
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype=dtype, typ=typ)
185189

186190
if copy:
187191
mgr = mgr.copy()
@@ -376,7 +380,13 @@ def maybe_squeeze_dt64tz(dta: ArrayLike) -> ArrayLike:
376380

377381

378382
def dict_to_mgr(
379-
data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str
383+
data: Dict,
384+
index,
385+
columns,
386+
*,
387+
dtype: Optional[DtypeObj] = None,
388+
typ: str = "block",
389+
copy: bool = True,
380390
) -> Manager:
381391
"""
382392
Segregate Series based on type and coerce into matrices.
@@ -414,6 +424,8 @@ def dict_to_mgr(
414424
val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
415425
arrays.loc[missing] = [val] * missing.sum()
416426

427+
arrays = list(arrays)
428+
417429
else:
418430
keys = list(data.keys())
419431
columns = data_names = Index(keys)
@@ -424,7 +436,21 @@ def dict_to_mgr(
424436
arrays = [
425437
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
426438
]
427-
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)
439+
440+
if copy:
441+
# arrays_to_mgr (via form_blocks) won't make copies for EAs
442+
# dtype attr check to exclude EADtype-castable strs
443+
arrays = [
444+
x
445+
if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
446+
else x.copy()
447+
for x in arrays
448+
]
449+
# TODO: can we get rid of the dt64tz special case above?
450+
451+
return arrays_to_mgr(
452+
arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
453+
)
428454

429455

430456
def nested_data_to_arrays(

pandas/core/internals/managers.py

+51-14
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@
5353

5454
import pandas.core.algorithms as algos
5555
from pandas.core.arrays.sparse import SparseDtype
56-
from pandas.core.construction import extract_array
56+
from pandas.core.construction import (
57+
ensure_wrapped_if_datetimelike,
58+
extract_array,
59+
)
5760
from pandas.core.indexers import maybe_convert_indices
5861
from pandas.core.indexes.api import (
5962
Float64Index,
@@ -991,6 +994,8 @@ def fast_xs(self, loc: int) -> ArrayLike:
991994
# Any]]"
992995
result = np.empty(n, dtype=dtype) # type: ignore[arg-type]
993996

997+
result = ensure_wrapped_if_datetimelike(result)
998+
994999
for blk in self.blocks:
9951000
# Such assignment may incorrectly coerce NaT to None
9961001
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
@@ -1693,7 +1698,7 @@ def set_values(self, values: ArrayLike):
16931698

16941699

16951700
def create_block_manager_from_blocks(
1696-
blocks: List[Block], axes: List[Index]
1701+
blocks: List[Block], axes: List[Index], consolidate: bool = True
16971702
) -> BlockManager:
16981703
try:
16991704
mgr = BlockManager(blocks, axes)
@@ -1703,7 +1708,8 @@ def create_block_manager_from_blocks(
17031708
tot_items = sum(arr.shape[0] for arr in arrays)
17041709
raise construction_error(tot_items, arrays[0].shape[1:], axes, err)
17051710

1706-
mgr._consolidate_inplace()
1711+
if consolidate:
1712+
mgr._consolidate_inplace()
17071713
return mgr
17081714

17091715

@@ -1713,7 +1719,10 @@ def _extract_array(obj):
17131719

17141720

17151721
def create_block_manager_from_arrays(
1716-
arrays, names: Index, axes: List[Index]
1722+
arrays,
1723+
names: Index,
1724+
axes: List[Index],
1725+
consolidate: bool = True,
17171726
) -> BlockManager:
17181727
assert isinstance(names, Index)
17191728
assert isinstance(axes, list)
@@ -1722,12 +1731,13 @@ def create_block_manager_from_arrays(
17221731
arrays = [_extract_array(x) for x in arrays]
17231732

17241733
try:
1725-
blocks = _form_blocks(arrays, names, axes)
1734+
blocks = _form_blocks(arrays, names, axes, consolidate)
17261735
mgr = BlockManager(blocks, axes)
1727-
mgr._consolidate_inplace()
1728-
return mgr
17291736
except ValueError as e:
17301737
raise construction_error(len(arrays), arrays[0].shape, axes, e)
1738+
if consolidate:
1739+
mgr._consolidate_inplace()
1740+
return mgr
17311741

17321742

17331743
def construction_error(
@@ -1760,7 +1770,7 @@ def construction_error(
17601770

17611771

17621772
def _form_blocks(
1763-
arrays: List[ArrayLike], names: Index, axes: List[Index]
1773+
arrays: List[ArrayLike], names: Index, axes: List[Index], consolidate: bool
17641774
) -> List[Block]:
17651775
# put "leftover" items in float bucket, where else?
17661776
# generalize?
@@ -1786,15 +1796,21 @@ def _form_blocks(
17861796

17871797
blocks: List[Block] = []
17881798
if len(items_dict["NumericBlock"]):
1789-
numeric_blocks = _multi_blockify(items_dict["NumericBlock"])
1799+
numeric_blocks = _multi_blockify(
1800+
items_dict["NumericBlock"], consolidate=consolidate
1801+
)
17901802
blocks.extend(numeric_blocks)
17911803

17921804
if len(items_dict["TimeDeltaBlock"]):
1793-
timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"])
1805+
timedelta_blocks = _multi_blockify(
1806+
items_dict["TimeDeltaBlock"], consolidate=consolidate
1807+
)
17941808
blocks.extend(timedelta_blocks)
17951809

17961810
if len(items_dict["DatetimeBlock"]):
1797-
datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE)
1811+
datetime_blocks = _simple_blockify(
1812+
items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate
1813+
)
17981814
blocks.extend(datetime_blocks)
17991815

18001816
if len(items_dict["DatetimeTZBlock"]):
@@ -1805,7 +1821,9 @@ def _form_blocks(
18051821
blocks.extend(dttz_blocks)
18061822

18071823
if len(items_dict["ObjectBlock"]) > 0:
1808-
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
1824+
object_blocks = _simple_blockify(
1825+
items_dict["ObjectBlock"], np.object_, consolidate=consolidate
1826+
)
18091827
blocks.extend(object_blocks)
18101828

18111829
if len(items_dict["CategoricalBlock"]) > 0:
@@ -1844,11 +1862,14 @@ def _form_blocks(
18441862
return blocks
18451863

18461864

1847-
def _simple_blockify(tuples, dtype) -> List[Block]:
1865+
def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]:
18481866
"""
18491867
return a single array of a block that has a single dtype; if dtype is
18501868
not None, coerce to this dtype
18511869
"""
1870+
if not consolidate:
1871+
return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
1872+
18521873
values, placement = _stack_arrays(tuples, dtype)
18531874

18541875
# TODO: CHECK DTYPE?
@@ -1859,8 +1880,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]:
18591880
return [block]
18601881

18611882

1862-
def _multi_blockify(tuples, dtype: Optional[Dtype] = None):
1883+
def _multi_blockify(tuples, dtype: Optional[DtypeObj] = None, consolidate: bool = True):
18631884
""" return an array of blocks that potentially have different dtypes """
1885+
1886+
if not consolidate:
1887+
return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
1888+
18641889
# group by dtype
18651890
grouper = itertools.groupby(tuples, lambda x: x[1].dtype)
18661891

@@ -1880,6 +1905,18 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None):
18801905
return new_blocks
18811906

18821907

1908+
def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]:
1909+
# tuples produced within _form_blocks are of the form (placement, whatever, array)
1910+
if dtype is not None:
1911+
return [
1912+
new_block(
1913+
np.atleast_2d(x[1].astype(dtype, copy=False)), placement=x[0], ndim=2
1914+
)
1915+
for x in tuples
1916+
]
1917+
return [new_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples]
1918+
1919+
18831920
def _stack_arrays(tuples, dtype: np.dtype):
18841921

18851922
placement, arrays = zip(*tuples)

pandas/tests/arithmetic/test_numeric.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,6 @@ def test_df_div_zero_series_does_not_commute(self):
538538
def test_df_mod_zero_df(self, using_array_manager):
539539
# GH#3590, modulo as ints
540540
df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
541-
542541
# this is technically wrong, as the integer portion is coerced to float
543542
first = Series([0, 0, 0, 0])
544543
if not using_array_manager:
@@ -551,6 +550,15 @@ def test_df_mod_zero_df(self, using_array_manager):
551550
result = df % df
552551
tm.assert_frame_equal(result, expected)
553552

553+
# GH#38939 If we dont pass copy=False, df is consolidated and
554+
# result["first"] is float64 instead of int64
555+
df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False)
556+
first = Series([0, 0, 0, 0], dtype="int64")
557+
second = Series([np.nan, np.nan, np.nan, 0])
558+
expected = pd.DataFrame({"first": first, "second": second})
559+
result = df % df
560+
tm.assert_frame_equal(result, expected)
561+
554562
def test_df_mod_zero_array(self):
555563
# GH#3590, modulo as ints
556564
df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})

pandas/tests/extension/decimal/array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def take(self, indexer, allow_fill=False, fill_value=None):
150150
return self._from_sequence(result)
151151

152152
def copy(self):
153-
return type(self)(self._data.copy())
153+
return type(self)(self._data.copy(), dtype=self.dtype)
154154

155155
def astype(self, dtype, copy=True):
156156
if is_dtype_equal(dtype, self._dtype):

0 commit comments

Comments
 (0)