From 7b892bd22d1f37e70db2ad23408251a426229143 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 10:35:11 -0500 Subject: [PATCH 1/8] API: Honor copy for dict-input in DataFrame Closes https://github.com/pandas-dev/pandas/issues/32960 --- pandas/core/frame.py | 9 +++++++-- pandas/core/internals/construction.py | 18 ++++++++++++------ pandas/core/internals/managers.py | 11 +++++++---- pandas/tests/frame/test_constructors.py | 24 ++++++++++++++++++++++-- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39ca7ed47f7fa..1a1f6d2f83cd8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -359,7 +359,12 @@ class DataFrame(NDFrame): dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input. + Copy data from inputs. This only applies to specific cases. + + * `data` is a DataFrame or 2D NumPy array + * `data` is a dict with at most one column per NumPy dtype. + + Or all other cases, zero-copy construction cannot be ensured. See Also -------- @@ -456,7 +461,7 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d49f1f154a2c1..60e061449a25f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -64,6 +64,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + copy: bool = False, ): """ Segregate Series based on type and coerce into matrices. @@ -80,7 +81,7 @@ def arrays_to_mgr( index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + arrays = _homogenize(arrays, index, dtype, copy=copy) columns = ensure_index(columns) else: @@ -234,7 +235,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict( + data: Dict, index, columns, dtype: Optional[DtypeObj] = None, copy: bool = False +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -272,6 +275,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] + # breakpoint() # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ @@ -280,7 +284,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, copy=copy) # --------------------------------------------------------------------- @@ -326,14 +330,16 @@ def convert(v): return values -def _homogenize(data, index, dtype: Optional[DtypeObj]): +def _homogenize(data, index, dtype: Optional[DtypeObj], copy: bool = False): oindex = None homogenized = [] for val in data: if isinstance(val, ABCSeries): if dtype is not None: - val = val.astype(dtype) + val = val.astype(dtype, copy=copy) + elif copy: + val = val.copy() if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later @@ -349,7 +355,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( - val, index, dtype=dtype, copy=False, raise_cast_failure=False + val, index, dtype=dtype, copy=copy, raise_cast_failure=False ) homogenized.append(val) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index eaf59051205d6..697d7013409cf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1817,10 +1817,13 @@ def _shape_compat(x): first = arrays[0] shape = (len(arrays),) + _shape_compat(first) - - stacked = np.empty(shape, dtype=dtype) - for i, arr in enumerate(arrays): - stacked[i] = _asarray_compat(arr) + if len(arrays) == 1: + # allow for 0-copy construction from dict + stacked = _asarray_compat(first).reshape(shape) + else: + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) return stacked, placement diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index baac87755c6d2..45cdae9dea1df 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1909,12 +1909,16 @@ def test_constructor_ndarray_copy(self, float_frame): assert not (df.values[6] == 6).all() def test_constructor_series_copy(self, float_frame): - series = float_frame._series + series = float_frame._series.copy() + + df = DataFrame({"A": series["A"]}, copy=True) + df["A"][:] = 5 + assert not (series["A"] == 5).all() df = DataFrame({"A": series["A"]}) df["A"][:] = 5 - assert not (series["A"] == 5).all() + assert (series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 @@ -2679,3 +2683,19 @@ def test_construction_from_set_raises(self): msg = "Set type is unordered" with pytest.raises(TypeError, match=msg): pd.DataFrame({"a": {1, 2, 3}}) + + +@pytest.mark.parametrize("copy", [False, True]) +def test_dict_nocopy(copy): + a = np.array([1, 2]) + b = pd.array([1, 2]) + df = pd.DataFrame({"a": a, "b": b}, copy=copy) + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + + if copy: + assert a[0] == 1 + assert b[0] == 1 + else: + assert a[0] == 0 + assert b[0] == 0 From acf99dd7dd573970d47badaed57dbb6466e9bfd3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:07:26 -0500 Subject: [PATCH 2/8] Fixups --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 25 ++++++++++++++----- pandas/core/internals/construction.py | 1 - pandas/tests/frame/test_constructors.py | 33 ++++++++++++------------- 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a27e6e8433779..209198698bc45 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -261,6 +261,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). +- The :meth:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made (:issue:`32960`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a1f6d2f83cd8..1c9ab3f046752 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -358,13 +358,17 @@ class DataFrame(NDFrame): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, default False - Copy data from inputs. This only applies to specific cases. + copy : bool, optional + Copy data from inputs. This only applies for specific types of `data` + and the default behavior depends on the type of data. - * `data` is a DataFrame or 2D NumPy array - * `data` is a dict with at most one column per NumPy dtype. + * `data` is a DataFrame or 2D NumPy array: do *not* copy by default. + Specifying ``copy=True`` will copy the data. + * `data` is a dict with at most one column per NumPy dtype: copy by default. + Specifying ``copy=False`` will not copy any of the data. - Or all other cases, zero-copy construction cannot be ensured. + For all other cases, zero-copy construction cannot be ensured and `copy` + has no effect. See Also -------- @@ -440,7 +444,7 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: bool = False, + copy: Optional[bool] = None, ): if data is None: data = {} @@ -451,6 +455,7 @@ def __init__( data = data._mgr if isinstance(data, BlockManager): + copy = bool(copy) # None -> False if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -461,10 +466,15 @@ def __init__( ) elif isinstance(data, dict): + if copy is None: + # Copy by default for dict + copy = True mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords + copy = bool(copy) # None -> False + # masked recarray if isinstance(data, mrecords.MaskedRecords): mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) @@ -481,6 +491,7 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): + copy = bool(copy) # None -> False if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} @@ -494,6 +505,7 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): + copy = bool(copy) # None -> False if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: @@ -520,6 +532,7 @@ def __init__( else: mgr = init_dict({}, index, columns, dtype=dtype) else: + copy = bool(copy) # None -> False try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 60e061449a25f..b030cf753c785 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -275,7 +275,6 @@ def init_dict( keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] - # breakpoint() # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 45cdae9dea1df..b242f89a01468 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1909,13 +1909,13 @@ def test_constructor_ndarray_copy(self, float_frame): assert not (df.values[6] == 6).all() def test_constructor_series_copy(self, float_frame): - series = float_frame._series.copy() + series = float_frame._series df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 assert not (series["A"] == 5).all() - df = DataFrame({"A": series["A"]}) + df = DataFrame({"A": series["A"]}, copy=False) df["A"][:] = 5 assert (series["A"] == 5).all() @@ -2684,18 +2684,17 @@ def test_construction_from_set_raises(self): with pytest.raises(TypeError, match=msg): pd.DataFrame({"a": {1, 2, 3}}) - -@pytest.mark.parametrize("copy", [False, True]) -def test_dict_nocopy(copy): - a = np.array([1, 2]) - b = pd.array([1, 2]) - df = pd.DataFrame({"a": a, "b": b}, copy=copy) - df.iloc[0, 0] = 0 - df.iloc[0, 1] = 0 - - if copy: - assert a[0] == 1 - assert b[0] == 1 - else: - assert a[0] == 0 - assert b[0] == 0 + @pytest.mark.parametrize("copy", [None, False, True]) + def test_dict_nocopy(self, copy): + a = np.array([1, 2]) + b = pd.array([1, 2]) + df = pd.DataFrame({"a": a, "b": b}, copy=copy) + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + + if copy is True or copy is None: + assert a[0] == 1 + assert b[0] == 1 + else: + assert a[0] == 0 + assert b[0] == 0 From 499080b0c89f4f59c77473287f8d5d8dabe30dd6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:12:49 -0500 Subject: [PATCH 3/8] copy --- pandas/tests/frame/test_constructors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b242f89a01468..db7950e6dbe81 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1911,7 +1911,7 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({"A": series["A"]}, copy=True) + df = DataFrame({"A": series["A"]}) # copy by default df["A"][:] = 5 assert not (series["A"] == 5).all() From 20c87ce10569c2247b664604c8d918144e3bee84 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:49:02 -0500 Subject: [PATCH 4/8] fixup --- pandas/tests/extension/test_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f318934ef5e52..ac933e342bcff 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -259,7 +259,7 @@ def test_combine_le(self, data_repeated): def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) - df = pd.DataFrame({"A": arr}) + df = pd.DataFrame({"A": arr}, copy=False) filled_val = df.iloc[0, 0] result = df.fillna(filled_val) From b0b125d471ed01a86c77a714754e67b01c2479b5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 14:56:44 -0500 Subject: [PATCH 5/8] simplify --- pandas/core/frame.py | 22 ++++++---------------- pandas/tests/extension/test_sparse.py | 2 +- pandas/tests/frame/test_constructors.py | 12 ++++++------ 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1c9ab3f046752..b2cf3781a1457 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -358,14 +358,13 @@ class DataFrame(NDFrame): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, optional - Copy data from inputs. This only applies for specific types of `data` - and the default behavior depends on the type of data. + copy : bool, default False + Copy data from inputs. This only applies for specific types of `data`. - * `data` is a DataFrame or 2D NumPy array: do *not* copy by default. + * `data` is a DataFrame or 2D NumPy array Specifying ``copy=True`` will copy the data. - * `data` is a dict with at most one column per NumPy dtype: copy by default. - Specifying ``copy=False`` will not copy any of the data. + * `data` is a dict with at most one column per NumPy dtype + Specifying ``copy=True`` will copy all of the values. For all other cases, zero-copy construction cannot be ensured and `copy` has no effect. @@ -444,7 +443,7 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: Optional[bool] = None, + copy: bool = False, ): if data is None: data = {} @@ -455,7 +454,6 @@ def __init__( data = data._mgr if isinstance(data, BlockManager): - copy = bool(copy) # None -> False if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -466,15 +464,10 @@ def __init__( ) elif isinstance(data, dict): - if copy is None: - # Copy by default for dict - copy = True mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords - copy = bool(copy) # None -> False - # masked recarray if isinstance(data, mrecords.MaskedRecords): mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) @@ -491,7 +484,6 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): - copy = bool(copy) # None -> False if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} @@ -505,7 +497,6 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): - copy = bool(copy) # None -> False if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: @@ -532,7 +523,6 @@ def __init__( else: mgr = init_dict({}, index, columns, dtype=dtype) else: - copy = bool(copy) # None -> False try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index ac933e342bcff..f318934ef5e52 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -259,7 +259,7 @@ def test_combine_le(self, data_repeated): def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) - df = pd.DataFrame({"A": arr}, copy=False) + df = pd.DataFrame({"A": arr}) filled_val = df.iloc[0, 0] result = df.fillna(filled_val) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index db7950e6dbe81..91be73146aba3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1911,14 +1911,14 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({"A": series["A"]}) # copy by default + df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 - assert not (series["A"] == 5).all() - df = DataFrame({"A": series["A"]}, copy=False) - df["A"][:] = 5 + assert not all(series["A"] == 5).all() - assert (series["A"] == 5).all() + df = DataFrame({"A": series["A"]}) # no copy by default + df["A"][:] = 5 + assert all(series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 @@ -2692,7 +2692,7 @@ def test_dict_nocopy(self, copy): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 - if copy is True or copy is None: + if copy: assert a[0] == 1 assert b[0] == 1 else: From f9b3f16508a22e0ff8030427eaee7a080c8b09ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jun 2020 15:31:30 -0500 Subject: [PATCH 6/8] optional --- pandas/core/construction.py | 13 ++++++++++++- pandas/core/frame.py | 24 ++++++++++++++++++------ pandas/core/internals/construction.py | 10 +++++++--- pandas/tests/frame/test_constructors.py | 11 ++++++++--- 4 files changed, 45 insertions(+), 13 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b110a316a76d9..5b4b0d9f91081 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -390,7 +390,7 @@ def sanitize_array( data, index: Optional["Index"], dtype: Optional[DtypeObj] = None, - copy: bool = False, + copy: Optional[bool] = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ @@ -412,6 +412,9 @@ def sanitize_array( # GH#846 if isinstance(data, np.ndarray): + if copy is None: + # copy by default for DataFrame({"A": ndarray}) + copy = True if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage @@ -428,15 +431,20 @@ def sanitize_array( elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray + # no copy by default for DataFrame({"A": ndarray}) + if copy is None: + copy = False subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: + # no copy by default from DataFrame.__init__ subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: + copy = bool(copy) # None -> False if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -446,16 +454,19 @@ def sanitize_array( elif isinstance(data, range): # GH#16804 + copy = bool(copy) # None -> False arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif isinstance(data, abc.Set): raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: + copy = bool(copy) # None -> False data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: + copy = bool(copy) # None -> False subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2cf3781a1457..1ad96c9c4c33f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -358,13 +358,19 @@ class DataFrame(NDFrame): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, default False - Copy data from inputs. This only applies for specific types of `data`. + copy : bool, optional + Copy data from inputs. This only applies for specific types of `data` + and the default behavior depends on `data`. - * `data` is a DataFrame or 2D NumPy array + * `data` is a DataFrame or 2D NumPy array: *no* copy by default. Specifying ``copy=True`` will copy the data. - * `data` is a dict with at most one column per NumPy dtype - Specifying ``copy=True`` will copy all of the values. + * `data` is a dict: + By default arrays in `data` with with NumPy dtypes in `data` are + copied, while extension types are not. Specifying ``copy=True`` + will copy all of the values, and ``copy=False`` will attempt to + not copy the data. Note that if `data` has more than one value with + the same NumPy dtype then then data will be copied, regardless of + the value of `copy`. For all other cases, zero-copy construction cannot be ensured and `copy` has no effect. @@ -443,7 +449,7 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: bool = False, + copy: Optional[bool] = None, ): if data is None: data = {} @@ -454,6 +460,7 @@ def __init__( data = data._mgr if isinstance(data, BlockManager): + copy = bool(copy) # None -> False if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -468,6 +475,8 @@ def __init__( elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords + copy = bool(copy) # None -> False + # masked recarray if isinstance(data, mrecords.MaskedRecords): mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) @@ -484,6 +493,7 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): + copy = bool(copy) # None -> False if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} @@ -497,6 +507,7 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): + copy = bool(copy) # None -> False if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: @@ -523,6 +534,7 @@ def __init__( else: mgr = init_dict({}, index, columns, dtype=dtype) else: + copy = bool(copy) # None -> False try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b030cf753c785..cfacc37389986 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -64,7 +64,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, - copy: bool = False, + copy: Optional[bool] = False, ): """ Segregate Series based on type and coerce into matrices. @@ -236,7 +236,11 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): def init_dict( - data: Dict, index, columns, dtype: Optional[DtypeObj] = None, copy: bool = False + data: Dict, + index, + columns, + dtype: Optional[DtypeObj] = None, + copy: Optional[bool] = False, ): """ Segregate Series based on type and coerce into matrices. @@ -329,7 +333,7 @@ def convert(v): return values -def _homogenize(data, index, dtype: Optional[DtypeObj], copy: bool = False): +def _homogenize(data, index, dtype: Optional[DtypeObj], copy: Optional[bool] = False): oindex = None homogenized = [] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 91be73146aba3..b8aa98d309942 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1914,11 +1914,11 @@ def test_constructor_series_copy(self, float_frame): df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 - assert not all(series["A"] == 5).all() + assert not (series["A"] == 5).all() df = DataFrame({"A": series["A"]}) # no copy by default df["A"][:] = 5 - assert all(series["A"] == 5).all() + assert (series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 @@ -2692,7 +2692,12 @@ def test_dict_nocopy(self, copy): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 - if copy: + if copy is None: + # copy for ndarray, no copy for EA + assert a[0] == 1 + assert b[0] == 0 + + elif copy: assert a[0] == 1 assert b[0] == 1 else: From 306d015312f49bd8e1a5c0cb67e31b5a02ca20af Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Jun 2020 11:21:12 -0500 Subject: [PATCH 7/8] Fixup --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 858b6d01ed955..35f0b85188348 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -261,7 +261,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). -- The :meth:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made (:issue:`32960`) +- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made (:issue:`32960`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d638c58a978b3..dfeccebff78cf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -365,15 +365,15 @@ class DataFrame(NDFrame): * `data` is a DataFrame or 2D NumPy array: *no* copy by default. Specifying ``copy=True`` will copy the data. * `data` is a dict: - By default arrays in `data` with with NumPy dtypes in `data` are + By default arrays in `data` with NumPy dtypes in `data` are copied, while extension types are not. Specifying ``copy=True`` will copy all of the values, and ``copy=False`` will attempt to not copy the data. Note that if `data` has more than one value with the same NumPy dtype then then data will be copied, regardless of the value of `copy`. - For all other cases, zero-copy construction cannot be ensured and `copy` - has no effect. + For all other types of `data`, zero-copy construction cannot be ensured + and `copy` has no effect. See Also -------- From 9f716c829212c05eb0a5b916682c56b6f2228217 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 12 Jul 2020 15:57:06 -0500 Subject: [PATCH 8/8] fix comment --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ec40d7c44bc3e..f9a111c14a666 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -431,7 +431,7 @@ def sanitize_array( elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray - # no copy by default for DataFrame({"A": ndarray}) + # no copy by default for DataFrame({"A": extension_array}) if copy is None: copy = False subarr = data