Skip to content

API: Honor copy for dict-input in DataFrame #34872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
9 changes: 7 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,12 @@ class DataFrame(NDFrame):
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer.
copy : bool, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
Copy data from inputs. This only applies to specific cases.

* `data` is a DataFrame or 2D NumPy array
* `data` is a dict with at most one column per NumPy dtype.

Or all other cases, zero-copy construction cannot be ensured.

See Also
--------
Expand Down Expand Up @@ -456,7 +461,7 @@ def __init__(
)

elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, copy=copy)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

Expand Down
18 changes: 12 additions & 6 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def arrays_to_mgr(
columns,
dtype: Optional[DtypeObj] = None,
verify_integrity: bool = True,
copy: bool = False,
):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -80,7 +81,7 @@ def arrays_to_mgr(
index = ensure_index(index)

# don't force copy because getting jammed in an ndarray anyway
arrays = _homogenize(arrays, index, dtype)
arrays = _homogenize(arrays, index, dtype, copy=copy)

columns = ensure_index(columns)
else:
Expand Down Expand Up @@ -234,7 +235,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
return create_block_manager_from_blocks(block_values, [columns, index])


def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
def init_dict(
data: Dict, index, columns, dtype: Optional[DtypeObj] = None, copy: bool = False
):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
Expand Down Expand Up @@ -272,6 +275,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
keys = list(data.keys())
columns = data_names = Index(keys)
arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
# breakpoint()
# GH#24096 need copy to be deep for datetime64tz case
# TODO: See if we can avoid these copies
arrays = [
Expand All @@ -280,7 +284,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
arrays = [
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, copy=copy)


# ---------------------------------------------------------------------
Expand Down Expand Up @@ -326,14 +330,16 @@ def convert(v):
return values


def _homogenize(data, index, dtype: Optional[DtypeObj]):
def _homogenize(data, index, dtype: Optional[DtypeObj], copy: bool = False):
oindex = None
homogenized = []

for val in data:
if isinstance(val, ABCSeries):
if dtype is not None:
val = val.astype(dtype)
val = val.astype(dtype, copy=copy)
elif copy:
val = val.copy()
if val.index is not index:
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
Expand All @@ -349,7 +355,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]):
val = dict(val)
val = lib.fast_multiget(val, oindex._values, default=np.nan)
val = sanitize_array(
val, index, dtype=dtype, copy=False, raise_cast_failure=False
val, index, dtype=dtype, copy=copy, raise_cast_failure=False
)

homogenized.append(val)
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,10 +1817,13 @@ def _shape_compat(x):

first = arrays[0]
shape = (len(arrays),) + _shape_compat(first)

stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
stacked[i] = _asarray_compat(arr)
if len(arrays) == 1:
# allow for 0-copy construction from dict
stacked = _asarray_compat(first).reshape(shape)
else:
stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't you use the original loop i think that will also allow 0-copy construction , no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I don't follow what you mean by original loop.

The original loop I see is allocating memory and assigning into it, which doesn't allow 0-copy.

stacked[i] = _asarray_compat(arr)

return stacked, placement

Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1909,12 +1909,16 @@ def test_constructor_ndarray_copy(self, float_frame):
assert not (df.values[6] == 6).all()

def test_constructor_series_copy(self, float_frame):
series = float_frame._series
series = float_frame._series.copy()

df = DataFrame({"A": series["A"]}, copy=True)
df["A"][:] = 5
assert not (series["A"] == 5).all()

df = DataFrame({"A": series["A"]})
df["A"][:] = 5

assert not (series["A"] == 5).all()
assert (series["A"] == 5).all()

def test_constructor_with_nas(self):
# GH 5016
Expand Down Expand Up @@ -2679,3 +2683,19 @@ def test_construction_from_set_raises(self):
msg = "Set type is unordered"
with pytest.raises(TypeError, match=msg):
pd.DataFrame({"a": {1, 2, 3}})


@pytest.mark.parametrize("copy", [False, True])
def test_dict_nocopy(copy):
a = np.array([1, 2])
b = pd.array([1, 2])
df = pd.DataFrame({"a": a, "b": b}, copy=copy)
df.iloc[0, 0] = 0
df.iloc[0, 1] = 0

if copy:
assert a[0] == 1
assert b[0] == 1
else:
assert a[0] == 0
assert b[0] == 0