Skip to content

API / CoW: Copy arrays by default in Series constructor #52022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 29, 2023
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,13 @@ Copy-on-Write improvements
of Series objects and specifying ``copy=False``, will now use a lazy copy
of those Series objects for the columns of the DataFrame (:issue:`50777`)

- The :class:`DataFrame` constructor, when constructing from a NumPy array,
will now copy the array by default to avoid mutating the :class:`DataFrame`
- The :class:`DataFrame` and :class:`Series` constructors, when constructing from
a NumPy array, will now copy the array by default to avoid mutating
the :class:`DataFrame` / :class:`Series`
when mutating the array. Specify ``copy=False`` to get the old behavior.
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
behavior when the NumPy array is modified after creation of the
:class:`DataFrame`.
:class:`DataFrame` / :class:`Series`.

- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
will now always raise an warning when Copy-on-Write is enabled. In this mode,
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ def _create_series(index):
"""Helper for the _series dict"""
size = len(index)
data = np.random.randn(size)
return Series(data, index=index, name="a")
return Series(data, index=index, name="a", copy=False)


_series = {
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
validate_percentile,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
Expand Down Expand Up @@ -376,9 +377,15 @@ def __init__(
index=None,
dtype: Dtype | None = None,
name=None,
copy: bool = False,
copy: bool | None = None,
fastpath: bool = False,
) -> None:
if copy is None:
if using_copy_on_write():
copy = True
else:
copy = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could easily move this below the first fastpath if block, when updating that to check for copy is None instead of copy is False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough


if (
isinstance(data, (SingleBlockManager, SingleArrayManager))
and index is None
Expand All @@ -394,6 +401,11 @@ def __init__(
self.name = name
return

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy and using_copy_on_write():
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
data = data.copy()

# we are called internally, so short-circuit
if fastpath:
# data is a ndarray, index is defined
Expand Down Expand Up @@ -6092,7 +6104,7 @@ def _construct_result(
# TODO: result should always be ArrayLike, but this fails for some
# JSONArray tests
dtype = getattr(result, "dtype", None)
out = self._constructor(result, index=self.index, dtype=dtype)
out = self._constructor(result, index=self.index, dtype=dtype, copy=False)
out = out.__finalize__(self)

# Set the result's name after __finalize__ is called because __finalize__
Expand All @@ -6111,7 +6123,7 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
raise ValueError("Lengths must be equal")
other = self._constructor(other, self.index)
other = self._constructor(other, self.index, copy=False)
result = self._binop(other, op, level=level, fill_value=fill_value)
result.name = res_name
return result
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
result = pd.Series(cat).replace(to_replace, value)._values
result = pd.Series(cat, copy=False).replace(to_replace, value)._values

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)

pd.Series(cat).replace(to_replace, value, inplace=True)
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,11 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
# TODO(CoW): arrow is not setting copy=False in the Series constructor
# under the hood
assert not np.shares_memory(
get_array(df, "a"), get_array(result, "a")._pa_array
)


def test_convert_dtypes_infer_objects(using_copy_on_write):
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -89,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write):
assert not result._mgr.blocks[0].refs.has_reference()


@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
if idx is None or dtype is not None:
fastpath = False
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
if using_copy_on_write:
assert not np.shares_memory(get_array(ser), data)
else:
assert np.shares_memory(get_array(ser), data)

arr[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
tm.assert_series_equal(ser, expected)


@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(using_copy_on_write, copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)


@pytest.mark.parametrize(
"idx",
[
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_array_from_scalars(self, data):
assert isinstance(result, type(data))

def test_series_constructor(self, data):
result = pd.Series(data)
result = pd.Series(data, copy=False)
assert result.dtype == data.dtype
assert len(result) == len(data)
if hasattr(result._mgr, "blocks"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_fillna_copy_frame(self, data_missing):

def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)
ser_orig = ser.copy()

filled_val = ser[0]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write):

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)

filled_val = ser[0]
result = ser.fillna(filled_val)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):

def check_series_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
ser = Series(arr)
ser = Series(arr, copy=False)

self.check_can_hold_element(ser, elem, inplace)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2821,7 +2821,7 @@ def __getitem__(self, ix):
def dtype(self):
return DtypeStub()

series = Series(ExtTypeStub())
series = Series(ExtTypeStub(), copy=False)
res = repr(series) # This line crashed before #33770 was fixed.
expected = "\n".join(
["0 [False True]", "1 [ True False]", "dtype: DtypeStub"]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reductions/test_stat_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_td64_mean(self, box):
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")

tdarr = tdi._data
obj = box(tdarr)
obj = box(tdarr, copy=False)

result = obj.mean()
expected = np.array(tdarr).mean()
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

for n in series.index:
msg = "assignment destination is read-only"
Expand All @@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

msg = "assignment destination is read-only"
with pytest.raises(ValueError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
# however, copy is False by default
# so this WILL change values
cat = Categorical(["a", "b", "c", "a"])
s = Series(cat)
s = Series(cat, copy=False)
assert s.values is cat
s = s.cat.rename_categories([1, 2, 3])
assert s.values is not cat
Expand Down