Skip to content

API / CoW: Copy arrays by default in Series constructor #52022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 29, 2023
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,13 @@ Copy-on-Write improvements
- The :class:`DataFrame` constructor, when constructing a DataFrame from a
:class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write.

- The :class:`DataFrame` constructor, when constructing from a NumPy array,
will now copy the array by default to avoid mutating the :class:`DataFrame`
- The :class:`DataFrame` and :class:`Series` constructors, when constructing from
a NumPy array, will now copy the array by default to avoid mutating
the :class:`DataFrame` / :class:`Series`
when mutating the array. Specify ``copy=False`` to get the old behavior.
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
behavior when the NumPy array is modified after creation of the
:class:`DataFrame`.
:class:`DataFrame` / :class:`Series`.

- The :meth:`DataFrame.from_records` will now respect Copy-on-Write when called
with a :class:`DataFrame`.
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ def _create_series(index):
"""Helper for the _series dict"""
size = len(index)
data = np.random.randn(size)
return Series(data, index=index, name="a")
return Series(data, index=index, name="a", copy=False)


_series = {
Expand Down
16 changes: 12 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
validate_percentile,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
Expand Down Expand Up @@ -373,14 +374,14 @@ def __init__(
index=None,
dtype: Dtype | None = None,
name=None,
copy: bool = False,
copy: bool | None = None,
fastpath: bool = False,
) -> None:
if (
isinstance(data, (SingleBlockManager, SingleArrayManager))
and index is None
and dtype is None
and copy is False
and (copy is False or copy is None)
):
if using_copy_on_write():
data = data.copy(deep=False)
Expand All @@ -393,6 +394,13 @@ def __init__(
self.name = name
return

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False and using_copy_on_write():
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
data = data.copy()
if copy is None:
copy = False

# we are called internally, so short-circuit
if fastpath:
# data is a ndarray, index is defined
Expand Down Expand Up @@ -5855,7 +5863,7 @@ def _construct_result(
# TODO: result should always be ArrayLike, but this fails for some
# JSONArray tests
dtype = getattr(result, "dtype", None)
out = self._constructor(result, index=self.index, dtype=dtype)
out = self._constructor(result, index=self.index, dtype=dtype, copy=False)
out = out.__finalize__(self)

# Set the result's name after __finalize__ is called because __finalize__
Expand All @@ -5874,7 +5882,7 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
raise ValueError("Lengths must be equal")
other = self._constructor(other, self.index)
other = self._constructor(other, self.index, copy=False)
result = self._binop(other, op, level=level, fill_value=fill_value)
result.name = res_name
return result
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
result = pd.Series(cat).replace(to_replace, value)._values
result = pd.Series(cat, copy=False).replace(to_replace, value)._values

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)

pd.Series(cat).replace(to_replace, value, inplace=True)
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,11 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
# TODO(CoW): arrow is not setting copy=False in the Series constructor
# under the hood
assert not np.shares_memory(
get_array(df, "a"), get_array(result, "a")._pa_array
)


def test_convert_dtypes_infer_objects(using_copy_on_write):
Expand Down
38 changes: 35 additions & 3 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write):
result = Series(ser, dtype=dtype)

# the shallow copy still shares memory
assert np.shares_memory(ser.values, result.values)
assert np.shares_memory(get_array(ser), get_array(result))

if using_copy_on_write:
assert result._mgr.blocks[0].refs.has_reference()
Expand All @@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write):
result.iloc[0] = 0
assert ser.iloc[0] == 1
# mutating triggered a copy-on-write -> no longer shares memory
assert not np.shares_memory(ser.values, result.values)
assert not np.shares_memory(get_array(ser), get_array(result))
else:
# mutating shallow copy does mutate original
result.iloc[0] = 0
assert ser.iloc[0] == 0
# and still shares memory
assert np.shares_memory(ser.values, result.values)
assert np.shares_memory(get_array(ser), get_array(result))

# the same when modifying the parent
result = Series(ser, dtype=dtype)
Expand Down Expand Up @@ -90,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write):
assert not result._mgr.blocks[0].refs.has_reference()


@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
if idx is None or dtype is not None:
fastpath = False
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
if using_copy_on_write:
assert not np.shares_memory(get_array(ser), data)
else:
assert np.shares_memory(get_array(ser), data)

arr[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
tm.assert_series_equal(ser, expected)


@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(using_copy_on_write, copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)


@pytest.mark.parametrize(
"idx",
[
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_array_from_scalars(self, data):
assert isinstance(result, type(data))

def test_series_constructor(self, data):
result = pd.Series(data)
result = pd.Series(data, copy=False)
assert result.dtype == data.dtype
assert len(result) == len(data)
if hasattr(result._mgr, "blocks"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_fillna_copy_frame(self, data_missing):

def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)
ser_orig = ser.copy()

filled_val = ser[0]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write):

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)

filled_val = ser[0]
result = ser.fillna(filled_val)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):

def check_series_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
ser = Series(arr)
ser = Series(arr, copy=False)

self.check_can_hold_element(ser, elem, inplace)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2821,7 +2821,7 @@ def __getitem__(self, ix):
def dtype(self):
return DtypeStub()

series = Series(ExtTypeStub())
series = Series(ExtTypeStub(), copy=False)
res = repr(series) # This line crashed before #33770 was fixed.
expected = "\n".join(
["0 [False True]", "1 [ True False]", "dtype: DtypeStub"]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reductions/test_stat_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_td64_mean(self, box):
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")

tdarr = tdi._data
obj = box(tdarr)
obj = box(tdarr, copy=False)

result = obj.mean()
expected = np.array(tdarr).mean()
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

for n in series.index:
msg = "assignment destination is read-only"
Expand All @@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

msg = "assignment destination is read-only"
with pytest.raises(ValueError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
# however, copy is False by default
# so this WILL change values
cat = Categorical(["a", "b", "c", "a"])
s = Series(cat)
s = Series(cat, copy=False)
assert s.values is cat
s = s.cat.rename_categories([1, 2, 3])
assert s.values is not cat
Expand Down