Skip to content

Commit a6f9057

Browse files
committed
API / CoW: Copy arrays by default in Series constructor (pandas-dev#52022)
1 parent 75d7af3 commit a6f9057

File tree

14 files changed

+64
-21
lines changed

14 files changed

+64
-21
lines changed

doc/source/whatsnew/v2.0.0.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -193,12 +193,13 @@ Copy-on-Write improvements
193193
- The :class:`DataFrame` constructor, when constructing a DataFrame from a
194194
:class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write.
195195

196-
- The :class:`DataFrame` constructor, when constructing from a NumPy array,
197-
will now copy the array by default to avoid mutating the :class:`DataFrame`
196+
- The :class:`DataFrame` and :class:`Series` constructors, when constructing from
197+
a NumPy array, will now copy the array by default to avoid mutating
198+
the :class:`DataFrame` / :class:`Series`
198199
when mutating the array. Specify ``copy=False`` to get the old behavior.
199200
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
200201
behavior when the NumPy array is modified after creation of the
201-
:class:`DataFrame`.
202+
:class:`DataFrame` / :class:`Series`.
202203

203204
- The :meth:`DataFrame.from_records` will now respect Copy-on-Write when called
204205
with a :class:`DataFrame`.

pandas/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,7 @@ def _create_series(index):
739739
"""Helper for the _series dict"""
740740
size = len(index)
741741
data = np.random.randn(size)
742-
return Series(data, index=index, name="a")
742+
return Series(data, index=index, name="a", copy=False)
743743

744744

745745
_series = {

pandas/core/series.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
validate_percentile,
8989
)
9090

91+
from pandas.core.dtypes.astype import astype_is_view
9192
from pandas.core.dtypes.cast import (
9293
LossySetitemError,
9394
convert_dtypes,
@@ -370,14 +371,14 @@ def __init__(
370371
index=None,
371372
dtype: Dtype | None = None,
372373
name=None,
373-
copy: bool = False,
374+
copy: bool | None = None,
374375
fastpath: bool = False,
375376
) -> None:
376377
if (
377378
isinstance(data, (SingleBlockManager, SingleArrayManager))
378379
and index is None
379380
and dtype is None
380-
and copy is False
381+
and (copy is False or copy is None)
381382
):
382383
if using_copy_on_write():
383384
data = data.copy(deep=False)
@@ -390,6 +391,13 @@ def __init__(
390391
self.name = name
391392
return
392393

394+
if isinstance(data, (ExtensionArray, np.ndarray)):
395+
if copy is not False and using_copy_on_write():
396+
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
397+
data = data.copy()
398+
if copy is None:
399+
copy = False
400+
393401
# we are called internally, so short-circuit
394402
if fastpath:
395403
# data is a ndarray, index is defined

pandas/tests/arrays/categorical/test_replace.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
6060
# GH#26988
6161
cat = Categorical(["a", "b"])
6262
expected = Categorical(result)
63-
result = pd.Series(cat).replace(to_replace, value)._values
63+
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
6464

6565
tm.assert_categorical_equal(result, expected)
6666
if to_replace == "b": # the "c" test is supposed to be unchanged
6767
with pytest.raises(AssertionError, match=expected_error_msg):
6868
# ensure non-inplace call does not affect original
6969
tm.assert_categorical_equal(cat, expected)
7070

71-
pd.Series(cat).replace(to_replace, value, inplace=True)
71+
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
7272
tm.assert_categorical_equal(cat, expected)
7373

7474

pandas/tests/copy_view/test_astype.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
200200
result = df.astype("timestamp[ns][pyarrow]")
201201
if using_copy_on_write:
202202
assert not result._mgr._has_no_reference(0)
203-
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)
203+
# TODO(CoW): arrow is not setting copy=False in the Series constructor
204+
# under the hood
205+
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)
204206

205207

206208
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/copy_view/test_constructors.py

+35-3
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write):
3030
result = Series(ser, dtype=dtype)
3131

3232
# the shallow copy still shares memory
33-
assert np.shares_memory(ser.values, result.values)
33+
assert np.shares_memory(get_array(ser), get_array(result))
3434

3535
if using_copy_on_write:
3636
assert result._mgr.blocks[0].refs.has_reference()
@@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write):
4040
result.iloc[0] = 0
4141
assert ser.iloc[0] == 1
4242
# mutating triggered a copy-on-write -> no longer shares memory
43-
assert not np.shares_memory(ser.values, result.values)
43+
assert not np.shares_memory(get_array(ser), get_array(result))
4444
else:
4545
# mutating shallow copy does mutate original
4646
result.iloc[0] = 0
4747
assert ser.iloc[0] == 0
4848
# and still shares memory
49-
assert np.shares_memory(ser.values, result.values)
49+
assert np.shares_memory(get_array(ser), get_array(result))
5050

5151
# the same when modifying the parent
5252
result = Series(ser, dtype=dtype)
@@ -90,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write):
9090
assert not result._mgr.blocks[0].refs.has_reference()
9191

9292

93+
@pytest.mark.parametrize("fastpath", [False, True])
94+
@pytest.mark.parametrize("dtype", [None, "int64"])
95+
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
96+
@pytest.mark.parametrize(
97+
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
98+
)
99+
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
100+
if idx is None or dtype is not None:
101+
fastpath = False
102+
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
103+
ser_orig = ser.copy()
104+
data = getattr(arr, "_data", arr)
105+
if using_copy_on_write:
106+
assert not np.shares_memory(get_array(ser), data)
107+
else:
108+
assert np.shares_memory(get_array(ser), data)
109+
110+
arr[0] = 100
111+
if using_copy_on_write:
112+
tm.assert_series_equal(ser, ser_orig)
113+
else:
114+
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
115+
tm.assert_series_equal(ser, expected)
116+
117+
118+
@pytest.mark.parametrize("copy", [True, False, None])
119+
def test_series_from_array_different_dtype(using_copy_on_write, copy):
120+
arr = np.array([1, 2, 3], dtype="int64")
121+
ser = Series(arr, dtype="int32", copy=copy)
122+
assert not np.shares_memory(get_array(ser), arr)
123+
124+
93125
@pytest.mark.parametrize(
94126
"idx",
95127
[

pandas/tests/extension/base/constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_array_from_scalars(self, data):
2222
assert isinstance(result, type(data))
2323

2424
def test_series_constructor(self, data):
25-
result = pd.Series(data)
25+
result = pd.Series(data, copy=False)
2626
assert result.dtype == data.dtype
2727
assert len(result) == len(data)
2828
if hasattr(result._mgr, "blocks"):

pandas/tests/extension/base/methods.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def test_fillna_copy_frame(self, data_missing):
254254

255255
def test_fillna_copy_series(self, data_missing):
256256
arr = data_missing.take([1, 1])
257-
ser = pd.Series(arr)
257+
ser = pd.Series(arr, copy=False)
258258
ser_orig = ser.copy()
259259

260260
filled_val = ser[0]

pandas/tests/extension/test_sparse.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
288288

289289
def test_fillna_copy_series(self, data_missing, using_copy_on_write):
290290
arr = data_missing.take([1, 1])
291-
ser = pd.Series(arr)
291+
ser = pd.Series(arr, copy=False)
292292

293293
filled_val = ser[0]
294294
result = ser.fillna(filled_val)

pandas/tests/internals/test_internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1362,7 +1362,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):
13621362

13631363
def check_series_setitem(self, elem, index: Index, inplace: bool):
13641364
arr = index._data.copy()
1365-
ser = Series(arr)
1365+
ser = Series(arr, copy=False)
13661366

13671367
self.check_can_hold_element(ser, elem, inplace)
13681368

pandas/tests/io/formats/test_format.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2812,7 +2812,7 @@ def __getitem__(self, ix):
28122812
def dtype(self):
28132813
return DtypeStub()
28142814

2815-
series = Series(ExtTypeStub())
2815+
series = Series(ExtTypeStub(), copy=False)
28162816
res = repr(series) # This line crashed before #33770 was fixed.
28172817
expected = "0 [False True]\n" + "1 [ True False]\n" + "dtype: DtypeStub"
28182818
assert res == expected

pandas/tests/reductions/test_stat_reductions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def test_td64_mean(self, box):
7070
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
7171

7272
tdarr = tdi._data
73-
obj = box(tdarr)
73+
obj = box(tdarr, copy=False)
7474

7575
result = obj.mean()
7676
expected = np.array(tdarr).mean()

pandas/tests/series/indexing/test_setitem.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data():
578578

579579
array = np.zeros(5)
580580
array.flags.writeable = False # make the array immutable
581-
series = Series(array)
581+
series = Series(array, copy=False)
582582

583583
for n in series.index:
584584
msg = "assignment destination is read-only"
@@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data():
593593

594594
array = np.zeros(5)
595595
array.flags.writeable = False # make the array immutable
596-
series = Series(array)
596+
series = Series(array, copy=False)
597597

598598
msg = "assignment destination is read-only"
599599
with pytest.raises(ValueError, match=msg):

pandas/tests/series/test_constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
527527
# however, copy is False by default
528528
# so this WILL change values
529529
cat = Categorical(["a", "b", "c", "a"])
530-
s = Series(cat)
530+
s = Series(cat, copy=False)
531531
assert s.values is cat
532532
s = s.cat.rename_categories([1, 2, 3])
533533
assert s.values is not cat

0 commit comments

Comments
 (0)