Skip to content

Commit d95fc0b

Browse files
authored
API / CoW: Copy arrays by default in Series constructor (#52022)
1 parent beec0e8 commit d95fc0b

File tree

14 files changed

+68
-23
lines changed

14 files changed

+68
-23
lines changed

doc/source/whatsnew/v2.0.0.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -193,12 +193,13 @@ Copy-on-Write improvements
193193
- The :class:`DataFrame` constructor, when constructing a DataFrame from a
194194
:class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write.
195195

196-
- The :class:`DataFrame` constructor, when constructing from a NumPy array,
197-
will now copy the array by default to avoid mutating the :class:`DataFrame`
196+
- The :class:`DataFrame` and :class:`Series` constructors, when constructing from
197+
a NumPy array, will now copy the array by default to avoid mutating
198+
the :class:`DataFrame` / :class:`Series`
198199
when mutating the array. Specify ``copy=False`` to get the old behavior.
199200
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
200201
behavior when the NumPy array is modified after creation of the
201-
:class:`DataFrame`.
202+
:class:`DataFrame` / :class:`Series`.
202203

203204
- The :meth:`DataFrame.from_records` will now respect Copy-on-Write when called
204205
with a :class:`DataFrame`.

pandas/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ def _create_series(index):
708708
"""Helper for the _series dict"""
709709
size = len(index)
710710
data = np.random.randn(size)
711-
return Series(data, index=index, name="a")
711+
return Series(data, index=index, name="a", copy=False)
712712

713713

714714
_series = {

pandas/core/series.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
validate_percentile,
5656
)
5757

58+
from pandas.core.dtypes.astype import astype_is_view
5859
from pandas.core.dtypes.cast import (
5960
LossySetitemError,
6061
convert_dtypes,
@@ -373,14 +374,14 @@ def __init__(
373374
index=None,
374375
dtype: Dtype | None = None,
375376
name=None,
376-
copy: bool = False,
377+
copy: bool | None = None,
377378
fastpath: bool = False,
378379
) -> None:
379380
if (
380381
isinstance(data, (SingleBlockManager, SingleArrayManager))
381382
and index is None
382383
and dtype is None
383-
and copy is False
384+
and (copy is False or copy is None)
384385
):
385386
if using_copy_on_write():
386387
data = data.copy(deep=False)
@@ -393,6 +394,13 @@ def __init__(
393394
self.name = name
394395
return
395396

397+
if isinstance(data, (ExtensionArray, np.ndarray)):
398+
if copy is not False and using_copy_on_write():
399+
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
400+
data = data.copy()
401+
if copy is None:
402+
copy = False
403+
396404
# we are called internally, so short-circuit
397405
if fastpath:
398406
# data is a ndarray, index is defined
@@ -5855,7 +5863,7 @@ def _construct_result(
58555863
# TODO: result should always be ArrayLike, but this fails for some
58565864
# JSONArray tests
58575865
dtype = getattr(result, "dtype", None)
5858-
out = self._constructor(result, index=self.index, dtype=dtype)
5866+
out = self._constructor(result, index=self.index, dtype=dtype, copy=False)
58595867
out = out.__finalize__(self)
58605868

58615869
# Set the result's name after __finalize__ is called because __finalize__
@@ -5874,7 +5882,7 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0
58745882
elif isinstance(other, (np.ndarray, list, tuple)):
58755883
if len(other) != len(self):
58765884
raise ValueError("Lengths must be equal")
5877-
other = self._constructor(other, self.index)
5885+
other = self._constructor(other, self.index, copy=False)
58785886
result = self._binop(other, op, level=level, fill_value=fill_value)
58795887
result.name = res_name
58805888
return result

pandas/tests/arrays/categorical/test_replace.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
6060
# GH#26988
6161
cat = Categorical(["a", "b"])
6262
expected = Categorical(result)
63-
result = pd.Series(cat).replace(to_replace, value)._values
63+
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
6464

6565
tm.assert_categorical_equal(result, expected)
6666
if to_replace == "b": # the "c" test is supposed to be unchanged
6767
with pytest.raises(AssertionError, match=expected_error_msg):
6868
# ensure non-inplace call does not affect original
6969
tm.assert_categorical_equal(cat, expected)
7070

71-
pd.Series(cat).replace(to_replace, value, inplace=True)
71+
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
7272
tm.assert_categorical_equal(cat, expected)
7373

7474

pandas/tests/copy_view/test_astype.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,11 @@ def test_astype_arrow_timestamp(using_copy_on_write):
200200
result = df.astype("timestamp[ns][pyarrow]")
201201
if using_copy_on_write:
202202
assert not result._mgr._has_no_reference(0)
203-
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
203+
# TODO(CoW): arrow is not setting copy=False in the Series constructor
204+
# under the hood
205+
assert not np.shares_memory(
206+
get_array(df, "a"), get_array(result, "a")._pa_array
207+
)
204208

205209

206210
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/copy_view/test_constructors.py

+35-3
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write):
3030
result = Series(ser, dtype=dtype)
3131

3232
# the shallow copy still shares memory
33-
assert np.shares_memory(ser.values, result.values)
33+
assert np.shares_memory(get_array(ser), get_array(result))
3434

3535
if using_copy_on_write:
3636
assert result._mgr.blocks[0].refs.has_reference()
@@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write):
4040
result.iloc[0] = 0
4141
assert ser.iloc[0] == 1
4242
# mutating triggered a copy-on-write -> no longer shares memory
43-
assert not np.shares_memory(ser.values, result.values)
43+
assert not np.shares_memory(get_array(ser), get_array(result))
4444
else:
4545
# mutating shallow copy does mutate original
4646
result.iloc[0] = 0
4747
assert ser.iloc[0] == 0
4848
# and still shares memory
49-
assert np.shares_memory(ser.values, result.values)
49+
assert np.shares_memory(get_array(ser), get_array(result))
5050

5151
# the same when modifying the parent
5252
result = Series(ser, dtype=dtype)
@@ -90,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write):
9090
assert not result._mgr.blocks[0].refs.has_reference()
9191

9292

93+
@pytest.mark.parametrize("fastpath", [False, True])
94+
@pytest.mark.parametrize("dtype", [None, "int64"])
95+
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
96+
@pytest.mark.parametrize(
97+
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
98+
)
99+
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
100+
if idx is None or dtype is not None:
101+
fastpath = False
102+
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
103+
ser_orig = ser.copy()
104+
data = getattr(arr, "_data", arr)
105+
if using_copy_on_write:
106+
assert not np.shares_memory(get_array(ser), data)
107+
else:
108+
assert np.shares_memory(get_array(ser), data)
109+
110+
arr[0] = 100
111+
if using_copy_on_write:
112+
tm.assert_series_equal(ser, ser_orig)
113+
else:
114+
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
115+
tm.assert_series_equal(ser, expected)
116+
117+
118+
@pytest.mark.parametrize("copy", [True, False, None])
119+
def test_series_from_array_different_dtype(using_copy_on_write, copy):
120+
arr = np.array([1, 2, 3], dtype="int64")
121+
ser = Series(arr, dtype="int32", copy=copy)
122+
assert not np.shares_memory(get_array(ser), arr)
123+
124+
93125
@pytest.mark.parametrize(
94126
"idx",
95127
[

pandas/tests/extension/base/constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_array_from_scalars(self, data):
2222
assert isinstance(result, type(data))
2323

2424
def test_series_constructor(self, data):
25-
result = pd.Series(data)
25+
result = pd.Series(data, copy=False)
2626
assert result.dtype == data.dtype
2727
assert len(result) == len(data)
2828
if hasattr(result._mgr, "blocks"):

pandas/tests/extension/base/methods.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def test_fillna_copy_frame(self, data_missing):
271271

272272
def test_fillna_copy_series(self, data_missing):
273273
arr = data_missing.take([1, 1])
274-
ser = pd.Series(arr)
274+
ser = pd.Series(arr, copy=False)
275275
ser_orig = ser.copy()
276276

277277
filled_val = ser[0]

pandas/tests/extension/test_sparse.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
288288

289289
def test_fillna_copy_series(self, data_missing, using_copy_on_write):
290290
arr = data_missing.take([1, 1])
291-
ser = pd.Series(arr)
291+
ser = pd.Series(arr, copy=False)
292292

293293
filled_val = ser[0]
294294
result = ser.fillna(filled_val)

pandas/tests/internals/test_internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1364,7 +1364,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):
13641364

13651365
def check_series_setitem(self, elem, index: Index, inplace: bool):
13661366
arr = index._data.copy()
1367-
ser = Series(arr)
1367+
ser = Series(arr, copy=False)
13681368

13691369
self.check_can_hold_element(ser, elem, inplace)
13701370

pandas/tests/io/formats/test_format.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2821,7 +2821,7 @@ def __getitem__(self, ix):
28212821
def dtype(self):
28222822
return DtypeStub()
28232823

2824-
series = Series(ExtTypeStub())
2824+
series = Series(ExtTypeStub(), copy=False)
28252825
res = repr(series) # This line crashed before #33770 was fixed.
28262826
expected = "\n".join(
28272827
["0 [False True]", "1 [ True False]", "dtype: DtypeStub"]

pandas/tests/reductions/test_stat_reductions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def test_td64_mean(self, box):
7070
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
7171

7272
tdarr = tdi._data
73-
obj = box(tdarr)
73+
obj = box(tdarr, copy=False)
7474

7575
result = obj.mean()
7676
expected = np.array(tdarr).mean()

pandas/tests/series/indexing/test_setitem.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data():
578578

579579
array = np.zeros(5)
580580
array.flags.writeable = False # make the array immutable
581-
series = Series(array)
581+
series = Series(array, copy=False)
582582

583583
for n in series.index:
584584
msg = "assignment destination is read-only"
@@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data():
593593

594594
array = np.zeros(5)
595595
array.flags.writeable = False # make the array immutable
596-
series = Series(array)
596+
series = Series(array, copy=False)
597597

598598
msg = "assignment destination is read-only"
599599
with pytest.raises(ValueError, match=msg):

pandas/tests/series/test_constructors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
527527
# however, copy is False by default
528528
# so this WILL change values
529529
cat = Categorical(["a", "b", "c", "a"])
530-
s = Series(cat)
530+
s = Series(cat, copy=False)
531531
assert s.values is cat
532532
s = s.cat.rename_categories([1, 2, 3])
533533
assert s.values is not cat

0 commit comments

Comments
 (0)