Skip to content

Commit 1c5dd84

Browse files
Backport PR #51731 on branch 2.0.x (API / CoW: Copy NumPy arrays by default in DataFrame constructor) (#52047)
* API / CoW: Copy NumPy arrays by default in DataFrame constructor (#51731) Co-authored-by: Joris Van den Bossche <[email protected]> * Fix test --------- Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent a38514e commit 1c5dd84

File tree

8 files changed

+62
-16
lines changed

8 files changed

+62
-16
lines changed

doc/source/whatsnew/v2.0.0.rst

+7
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,13 @@ Copy-on-Write improvements
190190
of Series objects and specifying ``copy=False``, will now use a lazy copy
191191
of those Series objects for the columns of the DataFrame (:issue:`50777`)
192192

193+
- The :class:`DataFrame` constructor, when constructing from a NumPy array,
194+
will now copy the array by default to avoid mutating the :class:`DataFrame`
195+
when mutating the array. Specify ``copy=False`` to get the old behavior.
196+
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
197+
behavior when the NumPy array is modified after creation of the
198+
:class:`DataFrame`.
199+
193200
- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
194201
will now always raise an warning when Copy-on-Write is enabled. In this mode,
195202
chained assignment can never work because we are always setting into a temporary

pandas/core/frame.py

+4
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,10 @@ def __init__(
685685
# INFO(ArrayManager) by default copy the 2D input array to get
686686
# contiguous 1D arrays
687687
copy = True
688+
elif using_copy_on_write() and not isinstance(
689+
data, (Index, DataFrame, Series)
690+
):
691+
copy = True
688692
else:
689693
copy = False
690694

pandas/tests/copy_view/test_constructors.py

+16
Original file line numberDiff line numberDiff line change
@@ -215,3 +215,19 @@ def test_dataframe_from_dict_of_series_with_dtype(index):
215215
df.iloc[0, 0] = 100
216216
arr_after = get_array(df, "a")
217217
assert np.shares_memory(arr_before, arr_after)
218+
219+
220+
@pytest.mark.parametrize("copy", [False, None, True])
221+
def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
222+
arr = np.array([[1, 2], [3, 4]])
223+
df = DataFrame(arr, copy=copy)
224+
225+
if (
226+
using_copy_on_write
227+
and copy is not False
228+
or copy is True
229+
or (using_array_manager and copy is None)
230+
):
231+
assert not np.shares_memory(get_array(df, 0), arr)
232+
else:
233+
assert np.shares_memory(get_array(df, 0), arr)

pandas/tests/frame/methods/test_fillna.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write):
4747
def test_fillna_on_column_view(self, using_copy_on_write):
4848
# GH#46149 avoid unnecessary copies
4949
arr = np.full((40, 50), np.nan)
50-
df = DataFrame(arr)
50+
df = DataFrame(arr, copy=False)
5151

52+
# TODO(CoW): This should raise a chained assignment error
5253
df[0].fillna(-1, inplace=True)
5354
if using_copy_on_write:
5455
assert np.isnan(arr[:, 0]).all()

pandas/tests/frame/methods/test_to_numpy.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,15 @@ def test_to_numpy_dtype(self):
2323
tm.assert_numpy_array_equal(result, expected)
2424

2525
@td.skip_array_manager_invalid_test
26-
def test_to_numpy_copy(self):
26+
def test_to_numpy_copy(self, using_copy_on_write):
2727
arr = np.random.randn(4, 3)
2828
df = DataFrame(arr)
29-
assert df.values.base is arr
30-
assert df.to_numpy(copy=False).base is arr
29+
if using_copy_on_write:
30+
assert df.values.base is not arr
31+
assert df.to_numpy(copy=False).base is df.values.base
32+
else:
33+
assert df.values.base is arr
34+
assert df.to_numpy(copy=False).base is arr
3135
assert df.to_numpy(copy=True).base is not arr
3236

3337
def test_to_numpy_mixed_dtype_to_str(self):

pandas/tests/frame/methods/test_transpose.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write):
120120
assert (float_frame.values[5:10] == 5).all()
121121

122122
@td.skip_array_manager_invalid_test
123-
def test_transpose_get_view_dt64tzget_view(self):
123+
def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write):
124124
dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
125125
arr = dti._data.reshape(3, 2)
126126
df = DataFrame(arr)
@@ -130,4 +130,7 @@ def test_transpose_get_view_dt64tzget_view(self):
130130
assert result._mgr.nblocks == 1
131131

132132
rtrip = result._mgr.blocks[0].values
133-
assert np.shares_memory(arr._ndarray, rtrip._ndarray)
133+
if using_copy_on_write:
134+
assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray)
135+
else:
136+
assert np.shares_memory(arr._ndarray, rtrip._ndarray)

pandas/tests/frame/methods/test_values.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -230,29 +230,35 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
230230

231231
class TestPrivateValues:
232232
@td.skip_array_manager_invalid_test
233-
def test_private_values_dt64tz(self):
233+
def test_private_values_dt64tz(self, using_copy_on_write):
234234
dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)
235235

236236
df = DataFrame(dta, columns=["A"])
237237
tm.assert_equal(df._values, dta)
238238

239-
# we have a view
240-
assert np.shares_memory(df._values._ndarray, dta._ndarray)
239+
if using_copy_on_write:
240+
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
241+
else:
242+
# we have a view
243+
assert np.shares_memory(df._values._ndarray, dta._ndarray)
241244

242245
# TimedeltaArray
243246
tda = dta - dta
244247
df2 = df - df
245248
tm.assert_equal(df2._values, tda)
246249

247250
@td.skip_array_manager_invalid_test
248-
def test_private_values_dt64tz_multicol(self):
251+
def test_private_values_dt64tz_multicol(self, using_copy_on_write):
249252
dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)
250253

251254
df = DataFrame(dta, columns=["A", "B"])
252255
tm.assert_equal(df._values, dta)
253256

254-
# we have a view
255-
assert np.shares_memory(df._values._ndarray, dta._ndarray)
257+
if using_copy_on_write:
258+
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
259+
else:
260+
# we have a view
261+
assert np.shares_memory(df._values._ndarray, dta._ndarray)
256262

257263
# TimedeltaArray
258264
tda = dta - dta

pandas/tests/frame/test_constructors.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,14 @@ def test_constructor_dtype_nocast_view_2d_array(
309309
def test_1d_object_array_does_not_copy(self):
310310
# https://github.com/pandas-dev/pandas/issues/39272
311311
arr = np.array(["a", "b"], dtype="object")
312-
df = DataFrame(arr)
312+
df = DataFrame(arr, copy=False)
313313
assert np.shares_memory(df.values, arr)
314314

315315
@td.skip_array_manager_invalid_test
316316
def test_2d_object_array_does_not_copy(self):
317317
# https://github.com/pandas-dev/pandas/issues/39272
318318
arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
319-
df = DataFrame(arr)
319+
df = DataFrame(arr, copy=False)
320320
assert np.shares_memory(df.values, arr)
321321

322322
def test_constructor_dtype_list_data(self):
@@ -2107,13 +2107,18 @@ def test_constructor_frame_shallow_copy(self, float_frame):
21072107
cop.index = np.arange(len(cop))
21082108
tm.assert_frame_equal(float_frame, orig)
21092109

2110-
def test_constructor_ndarray_copy(self, float_frame, using_array_manager):
2110+
def test_constructor_ndarray_copy(
2111+
self, float_frame, using_array_manager, using_copy_on_write
2112+
):
21112113
if not using_array_manager:
21122114
arr = float_frame.values.copy()
21132115
df = DataFrame(arr)
21142116

21152117
arr[5] = 5
2116-
assert (df.values[5] == 5).all()
2118+
if using_copy_on_write:
2119+
assert not (df.values[5] == 5).all()
2120+
else:
2121+
assert (df.values[5] == 5).all()
21172122

21182123
df = DataFrame(arr, copy=True)
21192124
arr[6] = 6

0 commit comments

Comments
 (0)