Backport PR #51731 on branch 2.0.x (API / CoW: Copy NumPy arrays by default in DataFrame constructor) (#52047)

phofl · jorisvandenbossche · web-flow · commit 1c5dd8463498 · 2023-03-17T18:18:22.000+01:00
* API / CoW: Copy NumPy arrays by default in DataFrame constructor (#51731) Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> * Fix test --------- Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -190,6 +190,13 @@ Copy-on-Write improvements
   of Series objects and specifying ``copy=False``, will now use a lazy copy
   of those Series objects for the columns of the DataFrame (:issue:`50777`)
 
+- The :class:`DataFrame` constructor, when constructing from a NumPy array,
+  will now copy the array by default to avoid mutating the :class:`DataFrame`
+  when mutating the array. Specify ``copy=False`` to get the old behavior.
+  When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
+  behavior when the NumPy array is modified after creation of the
+  :class:`DataFrame`.
+
 - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
   will now always raise an warning when Copy-on-Write is enabled. In this mode,
   chained assignment can never work because we are always setting into a temporary
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -685,6 +685,10 @@ def __init__(
                 # INFO(ArrayManager) by default copy the 2D input array to get
                 # contiguous 1D arrays
                 copy = True
+            elif using_copy_on_write() and not isinstance(
+                data, (Index, DataFrame, Series)
+            ):
+                copy = True
             else:
                 copy = False
 
diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -215,3 +215,19 @@ def test_dataframe_from_dict_of_series_with_dtype(index):
     df.iloc[0, 0] = 100
     arr_after = get_array(df, "a")
     assert np.shares_memory(arr_before, arr_after)
+
+
+@pytest.mark.parametrize("copy", [False, None, True])
+def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
+    arr = np.array([[1, 2], [3, 4]])
+    df = DataFrame(arr, copy=copy)
+
+    if (
+        using_copy_on_write
+        and copy is not False
+        or copy is True
+        or (using_array_manager and copy is None)
+    ):
+        assert not np.shares_memory(get_array(df, 0), arr)
+    else:
+        assert np.shares_memory(get_array(df, 0), arr)
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
@@ -47,8 +47,9 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write):
     def test_fillna_on_column_view(self, using_copy_on_write):
         # GH#46149 avoid unnecessary copies
         arr = np.full((40, 50), np.nan)
-        df = DataFrame(arr)
+        df = DataFrame(arr, copy=False)
 
+        # TODO(CoW): This should raise a chained assignment error
         df[0].fillna(-1, inplace=True)
         if using_copy_on_write:
             assert np.isnan(arr[:, 0]).all()
diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py
@@ -23,11 +23,15 @@ def test_to_numpy_dtype(self):
         tm.assert_numpy_array_equal(result, expected)
 
     @td.skip_array_manager_invalid_test
-    def test_to_numpy_copy(self):
+    def test_to_numpy_copy(self, using_copy_on_write):
         arr = np.random.randn(4, 3)
         df = DataFrame(arr)
-        assert df.values.base is arr
-        assert df.to_numpy(copy=False).base is arr
+        if using_copy_on_write:
+            assert df.values.base is not arr
+            assert df.to_numpy(copy=False).base is df.values.base
+        else:
+            assert df.values.base is arr
+            assert df.to_numpy(copy=False).base is arr
         assert df.to_numpy(copy=True).base is not arr
 
     def test_to_numpy_mixed_dtype_to_str(self):
diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py
@@ -120,7 +120,7 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write):
             assert (float_frame.values[5:10] == 5).all()
 
     @td.skip_array_manager_invalid_test
-    def test_transpose_get_view_dt64tzget_view(self):
+    def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write):
         dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
         arr = dti._data.reshape(3, 2)
         df = DataFrame(arr)
@@ -130,4 +130,7 @@ def test_transpose_get_view_dt64tzget_view(self):
         assert result._mgr.nblocks == 1
 
         rtrip = result._mgr.blocks[0].values
-        assert np.shares_memory(arr._ndarray, rtrip._ndarray)
+        if using_copy_on_write:
+            assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray)
+        else:
+            assert np.shares_memory(arr._ndarray, rtrip._ndarray)
diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py
@@ -230,29 +230,35 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
 
 class TestPrivateValues:
     @td.skip_array_manager_invalid_test
-    def test_private_values_dt64tz(self):
+    def test_private_values_dt64tz(self, using_copy_on_write):
         dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)
 
         df = DataFrame(dta, columns=["A"])
         tm.assert_equal(df._values, dta)
 
-        # we have a view
-        assert np.shares_memory(df._values._ndarray, dta._ndarray)
+        if using_copy_on_write:
+            assert not np.shares_memory(df._values._ndarray, dta._ndarray)
+        else:
+            # we have a view
+            assert np.shares_memory(df._values._ndarray, dta._ndarray)
 
         # TimedeltaArray
         tda = dta - dta
         df2 = df - df
         tm.assert_equal(df2._values, tda)
 
     @td.skip_array_manager_invalid_test
-    def test_private_values_dt64tz_multicol(self):
+    def test_private_values_dt64tz_multicol(self, using_copy_on_write):
         dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)
 
         df = DataFrame(dta, columns=["A", "B"])
         tm.assert_equal(df._values, dta)
 
-        # we have a view
-        assert np.shares_memory(df._values._ndarray, dta._ndarray)
+        if using_copy_on_write:
+            assert not np.shares_memory(df._values._ndarray, dta._ndarray)
+        else:
+            # we have a view
+            assert np.shares_memory(df._values._ndarray, dta._ndarray)
 
         # TimedeltaArray
         tda = dta - dta
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -309,14 +309,14 @@ def test_constructor_dtype_nocast_view_2d_array(
     def test_1d_object_array_does_not_copy(self):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array(["a", "b"], dtype="object")
-        df = DataFrame(arr)
+        df = DataFrame(arr, copy=False)
         assert np.shares_memory(df.values, arr)
 
     @td.skip_array_manager_invalid_test
     def test_2d_object_array_does_not_copy(self):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
-        df = DataFrame(arr)
+        df = DataFrame(arr, copy=False)
         assert np.shares_memory(df.values, arr)
 
     def test_constructor_dtype_list_data(self):
@@ -2107,13 +2107,18 @@ def test_constructor_frame_shallow_copy(self, float_frame):
         cop.index = np.arange(len(cop))
         tm.assert_frame_equal(float_frame, orig)
 
-    def test_constructor_ndarray_copy(self, float_frame, using_array_manager):
+    def test_constructor_ndarray_copy(
+        self, float_frame, using_array_manager, using_copy_on_write
+    ):
         if not using_array_manager:
             arr = float_frame.values.copy()
             df = DataFrame(arr)
 
             arr[5] = 5
-            assert (df.values[5] == 5).all()
+            if using_copy_on_write:
+                assert not (df.values[5] == 5).all()
+            else:
+                assert (df.values[5] == 5).all()
 
             df = DataFrame(arr, copy=True)
             arr[6] = 6