CoW: Return read-only array in Index.values (#53704)

phofl · jorisvandenbossche · web-flow · commit ed4ea1a20574 · 2023-06-20T13:35:04.000+02:00
Co-authored-by: Joris Van den Bossche &lt;jorisvandenbossche@gmail.com&gt;
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -19,6 +19,7 @@ Enhancements
 Copy-on-Write improvements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Calling :meth:`Index.values` will now return a read-only NumPy array (:issue:`53704`)
 - Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`)
 - The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
   of Index objects and specifying ``copy=False``, will now use a lazy copy
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -685,7 +685,7 @@ def index_with_missing(request):
     # GH 35538. Use deep copy to avoid illusive bug on np-dev
     # GHA pipeline that writes into indices_dict despite copy
     ind = indices_dict[request.param].copy(deep=True)
-    vals = ind.values
+    vals = ind.values.copy()
     if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]:
         # For setting missing values in the top level of MultiIndex
         vals = ind.tolist()
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5058,6 +5058,12 @@ def values(self) -> ArrayLike:
         >>> idx.values
         array([1, 2, 3])
         """
+        if using_copy_on_write():
+            data = self._data
+            if isinstance(data, np.ndarray):
+                data = data.view()
+                data.flags.writeable = False
+            return data
         return self._data
 
     @cache_readonly
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -18,6 +18,8 @@
 
 import numpy as np
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import (
     NaT,
     Timedelta,
@@ -451,7 +453,11 @@ def _with_freq(self, freq):
     @property
     def values(self) -> np.ndarray:
         # NB: For Datetime64TZ this is lossy
-        return self._data._ndarray
+        data = self._data._ndarray
+        if using_copy_on_write():
+            data = data.view()
+            data.flags.writeable = False
+        return data
 
     @doc(DatetimeIndexOpsMixin.shift)
     def shift(self, periods: int = 1, freq=None) -> Self:
diff --git a/pandas/tests/copy_view/index/test_datetimeindex.py b/pandas/tests/copy_view/index/test_datetimeindex.py
@@ -54,3 +54,12 @@ def test_datetimeindex_isocalendar(using_copy_on_write):
     ser.iloc[0] = Timestamp("2020-12-31")
     if using_copy_on_write:
         tm.assert_index_equal(df.index, expected)
+
+
+def test_index_values(using_copy_on_write):
+    idx = date_range("2019-12-31", periods=3, freq="D")
+    result = idx.values
+    if using_copy_on_write:
+        assert result.flags.writeable is False
+    else:
+        assert result.flags.writeable is True
diff --git a/pandas/tests/copy_view/index/test_index.py b/pandas/tests/copy_view/index/test_index.py
@@ -167,3 +167,12 @@ def test_index_to_frame(using_copy_on_write):
 
     df.iloc[0, 0] = 100
     tm.assert_index_equal(idx, expected)
+
+
+def test_index_values(using_copy_on_write):
+    idx = Index([1, 2, 3])
+    result = idx.values
+    if using_copy_on_write:
+        assert result.flags.writeable is False
+    else:
+        assert result.flags.writeable is True
diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py
@@ -58,18 +58,12 @@ def test_set_column_with_index(using_copy_on_write):
     # the index data is copied
     assert not np.shares_memory(get_array(df, "c"), idx.values)
 
-    # and thus modifying the index does not modify the DataFrame
-    idx.values[0] = 0
-    tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
-
     idx = RangeIndex(1, 4)
     arr = idx.values
 
     df["d"] = idx
 
     assert not np.shares_memory(get_array(df, "d"), arr)
-    arr[0] = 0
-    tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))
 
 
 def test_set_columns_with_dataframe(using_copy_on_write):
diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py
@@ -341,7 +341,7 @@ def test_constructor(self, dtype):
         # copy
         # pass list, coerce fine
         index = index_cls([-5, 0, 1, 2], dtype=dtype)
-        arr = index.values
+        arr = index.values.copy()
         new_index = index_cls(arr, copy=True)
         tm.assert_index_equal(new_index, index, exact=True)
         val = arr[0] + 3000
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -433,8 +433,12 @@ def test_read_columns(self, engine):
             df, engine, expected=expected, read_kwargs={"columns": ["string"]}
         )
 
-    def test_write_index(self, engine):
+    def test_write_index(self, engine, using_copy_on_write, request):
         check_names = engine != "fastparquet"
+        if using_copy_on_write and engine == "fastparquet":
+            request.node.add_marker(
+                pytest.mark.xfail(reason="fastparquet write into index")
+            )
 
         df = pd.DataFrame({"A": [1, 2, 3]})
         check_round_trip(df, engine)
@@ -1213,12 +1217,14 @@ def test_error_on_using_partition_cols_and_partition_on(
                 partition_cols=partition_cols,
             )
 
+    @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
     def test_empty_dataframe(self, fp):
         # GH #27339
         df = pd.DataFrame()
         expected = df.copy()
         check_round_trip(df, fp, expected=expected)
 
+    @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
     def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         idx = 5 * [timezone_aware_date_list]
 
@@ -1328,6 +1334,7 @@ def test_invalid_dtype_backend(self, engine):
             with pytest.raises(ValueError, match=msg):
                 read_parquet(path, dtype_backend="numpy")
 
+    @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
     def test_empty_columns(self, fp):
         # GH 52034
         df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))