Skip to content

CoW: Return read-only array in Index.values #53704

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 20, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Enhancements
Copy-on-Write improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^

- Calling :meth:`Index.values` will now return a read-only NumPy array (:issue:`53704`)
- Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`)

.. _whatsnew_210.enhancements.enhancement2:
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def index_with_missing(request):
# GH 35538. Use deep copy to avoid illusive bug on np-dev
# GHA pipeline that writes into indices_dict despite copy
ind = indices_dict[request.param].copy(deep=True)
vals = ind.values
vals = ind.values.copy()
if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]:
# For setting missing values in the top level of MultiIndex
vals = ind.tolist()
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_copy_on_write,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -5055,6 +5058,12 @@ def values(self) -> ArrayLike:
>>> idx.values
array([1, 2, 3])
"""
if using_copy_on_write():
data = self._data
if isinstance(data, np.ndarray):
data = data.view()
data.flags.writeable = False
return data
return self._data

@cache_readonly
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import (
NaT,
Timedelta,
Expand Down Expand Up @@ -451,7 +453,11 @@ def _with_freq(self, freq):
@property
def values(self) -> np.ndarray:
# NB: For Datetime64TZ this is lossy
return self._data._ndarray
data = self._data._ndarray
if using_copy_on_write():
data = data.view()
data.flags.writeable = False
return data

@doc(DatetimeIndexOpsMixin.shift)
def shift(self, periods: int = 1, freq=None) -> Self:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/copy_view/index/test_datetimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,12 @@ def test_datetimeindex_isocalendar(using_copy_on_write):
ser.iloc[0] = Timestamp("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)


def test_index_values(using_copy_on_write):
idx = date_range("2019-12-31", periods=3, freq="D")
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True
9 changes: 9 additions & 0 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,12 @@ def test_infer_objects(using_copy_on_write):
view_.iloc[0, 0] = "aaaa"
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)


def test_index_values(using_copy_on_write):
idx = Index([1, 2, 3])
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True
12 changes: 6 additions & 6 deletions pandas/tests/copy_view/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,18 @@ def test_set_column_with_index(using_copy_on_write):
# the index data is copied
assert not np.shares_memory(get_array(df, "c"), idx.values)

# and thus modifying the index does not modify the DataFrame
idx.values[0] = 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))

idx = RangeIndex(1, 4)
arr = idx.values

df["d"] = idx

assert not np.shares_memory(get_array(df, "d"), arr)
arr[0] = 0
tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))
if using_copy_on_write:
with pytest.raises(ValueError, match="assignment"):
arr[0] = 0
else:
arr[0] = 0
tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))


def test_set_columns_with_dataframe(using_copy_on_write):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/numeric/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def test_constructor_from_list_no_dtype(self):
index = Index([1, 2, 3])
assert index.dtype == np.int64

def test_constructor(self, dtype):
def test_constructor(self, dtype, using_copy_on_write):
index_cls = Index

# scalar raise Exception
Expand All @@ -341,7 +341,7 @@ def test_constructor(self, dtype):
# copy
# pass list, coerce fine
index = index_cls([-5, 0, 1, 2], dtype=dtype)
arr = index.values
arr = index.values.copy()
new_index = index_cls(arr, copy=True)
tm.assert_index_equal(new_index, index, exact=True)
val = arr[0] + 3000
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,12 @@ def test_read_columns(self, engine):
df, engine, expected=expected, read_kwargs={"columns": ["string"]}
)

def test_write_index(self, engine):
def test_write_index(self, engine, using_copy_on_write, request):
check_names = engine != "fastparquet"
if using_copy_on_write and engine == "fastparquet":
request.node.add_marker(
pytest.mark.xfail(reason="fastparquet write into index")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does fastparquet exactly? (it tries to write into the array it gets from the index? Do you know why? (that sounds as a bug in fastparquet, as it can change the dataframe you are writing?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can simply set the flag there, they seem to use the index to do a conversion. It's already on my todo list, but don't want to block this pr because of that

)

df = pd.DataFrame({"A": [1, 2, 3]})
check_round_trip(df, engine)
Expand Down Expand Up @@ -1213,12 +1217,14 @@ def test_error_on_using_partition_cols_and_partition_on(
partition_cols=partition_cols,
)

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_empty_dataframe(self, fp):
# GH #27339
df = pd.DataFrame()
expected = df.copy()
check_round_trip(df, fp, expected=expected)

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
idx = 5 * [timezone_aware_date_list]

Expand Down Expand Up @@ -1328,6 +1334,7 @@ def test_invalid_dtype_backend(self, engine):
with pytest.raises(ValueError, match=msg):
read_parquet(path, dtype_backend="numpy")

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_empty_columns(self, fp):
# GH 52034
df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
Expand Down