Skip to content

CoW: Return read-only array in Index.values #53704

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Enhancements
Copy-on-Write improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^

- Calling :meth:`Index.values` will now return a read-only NumPy array (:issue:`53704`)
- Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`)

.. _whatsnew_210.enhancements.enhancement2:
Expand Down Expand Up @@ -400,7 +401,7 @@ Strings

Interval
^^^^^^^^
-
- :meth:`pd.IntervalIndex.get_indexer` and :meth:`pd.IntervalIndex.get_indexer_nonunique` raising if ``target`` is read-only array (:issue:`53703`)
-

Indexing
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/intervaltree.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ cdef class IntervalTree(IntervalMixin):
sort_order = self.left_sorter
return is_monotonic(sort_order, False)[0]

def get_indexer(self, scalar_t[:] target) -> np.ndarray:
def get_indexer(self, ndarray[scalar_t, ndim=1] target) -> np.ndarray:
"""Return the positions corresponding to unique intervals that overlap
with the given array of scalar targets.
"""
Expand Down Expand Up @@ -153,7 +153,7 @@ cdef class IntervalTree(IntervalMixin):
old_len = result.data.n
return result.to_array().astype('intp')

def get_indexer_non_unique(self, scalar_t[:] target):
def get_indexer_non_unique(self, ndarray[scalar_t, ndim=1] target):
"""Return the positions corresponding to intervals that overlap with
the given array of scalar targets. Non-unique positions are repeated.
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def index_with_missing(request):
# GH 35538. Use deep copy to avoid illusive bug on np-dev
# GHA pipeline that writes into indices_dict despite copy
ind = indices_dict[request.param].copy(deep=True)
vals = ind.values
vals = ind.values.copy()
if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]:
# For setting missing values in the top level of MultiIndex
vals = ind.tolist()
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_copy_on_write,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -5055,6 +5058,12 @@ def values(self) -> ArrayLike:
>>> idx.values
array([1, 2, 3])
"""
if using_copy_on_write():
data = self._data
if isinstance(data, np.ndarray):
data = data.view()
data.flags.writeable = False
return data
return self._data

@cache_readonly
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import (
NaT,
Timedelta,
Expand Down Expand Up @@ -451,7 +453,11 @@ def _with_freq(self, freq):
@property
def values(self) -> np.ndarray:
# NB: For Datetime64TZ this is lossy
return self._data._ndarray
data = self._data._ndarray
if using_copy_on_write():
data = data.view()
data.flags.writeable = False
return data

@doc(DatetimeIndexOpsMixin.shift)
def shift(self, periods: int = 1, freq=None) -> Self:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/copy_view/index/test_datetimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,12 @@ def test_datetimeindex_isocalendar(using_copy_on_write):
ser.iloc[0] = Timestamp("2020-12-31")
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)


def test_index_values(using_copy_on_write):
idx = date_range("2019-12-31", periods=3, freq="D")
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True
9 changes: 9 additions & 0 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,12 @@ def test_infer_objects(using_copy_on_write):
view_.iloc[0, 0] = "aaaa"
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)


def test_index_values(using_copy_on_write):
idx = Index([1, 2, 3])
result = idx.values
if using_copy_on_write:
assert result.flags.writeable is False
else:
assert result.flags.writeable is True
17 changes: 13 additions & 4 deletions pandas/tests/copy_view/test_setitem.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Expand Down Expand Up @@ -59,17 +60,25 @@ def test_set_column_with_index(using_copy_on_write):
assert not np.shares_memory(get_array(df, "c"), idx.values)

# and thus modifying the index does not modify the DataFrame
idx.values[0] = 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
if using_copy_on_write:
with pytest.raises(ValueError, match="assignment"):
idx.values[0] = 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This defeats a bit the purpose of the original test (and I don't think we need to test here that assigning into a read-only numpy array gives an error).
So maybe just remove it? (we already check np.shares_memory) Or manually set the writeable flag to True and then assign the value.

But actually, now that we keep track of references to Index data as well, the original setitem doesn't really need to do a copy, I think? (for another issue/PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah let's rip it out

else:
idx.values[0] = 0
tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))

idx = RangeIndex(1, 4)
arr = idx.values

df["d"] = idx

assert not np.shares_memory(get_array(df, "d"), arr)
arr[0] = 0
tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))
if using_copy_on_write:
with pytest.raises(ValueError, match="assignment"):
arr[0] = 0
else:
arr[0] = 0
tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d"))


def test_set_columns_with_dataframe(using_copy_on_write):
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/indexes/interval/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,18 @@ def test_get_indexer_interval_index(self, box):
expected = np.array([-1, -1, -1], dtype=np.intp)
tm.assert_numpy_array_equal(actual, expected)

def test_get_indexer_read_only(self):
idx = interval_range(start=0, end=5)
arr = np.array([1, 2])
arr.flags.writeable = False
result = idx.get_indexer(arr)
expected = np.array([0, 1])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

result = idx.get_indexer_non_unique(arr)[0]
expected = np.array([0, 1])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)


class TestSliceLocs:
def test_slice_locs_with_interval(self):
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/numeric/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def test_constructor_from_list_no_dtype(self):
index = Index([1, 2, 3])
assert index.dtype == np.int64

def test_constructor(self, dtype):
def test_constructor(self, dtype, using_copy_on_write):
index_cls = Index

# scalar raise Exception
Expand All @@ -347,8 +347,12 @@ def test_constructor(self, dtype):
val = arr[0] + 3000

# this should not change index
arr[0] = val
assert new_index[0] != val
if not using_copy_on_write:
arr[0] = val
assert new_index[0] != val
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly here, we want to test that the copy=True keyword is honored (I assume, based on the comment). So maybe add a copy() to the line creating arr above: arr = index.values.copy(). Then arr is writeable, and the test can be done as normal (ensuring that arr was actually copied when creating the Index.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense

else:
with pytest.raises(ValueError, match="assignment"):
arr[0] = val

if dtype == np.int64:
# pass list, coerce fine
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,12 @@ def test_read_columns(self, engine):
df, engine, expected=expected, read_kwargs={"columns": ["string"]}
)

def test_write_index(self, engine):
def test_write_index(self, engine, using_copy_on_write, request):
check_names = engine != "fastparquet"
if using_copy_on_write and engine == "fastparquet":
request.node.add_marker(
pytest.mark.xfail(reason="fastparquet write into index")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does fastparquet exactly? (it tries to write into the array it gets from the index? Do you know why? (that sounds as a bug in fastparquet, as it can change the dataframe you are writing?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can simply set the flag there, they seem to use the index to do a conversion. It's already on my todo list, but don't want to block this pr because of that

)

df = pd.DataFrame({"A": [1, 2, 3]})
check_round_trip(df, engine)
Expand Down Expand Up @@ -1213,12 +1217,14 @@ def test_error_on_using_partition_cols_and_partition_on(
partition_cols=partition_cols,
)

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_empty_dataframe(self, fp):
# GH #27339
df = pd.DataFrame()
expected = df.copy()
check_round_trip(df, fp, expected=expected)

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
idx = 5 * [timezone_aware_date_list]

Expand Down Expand Up @@ -1328,6 +1334,7 @@ def test_invalid_dtype_backend(self, engine):
with pytest.raises(ValueError, match=msg):
read_parquet(path, dtype_backend="numpy")

@pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index")
def test_empty_columns(self, fp):
# GH 52034
df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
Expand Down