Skip to content

Commit bfa7e9f

Browse files
API / CoW: return read-only numpy arrays in .values/to_numpy() (#51082)
1 parent a7b9c56 commit bfa7e9f

26 files changed

+274
-53
lines changed

pandas/core/base.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
import numpy as np
2222

23+
from pandas._config import using_copy_on_write
24+
2325
from pandas._libs import lib
2426
from pandas._typing import (
2527
Axis,
@@ -589,10 +591,16 @@ def to_numpy(
589591

590592
result = np.asarray(values, dtype=dtype)
591593

592-
if copy and na_value is lib.no_default:
594+
if (copy and na_value is lib.no_default) or (
595+
not copy and using_copy_on_write()
596+
):
593597
if np.shares_memory(self._values[:2], result[:2]):
594598
# Take slices to improve performance of check
595-
result = result.copy()
599+
if using_copy_on_write() and not copy:
600+
result = result.view()
601+
result.flags.writeable = False
602+
else:
603+
result = result.copy()
596604

597605
return result
598606

pandas/core/generic.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1991,7 +1991,13 @@ def empty(self) -> bool_t:
19911991
__array_priority__: int = 1000
19921992

19931993
def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
1994-
return np.asarray(self._values, dtype=dtype)
1994+
values = self._values
1995+
arr = np.asarray(values, dtype=dtype)
1996+
if arr is values and using_copy_on_write():
1997+
# TODO(CoW) also properly handle extension dtypes
1998+
arr = arr.view()
1999+
arr.flags.writeable = False
2000+
return arr
19952001

19962002
@final
19972003
def __array_ufunc__(

pandas/core/internals/blocks.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
import numpy as np
1616

17+
from pandas._config import using_copy_on_write
18+
1719
from pandas._libs import (
1820
internals as libinternals,
1921
lib,
@@ -2592,6 +2594,12 @@ def external_values(values: ArrayLike) -> ArrayLike:
25922594
# NB: for datetime64tz this is different from np.asarray(values), since
25932595
# that returns an object-dtype ndarray of Timestamps.
25942596
# Avoid raising in .astype in casting from dt64tz to dt64
2595-
return values._ndarray
2596-
else:
2597-
return values
2597+
values = values._ndarray
2598+
2599+
if isinstance(values, np.ndarray) and using_copy_on_write():
2600+
values = values.view()
2601+
values.flags.writeable = False
2602+
2603+
# TODO(CoW) we should also mark our ExtensionArrays as read-only
2604+
2605+
return values

pandas/core/internals/managers.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1713,13 +1713,16 @@ def as_array(
17131713
arr = np.asarray(blk.get_values())
17141714
if dtype:
17151715
arr = arr.astype(dtype, copy=False)
1716+
1717+
if copy:
1718+
arr = arr.copy()
1719+
elif using_copy_on_write():
1720+
arr = arr.view()
1721+
arr.flags.writeable = False
17161722
else:
17171723
arr = self._interleave(dtype=dtype, na_value=na_value)
1718-
# The underlying data was copied within _interleave
1719-
copy = False
1720-
1721-
if copy:
1722-
arr = arr.copy()
1724+
# The underlying data was copied within _interleave, so no need
1725+
# to further copy if copy=True or setting na_value
17231726

17241727
if na_value is not lib.no_default:
17251728
arr[isna(arr)] = na_value

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -889,7 +889,13 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
889889
array(['1999-12-31T23:00:00.000000000', ...],
890890
dtype='datetime64[ns]')
891891
"""
892-
return np.asarray(self._values, dtype)
892+
values = self._values
893+
arr = np.asarray(values, dtype=dtype)
894+
if arr is values and using_copy_on_write():
895+
# TODO(CoW) also properly handle extension dtypes
896+
arr = arr.view()
897+
arr.flags.writeable = False
898+
return arr
893899

894900
# ----------------------------------------------------------------------
895901
# Unary Methods

pandas/io/parsers/base_parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1126,7 +1126,7 @@ def converter(*date_cols, col: Hashable):
11261126
dayfirst=dayfirst,
11271127
errors="ignore",
11281128
cache=cache_dates,
1129-
).to_numpy()
1129+
)._values
11301130
else:
11311131
try:
11321132
result = tools.to_datetime(

pandas/tests/copy_view/test_array.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import (
5+
DataFrame,
6+
Series,
7+
)
8+
import pandas._testing as tm
9+
from pandas.tests.copy_view.util import get_array
10+
11+
# -----------------------------------------------------------------------------
12+
# Copy/view behaviour for accessing underlying array of Series/DataFrame
13+
14+
15+
@pytest.mark.parametrize(
16+
"method",
17+
[lambda ser: ser.values, lambda ser: np.asarray(ser)],
18+
ids=["values", "asarray"],
19+
)
20+
def test_series_values(using_copy_on_write, method):
21+
ser = Series([1, 2, 3], name="name")
22+
ser_orig = ser.copy()
23+
24+
arr = method(ser)
25+
26+
if using_copy_on_write:
27+
# .values still gives a view but is read-only
28+
assert np.shares_memory(arr, get_array(ser, "name"))
29+
assert arr.flags.writeable is False
30+
31+
# mutating series through arr therefore doesn't work
32+
with pytest.raises(ValueError, match="read-only"):
33+
arr[0] = 0
34+
tm.assert_series_equal(ser, ser_orig)
35+
36+
# mutating the series itself still works
37+
ser.iloc[0] = 0
38+
assert ser.values[0] == 0
39+
else:
40+
assert arr.flags.writeable is True
41+
arr[0] = 0
42+
assert ser.iloc[0] == 0
43+
44+
45+
@pytest.mark.parametrize(
46+
"method",
47+
[lambda df: df.values, lambda df: np.asarray(df)],
48+
ids=["values", "asarray"],
49+
)
50+
def test_dataframe_values(using_copy_on_write, using_array_manager, method):
51+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
52+
df_orig = df.copy()
53+
54+
arr = method(df)
55+
56+
if using_copy_on_write:
57+
# .values still gives a view but is read-only
58+
assert np.shares_memory(arr, get_array(df, "a"))
59+
assert arr.flags.writeable is False
60+
61+
# mutating series through arr therefore doesn't work
62+
with pytest.raises(ValueError, match="read-only"):
63+
arr[0, 0] = 0
64+
tm.assert_frame_equal(df, df_orig)
65+
66+
# mutating the series itself still works
67+
df.iloc[0, 0] = 0
68+
assert df.values[0, 0] == 0
69+
else:
70+
assert arr.flags.writeable is True
71+
arr[0, 0] = 0
72+
if not using_array_manager:
73+
assert df.iloc[0, 0] == 0
74+
else:
75+
tm.assert_frame_equal(df, df_orig)
76+
77+
78+
def test_series_to_numpy(using_copy_on_write):
79+
ser = Series([1, 2, 3], name="name")
80+
ser_orig = ser.copy()
81+
82+
# default: copy=False, no dtype or NAs
83+
arr = ser.to_numpy()
84+
if using_copy_on_write:
85+
# to_numpy still gives a view but is read-only
86+
assert np.shares_memory(arr, get_array(ser, "name"))
87+
assert arr.flags.writeable is False
88+
89+
# mutating series through arr therefore doesn't work
90+
with pytest.raises(ValueError, match="read-only"):
91+
arr[0] = 0
92+
tm.assert_series_equal(ser, ser_orig)
93+
94+
# mutating the series itself still works
95+
ser.iloc[0] = 0
96+
assert ser.values[0] == 0
97+
else:
98+
assert arr.flags.writeable is True
99+
arr[0] = 0
100+
assert ser.iloc[0] == 0
101+
102+
# specify copy=False gives a writeable array
103+
ser = Series([1, 2, 3], name="name")
104+
arr = ser.to_numpy(copy=True)
105+
assert not np.shares_memory(arr, get_array(ser, "name"))
106+
assert arr.flags.writeable is True
107+
108+
# specifying a dtype that already causes a copy also gives a writeable array
109+
ser = Series([1, 2, 3], name="name")
110+
arr = ser.to_numpy(dtype="float64")
111+
assert not np.shares_memory(arr, get_array(ser, "name"))
112+
assert arr.flags.writeable is True

pandas/tests/frame/indexing/test_indexing.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ def test_setitem2(self):
345345

346346
def test_setitem_boolean(self, float_frame):
347347
df = float_frame.copy()
348-
values = float_frame.values
348+
values = float_frame.values.copy()
349349

350350
df[df["A"] > 0] = 4
351351
values[values[:, 0] > 0] = 4
@@ -381,16 +381,18 @@ def test_setitem_boolean(self, float_frame):
381381
df[df * 0] = 2
382382

383383
# index with DataFrame
384+
df_orig = df.copy()
384385
mask = df > np.abs(df)
385-
expected = df.copy()
386386
df[df > np.abs(df)] = np.nan
387-
expected.values[mask.values] = np.nan
387+
values = df_orig.values.copy()
388+
values[mask.values] = np.nan
389+
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
388390
tm.assert_frame_equal(df, expected)
389391

390392
# set from DataFrame
391-
expected = df.copy()
392393
df[df > np.abs(df)] = df * 2
393-
np.putmask(expected.values, mask.values, df.values * 2)
394+
np.putmask(values, mask.values, df.values * 2)
395+
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
394396
tm.assert_frame_equal(df, expected)
395397

396398
def test_setitem_cast(self, float_frame):
@@ -664,16 +666,20 @@ def test_setitem_fancy_boolean(self, float_frame):
664666
# from 2d, set with booleans
665667
frame = float_frame.copy()
666668
expected = float_frame.copy()
669+
values = expected.values.copy()
667670

668671
mask = frame["A"] > 0
669672
frame.loc[mask] = 0.0
670-
expected.values[mask.values] = 0.0
673+
values[mask.values] = 0.0
674+
expected = DataFrame(values, index=expected.index, columns=expected.columns)
671675
tm.assert_frame_equal(frame, expected)
672676

673677
frame = float_frame.copy()
674678
expected = float_frame.copy()
679+
values = expected.values.copy()
675680
frame.loc[mask, ["A", "B"]] = 0.0
676-
expected.values[mask.values, :2] = 0.0
681+
values[mask.values, :2] = 0.0
682+
expected = DataFrame(values, index=expected.index, columns=expected.columns)
677683
tm.assert_frame_equal(frame, expected)
678684

679685
def test_getitem_fancy_ints(self, float_frame):

pandas/tests/frame/indexing/test_insert.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def test_insert_with_columns_dups(self):
7171
)
7272
tm.assert_frame_equal(df, exp)
7373

74-
def test_insert_item_cache(self, using_array_manager):
74+
def test_insert_item_cache(self, using_array_manager, using_copy_on_write):
7575
df = DataFrame(np.random.randn(4, 3))
7676
ser = df[0]
7777

@@ -85,9 +85,14 @@ def test_insert_item_cache(self, using_array_manager):
8585
for n in range(100):
8686
df[n + 3] = df[1] * n
8787

88-
ser.values[0] = 99
89-
90-
assert df.iloc[0, 0] == df[0][0]
88+
if using_copy_on_write:
89+
ser.iloc[0] = 99
90+
assert df.iloc[0, 0] == df[0][0]
91+
assert df.iloc[0, 0] != 99
92+
else:
93+
ser.values[0] = 99
94+
assert df.iloc[0, 0] == df[0][0]
95+
assert df.iloc[0, 0] == 99
9196

9297
def test_insert_EA_no_warning(self):
9398
# PerformanceWarning about fragmented frame should not be raised when

pandas/tests/frame/indexing/test_setitem.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1002,8 +1002,9 @@ def test_setitem_boolean_mask(self, mask_type, float_frame):
10021002
result = df.copy()
10031003
result[mask] = np.nan
10041004

1005-
expected = df.copy()
1006-
expected.values[np.array(mask)] = np.nan
1005+
expected = df.values.copy()
1006+
expected[np.array(mask)] = np.nan
1007+
expected = DataFrame(expected, index=df.index, columns=df.columns)
10071008
tm.assert_frame_equal(result, expected)
10081009

10091010
@pytest.mark.xfail(reason="Currently empty indexers are treated as all False")

pandas/tests/frame/indexing/test_where.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -981,7 +981,7 @@ def test_where_dt64_2d():
981981

982982
df = DataFrame(dta, columns=["A", "B"])
983983

984-
mask = np.asarray(df.isna())
984+
mask = np.asarray(df.isna()).copy()
985985
mask[:, 1] = True
986986

987987
# setting all of one column, none of the other

pandas/tests/frame/methods/test_copy.py

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def test_copy_index_name_checking(self, float_frame, attr):
1818
getattr(cp, attr).name = "foo"
1919
assert getattr(float_frame, attr).name is None
2020

21+
@td.skip_copy_on_write_invalid_test
2122
def test_copy_cache(self):
2223
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
2324
df = DataFrame({"a": [1]})

pandas/tests/frame/methods/test_join.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ def test_join(self, multiindex_dataframe_random_data):
417417
b = frame.loc[frame.index[2:], ["B", "C"]]
418418

419419
joined = a.join(b, how="outer").reindex(frame.index)
420-
expected = frame.copy().values
420+
expected = frame.copy().values.copy()
421421
expected[np.isnan(joined.values)] = np.nan
422422
expected = DataFrame(expected, index=frame.index, columns=frame.columns)
423423

pandas/tests/frame/methods/test_quantile.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,9 @@ def test_quantile_empty_no_columns(self, interp_method):
766766
expected.columns.name = "captain tightpants"
767767
tm.assert_frame_equal(result, expected)
768768

769-
def test_quantile_item_cache(self, using_array_manager, interp_method):
769+
def test_quantile_item_cache(
770+
self, using_array_manager, interp_method, using_copy_on_write
771+
):
770772
# previous behavior incorrect retained an invalid _item_cache entry
771773
interpolation, method = interp_method
772774
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
@@ -776,9 +778,15 @@ def test_quantile_item_cache(self, using_array_manager, interp_method):
776778
assert len(df._mgr.blocks) == 2
777779

778780
df.quantile(numeric_only=False, interpolation=interpolation, method=method)
779-
ser.values[0] = 99
780781

781-
assert df.iloc[0, 0] == df["A"][0]
782+
if using_copy_on_write:
783+
ser.iloc[0] = 99
784+
assert df.iloc[0, 0] == df["A"][0]
785+
assert df.iloc[0, 0] != 99
786+
else:
787+
ser.values[0] = 99
788+
assert df.iloc[0, 0] == df["A"][0]
789+
assert df.iloc[0, 0] == 99
782790

783791
def test_invalid_method(self):
784792
with pytest.raises(ValueError, match="Invalid method: foo"):

0 commit comments

Comments
 (0)