Skip to content

API / CoW: return read-only numpy arrays in .values/to_numpy() #51082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import lib
from pandas._typing import (
Axis,
Expand Down Expand Up @@ -592,10 +594,16 @@ def to_numpy(

result = np.asarray(values, dtype=dtype)

if copy and na_value is lib.no_default:
if (copy and na_value is lib.no_default) or (
not copy and using_copy_on_write()
):
if np.shares_memory(self._values[:2], result[:2]):
# Take slices to improve performance of check
result = result.copy()
if using_copy_on_write() and not copy:
result = result.view()
result.flags.writeable = False
else:
result = result.copy()

return result

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1991,7 +1991,13 @@ def empty(self) -> bool_t:
__array_priority__: int = 1000

def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
return np.asarray(self._values, dtype=dtype)
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
arr = arr.view()
arr.flags.writeable = False
return arr

@final
def __array_ufunc__(
Expand Down
14 changes: 11 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import (
internals as libinternals,
lib,
Expand Down Expand Up @@ -2592,6 +2594,12 @@ def external_values(values: ArrayLike) -> ArrayLike:
# NB: for datetime64tz this is different from np.asarray(values), since
# that returns an object-dtype ndarray of Timestamps.
# Avoid raising in .astype in casting from dt64tz to dt64
return values._ndarray
else:
return values
values = values._ndarray

if isinstance(values, np.ndarray) and using_copy_on_write():
values = values.view()
values.flags.writeable = False

# TODO(CoW) we should also mark our ExtensionArrays as read-only

return values
13 changes: 8 additions & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1713,13 +1713,16 @@ def as_array(
arr = np.asarray(blk.get_values())
if dtype:
arr = arr.astype(dtype, copy=False)

if copy:
arr = arr.copy()
elif using_copy_on_write():
arr = arr.view()
arr.flags.writeable = False
else:
arr = self._interleave(dtype=dtype, na_value=na_value)
# The underlying data was copied within _interleave
copy = False

if copy:
arr = arr.copy()
# The underlying data was copied within _interleave, so no need
# to further copy if copy=True or setting na_value

if na_value is not lib.no_default:
arr[isna(arr)] = na_value
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,13 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
array(['1999-12-31T23:00:00.000000000', ...],
dtype='datetime64[ns]')
"""
return np.asarray(self._values, dtype)
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
arr = arr.view()
arr.flags.writeable = False
return arr

# ----------------------------------------------------------------------
# Unary Methods
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 +1126,7 @@ def converter(*date_cols, col: Hashable):
dayfirst=dayfirst,
errors="ignore",
cache=cache_dates,
).to_numpy()
)._values
else:
try:
result = tools.to_datetime(
Expand Down
112 changes: 112 additions & 0 deletions pandas/tests/copy_view/test_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array

# -----------------------------------------------------------------------------
# Copy/view behaviour for accessing underlying array of Series/DataFrame


@pytest.mark.parametrize(
"method",
[lambda ser: ser.values, lambda ser: np.asarray(ser)],
ids=["values", "asarray"],
)
def test_series_values(using_copy_on_write, method):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()

arr = method(ser)

if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)

# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0


@pytest.mark.parametrize(
"method",
[lambda df: df.values, lambda df: np.asarray(df)],
ids=["values", "asarray"],
)
def test_dataframe_values(using_copy_on_write, using_array_manager, method):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()

arr = method(df)

if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0, 0] = 0
tm.assert_frame_equal(df, df_orig)

# mutating the series itself still works
df.iloc[0, 0] = 0
assert df.values[0, 0] == 0
else:
assert arr.flags.writeable is True
arr[0, 0] = 0
if not using_array_manager:
assert df.iloc[0, 0] == 0
else:
tm.assert_frame_equal(df, df_orig)


def test_series_to_numpy(using_copy_on_write):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()

# default: copy=False, no dtype or NAs
arr = ser.to_numpy()
if using_copy_on_write:
# to_numpy still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)

# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0

# specify copy=False gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(copy=True)
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True

# specifying a dtype that already causes a copy also gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(dtype="float64")
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
20 changes: 13 additions & 7 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def test_setitem2(self):

def test_setitem_boolean(self, float_frame):
df = float_frame.copy()
values = float_frame.values
values = float_frame.values.copy()

df[df["A"] > 0] = 4
values[values[:, 0] > 0] = 4
Expand Down Expand Up @@ -381,16 +381,18 @@ def test_setitem_boolean(self, float_frame):
df[df * 0] = 2

# index with DataFrame
df_orig = df.copy()
mask = df > np.abs(df)
expected = df.copy()
df[df > np.abs(df)] = np.nan
expected.values[mask.values] = np.nan
values = df_orig.values.copy()
values[mask.values] = np.nan
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)

# set from DataFrame
expected = df.copy()
df[df > np.abs(df)] = df * 2
np.putmask(expected.values, mask.values, df.values * 2)
np.putmask(values, mask.values, df.values * 2)
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)

def test_setitem_cast(self, float_frame):
Expand Down Expand Up @@ -664,16 +666,20 @@ def test_setitem_fancy_boolean(self, float_frame):
# from 2d, set with booleans
frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()

mask = frame["A"] > 0
frame.loc[mask] = 0.0
expected.values[mask.values] = 0.0
values[mask.values] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)

frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()
frame.loc[mask, ["A", "B"]] = 0.0
expected.values[mask.values, :2] = 0.0
values[mask.values, :2] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)

def test_getitem_fancy_ints(self, float_frame):
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_insert_with_columns_dups(self):
)
tm.assert_frame_equal(df, exp)

def test_insert_item_cache(self, using_array_manager):
def test_insert_item_cache(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.randn(4, 3))
ser = df[0]

Expand All @@ -85,9 +85,14 @@ def test_insert_item_cache(self, using_array_manager):
for n in range(100):
df[n + 3] = df[1] * n

ser.values[0] = 99

assert df.iloc[0, 0] == df[0][0]
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df[0][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df[0][0]
assert df.iloc[0, 0] == 99

def test_insert_EA_no_warning(self):
# PerformanceWarning about fragmented frame should not be raised when
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,8 +1002,9 @@ def test_setitem_boolean_mask(self, mask_type, float_frame):
result = df.copy()
result[mask] = np.nan

expected = df.copy()
expected.values[np.array(mask)] = np.nan
expected = df.values.copy()
expected[np.array(mask)] = np.nan
expected = DataFrame(expected, index=df.index, columns=df.columns)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="Currently empty indexers are treated as all False")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ def test_where_dt64_2d():

df = DataFrame(dta, columns=["A", "B"])

mask = np.asarray(df.isna())
mask = np.asarray(df.isna()).copy()
mask[:, 1] = True

# setting all of one column, none of the other
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/methods/test_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def test_copy_index_name_checking(self, float_frame, attr):
getattr(cp, attr).name = "foo"
assert getattr(float_frame, attr).name is None

@td.skip_copy_on_write_invalid_test
def test_copy_cache(self):
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
df = DataFrame({"a": [1]})
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def test_join(self, multiindex_dataframe_random_data):
b = frame.loc[frame.index[2:], ["B", "C"]]

joined = a.join(b, how="outer").reindex(frame.index)
expected = frame.copy().values
expected = frame.copy().values.copy()
expected[np.isnan(joined.values)] = np.nan
expected = DataFrame(expected, index=frame.index, columns=frame.columns)

Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,9 @@ def test_quantile_empty_no_columns(self, interp_method):
expected.columns.name = "captain tightpants"
tm.assert_frame_equal(result, expected)

def test_quantile_item_cache(self, using_array_manager, interp_method):
def test_quantile_item_cache(
self, using_array_manager, interp_method, using_copy_on_write
):
# previous behavior incorrect retained an invalid _item_cache entry
interpolation, method = interp_method
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
Expand All @@ -776,9 +778,15 @@ def test_quantile_item_cache(self, using_array_manager, interp_method):
assert len(df._mgr.blocks) == 2

df.quantile(numeric_only=False, interpolation=interpolation, method=method)
ser.values[0] = 99

assert df.iloc[0, 0] == df["A"][0]
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] == 99

def test_invalid_method(self):
with pytest.raises(ValueError, match="Invalid method: foo"):
Expand Down
Loading