Skip to content

BUG: Fix copy semantics in __array__ #60046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ enhancement1
Other enhancements
^^^^^^^^^^^^^^^^^^

- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
-

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,16 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)
if copy is False:
# TODO: By using `zero_copy_only` it may be possible to implement this
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
elif copy is None:
# `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
copy = False

return self.to_numpy(dtype=dtype, copy=copy)

def __invert__(self) -> Self:
# This is a bit wise op for integer types
Expand Down
24 changes: 15 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
raise ValueError("Cannot convert float NaN to integer")

elif len(self.codes) == 0 or len(self.categories) == 0:
result = np.array(
self,
dtype=dtype,
copy=copy,
)
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
result = np.asarray(self, dtype=dtype)
else:
result = np.array(self, dtype=dtype)

else:
# GH8628 (PERF): astype category codes instead of astyping array
Expand Down Expand Up @@ -1663,7 +1664,7 @@ def __array__(
Specifies the the dtype for the array.

copy : bool or None, optional
Unused.
See :func:`numpy.asarray`.

Returns
-------
Expand All @@ -1686,13 +1687,18 @@ def __array__(
>>> np.asarray(cat)
array(['a', 'b'], dtype=object)
"""
if copy is False:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If lib.is_range_indexer(self._codes) i.e. the self.categories._values are all unique then, a copy could be avoided?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, but I left it out, it doesn't seem vital to try and improve it. Also fixed things up, because take_nd should presumably always return a copy.

raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

ret = take_nd(self.categories._values, self._codes)
if dtype and np.dtype(dtype) != self.categories.dtype:
return np.asarray(ret, dtype)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not understand why this is needed. If dtypes match, NumPy should make it a no-op? If dtype is None, it is the same as not passing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, also not sure why this is here

# When we're a Categorical[ExtensionArray], like Interval,
# we need to ensure __array__ gets all the way to an
# ndarray.
return np.asarray(ret)

# `take_nd` should already make a copy, so don't force again.
return np.asarray(ret, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,14 @@ def __array__(
) -> np.ndarray:
# used for Timedelta/DatetimeArray, overwritten by PeriodArray
if is_object_dtype(dtype):
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
return np.array(list(self), dtype=object)

if copy is True:
return np.array(self._ndarray, dtype=dtype)
return self._ndarray

@overload
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1622,6 +1622,11 @@ def __array__(
Return the IntervalArray's data as a numpy array of Interval
objects (with dtype='object')
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

left = self._left
right = self._right
mask = self.isna()
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,17 @@ def __array__(
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self.to_numpy(dtype=dtype)
if copy is False:
if not self._hasna:
# special case, here we can simply return the underlying data
return np.array(self._data, dtype=dtype, copy=copy)
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if copy is None:
copy = False # The NumPy copy=False meaning is different here.
return self.to_numpy(dtype=dtype, copy=copy)

_HANDLED_TYPES: tuple[type, ...]

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if copy is not None:
# Note: branch avoids `copy=None` for NumPy 1.x support
return np.array(self._ndarray, dtype=dtype, copy=copy)
return np.asarray(self._ndarray, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,19 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if dtype == "i8":
return self.asi8
elif dtype == bool:
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
return np.asarray(self.asi8, dtype=dtype)
else:
return np.array(self.asi8, dtype=dtype)

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if dtype == bool:
return ~self._isnan

# This will raise TypeError for non-object dtypes
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
fill_value = self.fill_value

if self.sp_index.ngaps == 0:
# Compat for na dtype and int values.
return self.sp_values
if copy is True:
return np.array(self.sp_values)
else:
return self.sp_values

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

fill_value = self.fill_value

if dtype is None:
# Can NumPy represent this type?
# If not, `np.result_type` will raise. We catch that
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2015,8 +2015,17 @@ def __array__(
self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
) -> np.ndarray:
values = self._values
arr = np.asarray(values, dtype=dtype)
if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block:
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if (
copy is not True
and astype_is_view(values.dtype, arr.dtype)
and self._mgr.is_single_block
):
# Check if both conversions can be done without a copy
if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
values.dtype, arr.dtype
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""
The array interface, return my values.
"""
return np.asarray(self._data, dtype=dtype)
if copy is None:
# Note, that the if branch exists for NumPy 1.x support
return np.asarray(self._data, dtype=dtype)

return np.array(self._data, dtype=dtype, copy=copy)

def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,15 @@ def copy( # type: ignore[override]

def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""the array interface, return my values"""
if copy is False:
# self.values is always a newly construct array, so raise.
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
if copy is True:
# explicit np.array call to ensure a copy is made and unique objects
# are returned, because self.values is cached
return np.array(self.values, dtype=dtype)
return self.values

def view(self, cls=None) -> Self:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def ndarray_to_mgr(
# and a subsequent `astype` will not already result in a copy
values = np.array(values, copy=True, order="F")
else:
values = np.array(values, copy=False)
values = np.asarray(values)
values = _ensure_2d(values)

else:
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def __array__(
the dtype is inferred from the data.

copy : bool or None, optional
Unused.
See :func:`numpy.asarray`.

Returns
-------
Expand Down Expand Up @@ -879,8 +879,15 @@ def __array__(
dtype='datetime64[ns]')
"""
values = self._values
arr = np.asarray(values, dtype=dtype)
if astype_is_view(values.dtype, arr.dtype):
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if copy is True:
return arr
if copy is False or astype_is_view(values.dtype, arr.dtype):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more issue I noticed while doing the backport: for a similar case in generic.py we changed the copy is False to copy is not True, which I think we should have done here as well.

Will try to come up with a test case that would fail because of this, and do a follow-up tomorrow.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, ouch. This is clearly hard to get right without tests :/.

The release snippet looks fine to me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It also only matters for the copy=None case, which AFAIU is quite new, and so we didn't cover that when adding the logic here for making the resulting array read-only

Copy link
Contributor Author

@seberg seberg Nov 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking closer, I think this is right? Note that it says copy is False or .... If copy is False we don't have to check whether it is a view (because otherwise it would have errored)?

EDIT: Not sure if it's worth to skip the check though. It seems like it may be more confusing then anything...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that's a good point. I assume the arr = np.array(values, dtype=dtype, copy=copy) line above will already have errored in case of copy=False if a zero-copy conversion was not possible.
So indeed the or is fine: if copy=False we know arr is always a view, and for copy=None (the one remaining case) we have astype_is_view to check it.

Maybe it's then in generic.py that we could change the copy is not True and .. to copy is False or .. (just to use a similar pattern in both cases)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking once more at it, we actually already have tests for this, because we test that we get a read-only array using np.asarray(ser), which essentially means passing copy=None.

I expanded the test with np.array(ser, copy=False) to also explicitly cover the copy=False case (and ensure this branch setting writeable to False is covered for that case) -> #60191

arr = arr.view()
arr.flags.writeable = False
return arr
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs.sparse import IntIndex
from pandas.compat.numpy import np_version_gt2

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -480,3 +481,33 @@ def test_zero_sparse_column():

expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)


def test_array_interface(arr_data, arr):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(arr)
tm.assert_numpy_array_equal(result, arr_data)

# it always gives a copy by default
result_copy1 = np.asarray(arr)
result_copy2 = np.asarray(arr)
assert not np.may_share_memory(result_copy1, result_copy2)

# or with explicit copy=True
result_copy1 = np.array(arr, copy=True)
result_copy2 = np.array(arr, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

# for sparse arrays, copy=False is never allowed
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
np.array(arr, copy=False)

# except when there are actually no sparse filled values
arr2 = SparseArray(np.array([1, 2, 3]))
result_nocopy1 = np.array(arr2, copy=False)
result_nocopy2 = np.array(arr2, copy=False)
assert np.may_share_memory(result_nocopy1, result_nocopy2)
8 changes: 8 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d):
result = np.asarray(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)

# to int64 gives the underlying representation
result = np.asarray(arr, dtype="int64")
tm.assert_numpy_array_equal(result, arr.asi8)

result2 = np.asarray(arr, dtype="int64")
assert np.may_share_memory(result, result2)

result_copy1 = np.array(arr, dtype="int64", copy=True)
result_copy2 = np.array(arr, dtype="int64", copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

# to other dtypes
msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading
Loading