Skip to content

Commit eacf032

Browse files
BUG: Fix copy semantics in __array__ (#60046)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent cbf6e42 commit eacf032

File tree

20 files changed

+255
-30
lines changed

20 files changed

+255
-30
lines changed

doc/source/whatsnew/v2.3.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ enhancement1
3232
Other enhancements
3333
^^^^^^^^^^^^^^^^^^
3434

35+
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
36+
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
37+
updated to work correctly with NumPy >= 2 (:issue:`57739`)
3538
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
3639
-
3740

pandas/core/arrays/arrow/array.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,16 @@ def __array__(
668668
self, dtype: NpDtype | None = None, copy: bool | None = None
669669
) -> np.ndarray:
670670
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
671-
return self.to_numpy(dtype=dtype)
671+
if copy is False:
672+
# TODO: By using `zero_copy_only` it may be possible to implement this
673+
raise ValueError(
674+
"Unable to avoid copy while creating an array as requested."
675+
)
676+
elif copy is None:
677+
# `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
678+
copy = False
679+
680+
return self.to_numpy(dtype=dtype, copy=copy)
672681

673682
def __invert__(self) -> Self:
674683
# This is a bit wise op for integer types

pandas/core/arrays/categorical.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
579579
raise ValueError("Cannot convert float NaN to integer")
580580

581581
elif len(self.codes) == 0 or len(self.categories) == 0:
582-
result = np.array(
583-
self,
584-
dtype=dtype,
585-
copy=copy,
586-
)
582+
# For NumPy 1.x compatibility we cannot use copy=None. And
583+
# `copy=False` has the meaning of `copy=None` here:
584+
if not copy:
585+
result = np.asarray(self, dtype=dtype)
586+
else:
587+
result = np.array(self, dtype=dtype)
587588

588589
else:
589590
# GH8628 (PERF): astype category codes instead of astyping array
@@ -1663,7 +1664,7 @@ def __array__(
16631664
Specifies the the dtype for the array.
16641665
16651666
copy : bool or None, optional
1666-
Unused.
1667+
See :func:`numpy.asarray`.
16671668
16681669
Returns
16691670
-------
@@ -1686,13 +1687,18 @@ def __array__(
16861687
>>> np.asarray(cat)
16871688
array(['a', 'b'], dtype=object)
16881689
"""
1690+
if copy is False:
1691+
raise ValueError(
1692+
"Unable to avoid copy while creating an array as requested."
1693+
)
1694+
16891695
ret = take_nd(self.categories._values, self._codes)
1690-
if dtype and np.dtype(dtype) != self.categories.dtype:
1691-
return np.asarray(ret, dtype)
16921696
# When we're a Categorical[ExtensionArray], like Interval,
16931697
# we need to ensure __array__ gets all the way to an
16941698
# ndarray.
1695-
return np.asarray(ret)
1699+
1700+
# `take_nd` should already make a copy, so don't force again.
1701+
return np.asarray(ret, dtype=dtype)
16961702

16971703
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
16981704
# for binary ops, use our custom dunder methods

pandas/core/arrays/datetimelike.py

+7
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,14 @@ def __array__(
359359
) -> np.ndarray:
360360
# used for Timedelta/DatetimeArray, overwritten by PeriodArray
361361
if is_object_dtype(dtype):
362+
if copy is False:
363+
raise ValueError(
364+
"Unable to avoid copy while creating an array as requested."
365+
)
362366
return np.array(list(self), dtype=object)
367+
368+
if copy is True:
369+
return np.array(self._ndarray, dtype=dtype)
363370
return self._ndarray
364371

365372
@overload

pandas/core/arrays/interval.py

+5
Original file line numberDiff line numberDiff line change
@@ -1622,6 +1622,11 @@ def __array__(
16221622
Return the IntervalArray's data as a numpy array of Interval
16231623
objects (with dtype='object')
16241624
"""
1625+
if copy is False:
1626+
raise ValueError(
1627+
"Unable to avoid copy while creating an array as requested."
1628+
)
1629+
16251630
left = self._left
16261631
right = self._right
16271632
mask = self.isna()

pandas/core/arrays/masked.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,17 @@ def __array__(
581581
the array interface, return my values
582582
We return an object array here to preserve our scalar values
583583
"""
584-
return self.to_numpy(dtype=dtype)
584+
if copy is False:
585+
if not self._hasna:
586+
# special case, here we can simply return the underlying data
587+
return np.array(self._data, dtype=dtype, copy=copy)
588+
raise ValueError(
589+
"Unable to avoid copy while creating an array as requested."
590+
)
591+
592+
if copy is None:
593+
copy = False # The NumPy copy=False meaning is different here.
594+
return self.to_numpy(dtype=dtype, copy=copy)
585595

586596
_HANDLED_TYPES: tuple[type, ...]
587597

pandas/core/arrays/numpy_.py

+3
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
150150
def __array__(
151151
self, dtype: NpDtype | None = None, copy: bool | None = None
152152
) -> np.ndarray:
153+
if copy is not None:
154+
# Note: branch avoids `copy=None` for NumPy 1.x support
155+
return np.array(self._ndarray, dtype=dtype, copy=copy)
153156
return np.asarray(self._ndarray, dtype=dtype)
154157

155158
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

pandas/core/arrays/period.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,19 @@ def __array__(
390390
self, dtype: NpDtype | None = None, copy: bool | None = None
391391
) -> np.ndarray:
392392
if dtype == "i8":
393-
return self.asi8
394-
elif dtype == bool:
393+
# For NumPy 1.x compatibility we cannot use copy=None. And
394+
# `copy=False` has the meaning of `copy=None` here:
395+
if not copy:
396+
return np.asarray(self.asi8, dtype=dtype)
397+
else:
398+
return np.array(self.asi8, dtype=dtype)
399+
400+
if copy is False:
401+
raise ValueError(
402+
"Unable to avoid copy while creating an array as requested."
403+
)
404+
405+
if dtype == bool:
395406
return ~self._isnan
396407

397408
# This will raise TypeError for non-object dtypes

pandas/core/arrays/sparse/array.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -547,11 +547,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
547547
def __array__(
548548
self, dtype: NpDtype | None = None, copy: bool | None = None
549549
) -> np.ndarray:
550-
fill_value = self.fill_value
551-
552550
if self.sp_index.ngaps == 0:
553551
# Compat for na dtype and int values.
554-
return self.sp_values
552+
if copy is True:
553+
return np.array(self.sp_values)
554+
else:
555+
return self.sp_values
556+
557+
if copy is False:
558+
raise ValueError(
559+
"Unable to avoid copy while creating an array as requested."
560+
)
561+
562+
fill_value = self.fill_value
563+
555564
if dtype is None:
556565
# Can NumPy represent this type?
557566
# If not, `np.result_type` will raise. We catch that

pandas/core/generic.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -2015,8 +2015,17 @@ def __array__(
20152015
self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
20162016
) -> np.ndarray:
20172017
values = self._values
2018-
arr = np.asarray(values, dtype=dtype)
2019-
if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block:
2018+
if copy is None:
2019+
# Note: branch avoids `copy=None` for NumPy 1.x support
2020+
arr = np.asarray(values, dtype=dtype)
2021+
else:
2022+
arr = np.array(values, dtype=dtype, copy=copy)
2023+
2024+
if (
2025+
copy is not True
2026+
and astype_is_view(values.dtype, arr.dtype)
2027+
and self._mgr.is_single_block
2028+
):
20202029
# Check if both conversions can be done without a copy
20212030
if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
20222031
values.dtype, arr.dtype

pandas/core/indexes/base.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
908908
"""
909909
The array interface, return my values.
910910
"""
911-
return np.asarray(self._data, dtype=dtype)
911+
if copy is None:
912+
# Note, that the if branch exists for NumPy 1.x support
913+
return np.asarray(self._data, dtype=dtype)
914+
915+
return np.array(self._data, dtype=dtype, copy=copy)
912916

913917
def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
914918
if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):

pandas/core/indexes/multi.py

+9
Original file line numberDiff line numberDiff line change
@@ -1391,6 +1391,15 @@ def copy( # type: ignore[override]
13911391

13921392
def __array__(self, dtype=None, copy=None) -> np.ndarray:
13931393
"""the array interface, return my values"""
1394+
if copy is False:
1395+
# self.values is always a newly construct array, so raise.
1396+
raise ValueError(
1397+
"Unable to avoid copy while creating an array as requested."
1398+
)
1399+
if copy is True:
1400+
# explicit np.array call to ensure a copy is made and unique objects
1401+
# are returned, because self.values is cached
1402+
return np.array(self.values, dtype=dtype)
13941403
return self.values
13951404

13961405
def view(self, cls=None) -> Self:

pandas/core/internals/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def ndarray_to_mgr(
258258
# and a subsequent `astype` will not already result in a copy
259259
values = np.array(values, copy=True, order="F")
260260
else:
261-
values = np.array(values, copy=False)
261+
values = np.asarray(values)
262262
values = _ensure_2d(values)
263263

264264
else:

pandas/core/series.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ def __array__(
842842
the dtype is inferred from the data.
843843
844844
copy : bool or None, optional
845-
Unused.
845+
See :func:`numpy.asarray`.
846846
847847
Returns
848848
-------
@@ -879,8 +879,15 @@ def __array__(
879879
dtype='datetime64[ns]')
880880
"""
881881
values = self._values
882-
arr = np.asarray(values, dtype=dtype)
883-
if astype_is_view(values.dtype, arr.dtype):
882+
if copy is None:
883+
# Note: branch avoids `copy=None` for NumPy 1.x support
884+
arr = np.asarray(values, dtype=dtype)
885+
else:
886+
arr = np.array(values, dtype=dtype, copy=copy)
887+
888+
if copy is True:
889+
return arr
890+
if copy is False or astype_is_view(values.dtype, arr.dtype):
884891
arr = arr.view()
885892
arr.flags.writeable = False
886893
return arr

pandas/tests/arrays/sparse/test_array.py

+31
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
from pandas._libs.sparse import IntIndex
7+
from pandas.compat.numpy import np_version_gt2
78

89
import pandas as pd
910
from pandas import (
@@ -480,3 +481,33 @@ def test_zero_sparse_column():
480481

481482
expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
482483
tm.assert_frame_equal(result, expected)
484+
485+
486+
def test_array_interface(arr_data, arr):
487+
# https://github.com/pandas-dev/pandas/pull/60046
488+
result = np.asarray(arr)
489+
tm.assert_numpy_array_equal(result, arr_data)
490+
491+
# it always gives a copy by default
492+
result_copy1 = np.asarray(arr)
493+
result_copy2 = np.asarray(arr)
494+
assert not np.may_share_memory(result_copy1, result_copy2)
495+
496+
# or with explicit copy=True
497+
result_copy1 = np.array(arr, copy=True)
498+
result_copy2 = np.array(arr, copy=True)
499+
assert not np.may_share_memory(result_copy1, result_copy2)
500+
501+
if not np_version_gt2:
502+
# copy=False semantics are only supported in NumPy>=2.
503+
return
504+
505+
# for sparse arrays, copy=False is never allowed
506+
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
507+
np.array(arr, copy=False)
508+
509+
# except when there are actually no sparse filled values
510+
arr2 = SparseArray(np.array([1, 2, 3]))
511+
result_nocopy1 = np.array(arr2, copy=False)
512+
result_nocopy2 = np.array(arr2, copy=False)
513+
assert np.may_share_memory(result_nocopy1, result_nocopy2)

pandas/tests/arrays/test_datetimelike.py

+8
Original file line numberDiff line numberDiff line change
@@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d):
11521152
result = np.asarray(arr, dtype=object)
11531153
tm.assert_numpy_array_equal(result, expected)
11541154

1155+
# to int64 gives the underlying representation
11551156
result = np.asarray(arr, dtype="int64")
11561157
tm.assert_numpy_array_equal(result, arr.asi8)
11571158

1159+
result2 = np.asarray(arr, dtype="int64")
1160+
assert np.may_share_memory(result, result2)
1161+
1162+
result_copy1 = np.array(arr, dtype="int64", copy=True)
1163+
result_copy2 = np.array(arr, dtype="int64", copy=True)
1164+
assert not np.may_share_memory(result_copy1, result_copy2)
1165+
11581166
# to other dtypes
11591167
msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
11601168
with pytest.raises(TypeError, match=msg):

0 commit comments

Comments
 (0)