diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 5d72fabedcee8..90f9f4ed464c6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,6 +32,9 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 52d7fba8798e6..b6f1412066574 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -668,7 +668,16 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7cde4c53cb2f5..99e4cb0545e2d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1663,7 +1664,7 @@ def __array__( Specifies the the dtype for the array. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1686,13 +1687,18 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a25a698856747..9c821bf0d184e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -359,7 +359,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c58d03fefedb5..3e231fb9f8ecb 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1622,6 +1622,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 92ed690e527c7..349d2ec4d3cc9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -581,7 +581,17 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..9f7238a97d808 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7d0ad74f851f0..ae92e17332c76 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -390,8 +390,19 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 0c76280e7fdb4..a3db7dc1f93e9 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -547,11 +547,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - fill_value = self.fill_value - if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 756c431022063..bbd627d4f0d73 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2015,8 +2015,17 @@ def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block: + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if ( + copy is not True + and astype_is_view(values.dtype, arr.dtype) + and self._mgr.is_single_block + ): # Check if both conversions can be done without a copy if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( values.dtype, arr.dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 749a5fea4d513..cf3d1e6a2ee2d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae9b272af9fe9..e6ce00cb714a4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,6 +1391,15 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..0812ba5e6def4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -258,7 +258,7 @@ def ndarray_to_mgr( # and a subsequent `astype` will not already result in a copy values = np.array(values, copy=True, order="F") else: - values = np.array(values, copy=False) + values = np.asarray(values) values = _ensure_2d(values) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index d83d9715878f8..1d601f36d604a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -842,7 +842,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -879,8 +879,15 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if copy is False or astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c35e8204f3437..1b685100e4931 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -480,3 +481,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 0c8eefab95464..d1ef29b0bf8a0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d8af7abe83084..888e8628f8664 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -4,6 +4,7 @@ from pandas._config import using_string_dtype from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -297,24 +298,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -329,6 +333,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -337,6 +342,7 @@ def test_array_multiindex_raises(): dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -347,10 +353,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -362,6 +369,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.xfail( using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4fa48023fbc95..a68c8a06e1d18 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -148,12 +148,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index f6b10c989326f..347d6b206e3b9 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -16,6 +18,40 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")]