From 1c5195c04bd20420bb8e6a50c879ed5f06b4eea2 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 12:57:40 +0200 Subject: [PATCH 01/15] BUG: Fix copy semantics in ``__array__`` This fixes the semantics of ``__array__``. While rejecting ``copy=False`` is pretty harmless, ``copy=True`` should never have been ignored and is dangerous. --- pandas/core/arrays/arrow/array.py | 10 +++++++++- pandas/core/arrays/categorical.py | 14 +++++++++++--- pandas/core/arrays/datetimelike.py | 7 +++++++ pandas/core/arrays/interval.py | 5 +++++ pandas/core/arrays/masked.py | 9 ++++++++- pandas/core/arrays/numpy_.py | 3 +++ pandas/core/arrays/period.py | 5 +++++ pandas/core/arrays/sparse/array.py | 5 +++++ pandas/core/generic.py | 13 +++++++++++-- pandas/core/indexes/base.py | 6 +++++- pandas/core/indexes/multi.py | 3 +++ pandas/core/internals/construction.py | 2 +- pandas/core/series.py | 13 ++++++++++--- pandas/tests/extension/json/array.py | 10 +++++++++- 14 files changed, 92 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 619e7b3ccfb4f..940d08216a963 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -667,7 +667,15 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + copy = False # The NumPy copy=False meaning is different here. + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7cde4c53cb2f5..21b32b1c6fa89 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1663,7 +1663,7 @@ def __array__( Specifies the the dtype for the array. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1686,13 +1686,21 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + # TODO: using asarray_func because NumPy 1.x doesn't support copy=None + asarray_func = np.asarray if copy is None else np.array + ret = take_nd(self.categories._values, self._codes) if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) + return asarray_func(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + return asarray_func(ret) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a25a698856747..9c821bf0d184e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -359,7 +359,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2ac9c77bef322..f6b815a874c10 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1606,6 +1606,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 92ed690e527c7..1be7787d67c93 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -581,7 +581,14 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..e8b33937ca866 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7d0ad74f851f0..b2a56b5ca2df3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -389,6 +389,11 @@ def freqstr(self) -> str: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype == "i8": return self.asi8 elif dtype == bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 0c76280e7fdb4..c1b3605c4e5b5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -547,6 +547,11 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + fill_value = self.fill_value if self.sp_index.ngaps == 0: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42516f0a85e07..02fbb08a14804 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2015,8 +2015,17 @@ def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block: + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.asarray(values, dtype=dtype, copy=copy) + + if ( + copy is not False + and astype_is_view(values.dtype, arr.dtype) + and self._mgr.is_single_block + ): # Check if both conversions can be done without a copy if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( values.dtype, arr.dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 749a5fea4d513..5fdf073b9e8c9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.asarray(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae9b272af9fe9..55f96ce777d09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,6 +1391,9 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is True: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.values, dtype=dtype, copy=copy) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..0812ba5e6def4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -258,7 +258,7 @@ def ndarray_to_mgr( # and a subsequent `astype` will not already result in a copy values = np.array(values, copy=True, order="F") else: - values = np.array(values, copy=False) + values = np.asarray(values) values = _ensure_2d(values) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index fe2bb0b5aa5c3..2270e0966a505 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -842,7 +842,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -879,8 +879,15 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.asarray(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if copy is False or astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4fa48023fbc95..a68c8a06e1d18 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -148,12 +148,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: From 2183861e1addba9c883ed5e206db0450aa161d3b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 13:34:39 +0200 Subject: [PATCH 02/15] BUG: Fix one more path not translating ``copy=`` correctly --- pandas/core/arrays/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 21b32b1c6fa89..c29466b13f6ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -579,10 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + asarray_func = np.array if copy else np.asarray + result = asarray_func( self, dtype=dtype, - copy=copy, ) else: From 404827b81381439626180fd9a690776d5eacc337 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 14:16:38 +0200 Subject: [PATCH 03/15] BUG: Avoid asarray with copy= (it was added in 2.0) --- pandas/core/arrays/numpy_.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/indexes/base.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e8b33937ca866..9f7238a97d808 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -152,7 +152,7 @@ def __array__( ) -> np.ndarray: if copy is not None: # Note: branch avoids `copy=None` for NumPy 1.x support - return np.asarray(self._ndarray, dtype=dtype, copy=copy) + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 02fbb08a14804..0a43556496973 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2019,7 +2019,7 @@ def __array__( # Note: branch avoids `copy=None` for NumPy 1.x support arr = np.asarray(values, dtype=dtype) else: - arr = np.asarray(values, dtype=dtype, copy=copy) + arr = np.array(values, dtype=dtype, copy=copy) if ( copy is not False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5fdf073b9e8c9..cf3d1e6a2ee2d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -912,7 +912,7 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: # Note, that the if branch exists for NumPy 1.x support return np.asarray(self._data, dtype=dtype) - return np.asarray(self._data, dtype=dtype, copy=copy) + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): From ec08728216a172f518e230ccc637e44cf0a65582 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 14:44:13 +0200 Subject: [PATCH 04/15] More fixes found by typing checks (or working around them) --- pandas/core/arrays/categorical.py | 20 +++++++++----------- pandas/core/indexes/multi.py | 2 +- pandas/core/series.py | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c29466b13f6ff..ac64c57bece9f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -581,11 +581,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: elif len(self.codes) == 0 or len(self.categories) == 0: # For NumPy 1.x compatibility we cannot use copy=None. And # `copy=False` has the meaning of `copy=None` here: - asarray_func = np.array if copy else np.asarray - result = asarray_func( - self, - dtype=dtype, - ) + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1693,16 +1692,15 @@ def __array__( "Unable to avoid copy while creating an array as requested." ) - # TODO: using asarray_func because NumPy 1.x doesn't support copy=None - asarray_func = np.asarray if copy is None else np.array - ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return asarray_func(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return asarray_func(ret) + + if copy is None: + # Branch required since copy=None is not defined on 1.x + return np.asarray(ret, dtype=dtype) + return np.array(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 55f96ce777d09..f21ab28061e9d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1393,7 +1393,7 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" if copy is True: # Note: branch avoids `copy=None` for NumPy 1.x support - return np.asarray(self.values, dtype=dtype, copy=copy) + return np.array(self.values, dtype=dtype, copy=copy) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/series.py b/pandas/core/series.py index 2270e0966a505..97193c7b564e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -883,7 +883,7 @@ def __array__( # Note: branch avoids `copy=None` for NumPy 1.x support arr = np.asarray(values, dtype=dtype) else: - arr = np.asarray(values, dtype=dtype, copy=copy) + arr = np.array(values, dtype=dtype, copy=copy) if copy is True: return arr From 4ac6323be3f04c2613eb2113f28f437c62fff31d Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 30 Oct 2024 15:50:17 +0100 Subject: [PATCH 05/15] TST: Add test for __array__ copy behavior --- pandas/tests/base/test_conversion.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d8af7abe83084..386660ddbb227 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -4,6 +4,7 @@ from pandas._config import using_string_dtype from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -362,6 +363,27 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.asarray(thing, copy=True) + result_cp2 = np.asarray(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.asarray(thing, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.asarray(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.xfail( using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False From 9b6c209f91f6c51328cf4492038a7e1efd88ccf2 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 30 Oct 2024 16:32:31 +0100 Subject: [PATCH 06/15] TST: Fixup test to use array rather than asarray asarray did not support `copy=` on older versions of NumPy --- pandas/tests/base/test_conversion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 386660ddbb227..90b33bfc408ba 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -363,10 +363,10 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) - # Additionally, we check the `copy=` semantics for asarray + # Additionally, we check the `copy=` semantics for array/asarray # (these are implemented by us via `__array__`). - result_cp1 = np.asarray(thing, copy=True) - result_cp2 = np.asarray(thing, copy=True) + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) # When called with `copy=True` NumPy/we should ensure a copy was made assert not np.may_share_memory(result_cp1, result_cp2) @@ -375,12 +375,12 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): return try: - result_nocopy1 = np.asarray(thing, copy=False) + result_nocopy1 = np.array(thing, copy=False) except ValueError: # An error is always acceptable for `copy=False` return - result_nocopy2 = np.asarray(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) # If copy=False was given, these must share the same data assert np.may_share_memory(result_nocopy1, result_nocopy2) From 77058df6b7a58fe61c05502047c4b04161ab22e3 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 4 Nov 2024 12:03:35 +0100 Subject: [PATCH 07/15] BUG: Fixup ``__array__`` copy paths based on review A few of these were just wrong, a few others are enhancements to allow the cases that clearly should work without copy to still pass. Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/arrow/array.py | 3 ++- pandas/core/arrays/categorical.py | 6 ++---- pandas/core/arrays/period.py | 12 +++++++++--- pandas/core/arrays/sparse/array.py | 10 +++++++--- pandas/core/generic.py | 2 +- pandas/core/indexes/multi.py | 8 +++++--- 6 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 940d08216a963..375d15a627e47 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -673,7 +673,8 @@ def __array__( "Unable to avoid copy while creating an array as requested." ) elif copy is None: - copy = False # The NumPy copy=False meaning is different here. + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False return self.to_numpy(dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ac64c57bece9f..99e4cb0545e2d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1697,10 +1697,8 @@ def __array__( # we need to ensure __array__ gets all the way to an # ndarray. - if copy is None: - # Branch required since copy=None is not defined on 1.x - return np.asarray(ret, dtype=dtype) - return np.array(ret, dtype=dtype) + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b2a56b5ca2df3..af880aa1ed727 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -389,14 +389,20 @@ def freqstr(self) -> str: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if dtype == "i8": + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype) + if copy is False: raise ValueError( "Unable to avoid copy while creating an array as requested." ) - if dtype == "i8": - return self.asi8 - elif dtype == bool: + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c1b3605c4e5b5..a3db7dc1f93e9 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -547,6 +547,13 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if self.sp_index.ngaps == 0: + # Compat for na dtype and int values. + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + if copy is False: raise ValueError( "Unable to avoid copy while creating an array as requested." @@ -554,9 +561,6 @@ def __array__( fill_value = self.fill_value - if self.sp_index.ngaps == 0: - # Compat for na dtype and int values. - return self.sp_values if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a43556496973..eb45afa210028 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2022,7 +2022,7 @@ def __array__( arr = np.array(values, dtype=dtype, copy=copy) if ( - copy is not False + copy is not True and astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block ): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f21ab28061e9d..82768ed0b3e99 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,9 +1391,11 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" - if copy is True: - # Note: branch avoids `copy=None` for NumPy 1.x support - return np.array(self.values, dtype=dtype, copy=copy) + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return self.values def view(self, cls=None) -> Self: From 6799f55d2fa271de19198e6976ee84f9dbe977b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 15:39:15 +0100 Subject: [PATCH 08/15] fix period case and add specific test --- pandas/core/arrays/period.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index af880aa1ed727..ae92e17332c76 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -393,9 +393,9 @@ def __array__( # For NumPy 1.x compatibility we cannot use copy=None. And # `copy=False` has the meaning of `copy=None` here: if not copy: - return np.asarray(self, dtype=dtype) + return np.asarray(self.asi8, dtype=dtype) else: - return np.array(self, dtype=dtype) + return np.array(self.asi8, dtype=dtype) if copy is False: raise ValueError( diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 0c8eefab95464..d1ef29b0bf8a0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): From 4217bafff3ce6cc90990b8f2981217d5bd2685ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 15:58:23 +0100 Subject: [PATCH 09/15] update test to be explicit about copy vs nocopy + allow copy=False for masked arrays in case of no NAs --- pandas/core/arrays/masked.py | 4 ++-- pandas/tests/base/test_conversion.py | 35 +++++++++++++++++----------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1be7787d67c93..e129a0ec2e5fb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -507,7 +507,7 @@ def to_numpy( else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) - data = self._data.astype(dtype, copy=copy) + data = np.array(self._data, dtype=dtype, copy=copy) return data @doc(ExtensionArray.tolist) @@ -581,7 +581,7 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - if copy is False: + if copy is False and self._hasna: raise ValueError( "Unable to avoid copy while creating an array as requested." ) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 90b33bfc408ba..888e8628f8664 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -298,24 +298,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -330,6 +333,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -338,6 +342,7 @@ def test_array_multiindex_raises(): dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -348,10 +353,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -374,15 +380,16 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): # copy=False semantics are only supported in NumPy>=2. return - try: - result_nocopy1 = np.array(thing, copy=False) - except ValueError: - # An error is always acceptable for `copy=False` - return + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) - result_nocopy2 = np.array(thing, copy=False) - # If copy=False was given, these must share the same data - assert np.may_share_memory(result_nocopy1, result_nocopy2) + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) @pytest.mark.xfail( From 5e4cb87d7dcd68ef44135601773a8e2d4a3f0219 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 16:02:10 +0100 Subject: [PATCH 10/15] add similar test to base extension tests --- pandas/tests/extension/base/interface.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) From 357f8a008ce3286bd675dbf43c9c491c652413d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 16:09:50 +0100 Subject: [PATCH 11/15] add specific test for sparse corner case --- pandas/tests/arrays/sparse/test_array.py | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index c35e8204f3437..1b685100e4931 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -480,3 +481,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) From 9927903a11a44f09ad5a8ca658fb8d15ac373f7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 16:18:29 +0100 Subject: [PATCH 12/15] add specific test for MultiIndex --- pandas/tests/indexes/multi/test_conversion.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index f6b10c989326f..347d6b206e3b9 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -16,6 +18,40 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] From 3b000be05751f42dc33eda082c583b122dfc7685 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 17:58:32 +0100 Subject: [PATCH 13/15] fix MultiIndex copy=True case for recent numpy --- pandas/core/indexes/multi.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 82768ed0b3e99..e6ce00cb714a4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1396,6 +1396,10 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: raise ValueError( "Unable to avoid copy while creating an array as requested." ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: From 421f9046d8ff10e1c6e6cc52e0f23b83f6213107 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 18:03:24 +0100 Subject: [PATCH 14/15] fix copy=False case for masked array --- pandas/core/arrays/masked.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e129a0ec2e5fb..349d2ec4d3cc9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -507,7 +507,7 @@ def to_numpy( else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) - data = np.array(self._data, dtype=dtype, copy=copy) + data = self._data.astype(dtype, copy=copy) return data @doc(ExtensionArray.tolist) @@ -581,7 +581,10 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - if copy is False and self._hasna: + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) raise ValueError( "Unable to avoid copy while creating an array as requested." ) From d70405e50ba7d8ed9dda4e6bf2a90e49590c7c33 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 20:40:23 +0100 Subject: [PATCH 15/15] add whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..7caf9a2cdbeb6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,7 +32,9 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) - .. ---------------------------------------------------------------------------