-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Fix copy semantics in __array__
#60046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
1c5195c
2183861
404827b
ec08728
4ac6323
9b6c209
77058df
6799f55
4217baf
5e4cb87
357f8a0
9927903
3b000be
421f904
d70405e
5289a82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -298,24 +298,27 @@ def test_array_multiindex_raises(): | |
|
||
|
||
@pytest.mark.parametrize( | ||
"arr, expected", | ||
"arr, expected, zero_copy", | ||
[ | ||
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), | ||
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), | ||
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), | ||
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), | ||
( | ||
pd.core.arrays.period_array(["2000", "2001"], freq="D"), | ||
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), | ||
False, | ||
), | ||
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), | ||
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), | ||
( | ||
IntervalArray.from_breaks([0, 1, 2]), | ||
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), | ||
False, | ||
), | ||
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), | ||
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), | ||
# tz-naive datetime | ||
( | ||
DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), | ||
np.array(["2000", "2001"], dtype="M8[ns]"), | ||
True, | ||
), | ||
# tz-aware stays tz`-aware | ||
( | ||
|
@@ -330,6 +333,7 @@ def test_array_multiindex_raises(): | |
Timestamp("2000-01-02", tz="US/Central"), | ||
] | ||
), | ||
False, | ||
), | ||
# Timedelta | ||
( | ||
|
@@ -338,6 +342,7 @@ def test_array_multiindex_raises(): | |
dtype=np.dtype("m8[ns]"), | ||
), | ||
np.array([0, 3600000000000], dtype="m8[ns]"), | ||
True, | ||
), | ||
# GH#26406 tz is preserved in Categorical[dt64tz] | ||
( | ||
|
@@ -348,10 +353,11 @@ def test_array_multiindex_raises(): | |
Timestamp("2016-01-02", tz="US/Pacific"), | ||
] | ||
), | ||
False, | ||
), | ||
], | ||
) | ||
def test_to_numpy(arr, expected, index_or_series_or_array, request): | ||
def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): | ||
box = index_or_series_or_array | ||
|
||
with tm.assert_produces_warning(None): | ||
|
@@ -374,15 +380,16 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): | |
# copy=False semantics are only supported in NumPy>=2. | ||
return | ||
|
||
try: | ||
result_nocopy1 = np.array(thing, copy=False) | ||
except ValueError: | ||
# An error is always acceptable for `copy=False` | ||
return | ||
if not zero_copy: | ||
with pytest.raises(ValueError, match="Unable to avoid copy while creating"): | ||
# An error is always acceptable for `copy=False` | ||
np.array(thing, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice to make the test specific! |
||
|
||
result_nocopy2 = np.array(thing, copy=False) | ||
# If copy=False was given, these must share the same data | ||
assert np.may_share_memory(result_nocopy1, result_nocopy2) | ||
else: | ||
result_nocopy1 = np.array(thing, copy=False) | ||
result_nocopy2 = np.array(thing, copy=False) | ||
# If copy=False was given, these must share the same data | ||
assert np.may_share_memory(result_nocopy1, result_nocopy2) | ||
|
||
|
||
@pytest.mark.xfail( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas.compat.numpy import np_version_gt2 | ||
|
||
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike | ||
from pandas.core.dtypes.common import is_extension_array_dtype | ||
from pandas.core.dtypes.dtypes import ExtensionDtype | ||
|
@@ -71,6 +73,25 @@ def test_array_interface(self, data): | |
expected = construct_1d_object_array_from_listlike(list(data)) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_array_interface_copy(self, data): | ||
result_copy1 = np.array(data, copy=True) | ||
result_copy2 = np.array(data, copy=True) | ||
assert not np.may_share_memory(result_copy1, result_copy2) | ||
|
||
if not np_version_gt2: | ||
# copy=False semantics are only supported in NumPy>=2. | ||
return | ||
|
||
try: | ||
result_nocopy1 = np.array(data, copy=False) | ||
except ValueError: | ||
# An error is always acceptable for `copy=False` | ||
return | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can make this test specific presumably. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this specific case it is harder to make it specific, because we don't know the exact array object and its expected semantics (this is eg also used by external packages implementing an ExtensionArray) |
||
|
||
result_nocopy2 = np.array(data, copy=False) | ||
# If copy=False was given and did not raise, these must share the same data | ||
assert np.may_share_memory(result_nocopy1, result_nocopy2) | ||
|
||
def test_is_extension_array_dtype(self, data): | ||
assert is_extension_array_dtype(data) | ||
assert is_extension_array_dtype(data.dtype) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas.compat.numpy import np_version_gt2 | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
DataFrame, | ||
|
@@ -16,6 +18,40 @@ def test_to_numpy(idx): | |
tm.assert_numpy_array_equal(result, exp) | ||
|
||
|
||
def test_array_interface(idx): | ||
# https://github.com/pandas-dev/pandas/pull/60046 | ||
result = np.asarray(idx) | ||
expected = np.empty((6,), dtype=object) | ||
expected[:] = [ | ||
("foo", "one"), | ||
("foo", "two"), | ||
("bar", "one"), | ||
("baz", "two"), | ||
("qux", "one"), | ||
("qux", "two"), | ||
] | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# it always gives a copy by default, but the values are cached, so results | ||
# are still sharing memory | ||
result_copy1 = np.asarray(idx) | ||
result_copy2 = np.asarray(idx) | ||
assert np.may_share_memory(result_copy1, result_copy2) | ||
Comment on lines
+35
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be an unfortunate side effect of our caching, because a user might expect here always to get a new object. But at least when explicitly passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is confusing, but maybe you tested with NumPy 1.x, which I think is the default dev environment? The test should be failing, and is failing locally to me. I think if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems I am testing locally with numpy 2.0.2 (so not 1.x, but also not latest 2.1.x or main) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, maybe there were fixes with 2.1. |
||
|
||
# with explicit copy=True, then it is an actual copy | ||
result_copy1 = np.array(idx, copy=True) | ||
result_copy2 = np.array(idx, copy=True) | ||
assert not np.may_share_memory(result_copy1, result_copy2) | ||
|
||
if not np_version_gt2: | ||
# copy=False semantics are only supported in NumPy>=2. | ||
return | ||
|
||
# for MultiIndex, copy=False is never allowed | ||
with pytest.raises(ValueError, match="Unable to avoid copy while creating"): | ||
np.array(idx, copy=False) | ||
|
||
|
||
def test_to_frame(): | ||
tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Made this change together with allowing to get here with
copy=False
, to ensure ifcopy=False
is not actually possible depending on the passeddtype
, it still properly errorsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, but of course for
to_numpy()
itself we want to keep the behaviour of only attempting to avoid a copy, and not raising. Will have to move this logic into__array__
then..There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change (together with the
self._hasna
change below) make sense to me. But unfortunately,to_numpy(copy=False)
change the meaning ofcopy=False
this way. And I think this is public API?Maybe better to special case
_hasna
in the branch below? (could even just ignoredtype
with a comment that NumPy is fine with that).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, will have to update this to not change
to_numpy(copy=False)
, as that is indeed public API we want to keep working that way.