Skip to content

BUG: Fix copy semantics in __array__ #60046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ def to_numpy(
else:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)
data = self._data.astype(dtype, copy=copy)
data = np.array(self._data, dtype=dtype, copy=copy)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made this change together with allowing to get here with copy=False, to ensure if copy=False is not actually possible depending on the passed dtype, it still properly errors

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, but of course for to_numpy() itself we want to keep the behaviour of only attempting to avoid a copy, and not raising. Will have to move this logic into __array__ then..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change (together with the self._hasna change below) make sense to me. But unfortunately, to_numpy(copy=False) change the meaning of copy=False this way. And I think this is public API?

Maybe better to special case _hasna in the branch below? (could even just ignore dtype with a comment that NumPy is fine with that).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, will have to update this to not change to_numpy(copy=False), as that is indeed public API we want to keep working that way.

return data

@doc(ExtensionArray.tolist)
Expand Down Expand Up @@ -581,7 +581,7 @@ def __array__(
the array interface, return my values
We return an object array here to preserve our scalar values
"""
if copy is False:
if copy is False and self._hasna:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,9 @@ def __array__(
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
return np.asarray(self, dtype=dtype)
return np.asarray(self.asi8, dtype=dtype)
else:
return np.array(self, dtype=dtype)
return np.array(self.asi8, dtype=dtype)

if copy is False:
raise ValueError(
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs.sparse import IntIndex
from pandas.compat.numpy import np_version_gt2

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -480,3 +481,33 @@ def test_zero_sparse_column():

expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)


def test_array_interface(arr_data, arr):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(arr)
tm.assert_numpy_array_equal(result, arr_data)

# it always gives a copy by default
result_copy1 = np.asarray(arr)
result_copy2 = np.asarray(arr)
assert not np.may_share_memory(result_copy1, result_copy2)

# or with explicit copy=True
result_copy1 = np.array(arr, copy=True)
result_copy2 = np.array(arr, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

# for sparse arrays, copy=False is never allowed
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
np.array(arr, copy=False)

# except when there are actually no sparse filled values
arr2 = SparseArray(np.array([1, 2, 3]))
result_nocopy1 = np.array(arr2, copy=False)
result_nocopy2 = np.array(arr2, copy=False)
assert np.may_share_memory(result_nocopy1, result_nocopy2)
8 changes: 8 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d):
result = np.asarray(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)

# to int64 gives the underlying representation
result = np.asarray(arr, dtype="int64")
tm.assert_numpy_array_equal(result, arr.asi8)

result2 = np.asarray(arr, dtype="int64")
assert np.may_share_memory(result, result2)

result_copy1 = np.array(arr, dtype="int64", copy=True)
result_copy2 = np.array(arr, dtype="int64", copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

# to other dtypes
msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
with pytest.raises(TypeError, match=msg):
Expand Down
35 changes: 21 additions & 14 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,24 +298,27 @@ def test_array_multiindex_raises():


@pytest.mark.parametrize(
"arr, expected",
"arr, expected, zero_copy",
[
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True),
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False),
(
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
False,
),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False),
(
IntervalArray.from_breaks([0, 1, 2]),
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
False,
),
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False),
# tz-naive datetime
(
DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
np.array(["2000", "2001"], dtype="M8[ns]"),
True,
),
# tz-aware stays tz`-aware
(
Expand All @@ -330,6 +333,7 @@ def test_array_multiindex_raises():
Timestamp("2000-01-02", tz="US/Central"),
]
),
False,
),
# Timedelta
(
Expand All @@ -338,6 +342,7 @@ def test_array_multiindex_raises():
dtype=np.dtype("m8[ns]"),
),
np.array([0, 3600000000000], dtype="m8[ns]"),
True,
),
# GH#26406 tz is preserved in Categorical[dt64tz]
(
Expand All @@ -348,10 +353,11 @@ def test_array_multiindex_raises():
Timestamp("2016-01-02", tz="US/Pacific"),
]
),
False,
),
],
)
def test_to_numpy(arr, expected, index_or_series_or_array, request):
def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array):
box = index_or_series_or_array

with tm.assert_produces_warning(None):
Expand All @@ -374,15 +380,16 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
# copy=False semantics are only supported in NumPy>=2.
return

try:
result_nocopy1 = np.array(thing, copy=False)
except ValueError:
# An error is always acceptable for `copy=False`
return
if not zero_copy:
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
# An error is always acceptable for `copy=False`
np.array(thing, copy=False)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice to make the test specific!


result_nocopy2 = np.array(thing, copy=False)
# If copy=False was given, these must share the same data
assert np.may_share_memory(result_nocopy1, result_nocopy2)
else:
result_nocopy1 = np.array(thing, copy=False)
result_nocopy2 = np.array(thing, copy=False)
# If copy=False was given, these must share the same data
assert np.may_share_memory(result_nocopy1, result_nocopy2)


@pytest.mark.xfail(
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/extension/base/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas.compat.numpy import np_version_gt2

from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
Expand Down Expand Up @@ -71,6 +73,25 @@ def test_array_interface(self, data):
expected = construct_1d_object_array_from_listlike(list(data))
tm.assert_numpy_array_equal(result, expected)

def test_array_interface_copy(self, data):
result_copy1 = np.array(data, copy=True)
result_copy2 = np.array(data, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

try:
result_nocopy1 = np.array(data, copy=False)
except ValueError:
# An error is always acceptable for `copy=False`
return
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can make this test specific presumably.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this specific case it is harder to make it specific, because we don't know the exact array object and its expected semantics (this is eg also used by external packages implementing an ExtensionArray)


result_nocopy2 = np.array(data, copy=False)
# If copy=False was given and did not raise, these must share the same data
assert np.may_share_memory(result_nocopy1, result_nocopy2)

def test_is_extension_array_dtype(self, data):
assert is_extension_array_dtype(data)
assert is_extension_array_dtype(data.dtype)
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/indexes/multi/test_conversion.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas.compat.numpy import np_version_gt2

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -16,6 +18,40 @@ def test_to_numpy(idx):
tm.assert_numpy_array_equal(result, exp)


def test_array_interface(idx):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(idx)
expected = np.empty((6,), dtype=object)
expected[:] = [
("foo", "one"),
("foo", "two"),
("bar", "one"),
("baz", "two"),
("qux", "one"),
("qux", "two"),
]
tm.assert_numpy_array_equal(result, expected)

# it always gives a copy by default, but the values are cached, so results
# are still sharing memory
result_copy1 = np.asarray(idx)
result_copy2 = np.asarray(idx)
assert np.may_share_memory(result_copy1, result_copy2)
Comment on lines +35 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be an unfortunate side effect of our caching, because a user might expect here always to get a new object. But at least when explicitly passing copy=True, then it appears to do the right thing. But is this numpy that does that? (still copy even if copy=True was passed down to __array__?) We should probably not keep relying on that, and do an explicit copy on our side in case of copy=True?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is confusing, but maybe you tested with NumPy 1.x, which I think is the default dev environment?

The test should be failing, and is failing locally to me. I think if copy=True, we'll have to add an np.array() call to the __array__ function with a comment that it is there due to caching.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems I am testing locally with numpy 2.0.2 (so not 1.x, but also not latest 2.1.x or main)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, maybe there were fixes with 2.1.


# with explicit copy=True, then it is an actual copy
result_copy1 = np.array(idx, copy=True)
result_copy2 = np.array(idx, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

# for MultiIndex, copy=False is never allowed
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
np.array(idx, copy=False)


def test_to_frame():
tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")]

Expand Down
Loading