Skip to content

ENH: add ExtensionArray.to_numpy to have control over conversion to numpy array #30322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,29 +296,47 @@ def __getitem__(self, item):
return self._data[item]
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA):
"""
Coerce to an ndarray of object dtype or bool dtype (if force_bool=True).
Convert to a numpy array.

By default converts to a numpy object array. Specify the `dtype` and
`na_value` keywords to customize the conversion.

Parameters
----------
dtype : dtype, default object
The numpy dtype to convert to
The numpy dtype to convert to.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
the array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we give some guidance on when no-copy is possible? Is it only when there are no missing values and we're going to the numpy dtype (bool in this case)?

And thinking forward, can a pyarrow array with no NAs be converted to an ndarray without any copies?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think it is only possible with no NAs and bool dtype. I first thought int8 would also be possible, but numpy doesn't seem to do such conversion without copy.

And thinking forward, can a pyarrow array with no NAs be converted to an ndarray without any copies?

For boolean not (since it is bits, not bytes), but in general (eg for IntegerArray without nulls) yes

na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).

Returns
-------
np.ndarray
"""
if dtype is None:
dtype = object
if is_bool_dtype(dtype):
if is_bool_dtype(dtype) and na_value is libmissing.NA:
if not self.isna().any():
return self._data
data = self._data
if copy:
data = data.copy()
else:
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
)
data = self._data.astype(dtype)
data[self._mask] = na_value
if self.isna().any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this up to before L325? It seems like an isna().any() is going to happen at least once, so should avoid ever doing it twice.

# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
data[self._mask] = na_value
else:
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -329,7 +347,7 @@ def __array__(self, dtype=None):
We return an object array here to preserve our scalar values
"""
# by default (no dtype specified), return an object array
return self._coerce_to_ndarray(dtype=dtype)
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -505,7 +523,7 @@ def astype(self, dtype, copy=True):
if is_float_dtype(dtype):
na_value = np.nan
# coerce
data = self._coerce_to_ndarray(na_value=na_value)
data = self.to_numpy(na_value=na_value)
return astype_nansafe(data, dtype, copy=None)

def value_counts(self, dropna=True):
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,68 @@ def test_coerce_to_numpy_array():
np.array(arr, dtype="bool")


def test_to_numpy():
# default (with or without missing values) -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
tm.assert_numpy_array_equal(result, expected)

arr = pd.array([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)

# no missing values -> can convert to bool, otherwise raises
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = pd.array([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to bool numpy"):
result = arr.to_numpy(dtype="bool")

# specify dtype and na_value
arr = pd.array([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

# converting to int or float without specifying na_value raises
with pytest.raises(TypeError):
arr.to_numpy(dtype="int64")
with pytest.raises(TypeError):
arr.to_numpy(dtype="float64")


def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)

arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))


def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
Expand Down