Skip to content

ENH: add ExtensionArray.to_numpy to have control over conversion to numpy array #30322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ Other enhancements
^^^^^^^^^^^^^^^^^^

- :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`)
- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`)
- :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`)
- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
- The :ref:`integer dtype <integer_na>` with support for missing values and the
Expand Down Expand Up @@ -725,7 +726,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
- Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`)
- Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`)
- Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`)
- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`)
- Removed :meth:`Series.nonzero`, use ``to_numpy().nonzero()`` instead (:issue:`24048`)
- Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`)
- Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`)
- Removed :meth:`Series.put` (:issue:`27106`)
Expand Down
34 changes: 34 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import ArrayLike
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -350,6 +351,39 @@ def __iter__(self):
for i in range(len(self)):
yield self[i]

def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert to a NumPy ndarray.

.. versionadded:: 1.0.0

This is similar to :meth:`numpy.asarray`, but may provide additional control
over how the conversion is done.

Parameters
----------
dtype : str or numpy.dtype, optional
The dtype to pass to :meth:`numpy.asarray`.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

Returns
-------
numpy.ndarray
"""
result = np.asarray(self, dtype=dtype)
if copy or na_value is not lib._no_default:
result = result.copy()
if na_value is not lib._no_default:
result[self.isna()] = na_value
return result

# ------------------------------------------------------------------------
# Required attributes
# ------------------------------------------------------------------------
Expand Down
74 changes: 63 additions & 11 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,29 +327,81 @@ def __getitem__(self, item):

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, **kwargs
):
"""
Coerce to an ndarray of object dtype or bool dtype (if force_bool=True).
Convert to a NumPy Array.

By default converts to an object-dtype NumPy array. Specify the `dtype` and
`na_value` keywords to customize the conversion.

Parameters
----------
dtype : dtype, default object
The numpy dtype to convert to
The numpy dtype to convert to.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
the array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary. This is typically
only possible when no missing values are present and `dtype`
is a boolean dtype.
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).

Returns
-------
numpy.ndarray

Examples
--------
An object-dtype is the default result

>>> a = pd.array([True, False], dtype="boolean")
>>> a.to_numpy()
array([True, False], dtype=object)

When no missing values are present, a boolean dtype can be used.

>>> a.to_numpy(dtype="bool")
array([ True, False])

However, requesting a bool dtype will raise a ValueError if
missing values are present and the default missing value :attr:`NA`
is used.

>>> a = pd.array([True, False, pd.NA], dtype="boolean")
>>> a
<BooleanArray>
[True, False, NA]
Length: 3, dtype: boolean

>>> a.to_numpy(dtype="bool")
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values

Specify a valid `na_value` instead

>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib._no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object
if is_bool_dtype(dtype):
if not self._hasna:
return self._data
else:
if self._hasna:
if is_bool_dtype(dtype) and na_value is libmissing.NA:
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
)
data = self._data.astype(dtype)
data[self._mask] = na_value
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
data[self._mask] = na_value
else:
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -360,7 +412,7 @@ def __array__(self, dtype=None):
We return an object array here to preserve our scalar values
"""
# by default (no dtype specified), return an object array
return self._coerce_to_ndarray(dtype=dtype)
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -536,7 +588,7 @@ def astype(self, dtype, copy=True):
if is_float_dtype(dtype):
na_value = np.nan
# coerce
data = self._coerce_to_ndarray(na_value=na_value)
data = self.to_numpy(na_value=na_value)
return astype_nansafe(data, dtype, copy=False)

def value_counts(self, dropna=True):
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):

# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(self, dtype=None, copy=False):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert the PandasArray to a :class:`numpy.ndarray`.

Expand All @@ -433,15 +433,21 @@ def to_numpy(self, dtype=None, copy=False):
The NumPy dtype to pass to :func:`numpy.asarray`.
copy : bool, default False
Whether to copy the underlying data.
na_value : Scalar, optional
The missing value to use for missing values.

Returns
-------
ndarray
"""
result = np.asarray(self._ndarray, dtype=dtype)
if copy and result is self._ndarray:

if (copy or na_value is not lib._no_default) and result is self._ndarray:
result = result.copy()

if na_value is not lib._no_default:
result[self.isna()] = na_value

return result

@Appender(ExtensionArray.searchsorted.__doc__)
Expand Down
31 changes: 23 additions & 8 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
Expand Down Expand Up @@ -785,7 +784,7 @@ def array(self) -> ExtensionArray:

return result

def to_numpy(self, dtype=None, copy=False):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs):
"""
A NumPy ndarray representing the values in this Series or Index.

Expand All @@ -800,6 +799,17 @@ def to_numpy(self, dtype=None, copy=False):
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

.. versionadded:: 1.0.0

**kwargs
Additional keywords passed through to the ``to_numpy`` method
of the underlying array (for extension arrays).

.. versionadded:: 1.0.0

Returns
-------
Expand Down Expand Up @@ -869,16 +879,21 @@ def to_numpy(self, dtype=None, copy=False):
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
dtype='datetime64[ns]')
"""
if is_datetime64tz_dtype(self.dtype) and dtype is None:
# note: this is going to change very soon.
# I have a WIP PR making this unnecessary, but it's
# a bit out of scope for the DatetimeArray PR.
dtype = "object"
if is_extension_array_dtype(self.dtype):
return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
else:
if kwargs:
msg = "to_numpy() got an unexpected keyword argument '{}'".format(
list(kwargs.keys())[0]
)
raise TypeError(msg)

result = np.asarray(self._values, dtype=dtype)
# TODO(GH-24345): Avoid potential double copy
if copy:
if copy or na_value is not lib._no_default:
result = result.copy()
if na_value is not lib._no_default:
result[self.isna()] = na_value
return result

@property
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None):

return cls(data, index=index, columns=columns, dtype=dtype)

def to_numpy(self, dtype=None, copy=False):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert the DataFrame to a NumPy array.

Expand All @@ -1264,6 +1264,12 @@ def to_numpy(self, dtype=None, copy=False):
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.

na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

.. versionadded:: 1.0.0

Returns
-------
numpy.ndarray
Expand Down Expand Up @@ -1295,6 +1301,13 @@ def to_numpy(self, dtype=None, copy=False):
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
result = np.array(self.values, dtype=dtype, copy=copy)
if na_value is not lib._no_default:
if not copy:
# copy even if not requested. This may be unnecessary
# if NumPy already copied.
result = result.copy()

result[self.isna()] = na_value
return result

def to_dict(self, orient="dict", into=dict):
Expand Down
64 changes: 64 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,70 @@ def test_coerce_to_numpy_array():
np.array(arr, dtype="bool")


@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)

# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to bool numpy"):
result = arr.to_numpy(dtype="bool")

# specify dtype and na_value
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)

result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

# converting to int or float without specifying na_value raises
with pytest.raises(TypeError):
arr.to_numpy(dtype="int64")
with pytest.raises(TypeError):
arr.to_numpy(dtype="float64")


def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)

arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))


def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
Expand Down
Loading