Skip to content

ENH: add ExtensionArray.to_numpy to have control over conversion to numpy array #30322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ Other enhancements
^^^^^^^^^^^^^^^^^^

- :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`)
- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`)
- :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`)
- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
- The :ref:`integer dtype <integer_na>` with support for missing values and the
Expand Down Expand Up @@ -725,7 +726,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
- Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`)
- Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`)
- Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`)
- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`)
- Removed :meth:`Series.nonzero`, use ``to_numpy().nonzero()`` instead (:issue:`24048`)
- Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`)
- Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`)
- Removed :meth:`Series.put` (:issue:`27106`)
Expand Down
34 changes: 34 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import ArrayLike
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -350,6 +351,39 @@ def __iter__(self):
for i in range(len(self)):
yield self[i]

def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert to a NumPy ndarray.

.. versionadded:: 1.0.0

This is similar to :meth:`numpy.asarray`, but may provide additional control
over how the conversion is done.

Parameters
----------
dtype : str or numpy.dtype, optional
The dtype to pass to :meth:`numpy.asarray`.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

Returns
-------
numpy.ndarray
"""
result = np.asarray(self, dtype=dtype)
if copy or na_value is not lib._no_default:
result = result.copy()
if na_value is not lib._no_default:
result[self.isna()] = na_value
return result

# ------------------------------------------------------------------------
# Required attributes
# ------------------------------------------------------------------------
Expand Down
53 changes: 46 additions & 7 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,13 @@ def __getitem__(self, item):

return type(self)(self._data[item], self._mask[item])

def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA):
def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, **kwargs
):
"""
Convert to a numpy array.
Convert to a NumPy Array.

By default converts to a numpy object array. Specify the `dtype` and
By default converts to an object-dtype NumPy array. Specify the `dtype` and
`na_value` keywords to customize the conversion.

Parameters
Expand All @@ -342,18 +344,55 @@ def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA):
Whether to ensure that the returned value is a not a view on
the array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
a copy is made, even if not strictly necessary. This is typically
only possible when no missing values are present and `dtype`
is a boolean dtype.
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).

Returns
-------
np.ndarray
numpy.ndarray

Examples
--------
An object-dtype is the default result

>>> a = pd.array([True, False], dtype="boolean")
>>> a.to_numpy()
array([True, False], dtype=object)

When no missing values are present, a boolean dtype can be used.

>>> a.to_numpy(dtype="bool")
array([ True, False])

However, requesting a bool dtype will raise a ValueError if
missing values are present and the default missing value :attr:`NA`
is used.

>>> a = pd.array([True, False, pd.NA], dtype="boolean")
>>> a
<BooleanArray>
[True, False, NA]
Length: 3, dtype: boolean

>>> a.to_numpy(dtype="bool")
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values

Specify a valid `na_value` instead

>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib._no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object
if self.isna().any():
if self._hasna:
if is_bool_dtype(dtype) and na_value is libmissing.NA:
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
Expand Down Expand Up @@ -550,7 +589,7 @@ def astype(self, dtype, copy=True):
na_value = np.nan
# coerce
data = self.to_numpy(na_value=na_value)
return astype_nansafe(data, dtype, copy=None)
return astype_nansafe(data, dtype, copy=False)

def value_counts(self, dropna=True):
"""
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):

# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(self, dtype=None, copy=False):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert the PandasArray to a :class:`numpy.ndarray`.

Expand All @@ -433,15 +433,21 @@ def to_numpy(self, dtype=None, copy=False):
The NumPy dtype to pass to :func:`numpy.asarray`.
copy : bool, default False
Whether to copy the underlying data.
na_value : Scalar, optional
The missing value to use for missing values.

Returns
-------
ndarray
"""
result = np.asarray(self._ndarray, dtype=dtype)
if copy and result is self._ndarray:

if (copy or na_value is not lib._no_default) and result is self._ndarray:
result = result.copy()

if na_value is not lib._no_default:
result[self.isna()] = na_value

return result

@Appender(ExtensionArray.searchsorted.__doc__)
Expand Down
22 changes: 12 additions & 10 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
Expand Down Expand Up @@ -785,7 +784,7 @@ def array(self) -> ExtensionArray:

return result

def to_numpy(self, dtype=None, copy=False, **kwargs):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs):
"""
A NumPy ndarray representing the values in this Series or Index.

Expand All @@ -800,6 +799,12 @@ def to_numpy(self, dtype=None, copy=False, **kwargs):
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

.. versionadded:: 1.0.0

**kwargs
Additional keywords passed through to the ``to_numpy`` method
of the underlying array (for extension arrays).
Expand Down Expand Up @@ -874,24 +879,21 @@ def to_numpy(self, dtype=None, copy=False, **kwargs):
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
dtype='datetime64[ns]')
"""
if is_extension_array_dtype(self.dtype) and hasattr(self.array, "to_numpy"):
return self.array.to_numpy(dtype, copy=copy, **kwargs)
if is_extension_array_dtype(self.dtype):
return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
else:
if kwargs:
msg = "to_numpy() got an unexpected keyword argument '{}'".format(
list(kwargs.keys())[0]
)
raise TypeError(msg)
if is_datetime64tz_dtype(self.dtype) and dtype is None:
# note: this is going to change very soon.
# I have a WIP PR making this unnecessary, but it's
# a bit out of scope for the DatetimeArray PR.
dtype = "object"

result = np.asarray(self._values, dtype=dtype)
# TODO(GH-24345): Avoid potential double copy
if copy:
if copy or na_value is not lib._no_default:
result = result.copy()
if na_value is not lib._no_default:
result[self.isna()] = na_value
return result

@property
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None):

return cls(data, index=index, columns=columns, dtype=dtype)

def to_numpy(self, dtype=None, copy=False):
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
"""
Convert the DataFrame to a NumPy array.

Expand All @@ -1264,6 +1264,12 @@ def to_numpy(self, dtype=None, copy=False):
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.

na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the type of the array.

.. versionadded:: 1.0.0

Returns
-------
numpy.ndarray
Expand Down Expand Up @@ -1295,6 +1301,13 @@ def to_numpy(self, dtype=None, copy=False):
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
result = np.array(self.values, dtype=dtype, copy=copy)
if na_value is not lib._no_default:
if not copy:
# copy even if not requested. This may be unnecessary
# if NumPy already copied.
result = result.copy()

result[self.isna()] = na_value
return result

def to_dict(self, orient="dict", into=dict):
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs import OutOfBoundsDatetime
from pandas.compat.numpy import _np_version_under1p18

import pandas as pd
from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
Expand Down Expand Up @@ -758,3 +759,38 @@ def test_invalid_nat_setitem_array(array, non_casting_nats):
for nat in non_casting_nats:
with pytest.raises(TypeError):
array[0] = nat


@pytest.mark.parametrize(
"array",
[
pd.date_range("2000", periods=4).array,
pd.timedelta_range("2000", periods=4).array,
],
)
def test_to_numpy_extra(array):
if _np_version_under1p18:
# np.isnan(NaT) raises, so use pandas'
isnan = pd.isna
else:
isnan = np.isnan

array[0] = pd.NaT
original = array.copy()

result = array.to_numpy()
assert isnan(result[0])

result = array.to_numpy(dtype="int64")
assert result[0] == -9223372036854775808

result = array.to_numpy(dtype="int64", na_value=0)
assert result[0] == 0

result = array.to_numpy(na_value=array[1].to_numpy())
assert result[0] == result[1]

result = array.to_numpy(na_value=array[1].to_numpy(copy=False))
assert result[0] == result[1]

tm.assert_equal(array, original)
33 changes: 33 additions & 0 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,36 @@ def test_to_numpy_dtype(as_series):
result = obj.to_numpy(dtype="M8[ns]")
expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"values, dtype, na_value, expected",
[
([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
(
[pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT],
None,
pd.Timestamp("2000"),
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
),
],
)
@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore
def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected):
s = container(values)
result = s.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array(expected)
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_kwargs_raises():
# numpy
s = pd.Series([1, 2, 3])
match = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
with pytest.raises(TypeError, match=match):
s.to_numpy(foo=True)

# extension
s = pd.Series([1, 2, 3], dtype="Int64")
with pytest.raises(TypeError, match=match):
s.to_numpy(foo=True)
11 changes: 11 additions & 0 deletions pandas/tests/extension/base/casting.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import numpy as np

import pandas as pd
from pandas.core.internals import ObjectBlock

Expand All @@ -21,3 +23,12 @@ def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
expected = pd.Series(data[:5].astype(str))
self.assert_series_equal(result, expected)

def test_to_numpy(self, data):
expected = np.asarray(data)

result = data.to_numpy()
self.assert_equal(result, expected)

result = pd.Series(data).to_numpy()
self.assert_equal(result, expected)
Loading