Skip to content

Commit 2e6f53b

Browse files
ENH: add ExtensionArray.to_numpy to have control over conversion to numpy array (#30322)
Co-authored-by: Tom Augspurger <[email protected]>
1 parent c5948d1 commit 2e6f53b

File tree

14 files changed

+303
-39
lines changed

14 files changed

+303
-39
lines changed

doc/source/reference/extensions.rst

+5
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,8 @@ behaves correctly.
6767
:toctree: api/
6868

6969
api.indexers.check_bool_array_indexer
70+
71+
72+
The sentinel ``pandas.api.extensions._no_default`` is used as the default
73+
value in some methods. Use an ``is`` comparison to check if the user
74+
provides a non-default value.

doc/source/whatsnew/v1.0.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ Other enhancements
197197
^^^^^^^^^^^^^^^^^^
198198

199199
- :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`)
200+
- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`)
200201
- :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`)
201202
- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`)
202203
- The :ref:`integer dtype <integer_na>` with support for missing values and the
@@ -729,7 +730,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
729730
- Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`)
730731
- Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`)
731732
- Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`)
732-
- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`)
733+
- Removed :meth:`Series.nonzero`, use ``to_numpy().nonzero()`` instead (:issue:`24048`)
733734
- Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`)
734735
- Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`)
735736
- Removed :meth:`Series.put` (:issue:`27106`)

pandas/_libs/lib.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -2232,7 +2232,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
22322232
return objects
22332233

22342234

2235-
_no_default = object()
2235+
# Note: _no_default is exported to the public API in pandas.api.extensions
2236+
_no_default = object() #: Sentinel indicating the default value.
22362237

22372238

22382239
@cython.boundscheck(False)

pandas/api/extensions/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
"""Public API for extending pandas objects."""
2+
from pandas._libs.lib import _no_default # noqa: F401
3+
24
from pandas.core.dtypes.dtypes import ( # noqa: F401
35
ExtensionDtype,
46
register_extension_dtype,

pandas/core/arrays/base.py

+34
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import numpy as np
1212

13+
from pandas._libs import lib
1314
from pandas._typing import ArrayLike
1415
from pandas.compat import set_function_name
1516
from pandas.compat.numpy import function as nv
@@ -350,6 +351,39 @@ def __iter__(self):
350351
for i in range(len(self)):
351352
yield self[i]
352353

354+
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
355+
"""
356+
Convert to a NumPy ndarray.
357+
358+
.. versionadded:: 1.0.0
359+
360+
This is similar to :meth:`numpy.asarray`, but may provide additional control
361+
over how the conversion is done.
362+
363+
Parameters
364+
----------
365+
dtype : str or numpy.dtype, optional
366+
The dtype to pass to :meth:`numpy.asarray`.
367+
copy : bool, default False
368+
Whether to ensure that the returned value is a not a view on
369+
another array. Note that ``copy=False`` does not *ensure* that
370+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
371+
a copy is made, even if not strictly necessary.
372+
na_value : Any, optional
373+
The value to use for missing values. The default value depends
374+
on `dtype` and the type of the array.
375+
376+
Returns
377+
-------
378+
numpy.ndarray
379+
"""
380+
result = np.asarray(self, dtype=dtype)
381+
if copy or na_value is not lib._no_default:
382+
result = result.copy()
383+
if na_value is not lib._no_default:
384+
result[self.isna()] = na_value
385+
return result
386+
353387
# ------------------------------------------------------------------------
354388
# Required attributes
355389
# ------------------------------------------------------------------------

pandas/core/arrays/boolean.py

+63-11
Original file line numberDiff line numberDiff line change
@@ -316,29 +316,81 @@ def __getitem__(self, item):
316316

317317
return type(self)(self._data[item], self._mask[item])
318318

319-
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
319+
def to_numpy(
320+
self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default,
321+
):
320322
"""
321-
Coerce to an ndarray of object dtype or bool dtype (if force_bool=True).
323+
Convert to a NumPy Array.
324+
325+
By default converts to an object-dtype NumPy array. Specify the `dtype` and
326+
`na_value` keywords to customize the conversion.
322327
323328
Parameters
324329
----------
325330
dtype : dtype, default object
326-
The numpy dtype to convert to
331+
The numpy dtype to convert to.
332+
copy : bool, default False
333+
Whether to ensure that the returned value is a not a view on
334+
the array. Note that ``copy=False`` does not *ensure* that
335+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
336+
a copy is made, even if not strictly necessary. This is typically
337+
only possible when no missing values are present and `dtype`
338+
is a boolean dtype.
327339
na_value : scalar, optional
328340
Scalar missing value indicator to use in numpy array. Defaults
329341
to the native missing value indicator of this array (pd.NA).
342+
343+
Returns
344+
-------
345+
numpy.ndarray
346+
347+
Examples
348+
--------
349+
An object-dtype is the default result
350+
351+
>>> a = pd.array([True, False], dtype="boolean")
352+
>>> a.to_numpy()
353+
array([True, False], dtype=object)
354+
355+
When no missing values are present, a boolean dtype can be used.
356+
357+
>>> a.to_numpy(dtype="bool")
358+
array([ True, False])
359+
360+
However, requesting a bool dtype will raise a ValueError if
361+
missing values are present and the default missing value :attr:`NA`
362+
is used.
363+
364+
>>> a = pd.array([True, False, pd.NA], dtype="boolean")
365+
>>> a
366+
<BooleanArray>
367+
[True, False, NA]
368+
Length: 3, dtype: boolean
369+
370+
>>> a.to_numpy(dtype="bool")
371+
Traceback (most recent call last):
372+
...
373+
ValueError: cannot convert to bool numpy array in presence of missing values
374+
375+
Specify a valid `na_value` instead
376+
377+
>>> a.to_numpy(dtype="bool", na_value=False)
378+
array([ True, False, False])
330379
"""
380+
if na_value is lib._no_default:
381+
na_value = libmissing.NA
331382
if dtype is None:
332383
dtype = object
333-
if is_bool_dtype(dtype):
334-
if not self._hasna:
335-
return self._data
336-
else:
384+
if self._hasna:
385+
if is_bool_dtype(dtype) and na_value is libmissing.NA:
337386
raise ValueError(
338387
"cannot convert to bool numpy array in presence of missing values"
339388
)
340-
data = self._data.astype(dtype)
341-
data[self._mask] = na_value
389+
# don't pass copy to astype -> always need a copy since we are mutating
390+
data = self._data.astype(dtype)
391+
data[self._mask] = na_value
392+
else:
393+
data = self._data.astype(dtype, copy=copy)
342394
return data
343395

344396
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
@@ -349,7 +401,7 @@ def __array__(self, dtype=None):
349401
We return an object array here to preserve our scalar values
350402
"""
351403
# by default (no dtype specified), return an object array
352-
return self._coerce_to_ndarray(dtype=dtype)
404+
return self.to_numpy(dtype=dtype)
353405

354406
def __arrow_array__(self, type=None):
355407
"""
@@ -525,7 +577,7 @@ def astype(self, dtype, copy=True):
525577
if is_float_dtype(dtype):
526578
na_value = np.nan
527579
# coerce
528-
data = self._coerce_to_ndarray(na_value=na_value)
580+
data = self.to_numpy(na_value=na_value)
529581
return astype_nansafe(data, dtype, copy=False)
530582

531583
def value_counts(self, dropna=True):

pandas/core/arrays/numpy_.py

+6-18
Original file line numberDiff line numberDiff line change
@@ -421,27 +421,15 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
421421

422422
# ------------------------------------------------------------------------
423423
# Additional Methods
424-
def to_numpy(self, dtype=None, copy=False):
425-
"""
426-
Convert the PandasArray to a :class:`numpy.ndarray`.
427-
428-
By default, this requires no coercion or copying of data.
429-
430-
Parameters
431-
----------
432-
dtype : numpy.dtype
433-
The NumPy dtype to pass to :func:`numpy.asarray`.
434-
copy : bool, default False
435-
Whether to copy the underlying data.
436-
437-
Returns
438-
-------
439-
ndarray
440-
"""
424+
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default):
441425
result = np.asarray(self._ndarray, dtype=dtype)
442-
if copy and result is self._ndarray:
426+
427+
if (copy or na_value is not lib._no_default) and result is self._ndarray:
443428
result = result.copy()
444429

430+
if na_value is not lib._no_default:
431+
result[self.isna()] = na_value
432+
445433
return result
446434

447435
@Appender(ExtensionArray.searchsorted.__doc__)

pandas/core/base.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from pandas.core.dtypes.common import (
2020
is_categorical_dtype,
2121
is_datetime64_ns_dtype,
22-
is_datetime64tz_dtype,
2322
is_dict_like,
2423
is_extension_array_dtype,
2524
is_list_like,
@@ -769,7 +768,7 @@ def array(self) -> ExtensionArray:
769768

770769
return result
771770

772-
def to_numpy(self, dtype=None, copy=False):
771+
def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs):
773772
"""
774773
A NumPy ndarray representing the values in this Series or Index.
775774
@@ -784,6 +783,17 @@ def to_numpy(self, dtype=None, copy=False):
784783
another array. Note that ``copy=False`` does not *ensure* that
785784
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
786785
a copy is made, even if not strictly necessary.
786+
na_value : Any, optional
787+
The value to use for missing values. The default value depends
788+
on `dtype` and the type of the array.
789+
790+
.. versionadded:: 1.0.0
791+
792+
**kwargs
793+
Additional keywords passed through to the ``to_numpy`` method
794+
of the underlying array (for extension arrays).
795+
796+
.. versionadded:: 1.0.0
787797
788798
Returns
789799
-------
@@ -853,16 +863,21 @@ def to_numpy(self, dtype=None, copy=False):
853863
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
854864
dtype='datetime64[ns]')
855865
"""
856-
if is_datetime64tz_dtype(self.dtype) and dtype is None:
857-
# note: this is going to change very soon.
858-
# I have a WIP PR making this unnecessary, but it's
859-
# a bit out of scope for the DatetimeArray PR.
860-
dtype = "object"
866+
if is_extension_array_dtype(self.dtype):
867+
return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
868+
else:
869+
if kwargs:
870+
msg = "to_numpy() got an unexpected keyword argument '{}'".format(
871+
list(kwargs.keys())[0]
872+
)
873+
raise TypeError(msg)
861874

862875
result = np.asarray(self._values, dtype=dtype)
863876
# TODO(GH-24345): Avoid potential double copy
864-
if copy:
877+
if copy or na_value is not lib._no_default:
865878
result = result.copy()
879+
if na_value is not lib._no_default:
880+
result[self.isna()] = na_value
866881
return result
867882

868883
@property

pandas/tests/arrays/test_boolean.py

+64
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,70 @@ def test_coerce_to_numpy_array():
251251
np.array(arr, dtype="bool")
252252

253253

254+
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
255+
def test_to_numpy(box):
256+
con = pd.Series if box else pd.array
257+
# default (with or without missing values) -> object dtype
258+
arr = con([True, False, True], dtype="boolean")
259+
result = arr.to_numpy()
260+
expected = np.array([True, False, True], dtype="object")
261+
tm.assert_numpy_array_equal(result, expected)
262+
263+
arr = con([True, False, None], dtype="boolean")
264+
result = arr.to_numpy()
265+
expected = np.array([True, False, pd.NA], dtype="object")
266+
tm.assert_numpy_array_equal(result, expected)
267+
268+
# no missing values -> can convert to bool, otherwise raises
269+
arr = con([True, False, True], dtype="boolean")
270+
result = arr.to_numpy(dtype="bool")
271+
expected = np.array([True, False, True], dtype="bool")
272+
tm.assert_numpy_array_equal(result, expected)
273+
274+
arr = con([True, False, None], dtype="boolean")
275+
with pytest.raises(ValueError, match="cannot convert to bool numpy"):
276+
result = arr.to_numpy(dtype="bool")
277+
278+
# specify dtype and na_value
279+
arr = con([True, False, None], dtype="boolean")
280+
result = arr.to_numpy(dtype=object, na_value=None)
281+
expected = np.array([True, False, None], dtype="object")
282+
tm.assert_numpy_array_equal(result, expected)
283+
284+
result = arr.to_numpy(dtype=bool, na_value=False)
285+
expected = np.array([True, False, False], dtype="bool")
286+
tm.assert_numpy_array_equal(result, expected)
287+
288+
result = arr.to_numpy(dtype="int64", na_value=-99)
289+
expected = np.array([1, 0, -99], dtype="int64")
290+
tm.assert_numpy_array_equal(result, expected)
291+
292+
result = arr.to_numpy(dtype="float64", na_value=np.nan)
293+
expected = np.array([1, 0, np.nan], dtype="float64")
294+
tm.assert_numpy_array_equal(result, expected)
295+
296+
# converting to int or float without specifying na_value raises
297+
with pytest.raises(TypeError):
298+
arr.to_numpy(dtype="int64")
299+
with pytest.raises(TypeError):
300+
arr.to_numpy(dtype="float64")
301+
302+
303+
def test_to_numpy_copy():
304+
# to_numpy can be zero-copy if no missing values
305+
arr = pd.array([True, False, True], dtype="boolean")
306+
result = arr.to_numpy(dtype=bool)
307+
result[0] = False
308+
tm.assert_extension_array_equal(
309+
arr, pd.array([False, False, True], dtype="boolean")
310+
)
311+
312+
arr = pd.array([True, False, True], dtype="boolean")
313+
result = arr.to_numpy(dtype=bool, copy=True)
314+
result[0] = False
315+
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))
316+
317+
254318
def test_astype():
255319
# with missing values
256320
arr = pd.array([True, False, None], dtype="boolean")

0 commit comments

Comments
 (0)