diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 16a84b5d2ecaf..374e1395b42f7 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -67,3 +67,8 @@ behaves correctly. :toctree: api/ api.indexers.check_bool_array_indexer + + +The sentinel ``pandas.api.extensions._no_default`` is used as the default +value in some methods. Use an ``is`` comparison to check if the user +provides a non-default value. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b2b6fe393f069..84d120abe4eca 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -197,6 +197,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the @@ -729,7 +730,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`) - Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`) - Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`) -- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`) +- Removed :meth:`Series.nonzero`, use ``to_numpy().nonzero()`` instead (:issue:`24048`) - Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`) - Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`) - Removed :meth:`Series.put` (:issue:`27106`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 089a7a04abb63..f88989b5e8d0e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2232,7 +2232,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -_no_default = object() +# Note: _no_default is exported to the public API in pandas.api.extensions +_no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..1f782e10396e3 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,4 +1,6 @@ """Public API for extending pandas objects.""" +from pandas._libs.lib import _no_default # noqa: F401 + from pandas.core.dtypes.dtypes import ( # noqa: F401 ExtensionDtype, register_extension_dtype, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 96a4eb1b3bf32..1be6f5886cb75 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -350,6 +351,39 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib._no_default: + result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value + return result + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 409be244c4327..9ef1c4b1bbb1c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -316,29 +316,81 @@ def __getitem__(self, item): return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, + ): """ - Coerce to an ndarray of object dtype or bool dtype (if force_bool=True). + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. Parameters ---------- dtype : dtype, default object - The numpy dtype to convert to + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is a boolean dtype. na_value : scalar, optional Scalar missing value indicator to use in numpy array. Defaults to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False], dtype="boolean") + >>> a.to_numpy() + array([True, False], dtype=object) + + When no missing values are present, a boolean dtype can be used. + + >>> a.to_numpy(dtype="bool") + array([ True, False]) + + However, requesting a bool dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) """ + if na_value is lib._no_default: + na_value = libmissing.NA if dtype is None: dtype = object - if is_bool_dtype(dtype): - if not self._hasna: - return self._data - else: + if self._hasna: + if is_bool_dtype(dtype) and na_value is libmissing.NA: raise ValueError( "cannot convert to bool numpy array in presence of missing values" ) - data = self._data.astype(dtype) - data[self._mask] = na_value + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -349,7 +401,7 @@ def __array__(self, dtype=None): We return an object array here to preserve our scalar values """ # by default (no dtype specified), return an object array - return self._coerce_to_ndarray(dtype=dtype) + return self.to_numpy(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -525,7 +577,7 @@ def astype(self, dtype, copy=True): if is_float_dtype(dtype): na_value = np.nan # coerce - data = self._coerce_to_ndarray(na_value=na_value) + data = self.to_numpy(na_value=na_value) return astype_nansafe(data, dtype, copy=False) def value_counts(self, dropna=True): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index a114be9a21c6c..ec6f9278f6bf7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -421,27 +421,15 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False): - """ - Convert the PandasArray to a :class:`numpy.ndarray`. - - By default, this requires no coercion or copying of data. - - Parameters - ---------- - dtype : numpy.dtype - The NumPy dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to copy the underlying data. - - Returns - ------- - ndarray - """ + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): result = np.asarray(self._ndarray, dtype=dtype) - if copy and result is self._ndarray: + + if (copy or na_value is not lib._no_default) and result is self._ndarray: result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value + return result @Appender(ExtensionArray.searchsorted.__doc__) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d499181c6ed1..2a10200960e32 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_dict_like, is_extension_array_dtype, is_list_like, @@ -767,7 +766,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -782,6 +781,17 @@ def to_numpy(self, dtype=None, copy=False): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 Returns ------- @@ -851,16 +861,21 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if is_datetime64tz_dtype(self.dtype) and dtype is None: - # note: this is going to change very soon. - # I have a WIP PR making this unnecessary, but it's - # a bit out of scope for the DatetimeArray PR. - dtype = "object" + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy - if copy: + if copy or na_value is not lib._no_default: result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value return result @property diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 278b4d41262b7..089cda7f434e9 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,6 +251,70 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values -> can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to bool numpy"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(TypeError): + arr.to_numpy(dtype="int64") + with pytest.raises(TypeError): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + def test_astype(): # with missing values arr = pd.array([True, False, None], dtype="boolean") diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index d3108c30df324..fa45db93c6102 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,6 +4,7 @@ import pytest from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 import pandas as pd import pandas._testing as tm @@ -758,3 +759,38 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): for nat in non_casting_nats: with pytest.raises(TypeError): array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index e328cc223c8f2..07a15d0619bb6 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -404,3 +404,36 @@ def test_to_numpy_dtype(as_series): result = obj.to_numpy(dtype="M8[ns]") expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7146443bf8de5..58859fc6ac54c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd from pandas.core.internals import ObjectBlock @@ -21,3 +23,12 @@ def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 570cdf5f29d00..b67703c7f80e0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -5,6 +5,8 @@ import numpy as np +from pandas._libs import lib + from pandas.core.dtypes.base import ExtensionDtype import pandas as pd @@ -84,6 +86,12 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # if not all( diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index d946772a98779..de7c98ab96571 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -499,3 +499,17 @@ def DecimalArray__array__(self, dtype=None): df[s > 0.5] s.at[0] df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected)