From a3dab78c148c7379a63b26460fe50927cd547d2e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Dec 2019 14:40:44 +0100 Subject: [PATCH 01/13] ENH: add to_numpy to have control over conversion to numpy array --- pandas/core/arrays/boolean.py | 36 ++++++++++++----- pandas/tests/arrays/test_boolean.py | 62 +++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 743d45e1fa400..f9a27f48770e8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -296,29 +296,47 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): + def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA): """ - Coerce to an ndarray of object dtype or bool dtype (if force_bool=True). + Convert to a numpy array. + + By default converts to a numpy object array. Specify the `dtype` and + `na_value` keywords to customize the conversion. Parameters ---------- dtype : dtype, default object - The numpy dtype to convert to + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. na_value : scalar, optional Scalar missing value indicator to use in numpy array. Defaults to the native missing value indicator of this array (pd.NA). + + Returns + ------- + np.ndarray """ if dtype is None: dtype = object - if is_bool_dtype(dtype): + if is_bool_dtype(dtype) and na_value is libmissing.NA: if not self.isna().any(): - return self._data + data = self._data + if copy: + data = data.copy() else: raise ValueError( "cannot convert to bool numpy array in presence of missing values" ) - data = self._data.astype(dtype) - data[self._mask] = na_value + if self.isna().any(): + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -329,7 +347,7 @@ def __array__(self, dtype=None): We return an object array here to preserve our scalar values """ # by default (no dtype specified), return an object array - return self._coerce_to_ndarray(dtype=dtype) + return self.to_numpy(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -505,7 +523,7 @@ def astype(self, dtype, copy=True): if is_float_dtype(dtype): na_value = np.nan # coerce - data = self._coerce_to_ndarray(na_value=na_value) + data = self.to_numpy(na_value=na_value) return astype_nansafe(data, dtype, copy=None) def value_counts(self, dropna=True): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 90bcd66987e0d..b805620d84dab 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,6 +251,68 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_to_numpy(): + # default (with or without missing values) -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values -> can convert to bool, otherwise raises + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to bool numpy"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = pd.array([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(TypeError): + arr.to_numpy(dtype="int64") + with pytest.raises(TypeError): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + def test_astype(): # with missing values arr = pd.array([True, False, None], dtype="boolean") From fd1c04a7cf90a2f970f53300381f5a2a48d5f903 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Dec 2019 12:23:07 +0100 Subject: [PATCH 02/13] move isna check earlier + simplify --- pandas/core/arrays/boolean.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9d4cc9c70d212..7e74948ff94cb 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -340,16 +340,11 @@ def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA): """ if dtype is None: dtype = object - if is_bool_dtype(dtype) and na_value is libmissing.NA: - if not self.isna().any(): - data = self._data - if copy: - data = data.copy() - else: + if self.isna().any(): + if is_bool_dtype(dtype) and na_value is libmissing.NA: raise ValueError( "cannot convert to bool numpy array in presence of missing values" ) - if self.isna().any(): # don't pass copy to astype -> always need a copy since we are mutating data = self._data.astype(dtype) data[self._mask] = na_value From 92f14d2b04bbc622177cf8a3dce52538f97c3e12 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Dec 2019 14:34:31 +0100 Subject: [PATCH 03/13] passthrough keywords in Series.to_numpy --- pandas/core/base.py | 15 ++++++++++++++- pandas/tests/arrays/test_boolean.py | 14 ++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 381d45d829e62..82e27c56985bc 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -780,7 +780,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -795,6 +795,11 @@ def to_numpy(self, dtype=None, copy=False): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 Returns ------- @@ -864,6 +869,14 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ + if is_extension_array_dtype(self.dtype) and hasattr(self.array, "to_numpy"): + return self.array.to_numpy(dtype, copy=copy, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) if is_datetime64tz_dtype(self.dtype) and dtype is None: # note: this is going to change very soon. # I have a WIP PR making this unnecessary, but it's diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index e6b538376a739..b133d1c3c4052 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,30 +251,32 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") -def test_to_numpy(): +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array # default (with or without missing values) -> object dtype - arr = pd.array([True, False, True], dtype="boolean") + arr = con([True, False, True], dtype="boolean") result = arr.to_numpy() expected = np.array([True, False, True], dtype="object") tm.assert_numpy_array_equal(result, expected) - arr = pd.array([True, False, None], dtype="boolean") + arr = con([True, False, None], dtype="boolean") result = arr.to_numpy() expected = np.array([True, False, pd.NA], dtype="object") tm.assert_numpy_array_equal(result, expected) # no missing values -> can convert to bool, otherwise raises - arr = pd.array([True, False, True], dtype="boolean") + arr = con([True, False, True], dtype="boolean") result = arr.to_numpy(dtype="bool") expected = np.array([True, False, True], dtype="bool") tm.assert_numpy_array_equal(result, expected) - arr = pd.array([True, False, None], dtype="boolean") + arr = con([True, False, None], dtype="boolean") with pytest.raises(ValueError, match="cannot convert to bool numpy"): result = arr.to_numpy(dtype="bool") # specify dtype and na_value - arr = pd.array([True, False, None], dtype="boolean") + arr = con([True, False, None], dtype="boolean") result = arr.to_numpy(dtype=object, na_value=None) expected = np.array([True, False, None], dtype="object") tm.assert_numpy_array_equal(result, expected) From 278447de903e86a89e7227839c27af5ba196a5d2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 11:09:24 -0600 Subject: [PATCH 04/13] doc note --- pandas/core/arrays/boolean.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 28b132f336a6d..152dc186ade41 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -342,7 +342,9 @@ def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA): Whether to ensure that the returned value is a not a view on the array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that - a copy is made, even if not strictly necessary. + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is a boolean dtype. na_value : scalar, optional Scalar missing value indicator to use in numpy array. Defaults to the native missing value indicator of this array (pd.NA). From 1da45bd57de473855b947b9bd324bb80b87a143d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 11:44:21 -0600 Subject: [PATCH 05/13] update --- pandas/core/arrays/base.py | 9 +++++++ pandas/core/arrays/boolean.py | 45 +++++++++++++++++++++++++++++++---- pandas/core/base.py | 18 +++++++------- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 96a4eb1b3bf32..2dd73088a0104 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -350,6 +351,14 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs): + result = np.asarray(self, dtype=dtype) + if copy: + result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value + return result + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 152dc186ade41..af84b5316c4ad 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -327,11 +327,13 @@ def __getitem__(self, item): return type(self)(self._data[item], self._mask[item]) - def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA): + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, **kwargs + ): """ - Convert to a numpy array. + Convert to a NumPy Array. - By default converts to a numpy object array. Specify the `dtype` and + By default converts to an object-dtype NumPy array. Specify the `dtype` and `na_value` keywords to customize the conversion. Parameters @@ -351,8 +353,43 @@ def to_numpy(self, dtype=None, copy=False, na_value: "Scalar" = libmissing.NA): Returns ------- - np.ndarray + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False], dtype="boolean") + >>> a.to_numpy() + array([True, False], dtype=object) + + When no missing values are present, a boolean dtype can be used. + + >>> a.to_numpy(dtype="bool") + array([ True, False]) + + However, requesting a bool dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) """ + if na_value is lib._no_default: + na_value = libmissing.NA if dtype is None: dtype = object if self._hasna: diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d8e3020d6776..d685f5495a80b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_dict_like, is_extension_array_dtype, is_list_like, @@ -785,7 +784,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False, **kwargs): + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -800,6 +799,12 @@ def to_numpy(self, dtype=None, copy=False, **kwargs): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + **kwargs Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). @@ -874,19 +879,14 @@ def to_numpy(self, dtype=None, copy=False, **kwargs): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if is_extension_array_dtype(self.dtype) and hasattr(self.array, "to_numpy"): - return self.array.to_numpy(dtype, copy=copy, **kwargs) + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) else: if kwargs: msg = "to_numpy() got an unexpected keyword argument '{}'".format( list(kwargs.keys())[0] ) raise TypeError(msg) - if is_datetime64tz_dtype(self.dtype) and dtype is None: - # note: this is going to change very soon. - # I have a WIP PR making this unnecessary, but it's - # a bit out of scope for the DatetimeArray PR. - dtype = "object" result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy From 6832f42accb99df16eff2b9ca101d83e61ee2fe0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 12:10:37 -0600 Subject: [PATCH 06/13] updates * Added ExtensionArray.to_numpy * Added na_value to Series/Index.to_numpy --- doc/source/whatsnew/v1.0.0.rst | 3 +- pandas/core/arrays/base.py | 29 ++++++++++++++-- pandas/core/arrays/numpy_.py | 10 ++++-- pandas/core/base.py | 4 ++- pandas/tests/arrays/test_datetimelike.py | 29 ++++++++++++++++ pandas/tests/base/test_conversion.py | 33 +++++++++++++++++++ pandas/tests/extension/base/casting.py | 11 +++++++ pandas/tests/extension/decimal/array.py | 8 +++++ .../tests/extension/decimal/test_decimal.py | 14 ++++++++ 9 files changed, 135 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a5ea60d0a0d19..56afeb64fef12 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -197,6 +197,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy` and :meth:`Index.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the @@ -725,7 +726,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`) - Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`) - Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`) -- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`) +- Removed :meth:`Series.nonzero`, use ``to_numpy().nonzero()`` instead (:issue:`24048`) - Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`) - Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`) - Removed :meth:`Series.put` (:issue:`27106`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2dd73088a0104..1be6f5886cb75 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -351,9 +351,34 @@ def __iter__(self): for i in range(len(self)): yield self[i] - def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs): + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ result = np.asarray(self, dtype=dtype) - if copy: + if copy or na_value is not lib._no_default: result = result.copy() if na_value is not lib._no_default: result[self.isna()] = na_value diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index a114be9a21c6c..b6fcd5f935bca 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -421,7 +421,7 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): """ Convert the PandasArray to a :class:`numpy.ndarray`. @@ -433,15 +433,21 @@ def to_numpy(self, dtype=None, copy=False): The NumPy dtype to pass to :func:`numpy.asarray`. copy : bool, default False Whether to copy the underlying data. + na_value : Scalar, optional + The missing value to use for missing values. Returns ------- ndarray """ result = np.asarray(self._ndarray, dtype=dtype) - if copy and result is self._ndarray: + + if (copy or na_value is not lib._no_default) and result is self._ndarray: result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value + return result @Appender(ExtensionArray.searchsorted.__doc__) diff --git a/pandas/core/base.py b/pandas/core/base.py index d685f5495a80b..37871e55df3f4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -890,8 +890,10 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, **kwargs): result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy - if copy: + if copy or na_value is not lib._no_default: result = result.copy() + if na_value is not lib._no_default: + result[self.isna()] = na_value return result @property diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index e9c64d04ec860..a642a60da8ff5 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -758,3 +758,32 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): for nat in non_casting_nats: with pytest.raises(TypeError): array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert np.isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 4295d89869a72..02d0d78063d3f 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -401,3 +401,36 @@ def test_to_numpy_dtype(as_series): result = obj.to_numpy(dtype="M8[ns]") expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7146443bf8de5..58859fc6ac54c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd from pandas.core.internals import ObjectBlock @@ -21,3 +23,12 @@ def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 570cdf5f29d00..b67703c7f80e0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -5,6 +5,8 @@ import numpy as np +from pandas._libs import lib + from pandas.core.dtypes.base import ExtensionDtype import pandas as pd @@ -84,6 +86,12 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # if not all( diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b5c3abd8ce8f6..f1b1f88ed2884 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -499,3 +499,17 @@ def DecimalArray__array__(self, dtype=None): df[s > 0.5] s.at[0] df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) From e8460ace298f834ce9eb9884d7d65a03cc337674 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 13:13:34 -0600 Subject: [PATCH 07/13] update --- pandas/core/frame.py | 15 ++++++++++++++- pandas/tests/frame/test_api.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 65b315167bd58..7597a5539083d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1242,7 +1242,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): """ Convert the DataFrame to a NumPy array. @@ -1264,6 +1264,12 @@ def to_numpy(self, dtype=None, copy=False): ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + Returns ------- numpy.ndarray @@ -1295,6 +1301,13 @@ def to_numpy(self, dtype=None, copy=False): [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ result = np.array(self.values, dtype=dtype, copy=copy) + if na_value is not lib._no_default: + if not copy: + # copy even if not requested. This may be unnecessary + # if NumPy already copied. + result = result.copy() + + result[self.isna()] = na_value return result def to_dict(self, orient="dict", into=dict): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index f6713d703e112..b7f79bed9b99c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -357,6 +357,22 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is None + def test_to_numpy_na_values(self): + df = pd.DataFrame({"A": [1, None, 3]}) + original = df.copy() + result = df.to_numpy(na_value=0) + expected = np.array([[1.0, 0.0, 3.0]]).T + tm.assert_numpy_array_equal(result, expected) + tm.assert_frame_equal(df, original) + + def test_to_numpy_na_value_heterogenous(self): + df = pd.DataFrame({"A": [1, None, 3], "B": ["a", None, "c"]}) + original = df.copy() + result = df.to_numpy(na_value=0) + expected = np.array([[1.0, 0.0, 3.0], ["a", 0, "c"]], dtype=object).T + tm.assert_numpy_array_equal(result, expected) + tm.assert_frame_equal(df, original) + def test_transpose(self, float_frame): frame = float_frame dft = frame.T From f1e34c6169d0b5b939aeb5dcb0135f4a3dcc005b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 13:14:47 -0600 Subject: [PATCH 08/13] dataframe --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/tests/frame/test_api.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 56afeb64fef12..765e164e116e9 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -197,7 +197,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) -- Added the ``na_value`` argument to :meth:`Series.to_numpy` and :meth:`Index.to_numpy` to control the value used for missing data (:issue:`30322`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index b7f79bed9b99c..7084ef4ef403b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -365,6 +365,11 @@ def test_to_numpy_na_values(self): tm.assert_numpy_array_equal(result, expected) tm.assert_frame_equal(df, original) + result = df.to_numpy(dtype="int", na_value=0) + expected = np.array([[1, 0, 3]]).T + tm.assert_numpy_array_equal(result, expected) + tm.assert_frame_equal(df, original) + def test_to_numpy_na_value_heterogenous(self): df = pd.DataFrame({"A": [1, None, 3], "B": ["a", None, "c"]}) original = df.copy() From 16247120d27bdacdafa49dffc856a22c862fd1b1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 13:44:49 -0600 Subject: [PATCH 09/13] compat --- pandas/tests/arrays/test_datetimelike.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index a642a60da8ff5..799ac79c145fa 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,6 +4,7 @@ import pytest from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 import pandas as pd from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray @@ -768,11 +769,17 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): ], ) def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + array[0] = pd.NaT original = array.copy() result = array.to_numpy() - assert np.isnan(result[0]) + assert isnan(result[0]) result = array.to_numpy(dtype="int64") assert result[0] == -9223372036854775808 From 34307ca7226135f186a028c3cf1ebbc57c0ddad0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jan 2020 14:23:46 -0600 Subject: [PATCH 10/13] typing --- pandas/tests/base/test_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 02d0d78063d3f..3fb5cce401776 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -415,7 +415,7 @@ def test_to_numpy_dtype(as_series): ), ], ) -@pytest.mark.parametrize("container", [pd.Series, pd.Index]) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): s = container(values) result = s.to_numpy(dtype=dtype, na_value=na_value) From afc7350f321b957c10a536c527c9c5b6f510bdb8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jan 2020 11:10:42 -0600 Subject: [PATCH 11/13] fixups --- doc/source/reference/extensions.rst | 5 +++++ pandas/_libs/lib.pyx | 3 ++- pandas/api/extensions/__init__.py | 2 ++ pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/numpy_.py | 18 ------------------ pandas/core/frame.py | 15 +-------------- 6 files changed, 11 insertions(+), 34 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 16a84b5d2ecaf..86672f3baa6f3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -67,3 +67,8 @@ behaves correctly. :toctree: api/ api.indexers.check_bool_array_indexer + + +The sentinel ``pandas.api.extensions._no_default`` is used as the default +value in some methods. Use an ``is`` comparisoin to check if the user +provides a non-default value. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 089a7a04abb63..f88989b5e8d0e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2232,7 +2232,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -_no_default = object() +# Note: _no_default is exported to the public API in pandas.api.extensions +_no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..1f782e10396e3 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,4 +1,6 @@ """Public API for extending pandas objects.""" +from pandas._libs.lib import _no_default # noqa: F401 + from pandas.core.dtypes.dtypes import ( # noqa: F401 ExtensionDtype, register_extension_dtype, diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 8b25ac0095fee..9ef1c4b1bbb1c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -317,7 +317,7 @@ def __getitem__(self, item): return type(self)(self._data[item], self._mask[item]) def to_numpy( - self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, **kwargs + self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default, ): """ Convert to a NumPy Array. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index b6fcd5f935bca..ec6f9278f6bf7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -422,24 +422,6 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): - """ - Convert the PandasArray to a :class:`numpy.ndarray`. - - By default, this requires no coercion or copying of data. - - Parameters - ---------- - dtype : numpy.dtype - The NumPy dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to copy the underlying data. - na_value : Scalar, optional - The missing value to use for missing values. - - Returns - ------- - ndarray - """ result = np.asarray(self._ndarray, dtype=dtype) if (copy or na_value is not lib._no_default) and result is self._ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6bb5f90618baf..ba0c0e7d66b1d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1242,7 +1242,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): + def to_numpy(self, dtype=None, copy=False): """ Convert the DataFrame to a NumPy array. @@ -1264,12 +1264,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. - na_value : Any, optional - The value to use for missing values. The default value depends - on `dtype` and the type of the array. - - .. versionadded:: 1.0.0 - Returns ------- numpy.ndarray @@ -1301,13 +1295,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib._no_default): [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ result = np.array(self.values, dtype=dtype, copy=copy) - if na_value is not lib._no_default: - if not copy: - # copy even if not requested. This may be unnecessary - # if NumPy already copied. - result = result.copy() - - result[self.isna()] = na_value return result def to_dict(self, orient="dict", into=dict): From 008b54f086df4e241bfb795fc4d1143c7bdf72e1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jan 2020 12:13:33 -0600 Subject: [PATCH 12/13] remove unused test --- pandas/tests/frame/test_api.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index d9c2b26bf2707..26d6a917fe1ca 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -358,27 +358,6 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is None - def test_to_numpy_na_values(self): - df = pd.DataFrame({"A": [1, None, 3]}) - original = df.copy() - result = df.to_numpy(na_value=0) - expected = np.array([[1.0, 0.0, 3.0]]).T - tm.assert_numpy_array_equal(result, expected) - tm.assert_frame_equal(df, original) - - result = df.to_numpy(dtype="int", na_value=0) - expected = np.array([[1, 0, 3]]).T - tm.assert_numpy_array_equal(result, expected) - tm.assert_frame_equal(df, original) - - def test_to_numpy_na_value_heterogenous(self): - df = pd.DataFrame({"A": [1, None, 3], "B": ["a", None, "c"]}) - original = df.copy() - result = df.to_numpy(na_value=0) - expected = np.array([[1.0, 0.0, 3.0], ["a", 0, "c"]], dtype=object).T - tm.assert_numpy_array_equal(result, expected) - tm.assert_frame_equal(df, original) - def test_transpose(self, float_frame): frame = float_frame dft = frame.T From 91a46395f6c6304d7b8eb4691218eb1c7979ae36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Jan 2020 20:31:22 +0100 Subject: [PATCH 13/13] Update doc/source/reference/extensions.rst --- doc/source/reference/extensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 86672f3baa6f3..374e1395b42f7 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -70,5 +70,5 @@ behaves correctly. The sentinel ``pandas.api.extensions._no_default`` is used as the default -value in some methods. Use an ``is`` comparisoin to check if the user +value in some methods. Use an ``is`` comparison to check if the user provides a non-default value.