diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9f0aaecacd383..34f6e2359a054 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -224,6 +224,7 @@ Other enhancements such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) +- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) - The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals` method, similarly to :meth:`Series.equals` (:issue:`27081`). - The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 445d168ff875d..31015e3095e7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1280,7 +1280,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1301,6 +1303,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the dtypes of the DataFrame columns. + + .. versionadded:: 1.1.0 Returns ------- @@ -1332,7 +1339,10 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - result = np.array(self.values, dtype=dtype, copy=copy) + result = self._mgr.as_array( + transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value + ) + return result def to_dict(self, orient="dict", into=dict): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3b88edabe9eb0..4f6d84e52ea54 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -781,14 +781,28 @@ def copy_func(ax): res.axes = new_axes return res - def as_array(self, transpose: bool = False) -> np.ndarray: + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: """ Convert the blockmanager data into an numpy array. Parameters ---------- transpose : bool, default False - If True, transpose the return array, + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. Returns ------- @@ -798,24 +812,41 @@ def as_array(self, transpose: bool = False) -> np.ndarray: arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - if self._is_single_block and self.blocks[0].is_datetimetz: - # TODO(Block.get_values): Make DatetimeTZBlock.get_values - # always be object dtype. Some callers seem to want the - # DatetimeArray (previously DTI) - arr = self.blocks[0].get_values(dtype=object) + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if self._is_single_block and self.blocks[0].is_extension: + # Avoid implicit conversion of extension blocks to object + arr = ( + self.blocks[0] + .values.to_numpy(dtype=dtype, na_value=na_value) + .reshape(self.blocks[0].shape) + ) elif self._is_single_block or not self.is_mixed_type: arr = np.asarray(self.blocks[0].get_values()) + if dtype: + arr = arr.astype(dtype, copy=False) else: - arr = self._interleave() + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave + copy = False + + if copy: + arr = arr.copy() + + if na_value is not lib.no_default: + arr[isna(arr)] = na_value return arr.transpose() if transpose else arr - def _interleave(self) -> np.ndarray: + def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: """ Return ndarray from blocks with specified item order Items must be contained in the blocks """ - dtype = _interleaved_dtype(self.blocks) + if not dtype: + dtype = _interleaved_dtype(self.blocks) # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. @@ -830,7 +861,12 @@ def _interleave(self) -> np.ndarray: for blk in self.blocks: rl = blk.mgr_locs - result[rl.indexer] = blk.get_values(dtype) + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + arr = blk.values.to_numpy(dtype=dtype, na_value=na_value) + else: + arr = blk.get_values(dtype) + result[rl.indexer] = arr itemmask[rl.indexer] = 1 if not itemmask.all(): diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index e737a09b2ec6d..b688a048cbe8e 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -407,3 +407,48 @@ def test_to_numpy_kwargs_raises(): s = pd.Series([1, 2, 3], dtype="Int64") with pytest.raises(TypeError, match=msg): s.to_numpy(foo=True) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [1, 2, None]}, + {"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])}, + {"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])}, + ], +) +@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)]) +def test_to_numpy_dataframe_na_value(data, dtype, na_value): + # https://github.com/pandas-dev/pandas/issues/33820 + df = pd.DataFrame(data) + result = df.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + {"a": pd.array([1, 2, None])}, + np.array([[1.0], [2.0], [np.nan]], dtype=float), + ), + ( + {"a": [1, 2, 3], "b": [1, 2, 3]}, + np.array([[1, 1], [2, 2], [3, 3]], dtype=float), + ), + ], +) +def test_to_numpy_dataframe_single_block(data, expected): + # https://github.com/pandas-dev/pandas/issues/33820 + df = pd.DataFrame(data) + result = df.to_numpy(dtype=float, na_value=np.nan) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_dataframe_single_block_no_mutate(): + # https://github.com/pandas-dev/pandas/issues/33820 + result = pd.DataFrame(np.array([1.0, 2.0, np.nan])) + expected = pd.DataFrame(np.array([1.0, 2.0, np.nan])) + result.to_numpy(na_value=0.0) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 5cf74d3205a13..2b79fc8cd3406 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -365,7 +365,7 @@ def test_to_numpy_copy(self): df = pd.DataFrame(arr) assert df.values.base is arr assert df.to_numpy(copy=False).base is arr - assert df.to_numpy(copy=True).base is None + assert df.to_numpy(copy=True).base is not arr def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5))