Skip to content

ENH: Add na_value argument to DataFrame.to_numpy #33857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 13, 2020
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
054d74e
ENH: Add na_value argument to DataFrame.to_numpy
dsaxton Apr 29, 2020
cafbf5f
Add some tests
dsaxton Apr 29, 2020
34b9b9f
Issue num
dsaxton Apr 30, 2020
9a2bbd6
A little better
dsaxton Apr 30, 2020
f7dc246
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton Apr 30, 2020
5eb8bb2
as_array
dsaxton May 1, 2020
ec2f729
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 1, 2020
b48b6c9
Black
dsaxton May 1, 2020
d1a60e8
More black
dsaxton May 1, 2020
09fdf51
to_numpy for ExtensionBlock
dsaxton May 2, 2020
f5db15a
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 2, 2020
89e8930
dtype hack
dsaxton May 2, 2020
d24b976
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 2, 2020
bec3889
reshape
dsaxton May 3, 2020
a20f116
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 3, 2020
02405a1
Check for NA first
dsaxton May 3, 2020
055413f
Cast non-extension single block
dsaxton May 3, 2020
ae088e4
Test nit
dsaxton May 3, 2020
d78ba29
Update pandas/core/frame.py
dsaxton May 4, 2020
9c87e00
side effect test
dsaxton May 4, 2020
df5b683
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 4, 2020
ae2b34a
Update pandas/tests/base/test_conversion.py
dsaxton May 4, 2020
bcb69c5
Copy
dsaxton May 4, 2020
c3a7a55
Copy less
dsaxton May 4, 2020
b5ec43f
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 4, 2020
f3e45d7
should_copy
dsaxton May 5, 2020
e54cc28
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 5, 2020
491a5ae
Update pandas/core/internals/managers.py
dsaxton May 5, 2020
4ecccff
Rename and comment
dsaxton May 9, 2020
142c808
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 9, 2020
8d42fd4
Don't special case datetimetz
dsaxton May 9, 2020
c2228bf
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ Other enhancements
such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
-
- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`)

.. ---------------------------------------------------------------------------

Expand Down
17 changes: 15 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1284,7 +1284,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra

return cls(data, index=index, columns=columns, dtype=dtype)

def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
def to_numpy(
self, dtype=None, copy: bool = False, na_value=lib.no_default
) -> np.ndarray:
"""
Convert the DataFrame to a NumPy array.

Expand All @@ -1305,6 +1307,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the dtypes of the DataFrame columns.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -1336,7 +1343,13 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
result = np.array(self.values, dtype=dtype, copy=copy)
result = self._mgr.as_array(
transpose=self._AXIS_REVERSED, dtype=dtype, na_value=na_value
)

if copy:
result = result.copy()

return result

def to_dict(self, orient="dict", into=dict):
Expand Down
38 changes: 32 additions & 6 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,14 +781,20 @@ def copy_func(ax):
res.axes = new_axes
return res

def as_array(self, transpose: bool = False) -> np.ndarray:
def as_array(
self, transpose: bool = False, dtype=None, na_value=lib.no_default
) -> np.ndarray:
"""
Convert the blockmanager data into an numpy array.

Parameters
----------
transpose : bool, default False
If True, transpose the return array,
If True, transpose the return array.
dtype : object, default None
Data type of the return array.
na_value : object, default lib.no_default
Value to be used as the missing value sentinel.

Returns
-------
Expand All @@ -803,19 +809,34 @@ def as_array(self, transpose: bool = False) -> np.ndarray:
# always be object dtype. Some callers seem to want the
# DatetimeArray (previously DTI)
arr = self.blocks[0].get_values(dtype=object)
elif self._is_single_block and self.blocks[0].is_extension:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might be able to remove the block aboev (is_datetimetz) as that is an extension block already

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, the DatetimeArray.to_numpy behaviour seems correct for the default (having object dtype with timestamps instead of datetime64)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice 👍

# Avoid implicit conversion of extension blocks to object
arr = (
self.blocks[0]
.values.to_numpy(dtype=dtype, na_value=na_value)
.reshape(self.blocks[0].shape)
)
elif self._is_single_block or not self.is_mixed_type:
arr = np.asarray(self.blocks[0].get_values())
if dtype:
arr = arr.astype(dtype)
else:
arr = self._interleave()
arr = self._interleave(dtype=dtype, na_value=na_value)

if na_value is not lib.no_default:
na_mask = isna(arr)
if na_mask.any():
arr[na_mask] = na_value

return arr.transpose() if transpose else arr

def _interleave(self) -> np.ndarray:
def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray:
"""
Return ndarray from blocks with specified item order
Items must be contained in the blocks
"""
dtype = _interleaved_dtype(self.blocks)
if not dtype:
dtype = _interleaved_dtype(self.blocks)

# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
Expand All @@ -830,7 +851,12 @@ def _interleave(self) -> np.ndarray:

for blk in self.blocks:
rl = blk.mgr_locs
result[rl.indexer] = blk.get_values(dtype)
if blk.is_extension:
# Avoid implicit conversion of extension blocks to object
arr = blk.values.to_numpy(dtype=dtype, na_value=na_value)
else:
arr = blk.get_values(dtype)
result[rl.indexer] = arr
itemmask[rl.indexer] = 1

if not itemmask.all():
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,3 +407,48 @@ def test_to_numpy_kwargs_raises():
s = pd.Series([1, 2, 3], dtype="Int64")
with pytest.raises(TypeError, match=msg):
s.to_numpy(foo=True)


@pytest.mark.parametrize(
"data",
[
{"a": [1, 2, 3], "b": [1, 2, None]},
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
],
)
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"data, expected",
[
(
{"a": pd.array([1, 2, None])},
np.array([[1.0], [2.0], [np.nan]], dtype=float),
),
(
{"a": [1, 2, 3], "b": [1, 2, 3]},
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
),
],
)
def test_to_numpy_dataframe_single_block(data, expected):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=float, na_value=np.nan)
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_dataframe_single_block_no_mutate():
# https://github.com/pandas-dev/pandas/issues/33820
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
result.to_numpy(dtype=float, na_value=0.0)
tm.assert_frame_equal(result, expected)