Skip to content

ENH: Add na_value argument to DataFrame.to_numpy #33857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
054d74e
ENH: Add na_value argument to DataFrame.to_numpy
dsaxton Apr 29, 2020
cafbf5f
Add some tests
dsaxton Apr 29, 2020
34b9b9f
Issue num
dsaxton Apr 30, 2020
9a2bbd6
A little better
dsaxton Apr 30, 2020
f7dc246
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton Apr 30, 2020
5eb8bb2
as_array
dsaxton May 1, 2020
ec2f729
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 1, 2020
b48b6c9
Black
dsaxton May 1, 2020
d1a60e8
More black
dsaxton May 1, 2020
09fdf51
to_numpy for ExtensionBlock
dsaxton May 2, 2020
f5db15a
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 2, 2020
89e8930
dtype hack
dsaxton May 2, 2020
d24b976
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 2, 2020
bec3889
reshape
dsaxton May 3, 2020
a20f116
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 3, 2020
02405a1
Check for NA first
dsaxton May 3, 2020
055413f
Cast non-extension single block
dsaxton May 3, 2020
ae088e4
Test nit
dsaxton May 3, 2020
d78ba29
Update pandas/core/frame.py
dsaxton May 4, 2020
9c87e00
side effect test
dsaxton May 4, 2020
df5b683
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 4, 2020
ae2b34a
Update pandas/tests/base/test_conversion.py
dsaxton May 4, 2020
bcb69c5
Copy
dsaxton May 4, 2020
c3a7a55
Copy less
dsaxton May 4, 2020
b5ec43f
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 4, 2020
f3e45d7
should_copy
dsaxton May 5, 2020
e54cc28
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 5, 2020
491a5ae
Update pandas/core/internals/managers.py
dsaxton May 5, 2020
4ecccff
Rename and comment
dsaxton May 9, 2020
142c808
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 9, 2020
8d42fd4
Don't special case datetimetz
dsaxton May 9, 2020
c2228bf
Merge remote-tracking branch 'upstream/master' into dataframe-to-numpy
dsaxton May 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ Other enhancements
such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`)
- The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals`
method, similarly to :meth:`Series.equals` (:issue:`27081`).
- The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`).
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra

return cls(data, index=index, columns=columns, dtype=dtype)

def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
def to_numpy(
self, dtype=None, copy: bool = False, na_value=lib.no_default
) -> np.ndarray:
"""
Convert the DataFrame to a NumPy array.

Expand All @@ -1301,6 +1303,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
na_value : Any, optional
The value to use for missing values. The default value depends
on `dtype` and the dtypes of the DataFrame columns.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -1332,7 +1339,10 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
result = np.array(self.values, dtype=dtype, copy=copy)
result = self._mgr.as_array(
transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
)

return result

def to_dict(self, orient="dict", into=dict):
Expand Down
58 changes: 47 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,14 +781,28 @@ def copy_func(ax):
res.axes = new_axes
return res

def as_array(self, transpose: bool = False) -> np.ndarray:
def as_array(
self,
transpose: bool = False,
dtype=None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
"""
Convert the blockmanager data into an numpy array.

Parameters
----------
transpose : bool, default False
If True, transpose the return array,
If True, transpose the return array.
dtype : object, default None
Data type of the return array.
copy : bool, default False
If True then guarantee that a copy is returned. A value of
False does not guarantee that the underlying data is not
copied.
na_value : object, default lib.no_default
Value to be used as the missing value sentinel.

Returns
-------
Expand All @@ -798,24 +812,41 @@ def as_array(self, transpose: bool = False) -> np.ndarray:
arr = np.empty(self.shape, dtype=float)
return arr.transpose() if transpose else arr

if self._is_single_block and self.blocks[0].is_datetimetz:
# TODO(Block.get_values): Make DatetimeTZBlock.get_values
# always be object dtype. Some callers seem to want the
# DatetimeArray (previously DTI)
arr = self.blocks[0].get_values(dtype=object)
# We want to copy when na_value is provided to avoid
# mutating the original object
copy = copy or na_value is not lib.no_default

if self._is_single_block and self.blocks[0].is_extension:
# Avoid implicit conversion of extension blocks to object
arr = (
self.blocks[0]
.values.to_numpy(dtype=dtype, na_value=na_value)
.reshape(self.blocks[0].shape)
)
elif self._is_single_block or not self.is_mixed_type:
arr = np.asarray(self.blocks[0].get_values())
if dtype:
arr = arr.astype(dtype, copy=False)
else:
arr = self._interleave()
arr = self._interleave(dtype=dtype, na_value=na_value)
# The underlying data was copied within _interleave
copy = False

if copy:
arr = arr.copy()

if na_value is not lib.no_default:
arr[isna(arr)] = na_value

return arr.transpose() if transpose else arr

def _interleave(self) -> np.ndarray:
def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray:
"""
Return ndarray from blocks with specified item order
Items must be contained in the blocks
"""
dtype = _interleaved_dtype(self.blocks)
if not dtype:
dtype = _interleaved_dtype(self.blocks)

# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
Expand All @@ -830,7 +861,12 @@ def _interleave(self) -> np.ndarray:

for blk in self.blocks:
rl = blk.mgr_locs
result[rl.indexer] = blk.get_values(dtype)
if blk.is_extension:
# Avoid implicit conversion of extension blocks to object
arr = blk.values.to_numpy(dtype=dtype, na_value=na_value)
else:
arr = blk.get_values(dtype)
result[rl.indexer] = arr
itemmask[rl.indexer] = 1

if not itemmask.all():
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,3 +407,48 @@ def test_to_numpy_kwargs_raises():
s = pd.Series([1, 2, 3], dtype="Int64")
with pytest.raises(TypeError, match=msg):
s.to_numpy(foo=True)


@pytest.mark.parametrize(
"data",
[
{"a": [1, 2, 3], "b": [1, 2, None]},
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
],
)
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=dtype, na_value=na_value)
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"data, expected",
[
(
{"a": pd.array([1, 2, None])},
np.array([[1.0], [2.0], [np.nan]], dtype=float),
),
(
{"a": [1, 2, 3], "b": [1, 2, 3]},
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
),
],
)
def test_to_numpy_dataframe_single_block(data, expected):
# https://github.com/pandas-dev/pandas/issues/33820
df = pd.DataFrame(data)
result = df.to_numpy(dtype=float, na_value=np.nan)
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_dataframe_single_block_no_mutate():
# https://github.com/pandas-dev/pandas/issues/33820
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
result.to_numpy(na_value=0.0)
tm.assert_frame_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def test_to_numpy_copy(self):
df = pd.DataFrame(arr)
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is None
assert df.to_numpy(copy=True).base is not arr
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This had to change because now the output array is a view, but only on the transpose of itself (which is a copy of the original data)


def test_swapaxes(self):
df = DataFrame(np.random.randn(10, 5))
Expand Down