From 054d74eee1556fadadc4658e3ceb2a4dbde77c98 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 28 Apr 2020 20:25:23 -0500 Subject: [PATCH 01/22] ENH: Add na_value argument to DataFrame.to_numpy --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 13 ++++++++++++- pandas/tests/base/test_conversion.py | 10 ++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 949109d9bb75a..f780aa94d1ba7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -149,7 +149,7 @@ Other enhancements such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) -- +- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4b4801f4e8c58..8e138a47a2009 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1284,7 +1284,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1305,6 +1307,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.1.0 Returns ------- @@ -1337,6 +1344,10 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ result = np.array(self.values, dtype=dtype, copy=copy) + + if na_value is not lib.no_default: + result[isna(result)] = na_value + return result def to_dict(self, orient="dict", into=dict): diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 59f9103072fe9..d4a5e7b599ca5 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -410,3 +410,13 @@ def test_to_numpy_kwargs_raises(): s = pd.Series([1, 2, 3], dtype="Int64") with pytest.raises(TypeError, match=msg): s.to_numpy(foo=True) + + +@pytest.mark.parametrize( + "dtype, na_value", [(float, np.nan), (object, None), (object, pd.NA)] +) +def test_to_numpy_dataframe_na_value(dtype, na_value): + data = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, None]}) + result = data.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) From cafbf5f1b337a8a0c9cab454f76e72e1450e208c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 29 Apr 2020 07:37:17 -0500 Subject: [PATCH 02/22] Add some tests --- pandas/tests/base/test_conversion.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d4a5e7b599ca5..1214d6a148c49 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -412,11 +412,19 @@ def test_to_numpy_kwargs_raises(): s.to_numpy(foo=True) +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [1, 2, None]}, + {"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])}, + {"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])}, + ], +) @pytest.mark.parametrize( "dtype, na_value", [(float, np.nan), (object, None), (object, pd.NA)] ) -def test_to_numpy_dataframe_na_value(dtype, na_value): - data = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, None]}) - result = data.to_numpy(dtype=dtype, na_value=na_value) +def test_to_numpy_dataframe_na_value(data, dtype, na_value): + df = pd.DataFrame(data) + result = df.to_numpy(dtype=dtype, na_value=na_value) expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype) tm.assert_numpy_array_equal(result, expected) From 34b9b9f16ea219c2a7e2fb3e8d5ab818859bdfe6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 30 Apr 2020 11:18:44 -0500 Subject: [PATCH 03/22] Issue num --- pandas/tests/base/test_conversion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 1214d6a148c49..34daa80249523 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -424,6 +424,7 @@ def test_to_numpy_kwargs_raises(): "dtype, na_value", [(float, np.nan), (object, None), (object, pd.NA)] ) def test_to_numpy_dataframe_na_value(data, dtype, na_value): + # https://github.com/pandas-dev/pandas/issues/33820 df = pd.DataFrame(data) result = df.to_numpy(dtype=dtype, na_value=na_value) expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype) From 9a2bbd6bd4571f4cd4b69a43186de40d70e1390d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 30 Apr 2020 11:50:13 -0500 Subject: [PATCH 04/22] A little better --- pandas/core/frame.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e138a47a2009..601e6b3359242 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1285,7 +1285,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) def to_numpy( - self, dtype=None, copy: bool = False, na_value=lib.no_default + self, dtype=None, copy: bool = False, na_value=lib.no_default, **kwargs ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1343,10 +1343,11 @@ def to_numpy( array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - result = np.array(self.values, dtype=dtype, copy=copy) - - if na_value is not lib.no_default: - result[isna(result)] = na_value + arr = [ + self[c].to_numpy(dtype=dtype, na_value=na_value, **kwargs) + for c in self.columns + ] + result = np.array(arr).T return result From 5eb8bb2ebe19df616eae2143bccf3afd594fd1b9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 1 May 2020 14:17:03 -0500 Subject: [PATCH 05/22] as_array --- pandas/core/frame.py | 13 +++++++------ pandas/core/internals/managers.py | 11 +++++++++-- pandas/tests/base/test_conversion.py | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2b975a48fec91..93d4f25e7fab0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1285,7 +1285,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) def to_numpy( - self, dtype=None, copy: bool = False, na_value=lib.no_default, **kwargs + self, dtype=None, copy: bool = False, na_value=lib.no_default ) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1343,11 +1343,12 @@ def to_numpy( array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - arr = [ - self[c].to_numpy(dtype=dtype, na_value=na_value, **kwargs) - for c in self.columns - ] - result = np.array(arr).T + result = self._mgr.as_array( + transpose=self._AXIS_REVERSED, na_value=na_value + ) + + if copy or dtype: + result = np.array(result, dtype=dtype, copy=copy) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6368a2498b04c..a1b237c392939 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -781,14 +781,18 @@ def copy_func(ax): res.axes = new_axes return res - def as_array(self, transpose: bool = False) -> np.ndarray: + def as_array( + self, transpose: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert the blockmanager data into an numpy array. Parameters ---------- transpose : bool, default False - If True, transpose the return array, + If True, transpose the return array. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. Returns ------- @@ -808,6 +812,9 @@ def as_array(self, transpose: bool = False) -> np.ndarray: else: arr = self._interleave() + if na_value is not lib.no_default: + arr[isna(arr)] = na_value + return arr.transpose() if transpose else arr def _interleave(self) -> np.ndarray: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 9aa595631a360..dcb65fe180025 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -418,7 +418,7 @@ def test_to_numpy_kwargs_raises(): ], ) @pytest.mark.parametrize( - "dtype, na_value", [(float, np.nan), (object, None), (object, pd.NA)] + "dtype, na_value", [(float, np.nan), (object, None)] ) def test_to_numpy_dataframe_na_value(data, dtype, na_value): # https://github.com/pandas-dev/pandas/issues/33820 From b48b6c9ded61d724f405c13c63ec71b63a610fad Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 1 May 2020 14:39:24 -0500 Subject: [PATCH 06/22] Black --- pandas/tests/base/test_conversion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index dcb65fe180025..71b331bf34979 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -417,9 +417,7 @@ def test_to_numpy_kwargs_raises(): {"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])}, ], ) -@pytest.mark.parametrize( - "dtype, na_value", [(float, np.nan), (object, None)] -) +@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)]) def test_to_numpy_dataframe_na_value(data, dtype, na_value): # https://github.com/pandas-dev/pandas/issues/33820 df = pd.DataFrame(data) From d1a60e8119ada4dbadd5d1cb60720dc97f3400e0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 1 May 2020 14:56:47 -0500 Subject: [PATCH 07/22] More black --- pandas/core/frame.py | 4 +--- pandas/core/internals/managers.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a97cd1aeeb604..ae0cdf45fefaf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1343,9 +1343,7 @@ def to_numpy( array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - result = self._mgr.as_array( - transpose=self._AXIS_REVERSED, na_value=na_value - ) + result = self._mgr.as_array(transpose=self._AXIS_REVERSED, na_value=na_value) if copy or dtype: result = np.array(result, dtype=dtype, copy=copy) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a1b237c392939..d847e1054d2c5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -781,9 +781,7 @@ def copy_func(ax): res.axes = new_axes return res - def as_array( - self, transpose: bool = False, na_value=lib.no_default - ) -> np.ndarray: + def as_array(self, transpose: bool = False, na_value=lib.no_default) -> np.ndarray: """ Convert the blockmanager data into an numpy array. From 09fdf51f2f4012626345d8725f2e359b81c20e82 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 1 May 2020 20:53:58 -0500 Subject: [PATCH 08/22] to_numpy for ExtensionBlock --- pandas/core/frame.py | 8 +++++--- pandas/core/internals/managers.py | 19 ++++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae0cdf45fefaf..9c13189aa3837 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1343,10 +1343,12 @@ def to_numpy( array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - result = self._mgr.as_array(transpose=self._AXIS_REVERSED, na_value=na_value) + result = self._mgr.as_array( + transpose=self._AXIS_REVERSED, dtype=dtype, na_value=na_value + ) - if copy or dtype: - result = np.array(result, dtype=dtype, copy=copy) + if copy: + result = result.copy() return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d847e1054d2c5..721e03fb140f4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -781,7 +781,9 @@ def copy_func(ax): res.axes = new_axes return res - def as_array(self, transpose: bool = False, na_value=lib.no_default) -> np.ndarray: + def as_array( + self, transpose: bool = False, dtype=None, na_value=lib.no_default + ) -> np.ndarray: """ Convert the blockmanager data into an numpy array. @@ -789,6 +791,8 @@ def as_array(self, transpose: bool = False, na_value=lib.no_default) -> np.ndarr ---------- transpose : bool, default False If True, transpose the return array. + dtype : object, default None + Data type of the return array. na_value : object, default lib.no_default Value to be used as the missing value sentinel. @@ -808,19 +812,20 @@ def as_array(self, transpose: bool = False, na_value=lib.no_default) -> np.ndarr elif self._is_single_block or not self.is_mixed_type: arr = np.asarray(self.blocks[0].get_values()) else: - arr = self._interleave() + arr = self._interleave(dtype=dtype, na_value=na_value) if na_value is not lib.no_default: arr[isna(arr)] = na_value return arr.transpose() if transpose else arr - def _interleave(self) -> np.ndarray: + def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: """ Return ndarray from blocks with specified item order Items must be contained in the blocks """ - dtype = _interleaved_dtype(self.blocks) + if not dtype: + dtype = _interleaved_dtype(self.blocks) # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. @@ -835,7 +840,11 @@ def _interleave(self) -> np.ndarray: for blk in self.blocks: rl = blk.mgr_locs - result[rl.indexer] = blk.get_values(dtype) + if blk.is_extension: + arr = blk.values.to_numpy(dtype=dtype, na_value=na_value) + else: + arr = blk.get_values(dtype) + result[rl.indexer] = arr itemmask[rl.indexer] = 1 if not itemmask.all(): From 89e8930900a9d3f7f7727530a8c544bfade2167a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 2 May 2020 09:21:50 -0500 Subject: [PATCH 09/22] dtype hack --- pandas/core/frame.py | 6 ++++-- pandas/tests/base/test_conversion.py | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c29f30fec4302..660a73a51997a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1347,8 +1347,10 @@ def to_numpy( transpose=self._AXIS_REVERSED, dtype=dtype, na_value=na_value ) - if copy: - result = result.copy() + if copy or result.dtype is not dtype: + # FIXME: The dtype conversion is potentially needed since the + # single extension block case is not correctly cast above + result = np.array(result, dtype=dtype, copy=copy) return result diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 71b331bf34979..edc21f2c8c147 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -424,3 +424,11 @@ def test_to_numpy_dataframe_na_value(data, dtype, na_value): result = df.to_numpy(dtype=dtype, na_value=na_value) expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype) tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_dataframe_single_block(): + # https://github.com/pandas-dev/pandas/issues/33820 + df = pd.DataFrame({"a": pd.array([1, 2, None])}) + result = df.to_numpy(dtype=float, na_value=np.nan) + expected = np.array([[1.0], [2.0], [np.nan]], dtype=float) + tm.assert_numpy_array_equal(result, expected) From bec3889b07d5e4eb30d88d274f1f420cc0d33d86 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 3 May 2020 08:51:48 -0500 Subject: [PATCH 10/22] reshape --- pandas/core/frame.py | 6 ++---- pandas/core/internals/managers.py | 8 ++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 766a4692f74d4..2192d53e93a8c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1347,10 +1347,8 @@ def to_numpy( transpose=self._AXIS_REVERSED, dtype=dtype, na_value=na_value ) - if copy or result.dtype is not dtype: - # FIXME: The dtype conversion is potentially needed since the - # single extension block case is not correctly cast above - result = np.array(result, dtype=dtype, copy=copy) + if copy: + result = result.copy() return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 721e03fb140f4..1590cb677e862 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -809,6 +809,13 @@ def as_array( # always be object dtype. Some callers seem to want the # DatetimeArray (previously DTI) arr = self.blocks[0].get_values(dtype=object) + elif self._is_single_block and self.blocks[0].is_extension: + # Avoid implicit conversion of extension blocks to object + arr = ( + self.blocks[0] + .values.to_numpy(dtype=dtype, na_value=na_value) + .reshape(self.blocks[0].shape) + ) elif self._is_single_block or not self.is_mixed_type: arr = np.asarray(self.blocks[0].get_values()) else: @@ -841,6 +848,7 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: for blk in self.blocks: rl = blk.mgr_locs if blk.is_extension: + # Avoid implicit conversion of extension blocks to object arr = blk.values.to_numpy(dtype=dtype, na_value=na_value) else: arr = blk.get_values(dtype) From 02405a1972064dab281a4ca6ba506424a8ce2e30 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 3 May 2020 09:12:57 -0500 Subject: [PATCH 11/22] Check for NA first Otherwise this can raise for an all integer DataFrame without any missing values --- pandas/core/internals/managers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1590cb677e862..2ba922d2d2c96 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -822,7 +822,9 @@ def as_array( arr = self._interleave(dtype=dtype, na_value=na_value) if na_value is not lib.no_default: - arr[isna(arr)] = na_value + na_mask = isna(arr) + if na_mask.any(): + arr[na_mask] = na_value return arr.transpose() if transpose else arr From 055413fbcd8c5bfb95f55e338c7de993345ecf66 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 3 May 2020 10:22:27 -0500 Subject: [PATCH 12/22] Cast non-extension single block --- pandas/core/internals/managers.py | 2 ++ pandas/tests/base/test_conversion.py | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2ba922d2d2c96..de1d3a775651c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -818,6 +818,8 @@ def as_array( ) elif self._is_single_block or not self.is_mixed_type: arr = np.asarray(self.blocks[0].get_values()) + if dtype: + arr = arr.astype(dtype) else: arr = self._interleave(dtype=dtype, na_value=na_value) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index edc21f2c8c147..17d68c954211a 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -426,9 +426,22 @@ def test_to_numpy_dataframe_na_value(data, dtype, na_value): tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_dataframe_single_block(): +@pytest.mark.parametrize( + "data, arr", + [ + ( + {"a": pd.array([1, 2, None])}, + np.array([[1.0], [2.0], [np.nan]], dtype=float), + ), + ( + {"a": [1, 2, 3], "b": [1, 2, 3]}, + np.array([[1, 1], [2, 2], [3, 3]], dtype=float), + ), + ], +) +def test_to_numpy_dataframe_single_block(data, arr): # https://github.com/pandas-dev/pandas/issues/33820 - df = pd.DataFrame({"a": pd.array([1, 2, None])}) + df = pd.DataFrame(data) result = df.to_numpy(dtype=float, na_value=np.nan) - expected = np.array([[1.0], [2.0], [np.nan]], dtype=float) + expected = arr tm.assert_numpy_array_equal(result, expected) From ae088e44a3391643db9ff3219131dd65d3f34578 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 3 May 2020 10:54:07 -0500 Subject: [PATCH 13/22] Test nit --- pandas/tests/base/test_conversion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 17d68c954211a..a8e6835becccc 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -427,7 +427,7 @@ def test_to_numpy_dataframe_na_value(data, dtype, na_value): @pytest.mark.parametrize( - "data, arr", + "data, expected", [ ( {"a": pd.array([1, 2, None])}, @@ -439,9 +439,8 @@ def test_to_numpy_dataframe_na_value(data, dtype, na_value): ), ], ) -def test_to_numpy_dataframe_single_block(data, arr): +def test_to_numpy_dataframe_single_block(data, expected): # https://github.com/pandas-dev/pandas/issues/33820 df = pd.DataFrame(data) result = df.to_numpy(dtype=float, na_value=np.nan) - expected = arr tm.assert_numpy_array_equal(result, expected) From d78ba299e3815d5d68fdcd638be2782c8c5cfd10 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 4 May 2020 07:10:05 -0500 Subject: [PATCH 14/22] Update pandas/core/frame.py Co-authored-by: Joris Van den Bossche --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 60b8b05dd8874..706d5de882c9b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1309,7 +1309,7 @@ def to_numpy( a copy is made, even if not strictly necessary. na_value : Any, optional The value to use for missing values. The default value depends - on `dtype` and the type of the array. + on `dtype` and the dtypes of the DataFrame columns. .. versionadded:: 1.1.0 From 9c87e0031f4893fb9acdc0f50e7b6d17eecede47 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 4 May 2020 07:21:12 -0500 Subject: [PATCH 15/22] side effect test --- pandas/tests/base/test_conversion.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index a8e6835becccc..5d51587152454 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -444,3 +444,11 @@ def test_to_numpy_dataframe_single_block(data, expected): df = pd.DataFrame(data) result = df.to_numpy(dtype=float, na_value=np.nan) tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_dataframe_single_block_no_mutate(): + # https://github.com/pandas-dev/pandas/issues/33820 + result = pd.DataFrame(np.array([1.0, 2.0, np.nan])) + expected = pd.DataFrame(np.array([1.0, 2.0, np.nan])) + result.to_numpy(dtype=float, na_value=0.0) + tm.assert_frame_equal(result, expected) From ae2b34ae95d3bb8e257507f2c685886867aed185 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 4 May 2020 07:30:17 -0500 Subject: [PATCH 16/22] Update pandas/tests/base/test_conversion.py Co-authored-by: Joris Van den Bossche --- pandas/tests/base/test_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 5d51587152454..b688a048cbe8e 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -450,5 +450,5 @@ def test_to_numpy_dataframe_single_block_no_mutate(): # https://github.com/pandas-dev/pandas/issues/33820 result = pd.DataFrame(np.array([1.0, 2.0, np.nan])) expected = pd.DataFrame(np.array([1.0, 2.0, np.nan])) - result.to_numpy(dtype=float, na_value=0.0) + result.to_numpy(na_value=0.0) tm.assert_frame_equal(result, expected) From bcb69c5550f2b1060b95f229d2118805d1b5ad09 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 4 May 2020 07:31:56 -0500 Subject: [PATCH 17/22] Copy --- pandas/core/internals/managers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index de1d3a775651c..aeeef04293562 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -826,6 +826,7 @@ def as_array( if na_value is not lib.no_default: na_mask = isna(arr) if na_mask.any(): + arr = arr.copy() arr[na_mask] = na_value return arr.transpose() if transpose else arr From c3a7a558c4d68ce5cd7f45049365468d0fbe818f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 4 May 2020 18:01:55 -0500 Subject: [PATCH 18/22] Copy less --- pandas/core/frame.py | 5 +---- pandas/core/internals/managers.py | 18 +++++++++++++----- pandas/tests/frame/test_api.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 706d5de882c9b..1b974075f8051 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1344,12 +1344,9 @@ def to_numpy( [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ result = self._mgr.as_array( - transpose=self._AXIS_REVERSED, dtype=dtype, na_value=na_value + transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) - if copy: - result = result.copy() - return result def to_dict(self, orient="dict", into=dict): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index aeeef04293562..b9eb34e2fd6da 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -782,7 +782,11 @@ def copy_func(ax): return res def as_array( - self, transpose: bool = False, dtype=None, na_value=lib.no_default + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, ) -> np.ndarray: """ Convert the blockmanager data into an numpy array. @@ -793,6 +797,10 @@ def as_array( If True, transpose the return array. dtype : object, default None Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. na_value : object, default lib.no_default Value to be used as the missing value sentinel. @@ -823,11 +831,11 @@ def as_array( else: arr = self._interleave(dtype=dtype, na_value=na_value) + if copy or na_value is not lib.no_default: + arr = arr.copy() + if na_value is not lib.no_default: - na_mask = isna(arr) - if na_mask.any(): - arr = arr.copy() - arr[na_mask] = na_value + arr[isna(arr)] = na_value return arr.transpose() if transpose else arr diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 5cf74d3205a13..2b79fc8cd3406 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -365,7 +365,7 @@ def test_to_numpy_copy(self): df = pd.DataFrame(arr) assert df.values.base is arr assert df.to_numpy(copy=False).base is arr - assert df.to_numpy(copy=True).base is None + assert df.to_numpy(copy=True).base is not arr def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) From f3e45d7ab9e096d153d5af57f4624a0c9484d446 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 5 May 2020 09:32:25 -0500 Subject: [PATCH 19/22] should_copy --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b9eb34e2fd6da..491bbbe0240bc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -812,6 +812,8 @@ def as_array( arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr + should_copy = copy or na_value is not lib.no_default + if self._is_single_block and self.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values # always be object dtype. Some callers seem to want the @@ -830,8 +832,9 @@ def as_array( arr = arr.astype(dtype) else: arr = self._interleave(dtype=dtype, na_value=na_value) + should_copy = False - if copy or na_value is not lib.no_default: + if should_copy: arr = arr.copy() if na_value is not lib.no_default: From 491a5ae9da3f7a476f73054154e0e85d1e3e9cad Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 5 May 2020 09:42:36 -0500 Subject: [PATCH 20/22] Update pandas/core/internals/managers.py Co-authored-by: Joris Van den Bossche --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 491bbbe0240bc..fc3ab6c153c3d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -829,7 +829,7 @@ def as_array( elif self._is_single_block or not self.is_mixed_type: arr = np.asarray(self.blocks[0].get_values()) if dtype: - arr = arr.astype(dtype) + arr = arr.astype(dtype, copy=False) else: arr = self._interleave(dtype=dtype, na_value=na_value) should_copy = False From 4ecccff7d8b9bdd5d89c9238d3af122e86a871c2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 9 May 2020 17:00:39 -0500 Subject: [PATCH 21/22] Rename and comment --- pandas/core/internals/managers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fc3ab6c153c3d..dbb7c0950c0c8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -812,7 +812,9 @@ def as_array( arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - should_copy = copy or na_value is not lib.no_default + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default if self._is_single_block and self.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values @@ -832,9 +834,10 @@ def as_array( arr = arr.astype(dtype, copy=False) else: arr = self._interleave(dtype=dtype, na_value=na_value) - should_copy = False + # The underlying data was copied within _interleave + copy = False - if should_copy: + if copy: arr = arr.copy() if na_value is not lib.no_default: From 8d42fd4c5c561858f1769a3269f70092407404a4 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 9 May 2020 17:10:02 -0500 Subject: [PATCH 22/22] Don't special case datetimetz --- pandas/core/internals/managers.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bcdd030a4567c..4f6d84e52ea54 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -816,12 +816,7 @@ def as_array( # mutating the original object copy = copy or na_value is not lib.no_default - if self._is_single_block and self.blocks[0].is_datetimetz: - # TODO(Block.get_values): Make DatetimeTZBlock.get_values - # always be object dtype. Some callers seem to want the - # DatetimeArray (previously DTI) - arr = self.blocks[0].get_values(dtype=object) - elif self._is_single_block and self.blocks[0].is_extension: + if self._is_single_block and self.blocks[0].is_extension: # Avoid implicit conversion of extension blocks to object arr = ( self.blocks[0]