Skip to content

Commit 1ef556e

Browse files
committed
API: Standard signature for to_numpy
This is part 1 of pandas-dev#23995 We make the signature of `to_numpy(dtype : Union[str, np.dtype], copy : bool) -> ndarray`
1 parent a34adac commit 1ef556e

File tree

6 files changed

+144
-19
lines changed

6 files changed

+144
-19
lines changed

doc/source/whatsnew/v0.24.0.rst

+19
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,25 @@ as ``.values``).
7373
ser.array
7474
ser.to_numpy()
7575
76+
:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the resulting :class:`ndarray`,
77+
which :attr:`Series.values` couldn't provide. For example, consider datetimes with timezones.
78+
NumPy doesn't have a dtype to represent datetimes with timezones, so there are two possibly
79+
useful representations:
80+
81+
1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each with the correct ``tz``
82+
2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have been converted to UTC and the timezone discarded
83+
84+
Timezones may be preserved with ``dtype=object``
85+
86+
.. ipython:: python
87+
88+
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
89+
ser.to_numpy(dtype=object)
90+
91+
Or thrown away with ``dtype='datetime64[ns]'``
92+
93+
ser.to_numpy(dtype="datetime64[ns]")
94+
7695
We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we
7796
recommend and using ``.array`` or ``.to_numpy()`` instead.
7897

pandas/core/base.py

+48-12
Original file line numberDiff line numberDiff line change
@@ -841,18 +841,22 @@ def array(self):
841841
"""
842842
return self._values
843843

844-
def to_numpy(self):
844+
def to_numpy(self, dtype=None, copy=False):
845845
"""
846846
A NumPy ndarray representing the values in this Series or Index.
847847
848848
.. versionadded:: 0.24.0
849849
850-
The returned array will be the same up to equality (values equal
851-
in `self` will be equal in the returned array; likewise for values
852-
that are not equal). When `self` contains an ExtensionArray, the
853-
dtype may be different. For example, for a category-dtype Series,
854-
``to_numpy()`` will return a NumPy array and the categorical dtype
855-
will be lost.
850+
851+
Parameters
852+
----------
853+
dtype : str or numpy.dtype, optional
854+
The dtype to pass to :meth:`numpy.asarray`
855+
copy : bool, default False
856+
Whether to ensure that the returned value is a not a view on
857+
another array. Note that ``copy=False`` does not *ensure* that
858+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
859+
a copy is made, even if not strictly necessary.
856860
857861
Returns
858862
-------
@@ -866,10 +870,18 @@ def to_numpy(self):
866870
867871
Notes
868872
-----
873+
The returned array will be the same up to equality (values equal
874+
in `self` will be equal in the returned array; likewise for values
875+
that are not equal). When `self` contains an ExtensionArray, the
876+
dtype may be different. For example, for a category-dtype Series,
877+
``to_numpy()`` will return a NumPy array and the categorical dtype
878+
will be lost.
879+
880+
869881
For NumPy dtypes, this will be a reference to the actual data stored
870-
in this Series or Index. Modifying the result in place will modify
871-
the data stored in the Series or Index (not that we recommend doing
872-
that).
882+
in this Series or Index (assuming ``copy=False``). Modifying the result
883+
in place will modify the data stored in the Series or Index (not that
884+
we recommend doing that).
873885
874886
For extension types, ``to_numpy()`` *may* require copying data and
875887
coercing the result to a NumPy type (possibly object), which may be
@@ -894,12 +906,36 @@ def to_numpy(self):
894906
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
895907
>>> ser.to_numpy()
896908
array(['a', 'b', 'a'], dtype=object)
909+
910+
Specify the `dtype` to control how datetime-aware data is represented.
911+
Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
912+
objects, each with the correct ``tz``.
913+
914+
>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
915+
>>> ser.to_numpy(dtype=object)
916+
array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
917+
Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
918+
dtype=object)
919+
920+
Or ``dtype='datetime64[ns]'`` to return an ndarray of native
921+
datetime64 values. The values are converted to UTC and the timezone
922+
info is dropped.
923+
924+
>>> ser.to_numpy(dtype="datetime64[ns]")
925+
... # doctest: +ELLIPSIS
926+
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
927+
dtype='datetime64[ns]')
897928
"""
898929
if (is_extension_array_dtype(self.dtype) or
899930
is_datetime64tz_dtype(self.dtype)):
900931
# TODO(DatetimeArray): remove the second clause.
901-
return np.asarray(self._values)
902-
return self._values
932+
result = np.asarray(self._values, dtype=dtype)
933+
else:
934+
result = self._values
935+
936+
if copy:
937+
result = result.copy()
938+
return result
903939

904940
@property
905941
def _ndarray_values(self):

pandas/core/frame.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -1072,17 +1072,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
10721072

10731073
return cls(data, index=index, columns=columns, dtype=dtype)
10741074

1075-
def to_numpy(self):
1075+
def to_numpy(self, dtype=None, copy=False):
10761076
"""
10771077
Convert the DataFrame to a NumPy array.
10781078
10791079
.. versionadded:: 0.24.0
10801080
1081-
The dtype of the returned array will be the common NumPy
1082-
dtype of all types in the DataFrame. For example,
1083-
if the dtypes are ``float16`` and ``float32``, the results
1084-
dtype will be ``float32``. This may require copying data and
1085-
coercing values, which may be expensive.
1081+
By default, the dtype of the returned array will be the common NumPy
1082+
dtype of all types in the DataFrame. For example, if the dtypes are
1083+
``float16`` and ``float32``, the results dtype will be ``float32``.
1084+
This may require copying data and coercing values, which may be
1085+
expensive.
1086+
1087+
Parameters
1088+
----------
1089+
dtype : str or numpy.dtype, optional
1090+
The dtype to pass to :meth:`numpy.asarray`
1091+
copy : bool, default False
1092+
Whether to ensure that the returned value is a not a view on
1093+
another array. Note that ``copy=False`` does not *ensure* that
1094+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1095+
a copy is made, even if not strictly necessary.
10861096
10871097
Returns
10881098
-------
@@ -1114,7 +1124,8 @@ def to_numpy(self):
11141124
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
11151125
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
11161126
"""
1117-
return self.values
1127+
result = np.array(self.values, dtype=dtype, copy=copy)
1128+
return result
11181129

11191130
def to_dict(self, orient='dict', into=dict):
11201131
"""

pandas/tests/frame/test_api.py

+13
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,19 @@ def test_to_numpy(self):
325325
result = df.to_numpy()
326326
tm.assert_numpy_array_equal(result, expected)
327327

328+
def test_to_numpy_dtype(self):
329+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
330+
expected = np.array([[1, 3], [2, 4]], dtype="int64")
331+
result = df.to_numpy(dtype="int64")
332+
tm.assert_numpy_array_equal(result, expected)
333+
334+
def test_to_numpy_copy(self):
335+
arr = np.random.randn(4, 3)
336+
df = pd.DataFrame(arr)
337+
assert df.values.base is arr
338+
assert df.to_numpy(copy=False).base is arr
339+
assert df.to_numpy(copy=True).base is None
340+
328341
def test_transpose(self, float_frame):
329342
frame = float_frame
330343
dft = frame.T

pandas/tests/indexes/multi/test_conversion.py

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def test_tolist(idx):
1717
assert result == exp
1818

1919

20+
def test_to_numpy(idx):
21+
result = idx.to_numpy()
22+
exp = idx.values
23+
tm.assert_numpy_array_equal(result, exp)
24+
25+
2026
def test_to_frame():
2127
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
2228

pandas/tests/test_base.py

+40
Original file line numberDiff line numberDiff line change
@@ -1301,3 +1301,43 @@ def test_to_numpy(array, expected, box):
13011301

13021302
result = thing.to_numpy()
13031303
tm.assert_numpy_array_equal(result, expected)
1304+
1305+
1306+
@pytest.mark.parametrize("as_series", [True, False])
1307+
@pytest.mark.parametrize("arr", [
1308+
np.array([1, 2, 3], dtype="int64"),
1309+
np.array(['a', 'b', 'c'], dtype=object),
1310+
])
1311+
def test_to_numpy_copy(arr, as_series):
1312+
obj = pd.Index(arr, copy=False)
1313+
if as_series:
1314+
obj = pd.Series(obj.values, copy=False)
1315+
1316+
# no copy by default
1317+
result = obj.to_numpy()
1318+
assert np.shares_memory(arr, result) is True
1319+
1320+
result = obj.to_numpy(copy=False)
1321+
assert np.shares_memory(arr, result) is True
1322+
1323+
# copy=True
1324+
result = obj.to_numpy(copy=True)
1325+
assert np.shares_memory(arr, result) is False
1326+
1327+
1328+
@pytest.mark.parametrize("as_series", [True, False])
1329+
def test_to_numpy_dtype(as_series):
1330+
tz = "US/Eastern"
1331+
obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
1332+
if as_series:
1333+
obj = pd.Series(obj)
1334+
result = obj.to_numpy(dtype=object)
1335+
expected = np.array([pd.Timestamp('2000', tz=tz),
1336+
pd.Timestamp('2001', tz=tz)],
1337+
dtype=object)
1338+
tm.assert_numpy_array_equal(result, expected)
1339+
1340+
result = obj.to_numpy()
1341+
expected = np.array(['2000-01-01T05', '2001-01-01T05'],
1342+
dtype='M8[ns]')
1343+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)