Skip to content

Commit a89136f

Browse files
TomAugspurgerjreback
authored andcommitted
API: Standard signature for to_numpy (#24341)
1 parent 312a8ee commit a89136f

File tree

6 files changed

+147
-19
lines changed

6 files changed

+147
-19
lines changed

doc/source/basics.rst

+21
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,27 @@ be the same as :attr:`~Series.array`. When the Series or Index is backed by
8686
a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy`
8787
may involve copying data and coercing values.
8888

89+
:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the
90+
resulting :class:`ndarray`. For example, consider datetimes with timezones.
91+
NumPy doesn't have a dtype to represent timezone-aware datetimes, so there
92+
are two possibly useful representations:
93+
94+
1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each
95+
with the correct ``tz``
96+
2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have
97+
been converted to UTC and the timezone discarded
98+
99+
Timezones may be preserved with ``dtype=object``
100+
101+
.. ipython:: python
102+
103+
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
104+
ser.to_numpy(dtype=object)
105+
106+
Or thrown away with ``dtype='datetime64[ns]'``
107+
108+
ser.to_numpy(dtype="datetime64[ns]")
109+
89110
Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more
90111
complex. When your ``DataFrame`` only has a single data type for all the
91112
columns, :attr:`DataFrame.to_numpy` will return the underlying data:

pandas/core/base.py

+49-12
Original file line numberDiff line numberDiff line change
@@ -841,18 +841,22 @@ def array(self):
841841
"""
842842
return self._values
843843

844-
def to_numpy(self):
844+
def to_numpy(self, dtype=None, copy=False):
845845
"""
846846
A NumPy ndarray representing the values in this Series or Index.
847847
848848
.. versionadded:: 0.24.0
849849
850-
The returned array will be the same up to equality (values equal
851-
in `self` will be equal in the returned array; likewise for values
852-
that are not equal). When `self` contains an ExtensionArray, the
853-
dtype may be different. For example, for a category-dtype Series,
854-
``to_numpy()`` will return a NumPy array and the categorical dtype
855-
will be lost.
850+
851+
Parameters
852+
----------
853+
dtype : str or numpy.dtype, optional
854+
The dtype to pass to :meth:`numpy.asarray`
855+
copy : bool, default False
856+
Whether to ensure that the returned value is a not a view on
857+
another array. Note that ``copy=False`` does not *ensure* that
858+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
859+
a copy is made, even if not strictly necessary.
856860
857861
Returns
858862
-------
@@ -866,10 +870,18 @@ def to_numpy(self):
866870
867871
Notes
868872
-----
873+
The returned array will be the same up to equality (values equal
874+
in `self` will be equal in the returned array; likewise for values
875+
that are not equal). When `self` contains an ExtensionArray, the
876+
dtype may be different. For example, for a category-dtype Series,
877+
``to_numpy()`` will return a NumPy array and the categorical dtype
878+
will be lost.
879+
880+
869881
For NumPy dtypes, this will be a reference to the actual data stored
870-
in this Series or Index. Modifying the result in place will modify
871-
the data stored in the Series or Index (not that we recommend doing
872-
that).
882+
in this Series or Index (assuming ``copy=False``). Modifying the result
883+
in place will modify the data stored in the Series or Index (not that
884+
we recommend doing that).
873885
874886
For extension types, ``to_numpy()`` *may* require copying data and
875887
coercing the result to a NumPy type (possibly object), which may be
@@ -894,12 +906,37 @@ def to_numpy(self):
894906
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
895907
>>> ser.to_numpy()
896908
array(['a', 'b', 'a'], dtype=object)
909+
910+
Specify the `dtype` to control how datetime-aware data is represented.
911+
Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
912+
objects, each with the correct ``tz``.
913+
914+
>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
915+
>>> ser.to_numpy(dtype=object)
916+
array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
917+
Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
918+
dtype=object)
919+
920+
Or ``dtype='datetime64[ns]'`` to return an ndarray of native
921+
datetime64 values. The values are converted to UTC and the timezone
922+
info is dropped.
923+
924+
>>> ser.to_numpy(dtype="datetime64[ns]")
925+
... # doctest: +ELLIPSIS
926+
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
927+
dtype='datetime64[ns]')
897928
"""
898929
if (is_extension_array_dtype(self.dtype) or
899930
is_datetime64tz_dtype(self.dtype)):
900931
# TODO(DatetimeArray): remove the second clause.
901-
return np.asarray(self._values)
902-
return self._values
932+
# TODO(GH-24345): Avoid potential double copy
933+
result = np.asarray(self._values, dtype=dtype)
934+
else:
935+
result = self._values
936+
937+
if copy:
938+
result = result.copy()
939+
return result
903940

904941
@property
905942
def _ndarray_values(self):

pandas/core/frame.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -1126,17 +1126,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
11261126

11271127
return cls(data, index=index, columns=columns, dtype=dtype)
11281128

1129-
def to_numpy(self):
1129+
def to_numpy(self, dtype=None, copy=False):
11301130
"""
11311131
Convert the DataFrame to a NumPy array.
11321132
11331133
.. versionadded:: 0.24.0
11341134
1135-
The dtype of the returned array will be the common NumPy
1136-
dtype of all types in the DataFrame. For example,
1137-
if the dtypes are ``float16`` and ``float32``, the results
1138-
dtype will be ``float32``. This may require copying data and
1139-
coercing values, which may be expensive.
1135+
By default, the dtype of the returned array will be the common NumPy
1136+
dtype of all types in the DataFrame. For example, if the dtypes are
1137+
``float16`` and ``float32``, the results dtype will be ``float32``.
1138+
This may require copying data and coercing values, which may be
1139+
expensive.
1140+
1141+
Parameters
1142+
----------
1143+
dtype : str or numpy.dtype, optional
1144+
The dtype to pass to :meth:`numpy.asarray`
1145+
copy : bool, default False
1146+
Whether to ensure that the returned value is a not a view on
1147+
another array. Note that ``copy=False`` does not *ensure* that
1148+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1149+
a copy is made, even if not strictly necessary.
11401150
11411151
Returns
11421152
-------
@@ -1168,7 +1178,8 @@ def to_numpy(self):
11681178
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
11691179
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
11701180
"""
1171-
return self.values
1181+
result = np.array(self.values, dtype=dtype, copy=copy)
1182+
return result
11721183

11731184
def to_dict(self, orient='dict', into=dict):
11741185
"""

pandas/tests/frame/test_api.py

+13
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,19 @@ def test_to_numpy(self):
325325
result = df.to_numpy()
326326
tm.assert_numpy_array_equal(result, expected)
327327

328+
def test_to_numpy_dtype(self):
329+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
330+
expected = np.array([[1, 3], [2, 4]], dtype="int64")
331+
result = df.to_numpy(dtype="int64")
332+
tm.assert_numpy_array_equal(result, expected)
333+
334+
def test_to_numpy_copy(self):
335+
arr = np.random.randn(4, 3)
336+
df = pd.DataFrame(arr)
337+
assert df.values.base is arr
338+
assert df.to_numpy(copy=False).base is arr
339+
assert df.to_numpy(copy=True).base is None
340+
328341
def test_transpose(self, float_frame):
329342
frame = float_frame
330343
dft = frame.T

pandas/tests/indexes/multi/test_conversion.py

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def test_tolist(idx):
1717
assert result == exp
1818

1919

20+
def test_to_numpy(idx):
21+
result = idx.to_numpy()
22+
exp = idx.values
23+
tm.assert_numpy_array_equal(result, exp)
24+
25+
2026
def test_to_frame():
2127
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
2228

pandas/tests/test_base.py

+40
Original file line numberDiff line numberDiff line change
@@ -1301,3 +1301,43 @@ def test_to_numpy(array, expected, box):
13011301

13021302
result = thing.to_numpy()
13031303
tm.assert_numpy_array_equal(result, expected)
1304+
1305+
1306+
@pytest.mark.parametrize("as_series", [True, False])
1307+
@pytest.mark.parametrize("arr", [
1308+
np.array([1, 2, 3], dtype="int64"),
1309+
np.array(['a', 'b', 'c'], dtype=object),
1310+
])
1311+
def test_to_numpy_copy(arr, as_series):
1312+
obj = pd.Index(arr, copy=False)
1313+
if as_series:
1314+
obj = pd.Series(obj.values, copy=False)
1315+
1316+
# no copy by default
1317+
result = obj.to_numpy()
1318+
assert np.shares_memory(arr, result) is True
1319+
1320+
result = obj.to_numpy(copy=False)
1321+
assert np.shares_memory(arr, result) is True
1322+
1323+
# copy=True
1324+
result = obj.to_numpy(copy=True)
1325+
assert np.shares_memory(arr, result) is False
1326+
1327+
1328+
@pytest.mark.parametrize("as_series", [True, False])
1329+
def test_to_numpy_dtype(as_series):
1330+
tz = "US/Eastern"
1331+
obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
1332+
if as_series:
1333+
obj = pd.Series(obj)
1334+
result = obj.to_numpy(dtype=object)
1335+
expected = np.array([pd.Timestamp('2000', tz=tz),
1336+
pd.Timestamp('2001', tz=tz)],
1337+
dtype=object)
1338+
tm.assert_numpy_array_equal(result, expected)
1339+
1340+
result = obj.to_numpy()
1341+
expected = np.array(['2000-01-01T05', '2001-01-01T05'],
1342+
dtype='M8[ns]')
1343+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)