Skip to content

Commit 1078b25

Browse files
committed
API: Standard signature for to_numpy (pandas-dev#24341)
1 parent 8db2167 commit 1078b25

File tree

6 files changed

+147
-19
lines changed

6 files changed

+147
-19
lines changed

doc/source/basics.rst

+21
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,27 @@ When the Series or Index is backed by
8787
an :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy`
8888
may involve copying data and coercing values. See :ref:`basics.dtypes` for more.
8989

90+
:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the
91+
resulting :class:`ndarray`. For example, consider datetimes with timezones.
92+
NumPy doesn't have a dtype to represent timezone-aware datetimes, so there
93+
are two possibly useful representations:
94+
95+
1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each
96+
with the correct ``tz``
97+
2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have
98+
been converted to UTC and the timezone discarded
99+
100+
Timezones may be preserved with ``dtype=object``
101+
102+
.. ipython:: python
103+
104+
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
105+
ser.to_numpy(dtype=object)
106+
107+
Or thrown away with ``dtype='datetime64[ns]'``
108+
109+
ser.to_numpy(dtype="datetime64[ns]")
110+
90111
Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more
91112
complex. When your ``DataFrame`` only has a single data type for all the
92113
columns, :meth:`DataFrame.to_numpy` will return the underlying data:

pandas/core/base.py

+49-12
Original file line numberDiff line numberDiff line change
@@ -857,18 +857,22 @@ def array(self):
857857
result = PandasArray(result)
858858
return result
859859

860-
def to_numpy(self):
860+
def to_numpy(self, dtype=None, copy=False):
861861
"""
862862
A NumPy ndarray representing the values in this Series or Index.
863863
864864
.. versionadded:: 0.24.0
865865
866-
The returned array will be the same up to equality (values equal
867-
in `self` will be equal in the returned array; likewise for values
868-
that are not equal). When `self` contains an ExtensionArray, the
869-
dtype may be different. For example, for a category-dtype Series,
870-
``to_numpy()`` will return a NumPy array and the categorical dtype
871-
will be lost.
866+
867+
Parameters
868+
----------
869+
dtype : str or numpy.dtype, optional
870+
The dtype to pass to :meth:`numpy.asarray`
871+
copy : bool, default False
872+
Whether to ensure that the returned value is a not a view on
873+
another array. Note that ``copy=False`` does not *ensure* that
874+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
875+
a copy is made, even if not strictly necessary.
872876
873877
Returns
874878
-------
@@ -882,10 +886,18 @@ def to_numpy(self):
882886
883887
Notes
884888
-----
889+
The returned array will be the same up to equality (values equal
890+
in `self` will be equal in the returned array; likewise for values
891+
that are not equal). When `self` contains an ExtensionArray, the
892+
dtype may be different. For example, for a category-dtype Series,
893+
``to_numpy()`` will return a NumPy array and the categorical dtype
894+
will be lost.
895+
896+
885897
For NumPy dtypes, this will be a reference to the actual data stored
886-
in this Series or Index. Modifying the result in place will modify
887-
the data stored in the Series or Index (not that we recommend doing
888-
that).
898+
in this Series or Index (assuming ``copy=False``). Modifying the result
899+
in place will modify the data stored in the Series or Index (not that
900+
we recommend doing that).
889901
890902
For extension types, ``to_numpy()`` *may* require copying data and
891903
coercing the result to a NumPy type (possibly object), which may be
@@ -910,12 +922,37 @@ def to_numpy(self):
910922
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
911923
>>> ser.to_numpy()
912924
array(['a', 'b', 'a'], dtype=object)
925+
926+
Specify the `dtype` to control how datetime-aware data is represented.
927+
Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
928+
objects, each with the correct ``tz``.
929+
930+
>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
931+
>>> ser.to_numpy(dtype=object)
932+
array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
933+
Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
934+
dtype=object)
935+
936+
Or ``dtype='datetime64[ns]'`` to return an ndarray of native
937+
datetime64 values. The values are converted to UTC and the timezone
938+
info is dropped.
939+
940+
>>> ser.to_numpy(dtype="datetime64[ns]")
941+
... # doctest: +ELLIPSIS
942+
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
943+
dtype='datetime64[ns]')
913944
"""
914945
if (is_extension_array_dtype(self.dtype) or
915946
is_datetime64tz_dtype(self.dtype)):
916947
# TODO(DatetimeArray): remove the second clause.
917-
return np.asarray(self._values)
918-
return self._values
948+
# TODO(GH-24345): Avoid potential double copy
949+
result = np.asarray(self._values, dtype=dtype)
950+
else:
951+
result = self._values
952+
953+
if copy:
954+
result = result.copy()
955+
return result
919956

920957
@property
921958
def _ndarray_values(self):

pandas/core/frame.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -1126,17 +1126,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
11261126

11271127
return cls(data, index=index, columns=columns, dtype=dtype)
11281128

1129-
def to_numpy(self):
1129+
def to_numpy(self, dtype=None, copy=False):
11301130
"""
11311131
Convert the DataFrame to a NumPy array.
11321132
11331133
.. versionadded:: 0.24.0
11341134
1135-
The dtype of the returned array will be the common NumPy
1136-
dtype of all types in the DataFrame. For example,
1137-
if the dtypes are ``float16`` and ``float32``, the results
1138-
dtype will be ``float32``. This may require copying data and
1139-
coercing values, which may be expensive.
1135+
By default, the dtype of the returned array will be the common NumPy
1136+
dtype of all types in the DataFrame. For example, if the dtypes are
1137+
``float16`` and ``float32``, the results dtype will be ``float32``.
1138+
This may require copying data and coercing values, which may be
1139+
expensive.
1140+
1141+
Parameters
1142+
----------
1143+
dtype : str or numpy.dtype, optional
1144+
The dtype to pass to :meth:`numpy.asarray`
1145+
copy : bool, default False
1146+
Whether to ensure that the returned value is a not a view on
1147+
another array. Note that ``copy=False`` does not *ensure* that
1148+
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1149+
a copy is made, even if not strictly necessary.
11401150
11411151
Returns
11421152
-------
@@ -1168,7 +1178,8 @@ def to_numpy(self):
11681178
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
11691179
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
11701180
"""
1171-
return self.values
1181+
result = np.array(self.values, dtype=dtype, copy=copy)
1182+
return result
11721183

11731184
def to_dict(self, orient='dict', into=dict):
11741185
"""

pandas/tests/frame/test_api.py

+13
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,19 @@ def test_to_numpy(self):
325325
result = df.to_numpy()
326326
tm.assert_numpy_array_equal(result, expected)
327327

328+
def test_to_numpy_dtype(self):
329+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
330+
expected = np.array([[1, 3], [2, 4]], dtype="int64")
331+
result = df.to_numpy(dtype="int64")
332+
tm.assert_numpy_array_equal(result, expected)
333+
334+
def test_to_numpy_copy(self):
335+
arr = np.random.randn(4, 3)
336+
df = pd.DataFrame(arr)
337+
assert df.values.base is arr
338+
assert df.to_numpy(copy=False).base is arr
339+
assert df.to_numpy(copy=True).base is None
340+
328341
def test_transpose(self, float_frame):
329342
frame = float_frame
330343
dft = frame.T

pandas/tests/indexes/multi/test_conversion.py

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def test_tolist(idx):
1717
assert result == exp
1818

1919

20+
def test_to_numpy(idx):
21+
result = idx.to_numpy()
22+
exp = idx.values
23+
tm.assert_numpy_array_equal(result, exp)
24+
25+
2026
def test_to_frame():
2127
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
2228

pandas/tests/test_base.py

+40
Original file line numberDiff line numberDiff line change
@@ -1318,3 +1318,43 @@ def test_to_numpy(array, expected, box):
13181318

13191319
result = thing.to_numpy()
13201320
tm.assert_numpy_array_equal(result, expected)
1321+
1322+
1323+
@pytest.mark.parametrize("as_series", [True, False])
1324+
@pytest.mark.parametrize("arr", [
1325+
np.array([1, 2, 3], dtype="int64"),
1326+
np.array(['a', 'b', 'c'], dtype=object),
1327+
])
1328+
def test_to_numpy_copy(arr, as_series):
1329+
obj = pd.Index(arr, copy=False)
1330+
if as_series:
1331+
obj = pd.Series(obj.values, copy=False)
1332+
1333+
# no copy by default
1334+
result = obj.to_numpy()
1335+
assert np.shares_memory(arr, result) is True
1336+
1337+
result = obj.to_numpy(copy=False)
1338+
assert np.shares_memory(arr, result) is True
1339+
1340+
# copy=True
1341+
result = obj.to_numpy(copy=True)
1342+
assert np.shares_memory(arr, result) is False
1343+
1344+
1345+
@pytest.mark.parametrize("as_series", [True, False])
1346+
def test_to_numpy_dtype(as_series):
1347+
tz = "US/Eastern"
1348+
obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
1349+
if as_series:
1350+
obj = pd.Series(obj)
1351+
result = obj.to_numpy(dtype=object)
1352+
expected = np.array([pd.Timestamp('2000', tz=tz),
1353+
pd.Timestamp('2001', tz=tz)],
1354+
dtype=object)
1355+
tm.assert_numpy_array_equal(result, expected)
1356+
1357+
result = obj.to_numpy()
1358+
expected = np.array(['2000-01-01T05', '2001-01-01T05'],
1359+
dtype='M8[ns]')
1360+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)